# for emacs: -*- mode: sh; -*-

# This file describes how we made the browser database on
# NCBI build 37 (February 2009 freeze) aka:
#	GRCh37 - Genome Reference Consortium Human Reference 37
#	Assembly Accession: GCA_000001405.1

#	"$Id: hg19.txt,v 1.118 2010/06/10 16:34:40 chinhli Exp $";

#############################################################################

# NOTE FOR NEXT HUMAN ASSEMBLY (2009-07-29 - Brooke): hg19 contains the wrong
# sequence for chrM. The accession NC_001807 was replaced in GenBank with
# NC_012920, with the note: "This sequence was removed since the accepted
# reference sequence for the Homo sapiens mitochondrion is the rCRS/Mitomap
# sequence, which is now available as the record NC_012920".
# Also, from http://www.mitomap.org/mitoseq.html:
# "IMPORTANT:  Do not use NC_001807 as "the rCRS" as it is an African
# (Yoruban) sequence with over 40 variant nucleotides from the rCRS. As of
# July 8, 2009 it has been removed from GenBank as a reference sequence but
# may be found, if needed, as  AF347015, one of 53 African sequence deposited
# in Genbank by Ingman et al in 2001."
# Use NC_012920 for the chrM sequence for the next build!

# Download sequence (DONE - 2009-02-04 - Hiram)
    mkdir -p /hive/data/genomes/hg19/download
    cd /hive/data/genomes/hg19/download
    mkdir -p assembled_chromosomes
    wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
        --directory-prefix=assembled_chromosomes \
        -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/assembled_chromosomes

    mkdir -p alternate_loci
for N in 1 2 3 4 5 6 7 8 9
do
wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
    --directory-prefix=alternate_loci \
        -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/ALT_REF_LOCI_${N}
done

    mkdir -p unlocalized_scaffolds
    wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
        --directory-prefix=unlocalized_scaffolds \
	    -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unlocalized_scaffolds

    mkdir -p unplaced_scaffolds
    wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
        --directory-prefix=unplaced_scaffolds \
	    -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unplaced_scaffolds

    mkdir -p placed_scaffolds
    wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
        --directory-prefix=placed_scaffolds \
	    -nH --ftp-user=anonymous --ftp-password=hiram@soe.ucsc.edu \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/placed_scaffolds

    mkdir ucscChr
    cd ucscChr
    for F in ../assembled_chromosomes/FASTA/chr*.fa
do
    C=`basename $F`
    C=${C/.fa}
    echo -n "${C} "
    H=`head -1 "${F}"`
    chrN=`echo $H | sed -e "s/.*Homo sapiens chromosome /chr/; s/, .*//"`
    A=`echo $H | sed -e "s/. Homo.*//; s/.*gb.//"`
    echo $chrN $A
    grep -v "^#" ../assembled_chromosomes/AGP/${chrN}.comp.agp \
        | sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
    echo ">${chrN}" > ${chrN}.fa
    grep -v "^>" ../assembled_chromosomes/FASTA/${chrN}.fa >> ${chrN}.fa
done

    rm -f scaffolds.agp
    find ../alternate_loci -type f | grep ".agp$" | while read F
do
    grep "^GL" $F | sed -e \
"s/^GL000250.1/chr6_apd_hap1/" -e \
"s/^GL000251.1/chr6_cox_hap2/" -e \
"s/^GL000252.1/chr6_dbb_hap3/" -e \
"s/^GL000253.1/chr6_mann_hap4/" -e \
"s/^GL000254.1/chr6_mcf_hap5/" -e \
"s/^GL000255.1/chr6_qbl_hap6/" -e \
"s/^GL000256.1/chr6_ssto_hap7/" -e \
"s/^GL000257.1/chr4_ctg9_hap1/" -e \
"s/^GL000258.1/chr17_ctg5_hap1/"
done > scaffolds.agp

    find ../unlocalized_scaffolds -type f | grep ".agp$" \
| while read F
do
    C=`basename ${F}`
    C=${C/.unlocalized.scaf.agp}
    grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/${C}_gl\1_random/"
done >> scaffolds.agp

    find ../unplaced_scaffolds -type f | grep ".agp$" \
| while read F
do
    grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/chrUn_gl\1/"
done >> scaffolds.agp

    rm -f scaffolds.fa
    find ../alternate_loci -type f | grep ".fa$" | while read F
do
    sed -e \
"s/>.*GL000250.*/>chr6_apd_hap1/" -e \
"s/>.*GL000251.*/>chr6_cox_hap2/" -e \
"s/>.*GL000252.*/>chr6_dbb_hap3/" -e \
"s/>.*GL000253.*/>chr6_mann_hap4/" -e \
"s/>.*GL000254.*/>chr6_mcf_hap5/" -e \
"s/>.*GL000255.*/>chr6_qbl_hap6/" -e \
"s/>.*GL000256.*/>chr6_ssto_hap6/" -e \
"s/>.*GL000257.*/>chr4_ctg9_hap1/" -e \
"s/>.*GL000258.*/>chr17_ctg5_hap1/" ${F}
done > scaffolds.fa

    find ../unlocalized_scaffolds -type f | grep ".fa$" | while read F
do
    sed -e \
"s/^>.*GL\([0-9]*\).* chromosome \([0-9]*\).*/>chr\2_gl\1_random/" ${F}
done >> scaffolds.fa

    find ../unplaced_scaffolds -type f | grep ".fa$" | while read F
do
    sed -e "s/.*\(GL[0-9]*\).*/\1/; s/GL/>chrUn_gl/" $F
done >> scaffolds.fa


############################################################################
## Create database (DONE - 2009-03-04 - Hiram)
    cd /hive/data/genomes/hg19
    cat << '_EOF_' > hg19.config.ra
# Config parameters for makeGenomeDb.pl:
db hg19
scientificName Homo sapiens
commonName Human
assemblyDate Feb. 2009
assemblyLabel GRCh37 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
orderKey 14
mitoAcc NC_001807
fastaFiles /hive/data/genomes/hg19/download/ucscChr/*.fa
agpFiles /hive/data/genomes/hg19/download/ucscChr/*.agp
# qualFiles /dev/null
dbDbSpeciesDir human
taxId	9606
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl hg19.config.ra > makeGenomeDb.log 2>&1
    #	real    14m8.958s
     featureBits -countGaps hg19 gap
    #	239845127 bases of 3137161264 (7.645%) in intersection
    featureBits -noRandom -noHap -countGaps hg19 gap
    #	234344806 bases of 3095693983 (7.570%) in intersection
    #	verify featureBits is properly ignorning haps and randoms:
    egrep -v "_" chrom.sizes | awk '{sum+=$2;print sum,$0}'
    #	3095693983 chrM 16571
    #	same total as in featureBits

    #	much later on, discovered that we needed a chrM definition in the
    #	agp files, added by hand to hg19/M/chrM.agp and hg19/hg19.agp the line:
# chrM    1       16571   1       F       NC001807        1       16571   +
    #	the spaces there are tabs

############################################################################
# running repeat masker (DONE - 2009-03-05 - Hiram)
    screen # use screen to manage this day-long job
    mkdir /hive/data/genomes/hg19/bed/repeatMasker
    cd /hive/data/genomes/hg19/bed/repeatMasker
    time doRepeatMasker.pl -bigClusterHub=swarm -buildDir=`pwd` hg19 \
	> do.log 2>&1
    #	real    525m23.521s
    cat faSize.rmsk.txt
    #	3137161264 bases (239850802 N's 2897310462 real 1431585691
    #	upper 1465724771 lower) in 93 sequences in 1 files
    #	%46.72 masked total, %50.59 masked real
    featureBits -countGaps hg19 rmsk
    #	1465724774 bases of 3137161264 (46.721%) in intersection
    #	this is odd, 3 bases more in featureBits than were masked ?
    #	check it out, make a bed file from the featureBits:
    featureBits -countGaps -bed=rmsk.bed hg19 rmsk
    #	went down a sequence of intersections with this idea, but could
    #	not get it resolved.  It appears there are 75 bases in the rmsk
    #	table that were not masked in the 2bit file ?
    #	Later on, realized that featureBits does not count lower case N's
    #	in the "lower" category, but only in the N's category.

    #	trying a non-split table:
    hgsql -e "show tables;" hg19 | grep _rmsk | while read T
do
    hgsql -e "drop table ${T};" hg19
done
    hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.fa.out
bad rep range [4385, 4384] line 1348605 of hg19.fa.out
bad rep range [5563, 5562] line 1563988 of hg19.fa.out
bad rep range [4539, 4538] line 3111186 of hg19.fa.out
    #	featureBits still reports 1465724774 bases in rmsk table
    #	cleaning the hg19.fa.out file:
    cp hg19.fa.out hg19.clean.out
    # edit hg19.clean.out and remove the three lines:
# 1467  20.7  1.2 17.6  chr14     35056767 35056794 (72292746) +  L1ME1          LINE/L1               4385 4384 (1761) 1120962
# 1943  23.8  5.0 12.6  chr15     65775909 65775924 (36755468) +  L1MC4          LINE/L1               5563 5562 (2480) 1299299
# 2463  25.1  5.0 11.6  chr3      121291056 121291083 (76731347) +  L1M3           LINE/L1               4539 4538 (1608) 2589267

    #	reload the table
    hgsql -e "drop table rmsk;" hg19
    hgLoadOut -nosplit -verbose=2 -table=rmsk hg19 hg19.clean.out

    #	try masking with this clean file:
    twoBitMask /hive/data/genomes/hg19/hg19.unmasked.2bit hg19.clean.out \
	hg19.clean.2bit
    twoBitToFa hg19.clean.2bit stdout | faSize stdin > faSize.clean.txt
    cat faSize.clean.txt
    #	this gives the lower by 75 bases result:
    #	3137161264 bases (239850802 N's 2897310462 real 1431585763 upper
    #	1465724699 lower) in 93 sequences in 1 files
    #	%46.72 masked total, %50.59 masked real
    featureBits -countGaps hg19 rmsk
    #	1465724774 bases of 3137161264 (46.721%) in intersection
    #	is the countGaps interferring ?
    featureBits hg19 rmsk
    #	1465724774 bases of 2897316137 (50.589%) in intersection
    #	nope, lets' see what the .out file has:
    grep chr hg19.clean.out | sed -e "s/^  *//" | awk '{print $5,$6-1,$7}' \
	| sort -k1,1 -k2,2n > hg19.clean.out.bed
    featureBits -countGaps hg19 hg19.clean.out.bed
    #	1465724774 bases of 3137161264 (46.721%) in intersection
    #	is it perhaps not masking N's ?
    twoBitToFa hg19.clean.2bit stdout | grep n | less
    #	that does find some lower case n's, find all N's:
    findMotif -strand=+ -motif=gattaca -verbose=4 hg19.clean.2bit \
	2> findMotif.out
    grep "^#GAP" findMotif.out | sed -e "s/#GAP //" > nLocations.bed
    #	which cover:
    featureBits -countGaps hg19 nLocations.bed
    #	251299071 bases of 3137161264 (8.010%) in intersection
    #	overlapping rmsk business with these N locations:
    featureBits -countGaps hg19 hg19.clean.out.bed nLocations.bed
    #	6494740 bases of 3137161264 (0.207%) in intersection
    #	and overlapping with gap:
    featureBits -countGaps hg19 gap nLocations.bed
    #	239845127 bases of 3137161264 (7.645%) in intersection

############################################################################
# running TRF simple repeats (DONE - 2009-03-05 - Hiram)
    screen # use screen to manage this day-long job
    mkdir /hive/data/genomes/hg19/bed/simpleRepeat
    cd /hive/data/genomes/hg19/bed/simpleRepeat
    time doSimpleRepeat.pl -bigClusterHub=pk -workhorse=hgwdev \
	-smallClusterHub=pk -buildDir=`pwd` hg19 > do.log 2>&1
    #	real    33m25.815s

    twoBitMask bed/repeatMasker/hg19.clean.2bit \
	-add bed/simpleRepeat/trfMask.bed hg19.2bit
    twoBitToFa hg19.2bit stdout | faSize stdin > faSize.hg19.2bit.txt
# 3137161264 bases (239850802 N's 2897310462 real 1430387259 upper
# 1466923203 lower) in 93 sequences in 1 files
# %46.76 masked total, %50.63 masked real

############################################################################
#	prepare cluster data (DONE - 2009-03-06 - Hiram)
    cd /hive/data/genomes/hg19
    rm /gbdb/hg19/hg19.2bit
    ln -s `pwd`/hg19.2bit /gbdb/hg19/hg19.2bit

    time blat hg19.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
    #	Wrote 30675 overused 11-mers to 11.ooc
    #	real    3m11.302s

    mkdir /hive/data/staging/data/hg19
    cp -p hg19.2bit /hive/data/staging/data/hg19
    cp -p 11.ooc /hive/data/staging/data/hg19
    cp -p chrom.sizes /hive/data/staging/data/hg19

    mkdir separateChrs
    cd separateChrs
    grep -v "_" ../chrom.sizes | awk '{print $1}' | while read C
do
    twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.chrOnly.2bit
    twoBitInfo hg19.chrOnly.2bit stdout | sort -k2,2nr > chrOnly.chrom.sizes

    grep "_hap" ../chrom.sizes | awk '{print $1}' | while read C
do
    twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.hapOnly.2bit
    twoBitInfo hg19.hapOnly.2bit stdout | sort -k2,2nr > hapOnly.chrom.sizes

    grep "_" ../chrom.sizes | grep -v "_hap" | awk '{print $1}' | while read C
do
    twoBitToFa -seq="${C}" ../hg19.2bit stdout
done | faToTwoBit stdin hg19.scaffolds.2bit
    twoBitInfo hg19.scaffolds.2bit stdout | sort -k2,2nr > scaffolds.chrom.sizes

    cp -p *.2bit *.sizes /hive/data/staging/data/hg19

    # ask admin to sync this directory: /hive/data/staging/data/hg19/
    #	to the kluster nodes /scratch/data/hg19/

############################################################################
# running cpgIsland business (DONE - 2009-03-06 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/cpgIsland
    cd /hive/data/genomes/hg19/bed/cpgIsland
    cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    # comment out the following two lines if it compiles cleanly
    # some day  (there were some other fixups too, adding include lines)
    sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
    mv tmp.c cpg_lh.c
    make
    cd ../../
    ln -s hg3rdParty/cpgIslands/cpglh.exe
    mkdir -p hardMaskedFa
    cut -f1 ../../chrom.sizes | while read C
do
    echo ${C}
    twoBitToFa ../../hg19.2bit:$C stdout \
	| maskOutFa stdin hard hardMaskedFa/${C}.fa
done

    cut -f1 ../../chrom.sizes > chr.list
    cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out line results/$(root1).cpg}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > runOne
#!/bin/csh -fe
./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
mv /scratch/tmp/$1.$$ $2
'_EOF_'
    # << happy emacs

    gensub2 chr.list single template jobList
    para create jobList
    para try
    para check ... etc
    para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs:        172s       2.86m     0.05h    0.00d  0.000 y
# IO & Wait Time:                  1748s      29.14m     0.49h    0.02d  0.000 y
# Average job time:                  21s       0.34m     0.01h    0.00d
# Longest finished job:              34s       0.57m     0.01h    0.00d
# Submission to last job:            83s       1.38m     0.02h    0.00d

    # Transform cpglh output to bed +
    catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed

    cd /hive/data/genomes/hg19/bed/cpgIsland
    hgLoadBed hg19 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed

# Reading cpgIsland.bed
# Loaded 28226 elements of size 10
# Sorted
# Saving bed.tab
# Loading hg19

############################################################################
# create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/gap
    cd /hive/data/genomes/hg19/bed/gap
    gapToLift hg19 hg19.unBridged.lift -bedFile=unBridged.lift.bed
    cp -p hg19.unBridged.lift ../../jkStuff
    cp -p hg19.unBridged.lift /hive/data/staging/data/hg19

############################################################################
# AUTO UPDATE GENBANK RUN  (DONE - 2009-03-07,13 - Hiram)
    # align with latest genbank process.
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add hg19 just after hg18

# hg19 - GRCh37 - Genome Reference Consortium Human Reference 37
#       Assembly Accession: GCA_000001405.1
hg19.serverGenome = /hive/data/genomes/hg19/hg19.2bit
hg19.clusterGenome = /scratch/data/hg19/hg19.2bit
hg19.ooc = /scratch/data/hg19/11.ooc
hg19.lift = /hive/data/genomes/hg19/jkStuff/hg19.unBridged.lift
hg19.hapRegions = /hive/data/genomes/hg19/jkStuff/hg19.haplotypes.psl
hg19.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
hg19.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
hg19.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
hg19.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
hg19.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
hg19.genbank.est.xeno.pslCDnaFilter   = ${finished.genbank.est.xeno.pslCDnaFilter}
hg19.genbank.est.xeno.load = yes
hg19.refseq.mrna.xeno.load  = yes
hg19.refseq.mrna.xeno.loadDesc = yes
hg19.mgc = yes
hg19.orfeome = yes
hg19.downloadDir = hg19
hg19.ccds.ncbiBuild = 37.1
hg19.upstreamGeneTbl = refGene
hg19.upstreamMaf = multiz46way /hive/data/genomes/hg19/bed/multiz46way/species.list
hg19.genbank.mrna.blatTargetDb = yes
hg19.perChromTables = no

    cvs ci -m "Added hg19." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen		#	use a screen to manage this job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial hg19 &
    #	logFile: var/build/logs/2009.03.10-20:28:44.hg19.initalign.log
    #	real    2761m13.680s
    #	that ran on the swarm with little interference and no problems

    # load database when finished
    ssh hgwdev
    screen	# use screen to manage this long running command
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19 &
    # logFile: var/dbload/hgwdev/logs/2009.03.12-21:10:02.dbload.log
    #	real    369m11.941s

    # enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add hg19 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added hg19 - Human - GRCh37" etc/align.dbs etc/hgwdev.dbs
    make etc-update

#########################################################################
#  BLATSERVERS ENTRY (DONE - 2009-03-09 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("hg19", "blat13", "17778", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("hg19", "blat13", "17779", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# Making download files (DONE - 2009-03-13 - Hiram)
    cd /hive/data/genomes/hg19
    makeDownloads.pl -allowMissedTrfs -noChromRoot hg19 \
	> downloads.log 2>&1
############################################################################
# Venter1 chain, net experiment (DONE - Hiram - 2009-03-15)
doBlastzChainNet.pl `pwd`/DEF \
        -stop=partition -bigClusterHub=swarm \
        -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
        -workhorse=hgwdev -fileServer=hgwdev > partition.log 2>&1

doBlastzChainNet.pl `pwd`/DEF \
        -continue=blastz -stop=blastz -bigClusterHub=swarm \
        -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
        -workhorse=hgwdev -fileServer=hgwdev > blastz.log 2>&1

doBlastzChainNet.pl `pwd`/DEF \
        -continue=cat -stop=net -bigClusterHub=swarm \
        -smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
        -workhorse=hgwdev -fileServer=hgwdev > net.log 2>&1
real    163m28.438s

    # to load, run it in debug, then check the load script
doBlastzChainNet.pl `pwd`/DEF \
	-noLoadChainSplit -continue=load -stop=load -bigClusterHub=swarm \
	-debug -smallClusterHub=swarm -chainMinScore=1000 \
	-chainLinearGap=medium \
	-workhorse=hgwdev -fileServer=hgwdev > load.log 2>&1

    # and create a synNet for multiz, run in debug, and examine script
    #	to make sure it works correctly
doBlastzChainNet.pl `pwd`/DEF \
	-syntenicNet -continue=syntenicNet -stop=syntenicNet \
	-debug -bigClusterHub=swarm \
	-smallClusterHub=swarm -chainMinScore=1000 -chainLinearGap=medium \
	-workhorse=hgwdev -fileServer=hgwdev > synNet.log 2>&1
    #	real    31m11.216s

############################################################################
# reset position to chr6 haplotype situation
    hgsql -e \
'update dbDb set defaultPos="chr6:28343766-33555363" where name="hg19";' \
	hgcentraltest

# reset to a smaller range (2009-04-24 - Brooke)
# this is the SOD1 gene, implicated in Lou Gehrig's disease.

    hgsql -e \
'update dbDb set defaultPos="chr21:33,031,597-33,041,570" where name="hg19";' \
        hgcentraltest

############################################################################
# Self Lastz run (DONE - 2009-03-19 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
    cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
    cat << '_EOF_'
# human vs human
BLASTZ=lastz
# maximum M allowed with lastz is only 255
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
#       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Human Hg19
SEQ2_DIR=/scratch/data/hg19/hg19.2bit
SEQ2_LEN=/scratch/data/hg19/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg19/bed/lastzSelf.2009-03-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long-running job
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev \
	-stop=net -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
    #	cluster difficulties, finished manually, then:
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-continue=cat -workhorse=hgwdev \
	-stop=net -smallClusterHub=pk -bigClusterHub=swarm > cat.log 2>&1 &

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-continue=load -debug -workhorse=hgwdev \
	-stop=load -smallClusterHub=pk -bigClusterHub=swarm > load.debug.log 2>&1 &
    #	that indicates it would do:
    hgLoadChain -tIndex hg19 chainSelf hg19.hg19.all.chain.gz
    #	adding -normScore
    hgLoadChain -normScore -tIndex hg19 chainSelf hg19.hg19.all.chain.gz

    # a user asked about axtNet files, d
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-ignoreSelf -continue=net -workhorse=hgwdev \
	-stop=net -smallClusterHub=encodek -bigClusterHub=swarm > net.log 2>&1 &
    #	about 8m 17s
    cd /hive/data/genomes/hg19/bed/lastzSelf.2009-03-19/axtChain
    netClass -verbose=0 -noAr noClass.net hg19 hg19 hg19.hg19.net
    gzip hg19.hg19.net
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/vsSelf
    ln -s \
/hive/data/genomes/hg19/bed/lastzSelf.2009-03-19/axtChain/hg19.hg19.net.gz .
    # fixup README.txt and md5sum.txt files
    md5sum hg19.hg19.net.gz >> md5sum.txt
    # Brian wants to see the track:
    netFilter -minGap=10 hg19.hg19.net.gz \
	| hgLoadNet -verbose=0 hg19 netSelf stdin

############################################################################
# Chimp Lastz run (DONE - 2009-03-19 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
    cd /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
    cat << '_EOF_'
# human vs chimp
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
#       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/data/panTro2/panTro2.2bit
SEQ2_LEN=/scratch/data/panTro2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long-running job
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm > do.log 2>&1 &
    #	real    173m22.880s
    #	cluster problems, continuing after lastz done:
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=cat \
	-stop=net -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
	> net.log 2>&1 &
    #	real    81m20.209s
    #	continuing with the load and adding syntenicNet
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 -continue=load \
	-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
	-chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
	> load.log 2>&1 &
    #	real    47m17.871s
    cat fb.hg19.chainPanTro2Link.txt
    #	2747983350 bases of 2897316137 (94.846%) in intersection

    #	running the swap - DONE - 2009-05-24
    ssh swarm
    mkdir /hive/data/genomes/panTro2/bed/blastz.hg19.swap
    cd /hive/data/genomes/panTro2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-swap /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    723m41.377s
    cat fb.panTro2.chainHg19Link.txt
    #	2761343871 bases of 2909485072 (94.908%) in intersection

############################################################################
# Creating the pushQ entry (DONE - 2009-03-20 - Hiram)
    mkdir /hive/data/genomes/hg19/pushQ
    cd /hive/data/genomes/hg19/pushQ
    makePushQSql.pl hg19 > hg19.pushQ.sql 2> make.err
    # many complaints about the chain and net tables from the haplotype
    #	experiments, and this table:
    #	orfeomeGenes
    #	which is probably in genbank, and these usual ones:
    #	hg19 does not have seq
    #	hg19 does not have extFile

############################################################################
# Determine PAR region of X and Y (DONE - 2009-03-20 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/parRegion
    cd /hive/data/genomes/hg19/bed/parRegion
    awk '$5 != "N"' ../../X/chrX.agp | awk '{print $6}' | sort > chrX.cloneList
    awk '$5 != "N"' ../../Y/chrY.agp | awk '{print $6}' | sort > chrY.cloneList
    comm -12 chrX.cloneList chrY.cloneList > chrXY.par.clone.list
    cat chrXY.par.clone.list \
	| while read C; do grep "${C}" ../../X/chrX.agp; done \
	| sort -k1,1 -k2,2n >> chrX.par.region.agp
    cat chrXY.par.clone.list \
	| while read C; do grep "${C}" ../../Y/chrY.agp; done \
	| sort -k1,1 -k2,2n >> chrY.par.region.agp
    awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrY.par.region.agp \
	> chrY.par.region.bed
    awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' chrX.par.region.agp \
	> chrX.par.region.bed
    #	use those bed files in custom tracks on hg19 to verify that they
    #	are two continuous regions with only gaps between these items
    #	these location extents are: (zero relative)
    #	chrX 60000 2722842
    #	chrX 154906585 155260560
    #	chrY 10000 2649520
    #	chrY 59034049 59363566

############################################################################
# Gorilla Lastz run (DONE - 2009-03-21,05-13 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
    cd /hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
    cat << '_EOF_'
# human vs gorilla
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
#       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Gorilla gorGor1
SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg19/bed/lastzGorGor1.2009-03-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long-running job
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
	> do.log 2>&1 &
    cat fb.hg19.chainGorGor1Link.txt
    #	1723432141 bases of 2897316137 (59.484%) in intersection
    doRecipBest.pl -buildDir=`pwd` hg19 gorGor1 > rbest.log 2>&1

############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2009-04-02 - Hiram)
    ssh pk
    mkdir /hive/data/genomes/hg19/bed/linSpecRep
    cd /hive/data/genomes/hg19/bed/linSpecRep
    #	create individual .out files from the master record in ../repeatMasker
    mkdir splitOut
    cat << '_EOF_' > split.csh
#!/bin/csh -fe
set C = $1
head -3 ../repeatMasker/hg19.clean.out > splitOut/${C}.out
grep "${C} " ../repeatMasker/hg19.clean.out >> splitOut/${C}.out
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > template
#LOOP
split.csh $(root1) {check out line+ splitOut/$(root1).out}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cut -f1 ../../chrom.sizes > chrom.list
    gensub2 chrom.list single template jobList
    para create jobList
    para try ... check ... push ... etc...
# Completed: 93 of 93 jobs
# CPU time in finished jobs:        127s       2.12m     0.04h    0.00d  0.000 y
# IO & Wait Time:                 17154s     285.90m     4.76h    0.20d  0.001 y
# Average job time:                 186s       3.10m     0.05h    0.00d
# Longest finished job:             224s       3.73m     0.06h    0.00d
# Submission to last job:           280s       4.67m     0.08h    0.00d

    #	now, we can date and process each of those .out files
    #	this really should be a single creation of notInOthers
    #	These four different ones all end up to be the same anyhow
    #	the notInMouse becomes notInOthers below and the others are removed.
    mkdir dateRepeats
    cd dateRepeats
    cat << '_EOF_' > mkLSR
#!/bin/csh -fe
rm -f $1.out_mus-musculus_rattus_canis-familiaris_bos-taurus
ln -s ../splitOut/$1.out .
/scratch/data/RepeatMasker/DateRepeats \
    $1.out -query human -comp mouse -comp rat -comp dog -comp cow
rm $1.out
mkdir -p ../notInMouse ../notInRat ../notInDog ../notInCow
/cluster/bin/scripts/extractRepeats 1 $1.out_mus*-taurus \
	> ../notInMouse/$1.out.spec
/cluster/bin/scripts/extractRepeats 2 $1.out_mus*-taurus \
	> ../notInRat/$1.out.spec
/cluster/bin/scripts/extractRepeats 3 $1.out_mus*-taurus \
	> ../notInDog/$1.out.spec
/cluster/bin/scripts/extractRepeats 4 $1.out_mus*-taurus \
	> ../notInCow/$1.out.spec
'_EOF_'
    #	<< happy emacs
    chmod +x mkLSR

    cat << '_EOF_' > template
#LOOP
./mkLSR $(path1) {check out line+ $(path1).out_mus-musculus_rattus_canis-familiaris_bos-taurus}
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    gensub2 ../chrom.list single template jobList
    para try ... check ... push ... etc...
    para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs:       2441s      40.69m     0.68h    0.03d  0.000 y
# IO & Wait Time:                   332s       5.53m     0.09h    0.00d  0.000 y
# Average job time:                  30s       0.50m     0.01h    0.00d
# Longest finished job:             125s       2.08m     0.03h    0.00d
# Submission to last job:           454s       7.57m     0.13h    0.01d

    done

    #	these four types of out.spec results all turn out to be identical
    #	To check identical
    cd /hive/data/genomes/hg19/bed/linSpecRep
    find . -name "*.out.spec" | \
	while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
	| sort -k1,1n | sort -t"/" -k3,3 | sed -e "s#./notIn.*/##" \
	| sort | uniq -c | less
    #	You will see they are all a count of 4
    #	Set them up on scratch data and get to all the kluster nodes:
    mkdir /hive/data/staging/data/hg19/lineageSpecificRepeats
    cd notInMouse
    rsync -a --progress ./ /hive/data/staging/data/hg19/lineageSpecificRepeats
    cd ..
    mv notInMouse notInOthers
    #	do not need to keep all of these
    rm -fr notInRat notInDog notInCow

    # We also need the nibs for blastz runs with lineage specific repeats
    mkdir /hive/data/genomes/hg19/bed/nibs
    cd /hive/data/genomes/hg19/bed/nibs
    cut -f1 ../../chrom.sizes | while read C
do
    twoBitToFa -seq=${C} ../../hg19.2bit stdout \
	| faToNib -softMask stdin ${C}.nib
    echo "${C} done"
done
    mkdir /hive/data/staging/data/hg19/nib
    rsync -a --progress ./ /hive/data/staging/data/hg19/nib

    # Ask cluster-admin to sync /scratch/ filesystem to kluster nodes

#############################################################################
# create gc5Base download file (DONE - 2009-04-24 - Hiram)
    cd /hive/data/genomes/hg19/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 hg19 \
        /cluster/data/hg19/hg19.2bit | gzip -c > hg19.gc5Base.txt.gz

#############################################################################
# Physical Map Contigs - ctgPos (DONE - 2009-04-23 - Hiram) (Alt. haplotypes added 4/12/10 angie)
    mkdir /hive/data/genomes/hg19/bed/ctgPos
    cd /hive/data/genomes/hg19/bed/ctgPos
    cat << '_EOF_' > mkCtgPos.sh
AGP="/hive/data/genomes/hg19/download/assembled_chromosomes/AGP"
export AGP
for F in `(cd ${AGP}; ls chr*.agp | grep -v ".comp.agp")`
do
    C=${F/.agp/}
    grep "^CM" "${AGP}/${F}" | awk '$5 != "N"' | awk '
{
printf "%s\t%d\t%s\t%d\t%d\n", $6, $8-$7+1, "'${C}'", $2-1+$7-1, $2-1+$8
}
'
done
'_EOF_'
    # << happy emacs
    chmod +x mkCtgPos.sh
    ./mkCtgPos.sh > ctgPos.tab

    cat << '_EOF_' > mkRanCtgPos.sh
AGP="/hive/data/genomes/hg19/download/unlocalized_scaffolds/AGP"
export AGP
for F in `(cd ${AGP}; ls chr*.agp)`
do
    C=${F/.unlocalized.scaf.agp/}
    c=${C/chr/}
    export C c
    grep "^GL" "${AGP}/${F}" | awk '$5 != "N"' | awk '
BEGIN {
    ctgName=""
    ctgStart=0
    ctgEnd=0
    chrom="'${c}'"
    ctgNameLower=""
}
{
if (match(ctgName,$1)) {
    ctgEnd = $3
} else {
    if (length(ctgName) > 0) {
        size=ctgEnd - ctgStart
printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower, 
ctgStart, ctgEnd
    }
    ctgStart = $2 - 1
    ctgEnd = $3
    ctgName = $1
    ctgNameLower = tolower($1)
    sub(".1$","",ctgNameLower)
}
}
END {
size=ctgEnd - ctgStart
printf "%s\t%d\tchr%s_%s_random\t%d\t%d\n", ctgName, size, chrom, ctgNameLower, 
ctgStart, ctgEnd
}
'
done
'_EOF_'
    # << happy emacs
    chmod +x mkRanCtgPos.sh
    ./mkRanCtgPos.sh >> ctgPos.tab

    #	fetch .sql definition from hg18
    chmod 777 .
    hgsqldump --all -c --tab=. hg18 ctgPos
    # Don't confuse us w/hg18 data:
    rm ctgPos.txt
    chmod 775 .
    hgsql hg19 < ctgPos.sql
    hgsql -e 'load data local infile "ctgPos.tab" into table ctgPos;' hg19

    # 4/12/10 (angie): add the alt loci:
    perl -we 'while (<>) { \
                next if (/^#/); chomp; @w = split; \
                $w[0] = lc($w[0]); $w[0] =~ s/^hs//; $w[0] =~ s/_mhc_/_/;  $w[0] =~ s/_ctg1$//; \
                $w[0] =~ s/_apd$/_apd_hap1/; $w[0] =~ s/_cox$/_cox_hap2/; \
                $w[0] =~ s/_dbb$/_dbb_hap3/; $w[0] =~ s/_mann$/_mann_hap4/; \
                $w[0] =~ s/_mcf$/_mcf_hap5/; $w[0] =~ s/_qbl$/_qbl_hap6/; \
                $w[0] =~ s/_ssto$/_ssto_hap7/; \
                $w[0] =~ s/_1(_ctg\d)/${1}_hap1/; \
                if ($w[0] eq "chr6_cox_hap2" && $w[8] == 4873745) { $w[8] = 4795371; } # yep, inconsistent \
                print join("\t", $w[1], $w[8], $w[0], 0, $w[8]) . "\n"; }' \
    /hive/data/genomes/hg19/download/alternate_loci/*/placed_scaffolds/alt_locus_scaf2primary.pos \
      >> ctgPos.tab
    sort -k 3,3 -k4n,4n ctgPos.tab \
    | hgLoadSqlTab hg19 ctgPos ctgPos.sql stdin
    # TODO: tell NCBI alternate_loci/ALT_REF_LOCI_2/placed_scaffolds/alt_locus_scaf2primary.pos
    # has size inconsistent w/AGP, FASTA


#############################################################################
# CLONE ENDS - first step for BACEND/CytoBand tracks
#	(DONE - 2009-04-28 - Hiram)
    mkdir -p /hive/data/genomes/hg19/bed/cloneend/ncbi
    cd /hive/data/genomes/hg19/bed/cloneend/ncbi

    wget --timestamping \
'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_ends*.mfa.gz'
    wget --timestamping \
'ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/9606_clone_info*.txt.gz'

    cd /hive/data/genomes/hg19/bed/cloneend
    # seems like the *.mfa files were split just for convenience
    # concatenate

    for F in ncbi/*.mfa.gz
do
    zcat "${F}"
    echo "${F}" 1>&2
done | gzip > all.mfa.gz
    #	that 1>&2 echos to stderr so you can see the file name and not
    #	interfere with the pipe stdout output to gzip

    # Convert the title line of the all.mfa file
    zcat all.mfa.gz \
	| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#\.[0-9]|.*##" \
	    | gzip > cloneEnds.fa.gz

    zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz

    #	make sure nothing got broken:
    faSize all.mfa.gz
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files

    faSize cloneEnds.fa.gz
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files

    #	identical numbers
    #	you can also carefully check the names:
    zcat all.mfa.gz | grep "^>" | awk -F'|' '{print $4}' \
	| sed -e "s/\.[0-9]$//" | sort > mfa.names
    #	should be the same as:
    zcat cloneEnds.fa.gz | grep "^>" | sed -e "s/>//" | sort > clone.names


    # concatenate the text files, too
    bash
    for F in ncbi/*.txt.gz
do
    zcat "${F}"
    echo "${F}" 1>&2
done | gzip > all.txt.gz

    # generate cloneEndPairs.txt and cloneEndSingles.txt
    zcat all.txt.gz >all.txt
    $HOME/kent/src/hg/utils/cloneEndParse.pl all.txt

    #	Reading in end info
    #	Writing out pair info
    #	Writing out singleton info
    #	302264 pairs and 203094 singles
    #	examined all the clone names and all the bac end names in these two
    #	files and compared with business from all.txt to make sure we properly
    #	classified all of them correctly.  We had 833,173 clone sequences,
    #	and 501,135 bac end names

    #	faSplit does not function correctly if given a .gz source file
    #	AND, we need the unzipped file for sequence loading below
    gunzip cloneEnds.fa.gz
    # split
    mkdir splitdir
    cd splitdir
    faSplit sequence ../cloneEnds.fa 100 cloneEnds
    #	Check to ensure no breakage:
    cat *.fa | faSize stdin
# 400901385 bases (5941742 N's 394959643 real 255835696 upper 139123947 lower)
# in 833173 sequences in 1 files
    #	same numbers as before

    # load sequences
    ssh hgwdev
    mkdir /gbdb/hg19/cloneend
    cd /gbdb/hg19/cloneend
      ln -s /hive/data/genomes/hg19/bed/cloneend/cloneEnds.fa .
    cd /tmp
    hgLoadSeq hg19 /gbdb/hg19/cloneend/cloneEnds.fa
    #  Advisory lock created
    # Creating .tab file
    # Adding /gbdb/hg19/cloneend/cloneEnds.fa
    # 833173 sequences
    # Updating seq table
    # Advisory lock has been released
    # All done

##############################################################################
# BACEND SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
    mkdir -p /hive/data/genomes/hg19/bed/bacends/run.blat
    cd /hive/data/genomes/hg19/bed/bacends/run.blat
    #	going to run separate runs for the golden path sequence vs. the
    #	randoms, haplotypes, chrUn and chrM
    partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
	/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
	| egrep -v "tParts|random|_hap|chrUn" \
	| sed -e "s/.*2bit://; s/:/./" > hg19.list
    ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
	> bacEnds.list

    ssh swarm
    cd /hive/data/genomes/hg19/bed/bacends/run.blat

    cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(path2) {check out line+ psl/$(root1)/$(file1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs
    cat > runOne.csh << '_EOF_'
#!/bin/csh -fe

set target = $1
set query = $2
set result = $3
set partSpec = `echo $target | sed -e "s/\./:/"`
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set dir = $result:h
set chr = `echo $target | sed -e "s/\..*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`

# echo $tmpFile
# echo "chr: $chr $start $end -> size: $chrSize, range: $range"
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p $dir
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
        /scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
rm -f $result
liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
    # << happy emacs

    gensub2 hg19.list bacEnds.list template jobList
    para create jobList
# 62034 jobs in batch
    # these jobs run quickly, limit them to 250 at a time
    para try, check, -maxJob=250 push, etc ...
# Completed: 62034 of 62034 jobs
# CPU time in finished jobs:     506023s    8433.72m   140.56h    5.86d  0.016 y
# IO & Wait Time:                175853s    2930.88m    48.85h    2.04d  0.006 y
# Average job time:                  11s       0.18m     0.00h    0.00d
# Longest finished job:             752s      12.53m     0.21h    0.01d
# Submission to last job:          3533s      58.88m     0.98h    0.04d

    #	combine the alignments
    time pslSort dirs raw.psl temp psl/chr*
    #	62034 files in 24 dirs
    #	Got 62034 files 249 files per mid file
    #	real    81m2.820s

    #	-rw-rw-r--  1 13410334441 Apr 29 12:00 raw.psl
    # cleanup
    rmdir temp

    time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                raw.psl  bacEnds.psl /dev/null > pslReps.out 2>&1 &
    #	real    5m55.990s
    #	Processed 106254032 alignments
    #	-rw-rw-r--  1   372734361 Apr 29 12:56 bacEnds.psl


    wc -l bacEnds.psl
    #	2852977 bacEnds.psl

    time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
	-mismatch -verbose bacEnds.psl \
	/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
	all_bacends bacEnds
    #	Reading pair file
    #	Reading psl file
    #	Creating Pairs
    #	Writing to files
    #	real    0m18.851s
    #	this creates the files:
    #	-rw-rw-r--  1    21178741 Apr 29 13:00 bacEnds.pairs
    #	-rw-rw-r--  1     5250873 Apr 29 13:00 bacEnds.orphan
    #	-rw-rw-r--  1      738045 Apr 29 13:00 bacEnds.short
    #	-rw-rw-r--  1      463560 Apr 29 13:00 bacEnds.slop
    #	-rw-rw-r--  1      146369 Apr 29 13:00 bacEnds.mismatch
    #	-rw-rw-r--  1        3528 Apr 29 13:00 bacEnds.long

    # filter and sort
    awk '$5 >= 300' bacEnds.pairs | sort -k1,1 -k2,2n > bacEndPairs.bed
    awk '$5 >= 300' bacEnds.slop bacEnds.short bacEnds.long \
	bacEnds.mismatch bacEnds.orphan | sort -k1,1 -k2,2n > bacEndPairsBad.bed

    extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
	bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
	    > bacEndPairs.load.psl

############################################################################
# BACEND Randoms SEQUENCE ALIGNMENTS (DONE - 2009-04-28,05-20 - Hiram)
    mkdir -p /hive/data/genomes/hg19/bed/bacends/run.randoms
    cd /hive/data/genomes/hg19/bed/bacends/run.randoms
    #	this separate run for the randoms, haplotypes, chrUn and chrM
    partitionSequence.pl 5000000 20000 /scratch/data/hg19/hg19.2bit \
	/scratch/data/hg19/chrom.sizes 100 -xdir xdir.sh -lstDir tParts \
	| egrep "random|_hap|chrUn" \
	| sed -e "s/.*2bit://; s/:/./" > random.list
    cat tParts/*.lst | sed -e "s/.*2bit://; s/:/./" >> random.list

    ls -1S /hive/data/genomes/hg19/bed/cloneend/splitdir/cloneEnds*.fa \
	> bacEnds.list

    ssh swarm
    cd /hive/data/genomes/hg19/bed/bacends/run.randoms
    gensub2 random.list bacEnds.list ../run.blat/template jobList
    # very similar runOne.csh script as above, but it doesn't need to do
    #	the lift
    cat > runOne.csh << '_EOF_'
#!/bin/csh -fe

set target = $1
set query = $2
set result = $3
set partSpec = `echo $target | sed -e "s/\./:/"`
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set dir = $result:h
set chr = `echo $target | sed -e "s/\..*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
set tmpFile = `echo $result | sed -e "s#psl/$chr/#/scratch/tmp/#; s/.psl//"`

# echo $tmpFile
# echo "chr: $chr $start $end -> size: $chrSize, range: $range"
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p $dir
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
        /scratch/data/hg19/hg19.2bit:$partSpec $query $tmpFile.psl
rm -f $result
mv $tmpFile.psl $result
echo rm -f $tmpFile.lift
'_EOF_'
    # << happy emacs

    # these jobs run fast, do not let too many of them run
    para -maxJob=100 try...check...push
    para time
# Completed: 6762 of 6762 jobs
# CPU time in finished jobs:      20357s     339.29m     5.65h    0.24d  0.001 y
# IO & Wait Time:                 17839s     297.31m     4.96h    0.21d  0.001 y
# Average job time:                   6s       0.09m     0.00h    0.00d
# Longest finished job:             261s       4.35m     0.07h    0.00d
# Submission to last job:           508s       8.47m     0.14h    0.01d

    time pslSort dirs raw.psl temp psl/chr*
    #	6762 files in 69 dirs
    #	Got 6762 files 82 files per mid file
    #	real    6m37.177s

    #	37044 files in 98 dirs
    #	Got 37044 files 192 files per mid file
    #	real    32m24.804s
    #	-rw-rw-r--    1 6487445210 Feb  2 21:08 raw.psl
    time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                raw.psl randomEnds.psl randomReps.psr > pslReps.out 2>&1 &
    #	real    0m5.761s
    #	Processed 1254273 alignments

    # cleanup
    rmdir temp

    wc -l randomEnds.psl
    #	367567 randomEnds.psl

    time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
	-mismatch -verbose randomEnds.psl \
	/cluster/data/hg19/bed/cloneend/cloneEndPairs.txt \
	all_bacends bacEnds
    #	Reading pair file
    #	Reading psl file
    #	Creating Pairs
    #	Writing to files
    #	real    0m11.221s
    #	this creates the files:
    #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.slop
    #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.short
    #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.mismatch
    #	-rw-rw-r--  1         0 Apr 29 14:53 bacEnds.long
    #	-rw-rw-r--  1    141836 Apr 29 14:53 bacEnds.pairs
    #	-rw-rw-r--  1    649907 Apr 29 14:53 bacEnds.orphan

##############################################################################
# BacEnds track - both results loaded together (DONE - 2009-04-29 - Hiram)
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/bacends
    # filter and sort
    awk '$5 >= 300' run.blat/bacEnds.pairs run.randoms/bacEnds.pairs \
	| sort -k1,1 -k2,2n > bacEndPairs.bed
    awk '$5 >= 300' run.blat/bacEnds.slop run.blat/bacEnds.short \
	run.blat/bacEnds.long run.blat/bacEnds.mismatch \
	run.blat/bacEnds.orphan run.randoms/bacEnds.slop \
	run.randoms/bacEnds.short run.randoms/bacEnds.long \
	run.randoms/bacEnds.mismatch run.randoms/bacEnds.orphan \
	    | sort -k1,1 -k2,2n > bacEndPairsBad.bed

    head -5 run.blat/bacEnds.psl > bacEnds.psl
    headRest 5 run.blat/bacEnds.psl > t.psl
    headRest 5 run.randoms/randomEnds.psl >> t.psl
    sort -k14,14 -k16,16n t.psl >> bacEnds.psl
    extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
	bacEndPairsBad.bed | headRest 2 stdin | sort -k14,14 -k16,16n \
	    > bacEnds.load.psl


    #	load them into the database
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/bacends
    #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
    awk '{print $4}' bacEndPairs.bed | grep " "
    awk '{print $5}' bacEndPairs.bed | sort | uniq -c
    #	result should be the scores, no extraneous strings:
    #	156984 1000
    #	   195 300
    #	   316 375
    #	   297 500
    #	  1476 750
    #	edit the file and fix it if it has a bad name.
    hgLoadBed -notItemRgb hg19 bacEndPairs bacEndPairs.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    #	Loaded 208922 elements of size 11
    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed -notItemRgb hg19 bacEndPairsBad bacEndPairsBad.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    #	Loaded 79004 elements of size 11
    #hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
    # NOTE: truncates file to 0 if -nobin is used
    hgLoadPsl hg19 -table=all_bacends bacEnds.load.psl
    # one complaint, there appears to be a bogus insert count in one
    #	of the blat results:
# < 585   797     67      0       3       2       -63     9       79188   +      AQ743980 852     42      846     chr19_gl000208_random   92689   4045    84100  11       14,124,84,496,53,6,20,28,28,10,4,       42,56,180,200,696,750,756,776,804,832,842,      4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,
Became:
# > 585   797     67      0       3       2       0       9       79188   +	 AQ743980 852     42      846     chr19_gl000208_random   92689   4045	84100  11       14,124,84,496,53,6,20,28,28,10,4,	42,56,180,200,696,750,756,776,804,832,842,	4045,5767,7086,83449,83946,83999,84006,84027,84056,84085,84096,

    hgsql -N -e "select count(*) from all_bacends;" hg19
    #	 2289275
    hgsql -N -e "select count(*) from all_bacends;" hg18
    #	1727387
    hgsql -N -e "select count(*) from all_bacends;" hg17
    #	 1729146

    nice featureBits hg19 all_bacends
# 230917362 bases of 2897316137 (7.970%) in intersection
    nice featureBits hg18 all_bacends
# 227770876 bases of 2881515245 (7.905%) in intersectio
    nice featureBits hg17 all_bacends
# 225763317 bases of 2866216770 (7.877%) in intersection

    nice featureBits hg19 bacEndPairs
# 236889607 bases of 2897316137 (8.176%) in intersection
    nice featureBits hg18 bacEndPairs
# 162690030 bases of 2881515245 (5.646%) in intersection
    nice featureBits hg17 bacEndPairs
# 162099487 bases of 2866216770 (5.656%) in intersection

    nice featureBits hg19 bacEndPairsBad
# 38344094 bases of 2897316137 (1.323%) in intersection
    nice featureBits hg18 bacEndPairsBad
# 37326990 bases of 2881515245 (1.295%) in intersection
    nice featureBits hg17 bacEndPairsBad
# 37437558 bases of 2866216770 (1.306%) in intersection

############################################################################
# STS MARKERS (DONE - 2009-04-30 - 2009-05-06 - Hiram)
    mkdir /hive/data/outside/ncbi/sts.2009-04
    cd /hive/data/outside/ncbi
    ln -s sts.2009-04 sts.11
    cd /hive/data/outside/ncbi/sts.2009-04
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
    wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
    gunzip sts.gz
    mv sts dbSTS.fa

    #	these items are copied in from the previous builds
    cp -p /cluster/data/ncbi/sts.10/all.STS.fa ./all.STS.fa.prev
    cp -p /cluster/data/ncbi/sts.10/stsInfo2.bed ./stsInfo2.bed.prev
    #	edit stsInfo2.bed.prev for a
    #	manual fixup of error that is in the hg18 bed file, replace
    #	the line for AFM067XA9 to fix bogus long list of aliases to be:
# 22788^IAFM067XA9^I1^IZ66598^I1^IGDB:1221611,^I5^I067XA9,GDB:1221611,W202,Z66598,SWSS2303^I69047^I0^I^ITCTTGGGGTTTAATTGCTTT^ICTTTGCCACAATCTTACACA^I149^IHomo sapiens^I1^I2^I6453,6454,^I0^I^I^I^I0^I0^I^I^I0^I0^IAFM067XA9^Ichr7^I145^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0^I^I^I0^I0
    #	as taken directly out of the hg18.stsInfo2 table which was fixed
    #	by Bob and Archana

    # Convert the title line of the dbSTS.fa file
    #	Verify that column 3 only contains gb emb dbj
    grep "^>" dbSTS.fa | awk -F'|' '{print $3}' | sort | uniq -c
#   39124 dbj
#   57375 emb
# 1212541 gb
    #	if that is true, this sed will work:
    cat dbSTS.fa \
	| sed -e "s#^>gi.[0-9]*.gb.#>#; s#^>gi.[0-9]*.emb.#>#; s#^>gi.[0-9]*.dbj.#>#; s#\.[0-9]|.*##" \
	    > UniSTS.convert.fa

    # get accessions
    grep ">" UniSTS.convert.fa | sed -e "s/^>//" | sort > UniSTS.acc
    #	head and tail that to ensure names are reasonable, odd names would
    #	show up at the beginning or end
    wc -l UniSTS.acc
    #	1309040 UniSTS.acc

    # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
    #   all.STS.fa, stsAlias.bed files

    updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
	UniSTS.sts UniSTS.aliases UniSTS.convert.fa new

    #	verify the number of aliases is reasonable:
    awk '{print $3}' new.alias | sort | uniq -c | sort -rn | less
    #	50 D7S831
    #	34 CHLC.GATA2B06.465
    #	24 CHLC.GATA11E11
    #	23 AFM276ZF5
    #	23 AFM273YH9
    #	22 SHGC-133043
    #	... etc ...
    #	verify there are no unusually long or short lines:
    awk '{printf "%d\n", length($0)}' new.info | sort -n | head -3
    #	143
    #	144
    #	144
    awk '{printf "%d\n", length($0)}' new.info | sort -n | tail -3
    #	552
    #	553
    #	644
    # check for null in the new files:
    grep -i null new.*
    #	if the new files look good, they can become the set to use:
    mv new.info stsInfo2.bed
    mv new.primers all.primers
    mv new.alias stsAlias.bed
    mv new.fa all.STS.fa

    # get list of all STS id's in the fasta file
    sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n >  all.STS.id
    wc -l all.STS.id
    # 100520 total sequences
    # in hg18 this was: 93698 total sequences
    $HOME/kent/src/hg/stsMarkers/convertPrimerToFA all.primers > all.primers.fa
    # check that fasta file for unusual length sequences:
    faSize all.primers.fa
# 97815329 bases (83677626 N's 14137703 real 14137703 upper 0 lower) in 317592 sequences in 1 files
# Total size: mean 308.0 sd 279.3 min 40 (dbSTS_144) max 30000 (dbSTS_156892) median 244

    # Copy stsInfo2.bed and stsAlias.bed to data directory becuase
    # these will be loaded into the database later
    mkdir -p /hive/data/genomes/hg19/bed/sts
    cp -p stsInfo2.bed /hive/data/genomes/hg19/bed/sts/
    cp -p stsAlias.bed /hive/data/genomes/hg19/bed/sts/

    # Create sts sequence alignments
    mkdir /hive/data/genomes/hg19/bed/sts/split

    faSplit sequence all.STS.fa 100 /hive/data/genomes/hg19/bed/sts/split/sts

    ssh swarm
    mkdir /hive/data/genomes/hg19/bed/sts/run
    cd /hive/data/genomes/hg19/bed/sts/run

    #	going to run separate runs for the golden path sequence vs. the
    #	randoms, haplotypes, chrUn and chrM
    #	40,000,000 chunck sizes, 20,000 overlap
    partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
	| egrep -v "tParts|random|_hap|chrUn" \
	| sed -e "s/.*2bit://;" > hg19.list
    ls -1S ../split > sts.list

    cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat > runOne.csh << '_EOF_'
#!/bin/csh -fe

set partSpec = $1
set query = $2.fa
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
    /scratch/data/hg19/hg19.2bit:$partSpec \
	../split/${query} -stepSize=5 $tmpFile.psl
/bin/rm -f $result
/cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
# rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
    # << happy emacs
    chmod +x runOne.csh

    gensub2 hg19.list sts.list template jobList
    #	these jobs run quickly, allow only 100 at a time
    para -maxJob=100 create jobList
# 8367 jobs in batch
    para try ... check ... push ... etc
# Completed: 8366 of 8366 jobs
# CPU time in finished jobs:      89744s    1495.74m    24.93h    1.04d  0.003 y
# IO & Wait Time:                 25467s     424.44m     7.07h    0.29d  0.001 y
# Average job time:                  14s       0.23m     0.00h    0.00d
# Longest finished job:              53s       0.88m     0.01h    0.00d
# Submission to last job:          1592s      26.53m     0.44h    0.02d

    #	and, run the randoms as a separate run:
    mkdir /hive/data/genomes/hg19/bed/sts/run.randoms
    cd /hive/data/genomes/hg19/bed/sts/run.randoms
    partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
	| egrep "tParts|random|_hap|chrUn"
    cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
    ls -1S ../split > sts.list
    cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat > runOne.csh << '_EOF_'
#!/bin/csh -fe

set partSpec = $1
set query = $2.fa
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile
/cluster/bin/x86_64/blat -ooc=/scratch/data/hg19/11.ooc \
    /scratch/data/hg19/hg19.2bit:$partSpec \
	../split/${query} -stepSize=5 $tmpFile.psl
/bin/rm -f $result
mv $tmpFile.psl $result
/bin/rm -f $tmpFile.psl
'_EOF_'
    # << happy emacs
    chmod +x runOne.csh

    gensub2 hg19.list sts.list template jobList
    #	these jobs run quickly, allow only 100 at a time
    para -maxJob=100 create jobList
# 6486 jobs in batch
    para try ... check ... push ... etc
# Completed: 6486 of 6486 jobs
# CPU time in finished jobs:       2206s      36.77m     0.61h    0.03d  0.000 y
# IO & Wait Time:                 16505s     275.08m     4.58h    0.19d  0.001 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:              21s       0.35m     0.01h    0.00d
# Submission to last job:           601s      10.02m     0.17h    0.01d

    # Compile sts sequence results
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/sts/run
    time pslSort dirs raw.psl temp psl/chr*
    #	8366 files in 89 dirs
    #	Got 8366 files 91 files per mid file
    #	real    8m50.714s
    #	-rw-rw-r--  1 810438277 May  1 11:45 raw.psl
    cd /hive/data/genomes/hg19/bed/sts/run.randoms
    time pslSort dirs raw.psl temp psl/chr*
    #	6486 files in 69 dirs
    #	Got 6486 files 81 files per mid file
    #	real    1m42.120s
    #	-rw-rw-r--  1 18378188 May  1 11:52 raw.psl

    rmdir temp
    cd /hive/data/genomes/hg19/bed/sts
    cat run*/raw.psl | egrep -v "^$|^psLayout|^match|^ |^-" \
	| pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \
	stsMarkers.psl /dev/null
    #	Processed 7412166 alignments
    #	-rw-rw-r-- 1 12031760 May  1 11:57 stsMarkers.psl

    $HOME/kent/src/hg/stsMarkers/extractPslInfo -h stsMarkers.psl
    # creates stsMarkers.psl.initial
    #	-rw-rw-r-- 1  4485053 May  1 12:06 stsMarkers.psl.initial
    wc -l stsMarkers.psl.initial
    #	101338  stsMarkers.psl.initial
    #	this command needs a chrom_names file to work correctly with this
    #	new style of layout for hg19:
    cd /hive/data/genomes/hg19
    cut -f1 chrom.sizes | sed -e "s/chr//" > chrom_names
    cd /hive/data/genomes/hg19/bed/sts

    $HOME/kent/src/hg/stsMarkers/findAccession.pl -agp stsMarkers.psl.initial \
	/cluster/data/hg19
    wc -l stsMarkers.psl.initial.acc
    #	101338  stsMarkers.psl.initial.acc

    sort -k4,4n stsMarkers.psl.initial.acc > stsMarkers.final

    # determine found markers (4th field in file)
    cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
    wc -l stsMarkers.found
    #	96472 stsMarkers.found
    #	out of 100520 total sequences from:
    wc -l /hive/data/outside/ncbi/sts.2009-04/all.STS.id
    #	There are lots of duplicates:
    wc -l stsMarkers.final
    #	101338 stsMarkers.final
    #	And a lot of them are just completely haywire:
    awk '$3-$2 < 1001' stsMarkers.final | wc -l
    #	98382
    #	filter out markers that are too long
    awk '$3-$2 < 1001' stsMarkers.final > stsMarkers.1K.size.filtered

    #  alignment of primers
    ssh swarm
    cd /hive/data/outside/ncbi/sts.2009-04
    awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
	    all.primers > all.primers.ispcr
    mkdir primerAlign
    cd primerAlign
    mkdir split
    cd split
    split -l 5000 ../../all.primers.ispcr primer_
    ls > ../primer.list

    cd ..
    #	we need a 10.ooc file for this business
    time blat /scratch/data/hg19/hg19.2bit \
	/dev/null /dev/null -tileSize=10 -makeOoc=10.ooc -repMatch=1024
# Wrote 146902 overused 10-mers to 10.ooc
# real    19m16.758s

    # separate runs for whole genome vs. randoms
    mkdir run
    cd run
    partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
	| egrep -v "tParts|random|_hap|chrUn" \
	| sed -e "s/.*2bit://;" > hg19.list
    cat > runOne.csh << '_EOF_'
#!/bin/csh -fe

set partSpec = $1
set primer = ../split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile.psl
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
    -ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
	/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
/bin/rm -f $result
/cluster/bin/x86_64/liftUp -type=.psl $result $tmpFile.lift error $tmpFile.psl
rm -f $tmpFile.lift $tmpFile.psl
'_EOF_'
    # << happy emacs
    chmod +x runOne.csh

    cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line+ psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 hg19.list ../primer.list template jobList
    para create jobList
# 5696 jobs in batch
    para try ... check ... push ... etc
# Completed: 5696 of 5696 jobs
# CPU time in finished jobs:     203899s    3398.32m    56.64h    2.36d  0.006 y
# IO & Wait Time:                 22049s     367.48m     6.12h    0.26d  0.001 y
# Average job time:                  40s       0.66m     0.01h    0.00d
# Longest finished job:            5314s      88.57m     1.48h    0.06d
# Submission to last job:          5418s      90.30m     1.50h    0.06d
# Estimated complete:                 0s       0.00m     0.00h    0.00d

    #	sort and filter the results
    cd psl
    pslSort dirs raw.psl temp chr*
    #	5696 files in 89 dirs
    #	Got 5696 files 75 files per mid file
    #	-rw-rw-r-- 1 456802973 May  4 13:32 raw.psl
    cd ..
    mkdir filter
    pslQuickFilter -minMatch=26 -maxMismatch=5 \
        -maxTinsert=5000 -verbose psl/ filter/
    #	-rw-rw-r-- 1 50302564 May  4 13:35 raw.psl

    #	And, for the randoms
    mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
    cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/runRandoms
    
    partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
	| egrep "tParts|random|_hap|chrUn" \
	| sed -e "s/.*2bit://;" > hg19.list
    cat tParts/* | sed -e "s/.*2bit://;" > hg19.list
    cat tParts/* > hg19.list

    cat > runOne.csh << '_EOF_'
#!/bin/csh -fe

set partSpec = $1
set primer = ../split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
/bin/mkdir -p psl/$partSpec
/bin/rm -f $tmpFile.psl
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 \
    -ooc=/hive/data/outside/ncbi/sts.2009-04/primerAlign/10.ooc -stepSize=5 \
	/scratch/data/hg19/hg19.2bit:$partSpec $primer $tmpFile.psl
/bin/rm -f $result
mv $tmpFile.psl $result
'_EOF_'
    # << happy emacs
    chmod +x runOne.csh

    #	can not use line+ check here, many of them are empty
    cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line psl/$(file1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 hg19.list ../primer.list template jobList
    #	they run quickly, limit to 100
    para -maxJob=100 create jobList
    para try ... check ... push ... etc
# Completed: 4416 of 4416 jobs
# CPU time in finished jobs:       1746s      29.09m     0.48h    0.02d  0.000 y
# IO & Wait Time:                 11407s     190.12m     3.17h    0.13d  0.000 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:               8s       0.13m     0.00h    0.00d
# Submission to last job:           147s       2.45m     0.04h    0.00d

    #	sort and filter the results
    cd psl
    pslSort dirs raw.psl temp chr*
    #	4416 files in 69 dirs
    #	Got 4416 files 66 files per mid file
    rmdir temp
    #	-rw-rw-r-- 1 9066053 May  4 13:31 raw.psl

    #	putting the two runs together
    mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
    cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/psl
    ln -s ../run/filter/raw.psl run.psl
    ln -s ../runRandoms/filter/raw.psl runRandoms.psl
    #	-rw-rw-r-- 1 50302564 May  4 13:35 run.psl
    #	-rw-rw-r-- 1   825973 May  4 13:35 runRandoms.psl
    cd ..
    pslSort dirs primers.psl temp psl
    #	2 files in 1 dirs
    #	Got 2 files 1 files per mid file
    #	-rw-rw-r-- 1 51128110 May  4 13:39 primers.psl
    wc -l primers.psl
    #	448107 primers.psl
    rmdir temp
    pslFilterPrimers primers.psl ../all.primers primers.filter.psl
    # creates primers.filter.unlifted.psl.notfound.primers
    wc -l primers*
    #	237962 primers.filter.psl
    #	97191 primers.filter.psl.notfound.primers

    #	see if ePCR can find some of these notfound
    ssh swarm
    mkdir /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr
    cd /hive/data/outside/ncbi/sts.2009-04/primerAlign/epcr

    mkdir split
    cd split
    split -l 5000 ../../primers.filter.psl.notfound.primers  primers_
    cd ..
    ls -1S split > primers.lst
    partitionSequence.pl 40000000 20000 /scratch/data/hg19/hg19.2bit \
	/scratch/data/hg19/chrom.sizes 100 -lstDir tParts \
	| grep -v tParts | sed -e "s/.*2bit://;" > hg19.list
    cat tParts/* | sed -e "s/.*2bit://;" >> hg19.list

    cat > runOne.csh << '_EOF_'
#!/bin/csh -fe

set partSpec = $1
set primer = split/$2
set result = $3
set tmpFile = "/scratch/tmp/$1.$2"
set start = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $1}'`
set end = `echo $partSpec | sed -e "s/.*://; s/-/ /" | awk '{print $2}'`
set range = `echo $start $end | awk '{print $2-$1}'`
set chr = `echo $partSpec | sed -e "s/:.*//"`
set chrSize = `grep -P "^$chr\t" /scratch/data/hg19/chrom.sizes | cut -f2`
/bin/echo -e "$start\t$partSpec\t$range\t$chr\t$chrSize" > $tmpFile.lift
/bin/mkdir -p epcr/$partSpec
/bin/rm -f $tmpFile.psl
twoBitToFa /scratch/data/hg19/hg19.2bit:$partSpec $tmpFile.fa
/cluster/bin/scripts/runEpcr64 $primer $tmpFile.fa $tmpFile.epcr
/bin/rm -f $result
/bin/mv $tmpFile.epcr $result
rm -f $tmpFile.fa $tmpFile.lift $tmpFile.psl $tmpFile.*
'_EOF_'
    # << happy emacs
    chmod +x runOne.csh

    cat > template << '_EOF_'
#LOOP
runOne.csh $(file1) $(root2) {check out line epcr/$(file1)/$(root2).epcr}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 hg19.list primers.lst template jobList
    para create jobList
	# 3160 jobs
    para try ... check ... push ... etc ...
# Completed: 3160 of 3160 jobs
# CPU time in finished jobs:      86253s    1437.54m    23.96h    1.00d  0.003 y
# IO & Wait Time:                 11196s     186.61m     3.11h    0.13d  0.000 y
# Average job time:                  31s       0.51m     0.01h    0.00d
# Longest finished job:              89s       1.48m     0.02h    0.00d
# Submission to last job:           237s       3.95m     0.07h    0.00d

    find ./epcr -type f | xargs cat > all.epcr
    wc -l all.epcr
    #	797286 all.epcr
    # convert the coordinates from the partitionSequence.pl to a lift file
    awk '{print $1}' all.epcr | sort -u > hg19.partSpec.txt
    $HOME/kent/src/hg/stsMarkers/liftFromSpec.pl hg19 hg19.partSpec.txt \
	> all.epcr.lift
    cat all.epcr | sed -e "s/\.\./ /; s/  */\t/g" \
	| liftUp -type=.bed stdout all.epcr.lift error stdin \
	| awk '
{
printf "%s %d..%d %d %d\n", $1, $2, $3, $4, $5
}
' > all.epcr.lifted

    pslFilterPrimers -epcr=all.epcr.lifted -verbose=1 ../primers.psl \
    /cluster/home/hiram/bin/x86_64/pslFilterPrimers -epcr=all.epcr.lifted \
	-verbose=1 ../primers.psl ../../all.primers epcr.primers.psl
    #	this took a long time, many hours
# -rw-rw-r--   1  2785254 May  5 17:28 epcr.not.found
# -rw-rw-r--   1 27343510 May  5 17:28 epcr.primers.psl
# -rw-rw-r--   1  1616885 May  5 17:28 epcr.primers.psl.notfound.primers

    time ./epcrToHgPsl.pl epcr.not.found ../../all.primers \
    time $HOME/kent/src/hg/stsMarkers/epcrToPsl epcr.not.found \
	../../all.primers /hive/data/genomes/hg19
    #	real    69m38.444s
    #	-rw-rw-r--   1        0 May  6 14:18 epcr.not.found.nomatch
    #	-rw-rw-r--   1  8369138 May  6 15:26 epcr.not.found.psl

    #	combining everything together now
    cd /hive/data/outside/ncbi/sts.2009-04/primerAlign

    sort -u primers.filter.psl epcr/epcr.primers.psl epcr/epcr.not.found.psl \
                | sort -k15,15 -k17,17n > primers.final.psl
    wc -l primers.final.psl
    #	310705 primers.final.psl

    time $HOME/kent/src/hg/stsMarkers/fixPrimersQueryGaps.pl \
        ../all.primers primers.final.psl > primers.final.fix.psl
    #	real    0m19.580s
    wc -l primers.final.fix.psl
    #	310705 primers.final.fix.psl

    # Extract relevant info, make alignments unique, and create final file to
    #	be merged with full sequence alignments
    $HOME/kent/src/hg/stsMarkers/extractPslInfo -h primers.final.fix.psl
    #	real    0m15.303s
    #	-rw-rw-r-- 1 15660447 May  6 15:44 primers.final.fix.psl.initial
    wc -l primers.final.fix.psl.initial
    #	308210 primers.final.fix.psl.initial
    $HOME/kent/src/hg/stsMarkers/findAccession.pl -agp \
	primers.final.fix.psl.initial /hive/data/genomes/hg19
    wc -l primers.final.fix.psl.initial.acc
    #	308210 primers.final.fix.psl.initial.acc

    $HOME/kent/src/hg/stsMarkers/getStsId ../stsInfo2.bed \
	primers.final.fix.psl.initial.acc | sort -k 4n > primers.final
    wc -l primers.final
    # 308210 primers.final
    #	There doesn't appear to be any use for this primers.ids list
    #	except for curiosity.  Check the head and tail of this list to
    #	verify no garbage is in here.  There should just be numbers.
    awk '{print $4}' primers.final | sort -n | uniq > primers.ids
    wc -l primers.ids
    #	290961 primers.ids

    # Merge primer and sequence files to create final bed file
    # Merge (combineSeqPrimerPos) takes about an hour to run
    cd /hive/data/genomes/hg19/bed/sts
    time $HOME/kent/src/hg/stsMarkers/combineSeqPrimerPos stsMarkers.final \
	/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final
    #	real    0m12.310s
    #	-rw-rw-r-- 1 15222346 May  6 15:55 stsMarkers_pos.rdb
    wc -l stsMarkers_pos.rdb
    #	315308 stsMarkers_pos.rdb

    time /cluster/bin/scripts/createSTSbed \
	/hive/data/outside/ncbi/sts.2009-04/stsInfo2.bed  \
	stsMarkers_pos.rdb > stsMap.bed
    #	real    0m31.886s
    #	-rw-rw-r-- 1 38244880 May  6 16:25 stsMap.bed
    wc -l stsMap.bed
    #	305914 stsMap.bed

    # Set up sequence files
    ssh hgwdev
    mkdir /gbdb/hg19/sts.11/
    ln -s /hive/data/outside/ncbi/sts.11/all.STS.fa \
	/gbdb/hg19/sts.11/all.STS.fa
    ln -s /hive/data/outside/ncbi/sts.11/all.primers.fa \
        /gbdb/hg19/sts.11/all.primers.fa

    # Load all files
    cd /hive/data/genomes/hg19/bed/sts
    hgLoadSeq hg19 /gbdb/hg19/sts.11/all.STS.fa /gbdb/hg19/sts.11/all.primers.fa
    #	Creating seq.tab file
    #	Adding /gbdb/hg19/sts.11/all.STS.fa
    #	100520 sequences
    #	Adding /gbdb/hg19/sts.11/all.primers.fa
    #	317592 sequences
    #	Updating seq table
    #	Advisory lock has been released
    #	All done


    hgsql hg19 < $HOME/kent/src/hg/lib/stsInfo2.sql
    hgsql hg19 < $HOME/kent/src/hg/lib/stsAlias.sql
    #	these files already exist here from previous operations
    # cp -p /hive/data/outside/ncbi/sts.11/{stsInfo2.bed,stsAlias.bed} .
    hgsql hg19 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
    hgsql hg19 -e 'load data local infile "stsAlias.bed" into table stsAlias'
    #	a couple minutes for each load above
    #	filter the stsMap.bed to eliminate items longer than 5,000 bases,
    #	takes out about 850:
    awk '$3-$2 < 5001' stsMap.bed | sort -k1,1 -k2,2n \
	> stsMap.filtered.5000.bed

    hgLoadBed -notItemRgb -noBin -tab \
	-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql hg19 stsMap \
	    stsMap.filtered.5000.bed
    #	Loaded 305064 elements of size 28

    ln -s \
/hive/data/outside/ncbi/sts.2009-04/primerAlign/primers.final.fix.psl \
	primers.psl

    hgLoadPsl -nobin -table=all_sts_primer hg19 primers.psl
    hgLoadPsl -nobin -table=all_sts_seq hg19 stsMarkers.psl

##############################################################################
# FISH CLONES (WORKING - 2009-04-29 - Hiram)
# The STS Marker and BAC End Pairs tracks must be completed prior to
# creating this track.  

    mkdir /hive/data/outside/ncbi/fishClones/fishClones.2009-04/
    cd /hive/data/outside/ncbi/fishClones/fishClones.2009-04/

# Download information from NCBI
        # point browser at:
#   http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
# change "Sequence tag:" to "placed on contig"
        # change "Show details on sequence-tag" to "yes"
        # change "Download or Display" to "Download table for UNIX"
        # press Submit - save as
# /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt
    chmod 664 /hive/data/outside/ncbi/fishClones/fishClones.2009-04/hbrc.txt

#	Unfortunately the format of this hbrc file has changed since
#	last time.  The columns have been rearranged, and one important
#	column is missing, the contig information.  So, let's see if we
#	can recover the original format by putting this together with
#	some other things we have here.
    $HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \
	/hive/data/genomes/hg19/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \
	    2> dbg
XXX - need to get this seq_clone.pmd from NCBI, maybe Paul Kitts
    #	the seq_clone.pmd file was obtained via email from Wonhee Jang
    #	jang at ncbi.nlm.nih.gov - I have asked for clarification where
    #	such a file can be fetched without resorting to email.

# Get current clone/accession information
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/clone/reports/clac.out
http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out

# Create initial Fish Clones bed file
    ssh kkstore02
    mkdir /hive/data/genomes/hg19/bed/fishClones
    cd /hive/data/genomes/hg19/bed/fishClones

    # Copy previous sts info from fhcrc
    cp -p /hive/data/genomes/hg18/bed/fishClones/fhcrc.sts .
    #	This fhcrc.sts listing doesn't change.  It is merely a listing
    #	of aliases that remain in effect.

    #	Create cl_acc_gi_len file form cloneend information:
    grep -v "^#" /hive/data/genomes/hg19/bed/cloneend/all.txt \
    | awk '{gsub(".[0-9]*$", "", $2);
	printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len

    hgsql -N \
	-e "select chrom,chromStart,chromEnd,contig from ctgPos;" hg19 \
	| sort -k1,1 -k2,2n > ctgPos.bed
    hgsql -N \
-e "select chrom,chromStart,chromEnd,frag,0,strand from gold;" hg19 \
	| sort -k1,1 -k2,2n > gold.bed
    hgsql -N \
-e "select tName,tStart,tEnd,qName,0,strand from all_bacends;" hg19 \
	| sort -k1,1 -k2,2n > all_bacends.bed
    hgsql -N \
-e "select chrom,chromStart,chromEnd,name,score,strand from bacEndPairs;" hg19 \
	| sort -k1,1 -k2,2n > bacEndPairs.bed


    ssh hgwdev
    #	have to be on hgwdev for this since it is going to read from the
    #	database.  Had to work on this program to get it past what is
    #	evidently a bad entry in hbrc.fixed where columns of information
    #	are missing for one clone in particular
    time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg19 \
	/hive/data/genomes/hg19/bed/ncbiCytoBand/contig/fixed.hbrc.txt \
	/hive/data/outside/ncbi/fishClones/fishClones.2009-04/clac.out \
         ./cl_acc_gi_len \
	/hive/data/genomes/hg19/bed/bacends/bacEnds.load.psl \
            fishClones
    #	real    2m4.708s
# Reading Fish Clones file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
# reading fishInfo file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
# Reading Clone/Acc (clac.out) file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/clac.out
# Reading BAC Ends file ./cl_acc_gi_len
# Reading BAC Ends psl file /hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl
# Reading additional STS Marker links fhcrc.sts
# Determining good positions
#	findClonePos: determining positions of fish clones
# Writing output file
# ERROR: at line # 170, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# ERROR: at line # 171, no cytoband info for chrX:104048913-104206974
# RP11-79L11

    # Load the track
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/fishClones
    hgLoadBed -notItemRgb -noBin -tab \
        -sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
	hg19 fishClones fishClones.bed
    #	Loaded 9461 elements of size 16

##############################################################################
# CytoBands from Wonhee Jang at NCBI (DONE - 2009-06-10 - Hiram)

    mkdir /hive/data/genomes/hg19/bed/ncbiCytoBand
    cd /hive/data/genomes/hg19/bed/ncbiCytoBand
    #	received the following files via email:
    ls -ogrt
# -rw-rw-r-- 1 187930 Jun 10 13:53 ideogram
# -rw-rw-r-- 1 672327 Jun  8 09:55 fish.markers.bed

    #	created cytobands.bed from the ideogram file with:
    cat << '_EOF_' > ideoToCytoBand.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH,"<ideogram") or die "can not read ideogram";

while (my $line = <FH>) {
    next if $line =~ m/^#/;
    chomp $line;
    my ($chr, $arm, $location, $a, $b, $start, $end, $stain) =
        split('\s+',$line);
    next if ($location =~ m/[a-z]$/);
//g;$stain =~ s/
    $start -= 1 if ($start == 1);
    printf "chr%s\t%d\t%d\t%s%s\t%s\n", $chr, $start, $end, $arm, $location,
        $stain;
}

close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ideoToCytoBand.pl
    ./ideoToCytoBand.pl > cytobands.bed

    hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
        hg19 cytoBand cytobands.bed

    hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
        hg19 cytoBandIdeo cytobands.bed
    #	checking coverage:
    featureBits -noRandom -noHap -countGaps hg19 cytoBand
    #	3095677412 bases of 3095693983 (99.999%) in intersection
    #	that is everything except chrM:
    echo 3095693983-3095677412 | bc -q
    #	16571

##########################################################################
# CYTOBANDIDEO update -  (DONE - 2013-02-27 - kuhn)
# adding rows for chroms with no cytology
# this is just for navigation/orientation on those chroms

    set db=hg19
    set sql=~/kent/src/hg/lib/cytoBandIdeo.sql
    # make backup of existing table
    hgsql -e "CREATE TABLE cytoBandIdeoCopy SELECT * FROM cytoBandIdeo" $db
    # dump existing table
    hgsql -N -e "SELECT * FROM cytoBandIdeo" $db > $db.cytoBandIdeo

    # find chroms already covered
    hgsql -N -e 'SELECT chrom FROM cytoBandIdeo' $db \
       | sort -u > $db.coveredNames
    # make cytoBand records for chroms not already covered
    hgsql -N -e 'SELECT chrom, size FROM chromInfo' $db \
      | grep -wvf $db.coveredNames \
      | awk '{print $1"\t0\t"$2"\t\tgneg"}' > $db.cytoBandNew
    # check
    wc -l $db.*
    # combine and sort
    cat $db.cytoBandNew $db.cytoBandIdeo > $db.cytoBandIdeoFull
    bedSort $db.cytoBandIdeoFull $db.cytoBandIdeoFull
    # replace exsting table
    hgsql -e "DROP TABLE cytoBandIdeo" $db
    hgLoadSqlTab $db cytoBandIdeo $sql $db.cytoBandIdeoFull
    # check and then drop copy

##############################################################################
# UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
    #	new names as of Ensembl version 57, see below
    mkdir /hive/data/genomes/hg19/ensembl
    cd /hive/data/genomes/hg19/ensembl
    wget --timestamping \
	'ftp://ftp.ensembl.org/pub/pre/homo_sapiens/GRCh37/dna/*'
    #	do not need the repeat masker sequence (although it would be
    #	interesting to measure to see how it compares)
    rm -f *.dna_rm.*
    #	fortunately we have the same sizes as Ensembl for everything
    #	(except the haplotypes) and the sizes are unique for each sequence
    #	so we can relate the names via their sizes
    mkdir /hive/data/genomes/hg19/bed/ucscToEnsembl
    cd /hive/data/genomes/hg19/bed/ucscToEnsembl
    #	the toplevel file is a duplicate of everything else
    ls /hive/data/genomes/hg19/ensembl/*.fa.gz | grep -v toplevel \
	| while read F
do
    zcat "${F}"
done | faCount stdin > faCount.txt

    cat << '_EOF_' > relateUcscEnsembl.pl
#!/usr/bin/env perl

use strict;
use warnings;

my %ucscChrs;   # key is size, value is UCSC chr name

open (FH,"<../../chrom.sizes") or die "can not read ../../chrom.sizes";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $size) = split('\s+', $line);
    die "'$line\n'duplicate size in ../chrom.sizes" if (exists($ucscChrs{$size})
);
    $ucscChrs{$size} = $chr;
}
close (FH);

my %ensemblChrs;        # key is size, value is Ensembl chr name

open (FH,"<faCount.txt") or die "can not read faCount.txt";
while (my $line = <FH>) {
    next if ($line =~ m/#/);
    next if ($line =~ m/total/);
    chomp $line;
    my ($chr, $size, $rest) = split('\s+', $line, 3);
    die "'$line\n'duplicate size in faCount.txt" if (exists($ensemblChrs{$size})
);
    $ensemblChrs{$size} = $chr;
}
close (FH);

my %usedUcscChrs;
my %usedEnsemblChrs;
my %ensemblTranslate; # key is Ensembl name, value is UCSC size
foreach my $size (keys %ucscChrs) {
    if (exists($ensemblChrs{$size})) {
        $usedUcscChrs{$size} = $ucscChrs{$size};
        $usedEnsemblChrs{$size} = $ensemblChrs{$size};
        printf "%s\t%s\t%d\n", $ucscChrs{$size}, $ensemblChrs{$size}, $size;
    } else {
        my $ucscName = $ucscChrs{$size};
        my $ensemblName = "unknown";
        if ($ucscName =~ m/^chr6/) {
            $ucscName =~ s/_hap.//;
            $ucscName =~ s/chr6_/chr6_mhc_/;
            $ensemblName = "HS" . uc($ucscName);
        } elsif ($ucscName =~ m/^chr17_/ || $ucscName =~ m/^chr4_/) {
            $ucscName =~ s/_.*/_1/;
            $ensemblName = "HS" . uc($ucscName);
        } elsif ($ucscName =~ m/^chrM/) {
            print "# no translation for chrM\n";
        } else {
            die "unknown UCSC chr name: $ucscName";
        }
        printf "# ucsc $ucscChrs{$size} -> $ensemblName\n";
        $ensemblTranslate{$ensemblName} = $size;
    }
}

foreach my $size (keys %ensemblChrs) {
    if (!exists($usedEnsemblChrs{$size})) {
        my $ensemblName = $ensemblChrs{$size};
        if (! exists($ensemblTranslate{$ensemblName})) {
            die "can not translate Ensembl name $ensemblName";
        } else {
            my $ucscSize = $ensemblTranslate{$ensemblName};
            printf "%s\t%s\t%d\t%d\n", $ucscChrs{$ucscSize}, $ensemblChrs{$size}
, $ucscSize, $size;
        }
    }
}

printf "chrM\tMT\n";
'_EOF_'
    # << happy emacs
    chmod +x relateUcscEnsembl.pl

    ./relateUcscEnsembl.pl  2>&1 | grep -v "^#" \
	| awk '{printf "%s\t%s\n", $1, $2}' | sort > ucscToEnsembl.tab

    cat << '_EOF_' > ucscToEnsembl.sql
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
    ucsc varchar(255) not null,        # UCSC chromosome name
    ensembl varchar(255) not null,     # Ensembl chromosome name
              #Indices
    PRIMARY KEY(ucsc(21))
);
'_EOF_'

    hgsql hg19 < ucscToEnsembl.sql
    hgsql hg19 \
-e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'

    awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \
	> ensemblLift.tab

    cat << '_EOF_' > ensemblLift.sql
# UCSC offset to Ensembl coordinates
CREATE TABLE ensemblLift (
    chrom varchar(255) not null,      # Ensembl chromosome name
    offset int unsigned not null,     # offset to add to UCSC position 
              #Indices
    PRIMARY KEY(chrom(15))
);
'_EOF_'

    hgsql hg19 < ensemblLift.sql
    hgsql hg19 \
-e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift'

##############################################################################
# LASTZ MOUSE Mm9 (DONE - 2009-05-13 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
    cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13

    cat << '_EOF_' > DEF
# human vs mouse
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Mouse Mm9
SEQ2_DIR=/scratch/data/mm9/nib
SEQ2_SMSK=/scratch/data/mm9/notInOthers
SEQ2_LEN=/scratch/data/mm9/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
    
BASE=/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    cat fb.hg19.chainMm9Link.txt 
    #	1022734273 bases of 2897316137 (35.299%) in intersection

    #	and the swap
    mkdir /hive/data/genomes/mm9/bed/blastz.hg19.swap
    cd /hive/data/genomes/mm9/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13/DEF \
	-swap -noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    131m58.763s
    cat fb.mm9.chainHg19Link.txt 
    #	1013880568 bases of 2620346127 (38.693%) in intersection

#########################################################################
# LASTZ Dog CanFam2 (DONE - 2009-05-13 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
    cd /hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13

    cat << '_EOF_' > DEF
# human vs dog
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/linSpecRep/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog CanFam2
SEQ2_DIR=/scratch/data/canFam2/nib
SEQ2_LEN=/scratch/data/canFam2/chrom.sizes
SEQ2_SMSK=/scratch/scratch/data/canFam2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    cat fb.hg19.chainCanFam2Link.txt 
    #	1532073507 bases of 2897316137 (52.879%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/canFam2/bed/blastz.hg19.swap
    cd /hive/data/genomes/canFam2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzCanFam2.2009-05-13/DEF \
	-noLoadChainSplit -swap \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    200m17.158s
    cat fb.canFam2.chainHg19Link.txt 
    #	1480018167 bases of 2384996543 (62.055%) in intersection
#########################################################################
# LASTZ Chicken GalGal3 (DONE - 2009-05-13 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
    cd /hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13

    cat << '_EOF_' > DEF
# human vs chicken
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/galGal3/nib
SEQ2_LEN=/scratch/data/galGal3/chrom.sizes
SEQ2_SMSK=/scratch/data/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
    cat fb.hg19.chainGalGal3Link.txt 
    #	104053179 bases of 2897316137 (3.591%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/galGal3/bed/blastz.hg19.swap
    cd /hive/data/genomes/galGal3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzGalGal3.2009-05-13/DEF \
	-swap \
	-noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
    #	real    16m45.090s
    cat fb.galGal3.chainHg19Link.txt 
    #	91605899 bases of 1042591351 (8.786%) in intersection

#########################################################################
# LASTZ Macaca Mulatta RheMac2 (DONE - 2009-05-13 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
    cd /hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13

    cat << '_EOF_' > DEF
# human vs macaca mulatta
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Macaca Mulatta RheMac2
SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit
SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    760m22.810s
    cat fb.hg19.chainRheMac2Link.txt 
    #	2397361211 bases of 2897316137 (82.744%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
    cd /hive/data/genomes/rheMac2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzRheMac2.2009-05-13/DEF \
	-swap \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    83m51.483s
    cat fb.rheMac2.chainHg19Link.txt 
    #	2313806886 bases of 2646704109 (87.422%) in intersection
#########################################################################
# LASTZ Rat Rn4 (DONE - 2009-05-13 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
    cd /hive/data/genomes/hg19/bed/lastzRn4.2009-05-13

    cat << '_EOF_' > DEF
# human vs rat
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn4
SEQ2_DIR=/scratch/data/rn4/nib
SEQ2_SMSK=/scratch/data/rn4/linSpecRep.notInHuman
SEQ2_LEN=/scratch/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
    
BASE=/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet -noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    314m18.227s
    cat fb.hg19.chainRn4Link.txt 
    #	952605822 bases of 2897316137 (32.879%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/rn4/bed/blastz.hg19.swap
    cd /hive/data/genomes/rn4/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzRn4.2009-05-13/DEF \
	-swap -noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    188m0.163s
    cat fb.rn4.chainHg19Link.txt 
    #	947862300 bases of 2571531505 (36.860%) in intersection
##############################################################################
# LASTZ Orangutan PonAbe2 (DONE - 2009-05-13 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
    cd /hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13

    cat << '_EOF_' > DEF
# human vs orangutan
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Orangutan PonAbe1
SEQ2_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/scratch/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	> do.log 2>&1 &
    cat fb.hg19.chainPonAbe2Link.txt 
    #	2646687531 bases of 2897316137 (91.350%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
    cd /hive/data/genomes/ponAbe2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzPonAbe2.2009-05-13/DEF \
	-swap \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	> swap.log 2>&1 &
    #	real    124m3.610s
    cat fb.ponAbe2.chainHg19Link.txt 
    #	2772351468 bases of 3093572278 (89.617%) in intersection
##############################################################################
# LASTZ Lamprey PetMar1 (DONE - 2009-05-14 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
    cd /hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14

    cat << '_EOF_' > DEF
# Human vs. Lamprey
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ2_LIMIT=5

# QUERY: Lamprey petMar1
SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/scratch/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
	> do.log 2>&1 &
    #	real    113m20.116s
    cat fb.hg19.chainPetMar1Link.txt 
    #	31347143 bases of 2897316137 (1.082%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/petMar1/bed/blastz.hg19.swap
    cd /hive/data/genomes/petMar1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzPetMar1.2009-05-14/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
	-swap > swap.log 2>&1 &
    #	real    59m14.813s
    cat fb.petMar1.chainHg19Link.txt 
    #	26615001 bases of 831696438 (3.200%) in intersection
##############################################################################
# LASTZ Fugu Fr2 (DONE - 2009-05-14 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
    cd /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14

    cat << '_EOF_' > DEF
# Human vs. Fugu
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Fugu fr2
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/scratch/data/fr2/fr2.2bit
SEQ2_LEN=/hive/data/genomes/fr2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/fr2/noUn/fr2.scaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/fr2/noUn/fr2.scaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/fr2/jkStuff/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
	> do.log 2>&1 &
    #	real    5797m9.288s
    #	had a small problem finishing the fundamental batch run, continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-continue=cat -qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
	> cat.log 2>&1 &
    cat fb.hg19.chainFr2Link.txt 
    #	49309456 bases of 2897316137 (1.702%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/fr2/bed/blastz.hg19.swap
    cd /hive/data/genomes/fr2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzFr2.2009-05-14/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
	-swap > swap.log 2>&1 &
    #	real    25m8.491s
    cat fb.fr2.chainHg19Link.txt 
    #	42984130 bases of 393312790 (10.929%) in intersection

##############################################################################
# LASTZ Tetraodon TetNig1 (DONE - 2009-05-14 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
    cd /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14

    cat << '_EOF_' > DEF
# human vs tetraodon
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/data/tetNig1/tetNig1.2bit
SEQ2_LEN=/hive/data/genomes/tetNig1/chrom.sizes
SEQ2_CHUNK=410000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	> do.log 2>&1 &
    #	real    166m19.745s
    cat fb.hg19.chainTetNig1Link.txt 
    #	58038079 bases of 2897316137 (2.003%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
    cd /hive/data/genomes/tetNig1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-swap > swap.log 2>&1 &
    #	real    29m20.968s
    cat fb.tetNig1.chainHg19Link.txt 
    #	49453375 bases of 342403326 (14.443%) in intersection

##############################################################################
# LASTZ Stickleback GasAcu1 (DONE - 2009-05-14 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
    cd /hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14

    cat << '_EOF_' > DEF
# Human vs. Stickleback
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# TARGET: Stickleback gasAcu1
SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
SEQ2_LEN=/hive/data/genomes/gasAcu1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	> do.log 2>&1 &
    #	real    174m40.659s
    cat fb.hg19.chainGasAcu1Link.txt 
    #	55509003 bases of 2897316137 (1.916%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
    cd /hive/data/genomes/gasAcu1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzGasAcu1.2009-05-14/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-swap > swap.log 2>&1 &
    #	real    29m41.433s
    cat fb.gasAcu1.chainHg19Link.txt 
    #	49909819 bases of 446627861 (11.175%) in intersection
##############################################################################
# LASTZ Marmoset CalJac1 (DONE - 2009-05-14,22 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
    cd /hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14

    cat << '_EOF_' > DEF
# human vs. marmoset
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Marmoset (calJac1)
SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/scratch/data/calJac1/chrom.sizes
SEQ2_LIMIT=200
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    214m16.294s
    cat fb.hg19.chainCalJac1Link.txt 
    #	2053025318 bases of 2897316137 (70.860%) in intersection
    time doRecipBest.pl -buildDir=`pwd` hg19 calJac1 > rbest.log 2>&1 &
    #	real    97m17.207s

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/calJac1/bed/blastz.hg19.swap
    cd /hive/data/genomes/calJac1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzCalJac1.2009-05-14/DEF \
	-swap \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    162m52.189s
    cat fb.calJac1.chainHg19Link.txt 
    #	2105959656 bases of 2929139385 (71.897%) in intersection

#########################################################################
# LASTZ Tarsier TarSyr1 (DONE - 2009-05-14,30 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
    cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14

    cat << '_EOF_' > DEF
# Human vs. Tarsier

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Tarsier
SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit
SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    1724m48.032s
    #	need to load the chain table manually:
    #	mySQL error 1114: The table 'chainTarSyr1Link' is full
    cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14/axtChain
    wc -l *.tab
    #	 21882142 chain.tab
    #	165017606 link.tab
    #	186899748 total
    awk '{print length($0)}' link.tab | sort | uniq -c | less
      4 23
      9 24
     27 25
    105 26
    767 27
   1401 28
   5020 29
   8472 30
  24390 31
 117666 32
 264774 33
 776095 34
1632393 35
2672187 36
7125988 37
16831901 38
34905113 39
45218159 40
31570706 41
13746548 42
5868689 43
2460114 44
1118556 45
 420826 46
 106674 47
  36770 48
  40719 49
  36955 50
  19389 51
   5571 52
   1557 53
     61 54

    time nice -n +19 hgsql -e "DROP TABLE chainTarSyr1Link;" hg19

    cat << '_EOF_' | hgsql hg19
    CREATE TABLE chainTarSyr1Link (
      bin smallint(5) unsigned NOT NULL default 0,
      tName varchar(255) NOT NULL default '',
      tStart int(10) unsigned NOT NULL default 0,
      tEnd int(10) unsigned NOT NULL default 0,
      qStart int(10) unsigned NOT NULL default 0,
      chainId int(10) unsigned NOT NULL default 0,
      KEY tName (tName(16),bin),
      KEY chainId (chainId)
    ) ENGINE=MyISAM max_rows=166000000 avg_row_length=42 pack_keys=1 CHARSET=latin1;
'_EOF_'
    # << happy emacs

    time nice -n +19 hgsql -e \
      "load data local infile \"link.tab\" into table chainTarSyr1Link;" hg19
    #	real    157m0.230s
    #	the running the rest of loadUp.csh after the hgLoadChain
    #	real    26m8.263s
    cat fb.hg19.chainTarSyr1Link.txt 
    #	1385797066 bases of 2897316137 (47.830%) in intersection
    #	Continuing:
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-continue=download -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> download.log 2>&1 &
    #	real    48m6.573s
    #	ran the script on swarm to recover after hive outages
    time doRecipBest.pl -buildDir=`pwd` hg19 tarSyr1 > rbest.log 2>&1 &
    #	real    404m0.201s
    time doRecipBest.pl -continue=download -buildDir=`pwd` \
	hg19 tarSyr1 > rbest.download.log 2>&1 &

#########################################################################
# LASTZ Bushbaby OtoGar1 (DONE - 2009-05-14,22 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
    cd /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14

    cat << '_EOF_' > DEF
# Human vs. Bushbaby

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Bushbaby otoGar1 - single chunk big enough to run largest scaffold
SEQ2_DIR=/scratch/data/otoGar1/otoGar1.rmsk.2bit
SEQ2_LEN=/hive/data/genomes/otoGar1/chrom.sizes
SEQ2_LIMIT=200
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    762m56.055s
    cat fb.hg19.chainOtoGar1Link.txt 
    #	1264492372 bases of 2897316137 (43.644%) in intersection
    time doRecipBest.pl -buildDir=`pwd` hg19 otoGar1 > rbest.log 2>&1 &
    #	real    271m39.925s

#########################################################################
# LASTZ Mouse lemur MicMur1 (DONE - 2009-05-14,26 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
    cd /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14

    cat << '_EOF_' > DEF
# Human vs. Mouse lemur

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Mouse lemur
SEQ2_DIR=/hive/data/genomes/micMur1/bed/repeatMasker/micMur1.rmsk.2bit
SEQ2_LEN=/hive/data/genomes/micMur1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	> do.log 2>&1 &
    #	real    5429m52.082s
    #	there is one unusual long running job having trouble
    #	continuing after finishing the lastz run manually:
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-continue=cat -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	> cat.log 2>&1 &
    #	real    388m25.032s
    cat fb.hg19.chainMicMur1Link.txt 
    #	1347792207 bases of 2897316137 (46.519%) in intersection
    time doRecipBest.pl -buildDir=`pwd` hg19 micMur1 > rbest.log 2>&1
    #	about 4h30m

#########################################################################
# LASTZ Baboon PapHam1 (DONE - 2009-05-20,22 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
    cd /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20

    cat << '_EOF_' > DEF
# human vs baboon
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Baboon papHam1
SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit
SEQ2_LEN=/scratch/data/papHam1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	forgot that the synNet was not needed here, use recip best as below
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    cat fb.hg19.chainPapHam1Link.txt 
    #	2399269031 bases of 2897316137 (82.810%) in intersection

    time doRecipBest.pl -buildDir=`pwd` hg19 papHam1 > rbest.log 2>&1
    #	real    182m0.276s

#########################################################################
# SGP GENES (DONE - 2009-05-22 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/sgpGene
    cd /hive/data/genomes/hg19/bed/sgpGene
    mkdir download
    cd download
for C in `cut -f1 ../../../chrom.sizes`
do
    echo $C
    wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.gtf
    wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.prot
done

    cd ..
    cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 sgpGene stdin

    #	Read 33994 transcripts in 291782 lines in 1 files
    #	33994 groups 85 seqs 1 sources 3 feature types
    #	33994 gene predictions
    nice -n +19 featureBits -enrichment hg19 refGene:CDS sgpGene
# refGene:CDS 1.181%, sgpGene 1.295%, both 1.011%, cover 85.59%, enrich 66.08x

###########################################################################
# GENEID GENE PREDICTIONS (DONE - 2009-05-22 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/hg19/bed/geneid
    cd /hive/data/genomes/hg19/bed/geneid
    mkdir download
    cd download
    for C in `cut -f1 ../../../chrom.sizes`
    do
	echo $C
 wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.gtf
    wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.prot
    done

    cd ..
    cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 geneid stdin
    #	Read 33428 transcripts in 277332 lines in 1 files
    #	33428 groups 92 seqs 1 sources 3 feature types
    #	33428 gene predictions

##########################################################################
## 4-Way Multiz for UCSC Genes construction (DONE - 2009-05-22 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/hg19/bed/multiz4way
    cd /hive/data/genomes/hg19/bed/multiz4way

    #	extract our 4 organisms from the 44-way on hg18:
    ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh

    /cluster/bin/phast/tree_doctor \
	--prune-all-but hg18,mm9,canFam2,rheMac2 44way.nh \
	| sed -e "s/hg18/hg19/" > 4way.nh

    #	this looks like:
    cat 4way.nh
(((hg19:0.032973,rheMac2:0.036199):0.109706,mm9:0.352605):0.020666,canFam2:0.193569);


    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a gif image for htdocs/images/phylo/hg19_4way.gif

    /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
    #	Use this output to create the table below
    grep -y hg19 4way.distances.txt | sort -k3,3n
#
#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#
#                         featureBits chainLink measures
#                                        chainHg19Link   chain    linearGap
#    distance                      on hg19    on other   minScore
#  1  0.069172 - rhesus rheMac2 (% 82.744) (% xx.xxx)       5000     medium
#  2  0.356914 - dog canFam2    (% 52.879) (% xx.xxx)       3000     medium
#  3  0.495284 - mouse mm9      (% 35.299) (% 38.693)       3000     medium

    #	using the syntenic nets
    cd /cluster/data/hg19/bed/multiz4way
    mkdir mafLinks
    cd mafLinks
    mkdir rheMac2 canFam2 mm9

    cd mm9
    ln -s ../../../lastz.mm9/mafSynNet/*.maf.gz .
    cd ../canFam2
    ln -s ../../../lastz.canFam2/mafSynNet/*.maf.gz .
    cd ../rheMac2
    ln -s ../../../lastz.rheMac2/mafSynNet/*.maf.gz .

    #	determine what is the newest version of multiz and use that
    cd /hive/data/genomes/hg19/bed/multiz4way
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn

    # the autoMultiz cluster run
    ssh swarm
    cd /hive/data/genomes/hg19/bed/multiz4way

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	4way.nh > tmp.nh
    echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir run maf
    cd run

    #	NOTE: you need to set the db and multiz dirname properly in this script
    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = hg19
set c = $1
set maf = $2
set binDir = /hive/data/genomes/hg19/bed/multiz4way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/hg19/bed/multiz4way/mafLinks
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz4way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cut -f1 /cluster/data/hg19/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    # 93 jobs
    para try ... check ... push ... etc ...
# Completed: 93 of 93 jobs
# CPU time in finished jobs:      24282s     404.70m     6.75h    0.28d  0.001 y
# IO & Wait Time:                  2362s      39.36m     0.66h    0.03d  0.000 y
# Average job time:                 286s       4.77m     0.08h    0.00d
# Longest finished job:            2235s      37.25m     0.62h    0.03d
# Submission to last job:          2241s      37.35m     0.62h    0.03d

    #	combine results into a single file for loading and gbdb reference
    cd /hive/data/genomes/hg19/bed/multiz4way
    time nice -n +19 catDir maf > multiz4way.maf
    #	real    3m27.561s

    #	makes a 8.5 Gb file:
    #	-rw-rw-r-- 1 9026080732 May 22 11:11 multiz4way.maf

    # Load into database
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/multiz4way
    mkdir /gbdb/hg19/multiz4way
    ln -s /hive/data/genomes/hg19/bed/multiz4way/multiz4way.maf \
	/gbdb/hg19/multiz4way
    #	the hgLoadMaf generates huge tmp files, locate them in /scratch/tmp/
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf hg19 multiz4way
    #	real    5m31.883s
    #	Loaded 5788627 mafs in 1 files from /gbdb/hg19/multiz4way

    cd /hive/data/genomes/hg19/bed/multiz4way
    time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 hg19 multiz4waySummary multiz4way.maf
    #	Created 1238721 summary blocks from 11959676 components
    #	and 5788627 mafs from multiz4way.maf
    #	real    6m33.936s

#########################################################################
# LASTZ Medaka OryLat2 (DONE - 2009-05-22 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
    cd /hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22

    cat << '_EOF_' > DEF
# Human vs. Medaka
# typical parameters for a genome that is distant from human
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ2_LEN=/hive/data/genomes/oryLat2/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=200
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	> do.log 2>&1 &
    #	real    124m5.298s
    cat fb.hg19.chainOryLat2Link.txt 
    #	53571737 bases of 2897316137 (1.849%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
    cd /hive/data/genomes/oryLat2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzOryLat2.2009-05-22/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-swap > swap.log 2>&1 &
    #	real    28m35.174s
    cat fb.oryLat2.chainHg19Link.txt 
    #	46961818 bases of 700386597 (6.705%) in intersection
##############################################################################
# LASTZ Opossum MonDom5 (DONE - 2009-05-23,29 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
    cd /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23

    cat << '_EOF_' > DEF
# human vs. opossum
# settings for more distant organism alignments
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Opossum monDom5
SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	One job took a long time to complete, had to run it manually on
    #	swarm:
# /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
#	/scratch/data/hg19/hg19.2bit:chr19:50000000-59128983 \
#	/scratch/data/monDom5/monDom5.2bit:chr4:390000000-420000000 \
#	../DEF \
#	../psl/hg19.2bit:chr19:50000000-59128983/hg19.2bit:chr19:50000000-59128983_monDom5.2bit:chr4:390000000-420000000.psl
    #	took about 48 hours, continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-continue=cat > cat.log 2>&1 &
    #	real    1508m18.471s ==	about 25h08m
    cat fb.hg19.chainMonDom5Link.txt 
    #	415997117 bases of 2897316137 (14.358%) in intersection
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
    #	real    20m29.049s

    mkdir /hive/data/genomes/monDom5/bed/blastz.hg19.swap
    cd /hive/data/genomes/monDom5/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap -syntenicNet > swap.log 2>&1 &
    #	real    297m13.041s
    cat fb.monDom5.chainHg19Link.txt 
    #	406727849 bases of 3501660299 (11.615%) in intersection

##############################################################################
# LASTZ Armadillo DasNov2 (DONE - 2009-05-23,28 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
    cd /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23

    cat << '_EOF_' > DEF
# Human vs. Armadillo
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Armadillo
SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit
SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	> do.log 2>&1 &
    #	finished the lastz run manually after hive maintenance outages
    #	then, continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-continue=cat > cat.log 2>&1 &
    #	real    458m11.304s
    cat fb.hg19.chainDasNov2Link.txt 
    #	971847303 bases of 2897316137 (33.543%) in intersection
    time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 dasNov2 \
	> rbest.log 2>&1
    #	time about 6h30m

##############################################################################
# LASTZ Rock Hyrax ProCap1 (DONE - 2009-05-23,26 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
    cd /hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23

    cat << '_EOF_' > DEF
# Human vs. Rock Hyrax
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Rock Hyrax
SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit
SEQ2_LEN=/scratch/data/proCap1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzProCap1.2009-05-23
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
# Completed: 997438 of 997438 jobs
# CPU time in finished jobs:   32830587s  547176.45m  9119.61h  379.98d  1.041 y
# IO & Wait Time:               9549484s  159158.07m  2652.63h  110.53d  0.303 y
# Average job time:                  42s       0.71m     0.01h    0.00d
# Longest finished job:            1953s      32.55m     0.54h    0.02d
# Submission to last job:         67216s    1120.27m    18.67h    0.78d
    #	finished lastz run manually, then continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-continue=cat > cat.log 2>&1 &
    #	real    369m1.678s
    cat fb.hg19.chainProCap1Link.txt 
    #	894221652 bases of 2897316137 (30.864%) in intersection
    time nice -n +19 doRecipBest.pl -buildDir=`pwd` hg19 proCap1 \
	> rbest.log 2>&1
    #	real    251m59.549s

##############################################################################
# LASTZ Zebra Finch TaeGut1 (DONE - 2009-05-26 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
    cd /hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26

    cat << '_EOF_' > DEF
# human vs Zebra Finch
# distant from Human settings
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebra Finch taeGut1 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-qRepeats=windowmaskerSdust > do.log 2>&1 &
    cat fb.hg19.chainTaeGut1Link.txt 
    #	real    192m48.479s
    #	101295490 bases of 2897316137 (3.496%) in intersection
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet -noLoadChainSplit -chainMinScore=5000 \
	-chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-continue=syntenicNet -qRepeats=windowmaskerSdust > synNet.log 2>&1 &
    #	real    4m10.261s

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
    cd /hive/data/genomes/taeGut1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzTaeGut1.2009-05-26/DEF \
	-swap -noLoadChainSplit -chainMinScore=5000 \
	-chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-qRepeats=windowmaskerSdust > swap.log 2>&1 &
    #	real    real    16m45.080s
    cat fb.taeGut1.chainHg19Link.txt 
    #	95320369 bases of 1222864691 (7.795%) in intersection

##############################################################################
# LASTZ Lizard AnoCar1 (DONE - 2009-05-30,31 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
    cd /hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30

    cat << '_EOF_' > DEF
# human vs lizard
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Lizard anoCar1
SEQ2_DIR=/scratch/data/anoCar1/anoCar1.2bit
SEQ2_LEN=/scratch/data/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-qRepeats=windowmaskerSdust > do.log 2>&1 &
    #	real    168m32.016s
    cat fb.hg19.chainAnoCar1Link.txt 
    #	104045950 bases of 2897316137 (3.591%) in intersection
    time doRecipBest.pl -buildDir=`pwd` hg19 anoCar1 > rbest.log 2>&1
    #	real    45m58.001s

    #	running syntenic Net 2009-08-27 - Hiram
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-continue=syntenicNet -syntenicNet \
	-qRepeats=windowmaskerSdust > syntenicNet.log 2>&1 &
    #	real    6m13.304s

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
    cd /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzAnoCar1.2009-05-30/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
    #	real    34m55.857s
    cat fb.anoCar1.chainHg19Link.txt 
    #	89608316 bases of 1741478929 (5.146%) in intersection
##############################################################################
# LASTZ X. tropicalis XenTro2 (DONE - 2009-05-26 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
    cd /hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26

    cat << '_EOF_' > DEF
# human vs X. tropicalis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Lizard anoCar1
SEQ2_DIR=/scratch/data/xenTro2/xenTro2.2bit
SEQ2_LEN=/scratch/data/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    1129m11.568s
    #	finished the lastz run manually after hive difficulties, continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-continue=cat > cat.log 2>&1 &
    #	time about 1h30m
    cat fb.hg19.chainXenTro2Link.txt 
    #	92015242 bases of 2897316137 (3.176%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
    cd /hive/data/genomes/xenTro2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzXenTro2.2009-05-26/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    130m53.860s
    cat fb.xenTro2.chainHg19Link.txt 
    #	92070065 bases of 1359412157 (6.773%) in intersection

##############################################################################
# LASTZ Zebrafish DanRer5 (DONE - 2009-05-26 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
    cd /hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26

    cat << '_EOF_' > DEF
# human vs X. zebrafish
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebrafish danRer5
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/scratch/data/danRer5/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=40

BASE=/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    311m39.817s
    cat fb.hg19.chainDanRer5Link.txt 
    #	74229561 bases of 2897316137 (2.562%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/danRer5/bed/blastz.hg19.swap
    cd /hive/data/genomes/danRer5/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzDanRer5.2009-05-26/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    26m54.605s
    cat fb.danRer5.chainHg19Link.txt 
    #	73852780 bases of 1435609608 (5.144%) in intersection

##############################################################################
# LASTZ Platypus OrnAna1 (DONE - 2009-05-26 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
    cd /hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26

    cat << '_EOF_' > DEF
# human vs platypus
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Platypus ornAna1
SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    572m18.808s
    cat fb.hg19.chainOrnAna1Link.txt 
    #	220977689 bases of 2897316137 (7.627%) in intersection
    time doRecipBest.pl -buildDir=`pwd` hg19 ornAna1 > rbest.log 2>&1
    #	time about 1h32m

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
    cd /hive/data/genomes/ornAna1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzOrnAna1.2009-05-26/DEF \
	-swap -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    146m52.638s
    cat fb.ornAna1.chainHg19Link.txt 
    #	207415519 bases of 1842236818 (11.259%) in intersection

##############################################################################
# LASTZ Elephant LoxAfr2 (DONE - 2009-05-27,29 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
    cd /hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27

    cat << '_EOF_' > DEF
# Human vs. Elephant
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Elephant
SEQ2_DIR=/scratch/data/loxAfr2/loxAfr2.2bit
SEQ2_LEN=/scratch/data/loxAfr2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr2.2009-05-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    # time about 3h23m
    cat fb.hg19.chainLoxAfr2Link.txt 
    #	1018502258 bases of 2897316137 (35.153%) in intersection

    time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr2 > rbest.log 2>&1
    #	real    322m37.502s

##############################################################################
# LASTZ Tenrec EchTel1 (DONE - 2009-05-27 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
    cd /hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27

    cat << '_EOF_' > DEF
# Human vs. Tenrec
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Tenrec
SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/data/echTel1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzEchTel1.2009-05-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    1153m34.595s
    cat fb.hg19.chainEchTel1Link.txt 
    #	669856841 bases of 2897316137 (23.120%) in intersection
    time doRecipBest.pl -buildDir=`pwd` hg19 echTel1 > rbest.log 2>&1
    # time about 7h13m

##############################################################################
# LASTZ Tree Shrew TupBel1 (DONE - 2009-05-27,06-02 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
    cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27

    cat << '_EOF_' > DEF
# Human vs. Tree Shrew
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Tree Shrew
SEQ2_DIR=/scratch/data/tupBel1/tupBel1.2bit
SEQ2_LEN=/scratch/data/tupBel1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
	> do.log 2>&1 &
    #	real 811m54.095s
    #	having trouble with pk, finished manually
    #	XXX there is one job that is taking forever ...
    #	finished it in pieces on swarm in a few minutes, like this:
    mkdir /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
    cd /hive/data/genomes/hg19/bed/lastzTupBel1.2009-05-27/run.blastz/lastJob
#!/bin/sh

S=100000000
E=101010000
export S E
for I in 0 1 2 3 4 5 6 7 8 9
do
echo $S $E
/usr/bin/time -p /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
/scratch/data/hg19/nib/chr1.nib:chr1:${S}-${E} ../qParts/part019.lst \
../../DEF psl/chr1.nib:chr1:${S}-${E}_part019.lst.psl
nextS=`echo $S | awk '{printf "%d", $1 + 1000000}'`
nextE=`echo $E | awk '{printf "%d", $1 + 1000000}'`
S=$nextS
E=$nextE
done

    grep -h "^#" psl/chr* | sort -u > result.psl
    grep -h -v "^#" psl/chr* | sort -k14,14 -k16,16n >> result.psl
    cp -p result.psl \
../../psl/chr1.nib:chr1:100000000-110010000/chr1.nib:chr1:100000000-110010000_part019.lst.psl

    #	then, continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
	-continue=cat > cat.log 2>&1 &
    #	real    212m22.707s
    time doRecipBest.pl -buildDir=`pwd` hg19 tupBel1 > rbest.log 2>&1
    #	time about 4h22m

##############################################################################
# LASTZ Shrew SorAra1 (DONE - 2009-05-28,30 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
    cd /hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28

    cat << '_EOF_' > DEF
# Human vs. Shrew
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Shrew
SEQ2_DIR=/scratch/data/sorAra1/sorAra1.2bit
SEQ2_LEN=/scratch/data/sorAra1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzSorAra1.2009-05-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	time about 23h26m
    cat fb.hg19.chainSorAra1Link.txt 
    #	572519288 bases of 2897316137 (19.760%) in intersection

    time doRecipBest.pl -buildDir=`pwd` hg19 sorAra1 > rbest.log 2>&1
    #	real    251m20.055s

##############################################################################
# LASTZ Rabbit OryCun1 (DONE - 2009-05-28,30 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
    cd /hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28

    cat << '_EOF_' > DEF
# Human vs. Rabbit
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rabbit
SEQ2_DIR=/scratch/data/oryCun1/oryCun1.2bit
SEQ2_LEN=/scratch/data/oryCun1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzOryCun1.2009-05-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	time about 23h09m
    cat fb.hg19.chainOryCun1Link.txt 
    #	975693323 bases of 2897316137 (33.676%) in intersection

    time doRecipBest.pl -buildDir=`pwd` hg19 oryCun1 > rbest.log 2>&1
    #	real    318m1.142s

##############################################################################
# LASTZ Hedgehog EriEur1 (DONE - 2009-05-28,30 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
    cd /hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28

    cat << '_EOF_' > DEF
# Human vs. Hedgehog
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Hedgehog
SEQ2_DIR=/scratch/data/eriEur1/eriEur1.2bit
SEQ2_LEN=/scratch/data/eriEur1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=500
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzEriEur1.2009-05-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	> do.log 2>&1 &
    #	real    2043m33.198s
    cat fb.hg19.chainEriEur1Link.txt 
    #	560965051 bases of 2897316137 (19.362%) in intersection
    time doRecipBest.pl -buildDir=`pwd` hg19 eriEur1 > rbest.log 2>&1
    #	real    350m17.737s

##############################################################################
# LASTZ Pika OchPri2 (DONE - 2009-05-29,30 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
    cd /hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29

    cat << '_EOF_' > DEF
# Human vs. Pika
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Pika
SEQ2_DIR=/scratch/data/ochPri2/ochPri2.2bit
SEQ2_LEN=/scratch/data/ochPri2/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzOchPri2.2009-05-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    393m42.569s
    cat fb.hg19.chainOchPri2Link.txt 
    #	804516397 bases of 2897316137 (27.768%) in intersection
    time doRecipBest.pl -buildDir=`pwd` hg19 ochPri2 > rbest.log 2>&1
    #	real    224m47.979s

##############################################################################
# LASTZ Kangaroo Rat DipOrd1 (DONE - 2009-05-29,30 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
    cd /hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29

    cat << '_EOF_' > DEF
# Human vs. Kangaroo Rat
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Kangaroo Rat
SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit
SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzDipOrd1.2009-05-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    688m47.595s
    time doRecipBest.pl -buildDir=`pwd` hg19 dipOrd1 > rbest.log 2>&1
    #	real    140m42.014s

##############################################################################
# LIFTOVER TO Hg18 (DONE - 2009-06-04 - Hiram )
    mkdir /hive/data/genomes/hg19/bed/blat.hg18.2009-06-04
    cd /hive/data/genomes/hg19/bed/blat.hg18.2009-06-04
    # -debug run to create run dir, preview scripts...
    #	verifies files can be found
    doSameSpeciesLiftOver.pl -debug hg19 hg18
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	 hg19 hg18 > do.log 2>&1
    #	real    115m26.071s

#############################################################################
# BLASTZ/CHAIN/NET/ETC 11 GENOMES TO HG19 (DONE, Andy 2009-06-06)
ssh hgwdev
cd /hive/data/genomes/hg19/bed
mkdir lastz{SpeTri1,FelCat3,CavPor3,BosTau4,PteVam1,EquCab2,VicPac1,MyoLuc1,TurTru1,ChoHof1}.2009-06-04
ln -s lastzSpeTri1.2009-06-04 lastz.speTri1
ln -s lastzFelCat3.2009-06-04 lastz.felCat3
ln -s lastzCavPor3.2009-06-04 lastz.cavPor3
ln -s lastzBosTau4.2009-06-04 lastz.bosTau4
ln -s lastzPteVam1.2009-06-04 lastz.pteVam1
ln -s lastzEquCab2.2009-06-04 lastz.equCab2
ln -s lastzVicPac1.2009-06-04 lastz.vicPac1
ln -s lastzMyoLuc1.2009-06-04 lastz.myoLuc1
ln -s lastzTurTru1.2009-06-04 lastz.turTru1
ln -s lastzChoHof1.2009-06-04 lastz.choHof1
cat > lastz.speTri1/DEF << 'EOF'
# human vs squirrel

# TARGET: human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: squirrel speTri1 
SEQ2_DIR=/hive/data/genomes/speTri1/speTri1.2bit
SEQ2_LEN=/hive/data/genomes/speTri1/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastz.speTri1
TMPDIR=/scratch/tmp
EOF

sed 's/speTri1/felCat3/g; s/squirrel/cat/;' lastz.speTr1/DEF | \
   sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' \
   > lastz.felCat3/DEF
sed 's/speTri1/cavPor3/g; s/squirrel/guinea pig/;' lastz.speTr1/DEF | \
   sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/' | \
   sed 's/hive\/data\/genomes\/cavPor3/scratch\/data\/cavPor3/' \
   > lastz.cavPor3/DEF
sed 's/speTri1/bosTau4/g; s/squirrel/cow/;' lastz.speTr1/DEF | \
   sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' \
   > lastz.bosTau4/DEF
sed 's/speTri1/pteVam1/g; s/squirrel/megabat/;' lastz.speTr1/DEF | \
   sed 's/SEQ1_CHUNK=1/SEQ1_CHUNK=2/; s/SEQ2_LIMIT=1/SEQ2_LIMIT=2/' \
   > lastz.pteVam1/DEF
sed 's/cavPor3/equCab2/g; s/guinea pig/horse/' lastz.cavPor3/DEF | \ 
   sed 's/SEQ2_LIMIT=1/SEQ2_LIMIT=3/' > lastz.equCab2/DEF
sed 's/equCab2/vicPac1/g; s/horse/alpaca/' lastz.equCab2/DEF > lastz.vicPac1/DEF
sed 's/pteVam1/myoLuc1/g; s/megabat/microbat/' lastz.pteVam1/DEF | \
   sed 's/SEQ2_LIMIT=3/SEQ2_LIMIT=2/' > lastz.myoLuc1/DEF
sed 's/equCab2/turTru1/g; s/horse/dolphin/' lastz.equCab2/DEF | \
   sed 's/SEQ2_LIMIT=3/SEQ2_LIMIT=2/' > lastz.turTru1/DEF
sed 's/equCab2/choHof11/g; s/horse/sloth/' lastz.equCab2/DEF > lastz.choHof1/DEF

cd andy/
for db in speTri1 felCat3 cavPor3 bosTau4 pteVam1 equCab2 vicPac1 myoLuc1 turTru1 choHof1; do
    ln -s ../lastz.${db}/DEF ${db}.DEF
done

screen -S speTri1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  -chainMinScore=3000 -chainLinearGap=medium speTri1.DEF >& speTri1.do.log
# [detach screen]
#real    2059m30.699s

screen -S felCat3
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  -chainMinScore=3000 -chainLinearGap=medium felCat3.DEF >& felCat3.do.log
# [detach screen]
#real    1574m47.522s

screen -S bosTau4
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  -chainMinScore=3000 -chainLinearGap=medium bosTau4.DEF >& bosTau4.do.log
# [detach screen]
#real    1474m54.655s

screen -S pteVam1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm
  -chainMinScore=3000 -chainLinearGap=medium pteVam1.DEF >& pteVam1.do.log
# [detach screen]
#real    1168m33.923s

screen -S equCab2
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  -chainMinScore=3000 -chainLinearGap=medium -syntenicNet equCab2.DEF >& equCab2.do.log
# [detach screen]
#real    1662m56.158s
# (included syntenic net)

screen -S vicPac1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  -chainMinScore=3000 -chainLinearGap=medium vicPac1.DEF >& vicPac1.do.log
# [detach screen]
#real    1495m48.173s

screen -S turTru1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  -chainMinScore=3000 -chainLinearGap=medium turTru1.DEF >& turTru1.do.log
# [detach screen]
#real    1079m17.234s

screen -S choHof1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
  -chainMinScore=3000 -chainLinearGap=medium choHof1.DEF >& choHof1.do.log
# [detach screen]
#real    1310m49.287s (script and cluster run stopped after halfway...
# pk was too slow... remaining jobs started on swarm)
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  -chainMinScore=3000 -chainLinearGap=medium -continue=cat \
  choHof1.DEF >& choHof1.doAfterBlastz.log
#real    257m32.701s

screen -S cavPor3
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \
  -smallClusterHub=memk -bigClusterHub=pk cavPor3.DEF >& cavPor3.do.log
# [detach screen]
#real    1370m5.258s
# TROUBLE!  got to the 'load' step and failed.  This one needs a special
# chain table and chainLink table to get loaded.
cd ../lastz.cavPor3/axtChain/
# figure out number of rows and average length
wc -l *.tab
#   27186468 chain.tab
#  240602108 link.tab
randomLines link.tab 10000000 stdout | awk '{print length($0)}' | sort | uniq -c
randomLines chain.tab 1000000 stdout | awk '{print length($0)}' | sort | uniq -c
# about 43 average length for the chainLink and 100 for the chain
sed "s/hgLoadChain.*/hgsqldump hg19 chainSpeTri1Link --no-data --skip-comments | sed \'s\/SpeTri1\/CavPor3\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=241000000 avg_row_length=43 pack_keys=1 CHARSET=latin1\/\' | hgsql hg19 \n\
hgsqldump hg19 chainSpeTri1 --no-data --skip-comments | sed \'s\/SpeTri1\/CavPor3\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=27200000 avg_row_length=100 pack_keys=1 CHARSET=latin1\/\' | hgsql hg19 \n\
hgsql hg19 -e \"load data local infile \'chain.tab\' into table chainCavPor3\"\n\
hgsql hg19 -e \"load data local infile \'link.tab\' into table chainCavPor3Link\"\n\
hgsql hg19 -e \"INSERT into history (ix, startId, endId, who, what, modTime, errata) VALUES(NULL,0,0,\'aamp\',\'Loaded 27186468 chains into cavPor3 chain table manually\', NOW(), NULL)\"\
/" loadUp.csh > manualLoadUp.csh
chmod +x manualLoadUp.csh
time nice -n +19 ./manualLoadUp.csh
# [detach screen]
#real    584m4.093s
cd ../../andy/
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \
  -smallClusterHub=memk -bigClusterHub=swarm -continue=download \
  cavPor3.DEF >& cavPor3.doAfterLoad.log
#real    5m45.122s

# syntenic nets 

screen -r bosTau4
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit \
  -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
  -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \
  -continue=syntenicNet bosTau4.DEF >& bosTau4.syn.log
#real    31m48.545s

# reciprocal best choHof1 and cavPor3
screen -r choHof1
time nice -n +19 doRecipBest.pl -buildDir=/hive/data/genomes/hg19/bed/lastz.choHof1 \
  -workhorse=hgwdev hg19 choHof1 >& choHof1.doRecip.log
#real    367m52.993s

screen -r cavPor3
time nice -n +19 doRecipBest.pl -buildDir=/hive/data/genomes/hg19/bed/lastz.cavPor3 \
  -workhorse=hgwdev hg19 cavPor3 >& cavPor3.doRecip.log
#real    123m3.795s

# reciprocal best small six genome memk run

screen -S recipRun
mkdir recipRun
cd recipRun/
cat > gsub << 'EOF'
#LOOP
./doRecip.sh $(path1) 
#ENDLOOP
'EOF'
cat > doRecip.sh << 'EOF'
#!/bin/csh -ef
set db = $1
/cluster/bin/scripts/doRecipBest.pl -workhorse=`uname -n` -stop=recipBest -buildDir=/hive/data/genomes/hg19/bed/lastz.$db hg19 $db >& $db.recipBest.log
'EOF'
chmod +x doRecip.sh
cat > db.lst << 'EOF'
speTri1
vicPac1
myoLuc1
turTru1
pteVam1
felCat3
EOF
ssh memk
cd /hive/data/genomes/hg19/bed/andy/recipRun
gensub2 db.lst single gsub jobList
para create jobList
para push
# finished overnight
exit # to hgwdev
for log in *.recipBest.log; do 
  db=${log%.recipBest.log}; 
  echo $db;
  doRecipBest.pl -workhorse=hgwdev -continue=download \
    -buildDir=/hive/data/genomes/hg19/bed/lastz.$db \
     hg19 $db >& $db.recipBestDownload.log;
done

# swaps for equCab2, felCat3, bostTau4, cavPor3

cd /hive/data/genomes/hg19/bed/andy
screen -r equCab2
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u01 \
  -chainMinScore=3000 -chainLinearGap=medium -swap equCab2.DEF >& equCab2.doSwap.log
# [detach screen]
#real    486m35.206s

screen -r felCat3
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u02 \
  -chainMinScore=3000 -chainLinearGap=medium -swap felCat3.DEF >& felCat3.doSwap.log
# [detach screen]
#real    463m5.257s

screen -r bosTau4
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=kkr14u03 \
  -chainMinScore=3000 -chainLinearGap=medium -swap bosTau4.DEF >& bosTau4.doSwap.log
# [detach screen]
#real    391m40.132s

screen -r cavPor3
time nice -n +19 doBlastzChainNet.pl -verbose=2 -noLoadChainSplit -workhorse=hgwdev
  -chainMinScore=3000 -chainLinearGap=medium -swap cavPor3.DEF >& cavPor3.doSwap.log
# [detach screen]
real    192m39.792s

##########################################################################
# LASTZ Venter's Poodle canFamPoodle1 (DONE - 2009-06-05,10 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
    cd /hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05

    cat << '_EOF_' > DEF
# human vs Venter's poodle

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog CanFam2
SEQ2_DIR=/scratch/data/canFamPoodle1/canFamPoodle1.2bit
SEQ2_LEN=/scratch/data/canFamPoodle1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LAP=0
SEQ2_LIMIT=600

BASE=/hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl \
        -verbose=2 \
        `pwd`/DEF \
        -noDbNameCheck -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium
    #	real    5162m58.743s
    cat fb.hg19.chainCanFamPoodle1Link.txt 
    #	898034247 bases of 2897316137 (30.995%) in intersection
    #	the original canFam2 measured:
    #	1532073507 bases of 2897316137 (52.879%) in intersection

    time nice -n +19 doRecipBest.pl -buildDir=`pwd` \
	hg19 canFamPoodle1 > rbest.log 2>&1 &
    #	real    811m27.965s

##############################################################################
## 46-Way Multiz (DONE - 2009-06-09,2009-11-10 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/multiz46way
    cd /hive/data/genomes/hg19/bed/multiz46way

    #	starting with the 46way tree created from 44 way tree
    cat << '_EOF_' > 46way.nh
(((((((((((((((((
((hg19:0.006591,panTro2:0.006639):0.002184,gorGor1:0.009411):0.009942,
ponAbe2:0.018342):0.014256,rheMac2:0.036199):0.021496,papHam1:0.04):0.02,
calJac1:0.066389):0.056911,tarSyr1:0.135169):0.011307,
(micMur1:0.091452,otoGar1:0.128984):0.035463):0.015304,
tupBel1:0.183583):0.004688,(((((mm9:0.083220,rn4:0.090564):0.196605,
dipOrd1:0.209532):0.022555,cavPor3:0.223415):0.009828,
speTri1:0.146894):0.025042,
(oryCun2:0.116009,ochPri2:0.198295):0.100037):0.015355):0.020666,
(((vicPac1:0.105252,(turTru1:0.064182,bosTau4:0.121911):0.025111):0.039691,
((equCab2:0.107726,(felCat3:0.097971,canFam2:0.100888):0.049486):0.006252,
(myoLuc1:0.141155,pteVam1:0.111787):0.033187):0.004179):0.011699,
(eriEur1:0.220580,sorAra1:0.266859):0.056117):0.021065):0.023276,
(((loxAfr3:0.083775,proCap1:0.152633):0.026190,echTel1:0.240221):0.049905,
(dasNov2:0.115179,choHof1:0.096272):0.052373):0.006713):0.132748,
macEug1:0.3):0.1,
monDom5:0.325899):0.072430,ornAna1:0.453916):0.109903,
((galGal3:0.166386,taeGut1:0.170717):0.199763,
anoCar1:0.509545):0.108130):0.166150,xenTro2:0.852482):0.300396,
(((tetNig2:0.224774,fr2:0.205294):0.191836,
(gasAcu1:0.313967,oryLat2:0.478451):0.058404):0.322824,
danRer6:0.731166):0.155214):0.511293,petMar1:0.511293);
'_EOF_'
    # << happy emacs

    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a gif image for htdocs/images/phylo/hg19_46way.gif

    /cluster/bin/phast/all_dists 46way.nh > 46way.distances.txt
    #	Use this output to create the table below, with this perl script:
    cat << '_EOF_' > sizeStats.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH, "grep -y hg19 46way.distances.txt | sort -k3,3n|") or
        die "can not read 46way.distances.txt";

my $count = 0;
while (my $line = <FH>) {
    chomp $line;
    my ($hg19, $D, $dist) = split('\s+', $line);
    my $chain = "chain" . ucfirst($D);
    my $B="/hive/data/genomes/hg19/bed/lastz.$D/fb.hg19." .
        $chain . "Link.txt";
    my $chainLinkMeasure =
        `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
    chomp $chainLinkMeasure;
    $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
    $chainLinkMeasure =~ s/\%//;
    my $swapFile="/hive/data/genomes/${D}/bed/blastz.hg19.swap/fb.${D}.chainHg19Link.txt";
    my $swapMeasure = "N/A";
    if ( -s $swapFile ) {
	$swapMeasure =
	    `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
	chomp $swapMeasure;
	$swapMeasure = 0.0 if (length($swapMeasure) < 1);
	$swapMeasure =~ s/\%//;
    }
    my $orgName=
    `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
    chomp $orgName;
    if (length($orgName) < 1) {
        $orgName="N/A";
    }
    ++$count;
    if ($swapMeasure eq "N/A") {
	printf "# %02d  %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist,
	    $orgName, $D, $chainLinkMeasure, $swapMeasure
    } else {
	printf "# %02d  %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist,
	    $orgName, $D, $chainLinkMeasure, $swapMeasure
    }
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./sizeStats.pl
    ./sizeStats.pl
#
#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#
#                         featureBits chainLink measures
#                                        chainOryLat1Link   chain    linearGap
#    distance                      on hg19    on other   minScore
# 01  0.0132 - Chimp panTro2    (% 94.846) (% 94.908)
# 02  0.0182 - Gorilla gorGor1  (% 59.484) (N/A)
# 03  0.0371 - Orangutan ponAbe2        (% 91.350) (% 89.617)
# 04  0.0692 - Rhesus rheMac2   (% 82.744) (% 87.422)
# 05  0.0945 - Baboon papHam1   (% 82.810) (N/A)
# 06  0.1409 - Marmoset calJac1 (% 70.860) (% 71.897)
# 07  0.2665 - Tarsier tarSyr1  (% 47.830) (N/A)
# 08  0.2696 - Mouse lemur micMur1      (% 46.519) (N/A)
# 09  0.3071 - Bushbaby otoGar1 (% 43.644) (N/A)
# 10  0.3343 - Horse equCab2    (% 57.050) (% 66.774)
# 11  0.3416 - TreeShrew tupBel1        (% 36.156) (N/A)
# 12  0.3451 - Dolphin turTru1  (% 48.398) (N/A)
# 13  0.3500 - Squirrel speTri1 (% 35.713) (N/A)
# 14  0.3611 - Alpaca vicPac1   (% 39.399) (N/A)
# 15  0.3620 - Sloth choHof1    (% 34.377) (N/A)
# 16  0.3653 - Megabat pteVam1  (% 45.414) (N/A)
# 17  0.3732 - Elephant loxAfr3 (% 46.636) (% 42.430)
# 18  0.3740 - Cat felCat3      (% 35.713) (% 61.104)
# 19  0.3769 - Dog canFam2      (% 52.879) (% 62.055)
# 20  0.3809 - Armadillo dasNov2        (% 33.543) (N/A)
# 21  0.3941 - Rabbit oryCun2   (% 44.317) (58.405)
# 22  0.3946 - Microbat myoLuc1 (% 33.174) (N/A)
# 23  0.4028 - Cow bosTau4      (% 46.506) (% 50.297)
# 24  0.4363 - Guinea Pig cavPor3       (% 43.680) (N/A)
# 25  0.4421 - Rock hyrax proCap1       (% 30.864) (N/A)
# 26  0.4450 - Kangaroo rat dipOrd1     (% 27.161) (N/A)
# 27  0.4764 - Pika ochPri2     (% 27.768) (N/A)
# 28  0.4811 - Hedgehog eriEur1 (% 19.362) (N/A)
# 29  0.5035 - Tenrec echTel1   (% 23.120) (N/A)
# 30  0.5153 - Mouse mm9        (% 35.299) (% 38.693)
# 31  0.5226 - Rat rn4  (% 32.879) (% 36.860)
# 32  0.5274 - Shrew sorAra1    (% 19.760) (N/A)
# 33  0.6394 - Wallaby macEug1  (% 6.011) (N/A)
# 34  0.7653 - Opossum monDom5  (% 14.358) (N/A)
# 35  0.9657 - Platypus ornAna1 (% 7.627) (% 11.259)
# 36  1.0960 - Chicken galGal3  (% 3.591) (% 8.786)
# 37  1.1003 - Zebra finch taeGut1      (% 3.496) (% 7.795)
# 38  1.2394 - Lizard anoCar1   (% 3.591) (% 5.146)
# 39  1.6403 - X. tropicalis xenTro2    (% 3.176) (% 6.773)
# 40  1.9387 - Stickleback gasAcu1      (% 1.916) (% 11.175)
# 41  1.9634 - Fugu fr2 (% 1.702) (% 10.929)
# 42  1.9746 - Zebrafish danRer6        (% 3.051) (% 6.399)
# 43  1.9829 - Tetraodon tetNig2        (% 1.712) (% 14.194)
# 44  2.1031 - Medaka oryLat2   (% 1.849) (% 6.705)
# 45  2.1108 - Lamprey petMar1  (% 1.082) (% 3.200)

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	46way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.list

    cd /hive/data/genomes/hg19/bed/multiz46way
    #	bash shell syntax here ...
    export H=/hive/data/genomes/hg19/bed
    mkdir mafLinks
    for G in `sed -e "s/hg19 //" species.list`
    do
	mkdir mafLinks/$G
	if [ -s ${H}/lastz.${G}/mafRBestNet/chr1.maf.gz ]; then
	    echo "$G - recipBest"
	    ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G
	else
	    if [ -s ${H}/lastz.${G}/mafSynNet/chr1.maf.gz ]; then
		echo "$G - synNet"
		ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G
	    else
		if [ -s ${H}/lastz.${G}/mafNet/chr1.maf.gz ]; then
		    echo "$G - mafNet"
		    ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
		else
		    echo "missing directory lastz.${G}/*Net"
		fi
	    fi
	fi
    done

    #	verify the alignment type is correct:
    for D in `cat /hive/users/hiram/bigWayHg19/ordered.list`
do
    ls -l mafLinks/$D/chr1.maf.gz | awk '{print $NF}'
done
    #	compare to the list at:
    #	http://genomewiki.ucsc.edu/index.php/Hg19_Genome_size_statistics

    #	need to split these things up into smaller pieces for
    #	efficient kluster run.
    cd /hive/data/genomes/hg19/bed/multiz46way
    mkdir mafSplit
    cd mafSplit
    #	mafSplitPos splits on gaps or repeat areas that will not have
    #	any chains, approx 5 Mbp intervals, gaps at least 10,000
    mafSplitPos -minGap=10000 hg19 5stdout | sort -u \
	| sort -k1,1 -k2,2n > mafSplit.bed
    #	There is a splitRegions.pl script here (copied from previous 44way)
    #	that can create a custom track from this mafSplit.bed file.
    #	Take a look at that in the browser and see if it looks OK,
    #	check the number of sections on each chrom to verify none are
    #	too large.  Despite the claim above, it does appear that some
    #	areas are split where actual chains exist.

    #	run a small kluster job to split them all
    ssh memk
    cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
    cat << '_EOF_' > runOne
#!/bin/csh -ef
set G = $1
set C = $2
mkdir -p $G
pushd $G > /dev/null
if ( -s ../../mafLinks/${G}/${C}.maf.gz ) then
    rm -f hg19_${C}.*.maf
    mafSplit ../mafSplit.bed hg19_ ../../mafLinks/${G}/${C}.maf.gz
    gzip hg19_${C}.*.maf
else
    touch hg19_${C}.00.maf
    gzip hg19_${C}.00.maf
endif
popd > /dev/null
'_EOF_'
    # << happy emacs
    chmod +x runOne

    cat << '_EOF_' > template
#LOOP
runOne $(root1) $(root2) {check out line $(root1)/hg19_$(root2).00.maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    for G in `sed -e "s/hg19 //" ../species.list`
do
    echo $G
done > species.list
    cut -f 1 ../../../chrom.sizes > chr.list

    gensub2 species.list chr.list template jobList
    para -ram=8g create jobList
    para try ... check ... push ... etc...
# Completed: 4185 of 4185 jobs
# CPU time in finished jobs:      25547s     425.78m     7.10h    0.30d  0.001 y
# IO & Wait Time:                268664s    4477.73m    74.63h    3.11d  0.009 y
# Average job time:                  70s       1.17m     0.02h    0.00d
# Longest finished job:            1234s      20.57m     0.34h    0.01d
# Submission to last job:          3048s      50.80m     0.85h    0.04d

    # the autoMultiz cluster run
    ssh swarm
    cd /hive/data/genomes/hg19/bed/multiz46way/

    mkdir splitRun
    cd splitRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn 
    cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn 
    cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn 

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = hg19
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/hg19/bed/multiz46way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/ $db//" species.list`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
	if (! -s $out) then
	    echo "##maf version=1 scoring=autoMZ" > $out
	endif
    else if (-e $in) then
        /bin/ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \
	> /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c.maf $result
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    find ../../mafSplit -type f | grep hg19_ | xargs -L 1 basename \
	| sed -e "s/.gz//" | sort -u > chr.part.list
    gensub2 chr.part.list single template jobList
    para -ram=8g create jobList
    #	initial run experience suggest some of the big jobs reach 8 Gb
    #	of memory usage, so, tell parasol to limit the number of jobs per
    #	node to avoid thrashing
    para -ram=8g try
    para -ram=8g push
# Completed: 504 of 504 jobs
# CPU time in finished jobs:    1342039s   22367.32m   372.79h   15.53d  0.043 y
# IO & Wait Time:                 63835s    1063.91m    17.73h    0.74d  0.002 y
# Average job time:                2789s      46.49m     0.77h    0.03d
# Longest finished job:           12625s     210.42m     3.51h    0.15d
# Submission to last job:         15300s     255.00m     4.25h    0.18d

    # put the split maf results back together into a single maf file
    #	eliminate duplicate comments
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
    mkdir ../maf
    #	the sed edits take out partitioning name information from the comments
    #	so the multiple parts will condense to smaller number of lines
    #	this takes almost 2 hours of time, resulting in a bit over 150 Gb,
    #	almost all chrom files over 1 Gb, up to almost 10 Gb for chr2
    #	HOWEVER, this is actually not necessary to maintain these comments,
    #	they are lost during the mafAddIRows

    cat << '_EOF_' >> runOne
#!/bin/csh -fe
set C = $1
if ( -s ../maf/${C}.maf.gz ) then
    rm -f ../maf/${C}.maf.gz
endif
head -q -n 1 maf/hg19_${C}.*.maf | sort -u > ../maf/${C}.maf
grep -h "^#" maf/hg19_${C}.*.maf | egrep -v "maf version=1|eof maf" | \
    sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \
        | sort -u >> ../maf/${C}.maf 
grep -h -v "^#" `ls maf/hg19_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf
tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
'_EOF_'
    # << happy emacs
    chmod +x runOne

    cat << '_EOF_' >> template
#LOOP
runOne $(root1) {check out exists+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cut -f1 ../../../chrom.sizes > chr.list
    ssh encodek
    cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
    gensub2 chr.list single template jobList
    para create jobList
    para try ... check ... push ... etc ...
# Completed: 92 of 93 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:        412s       6.86m     0.11h    0.00d  0.000 y
# IO & Wait Time:                 21187s     353.12m     5.89h    0.25d  0.001 y
# Average job time:                 235s       3.91m     0.07h    0.00d
# Longest finished job:            1529s      25.48m     0.42h    0.02d
# Submission to last job:          1542s      25.70m     0.43h    0.02d

    #	one of the results is completely empty, the grep for results failed
    #	this file ../maf/chrUn_gl000226.maf only has header comments, no result

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/hg19/multiz46way/maf
    cd /hive/data/genomes/hg19/bed/multiz46way/maf
    ln -s `pwd`/*.maf /gbdb/hg19/multiz46way/maf

    # this generates an immense multiz46way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /data/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
    #	Loaded 33558634 mafs in 93 files from /gbdb/hg19/multiz46way/maf
    #	real    512m8.053s

    # load summary table
    time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
	| $HOME/bin/$MACHTYPE/hgLoadMafSummary hg19 -minSize=30000 -verbose=2 \
		-mergeGap=1500 -maxSize=200000  multiz46waySummary stdin
    #	real    92m30.700s
# flushSummaryBlocks: output 45 blocks
# Created 8766427 summary blocks from 645238409 components and
#	33558634 mafs from stdin
# blocks too small to be used: 29456
# Loading into hg19 table multiz46waySummary...

    # Gap Annotation
    # prepare bed files with gap info
    mkdir /hive/data/genomes/hg19/bed/multiz46way/anno
    cd /hive/data/genomes/hg19/bed/multiz46way/anno
    mkdir maf run

    #	most of these will already exist from previous multiple alignments
    #	remove the echo from in front of the twoBitInfo command to get them
    #	to run if this loop appears to be correct
    for DB in `cat ../species.list`
do
    CDIR="/hive/data/genomes/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
	echo "creating ${DB}.N.bed"
	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
	ls -og ${CDIR}/${DB}.N.bed
    fi
done

    cd run
    rm -f nBeds sizes
    for DB in `sed -e "s/hg19 //" ../../species.list`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    #	the annotation step requires large memory, run on memk nodes
    ssh memk
    cd /hive/data/genomes/hg19/bed/multiz46way/anno/run
    ls ../../maf | sed -e "s/.maf//" > chr.list
    cat << '_EOF_' > template
#LOOP
./anno.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > anno.csh
#!/bin/csh -fe

set inMaf = ../../maf/$1.maf
set outMaf = ../maf/$1.maf
rm -f $outMaf
mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/hg19/hg19.2bit $outMaf
'_EOF_'
    # << happy emacs
    chmod +x anno.csh

    gensub2 chr.list single template jobList
    para -ram=30g create jobList
    #	specify lots of ram to get one job per node
    para -ram=30g push
    #	
# Completed: 93 of 93 jobs
# CPU time in finished jobs:      10371s     172.85m     2.88h    0.12d  0.000 y
# IO & Wait Time:                  3365s      56.09m     0.93h    0.04d  0.000 y
# Average job time:                 148s       2.46m     0.04h    0.00d
# Longest finished job:            1153s      19.22m     0.32h    0.01d
# Submission to last job:          7402s     123.37m     2.06h    0.09d

    ssh hgwdev
    rm -fr /gbdb/hg19/multiz46way/maf
    mkdir /gbdb/hg19/multiz46way/maf
    cd /hive/data/genomes/hg19/bed/multiz46way/anno/maf
    ln -s `pwd`/*.maf /gbdb/hg19/multiz46way/maf/
    #	by loading this into the table multiz46way, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /data/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
    #	real    113m11.709s
    #	Loaded 33612571 mafs in 93 files from /gbdb/hg19/multiz46way/maf
XXX - done to here

    time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
	| hgLoadMafSummary hg19 -minSize=30000 -mergeGap=1500 \
                 -maxSize=200000  multiz46waySummary stdin
    #	with the quality annotated mafs, and mem interference on hgwdev:
    #	Created 8514381 summary blocks from 600504256 components \
    #	and 33320838 mafs from stdin
    #	real    169m56.936s

    #	with the Irow annotations after the multiz fix:
    #	Created 8514380 summary blocks from 600499937
    #		components and 33298894 mafs from stdin
    #	real    184m42.893s
    #	user    70m44.431s
    #	sys     8m7.970s

    #	Created 8514078 summary blocks from 604683213 components
    #	and 35125649 mafs from stdin
    #	real    130m55.115s
    #	user    71m37.409s
    #	sys     8m5.110s

    #	by loading this into the table multiz46waySummary, it will replace
    #	the previously loaded table with the unannotated mafs
    #	remove the multiz46way*.tab files in this /data/tmp directory
# -rw-rw-r--   1 1949221892 Nov 15 14:04 multiz46way.tab
# -rw-rw-r--   1  417994189 Nov 15 20:57 multiz46waySummary.tab
    wc -l multiz46way*.tab
    #	33964377 multiz46way.tab
    #	 8514078 multiz46waySummary.tab
    #	42478455 total
    rm multiz46way*.tab

    # create some downloads
    mkdir -p /hive/data/genomes/hg19/bed/multiz46way/download/maf
    cd /hive/data/genomes/hg19/bed/multiz46way/download/maf
    time cp -p ../../anno/maf/chr*.maf .
    #	real    72m46.514s
    #	user    0m1.293s
    #	sys     5m15.981s
    time gzip --rsyncable *.maf
    time gzip --rsyncable *.maf
    #	real    185m37.884s
    #	user    179m51.161s
    #	sys     3m48.016s
    time md5sum *.gz > md5sum.txt
    #	real    3m59.009s
    #	user    1m19.338s
    #	sys     0m18.976s

##############################################################################
# LASTZ Sea Hare aplCal1 (STARTING - 2009-06-08 - Galt)
# To Do #1813 remove aplCal1 <-> hg19 chain/nets (2011-10-07 Chin)
# However, only tables and download files are physically drop/removed
# All data created stay on hive.
    mkdir /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
    cd /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08

    cat << '_EOF_' > DEF
# Human vs. Sea Hare
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ2_LIMIT=5

# QUERY: Sea Hare aplCal1
SEQ2_DIR=/scratch/data/aplCal1/aplCal1.2bit
SEQ2_LEN=/scratch/data/aplCal1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    #   (NOTE I SHOULD NOT HAVE USED  -qRepeats=windowmaskerSdust)
    screen
    time nice +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
	>& do.log &
    #	real about one hour but one job hung

    # resuming from failure 
    # edited loadUp.csh, commenting out the first completed step
    # and removing the unneeded -qRepeats=windowmaskerSdust
    # from the next step, now run it to complete the load step.
    /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/axtChain/loadUp.csh \ 
        >& continue-loadUp.log&

    # continue from step 'download'
    time nice +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
        -continue download \
        >& continue-download.log &
 
    cat fb.hg19.chainAplCal1Link.txt 
    #   19675762 bases of 2897316137 (0.679%) in intersection

    #	running the swap - DONE - 2009-06-02
    #   (NOTE I SHOULD NOT HAVE USED  -qRepeats=windowmaskerSdust)
    mkdir /hive/data/genomes/aplCal1/bed/blastz.hg19.swap
    cd /hive/data/genomes/aplCal1/bed/blastz.hg19.swap
    time nice +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
	-swap >& swap.log &
    #	real  time not long

    # resuming from failure
    # edited loadUp.csh, commenting out the first completed step
    # and removing the unneeded -tRepeats=windowmaskerSdust
    # from the next step, now run it to complete the load step.
    /hive/data/genomes/aplCal1/bed/blastz.hg19.swap/axtChain/loadUp.csh \
        >& continue-loadUp.log&

    time nice +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
        -continue download \
	-swap >& continue-download.log &

    cat fb.aplCal1.chainHg19Link.txt 
    #   14163455 bases of 619228098 (2.287%) in intersection

#########################################################################
# EXONIPHY Hg19, lifted from hg18 (DONE - 2009-06-19 - Hiram)
#	needed for uscsGenes11 building
    # create a syntenic liftOver chain file
    cd /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
    time nice -n +19 netSyntenic run.chain/hg18.hg19.noClass.net.gz stdout \
	| netFilter -syn stdin | netChainSubset -verbose=0 stdin \
		run.chain/hg18.hg19.all.chain.gz stdout \
	| chainStitchId stdin stdout | gzip -c > hg18.hg19.syn.chain.gz
    #	memory usage 55492608, utime 3 s/100, stime 3
    #	real    2m35.613s

    #	real    5m55.575s
    #	slightly smaller than the ordinary liftOver chain file:
# -rw-rw-r-- 1 137245 Mar  6 17:37 hg18ToHg19.over.chain.gz
# -rw-rw-r-- 1  96115 Jun 19 14:30 hg18.hg19.syn.chain.gz

    # exoniphyHg19.gp is prepared as follows
    mkdir /cluster/data/hg19/bed/exoniphy
    cd /cluster/data/hg19/bed/exoniphy
    hgsql hg18 -e "select * from exoniphy" -N > exoniphyHg18.gp
    time nice -n +19 liftOver -genePred exoniphyHg18.gp \
      /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06/hg18.hg19.syn.chain.gz \
	    exoniphyHg19.gp unmapped
    wc -l *
    #	178162 exoniphyHg18.gp
    #	178109 exoniphyHg19.gp
    #	   106 unmapped

    mkdir dump
    cd dump
    hgsqldump --all -c --tab=. hg18 exoniphy
    cd ..
    chmod 775 dump
    hgsql hg19 < dump/exoniphy.sql
    hgsql hg19 \
-e "load data local infile \"exoniphyHg19.gp\" into table exoniphy;"
    nice -n +19 featureBits hg19 exoniphy
    #	27421336 bases of 2897316137 (0.946%) in intersection
    nice -n +19 featureBits hg18 exoniphy
    #	27475705 bases of 2881515245 (0.954%) in intersection

#########################################################################
# BIOCYCTABLES NEEDED BY hgGene (DONE - 2009-06-22 - Hiram)

# First register with BioCyc to download their HumanCyc database
# The site will email you the URL for download.  Beware, they supply
#	a URL to a directory chock a block full of data, almost 7 Gb,
#	you only need one file

    mkdir /hive/data/outside/bioCyc/090623
    cd /hive/data/outside/bioCyc/090623

    mkdir download
    cd download
    wget --timestamping --no-directories --recursive \
	"http://bioinformatics.ai.sri.com/ecocyc/dist/flatfiles-52983746/humancyc-flatfiles.tar.Z"
    tar xvzf humancyc-flatfiles.tar.Z

    mkdir /hive/data/genomes/hg19/bed/bioCyc
    cd /hive/data/genomes/hg19/bed/bioCyc
    #	clean the headers from these files
    grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/genes.col \
	> genes.tab
    #	this file isn't consistent in its number of columns
    grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/pathways.col \
| awk -F'\t' '{if (140 == NF) { printf "%s\t\t\n", $0; } else { print $0}}' \
	> pathways.tab

    hgsql hg19 -e 'create database bioCyc090623'

    hgLoadSqlTab bioCyc090623 genes ~/src/hg/lib/bioCycGenes.sql ./genes.tab
    hgLoadSqlTab bioCyc090623 pathways ~/src/hg/lib/bioCycPathways.sql ./pathways.tab

# Create bioCycMapDesc.tab
    hgsql bioCyc090623 -N \
	-e 'select UNIQUE_ID, NAME from pathways' | sort -u >  bioCycMapDesc.tab
XXX see alternative below

    #	this kgBioCyc0 thing needs kgXref and other UCSC gene tables to work
# Create bioCycPathway.tab
    kgBioCyc0 bioCyc090623 hg19 hg19

    hgLoadSqlTab hg19 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
    hgLoadSqlTab hg19 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab

XXX maybe instead do this in the gene build procedure
    # from the UCSC genes build procedure
# Do BioCyc Pathways build
    mkdir $dir/bioCyc
    cd $dir/bioCyc
    grep -v '^#' $bioCycPathways > pathways.tab
    grep -v '^#' $bioCycGenes > genes.tab
    kgBioCyc1 genes.tab pathways.tab $db bioCycPathway.tab bioCycMapDesc.tab
    hgLoadSqlTab $tempDb bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
    hgLoadSqlTab $tempDb bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab

##############################################################################
nscanGene (2009-06-22 markd)
   # nscanGene track from WUSTL
   cd /cluster/data/hg19/bed/nscan
   wget http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19.updated.gtf
   wget http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19.readme
   wget -r -np -l 1 http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19_proteins
   bzip2 hg19.updated.gtf hg19_proteins/*.fa

   # load track
   gtfToGenePred -genePredExt hg19.updated.gtf.bz2 stdout| hgLoadGenePred -genePredExt hg19 nscanGene stdin
   bzcat hg19_proteins/chr*.fa.bz2 | hgPepPred hg19 generic nscanPep stdin
   rm *.tab

   # validate same number of transcripts and peptides are loaded
   hgsql -Ne 'select count(*) from nscanGene' hg19
   hgsql -Ne 'select count(*) from nscanPep' hg19

   # validate search expression
   hgc-sql -Ne 'select name from nscanGene' hg19 | egrep -v -e '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |wc -l

#########################################################################
# Phylogenetic tree from 46-way for chrX  (DONE - 2009-10-26 - Hiram)
# 	We need two trees, one for chrX only, and a second for all other chroms
    mkdir /hive/data/genomes/hg19/bed/multiz46way/4dX
    cd /hive/data/genomes/hg19/bed/multiz46way/4dX

    hgsql hg19 -Ne \
    "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA' and refGene.chrom='chrX'" \
	| cut -f 2-20 > refSeqReviewed.gp
    wc -l refSeqReviewed.gp
    # 727 refSeqReviewed.gp
    genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
    wc -l refSeqReviewedNR.gp
    # 401 refSeqReviewed.gp

    ssh memk
    mkdir /hive/data/genomes/hg19/bed/multiz46way/4dX/run
    cd /hive/data/genomes/hg19/bed/multiz46way/4dX/run
    mkdir ../mfa

# whole chrom mafs version, using new version of 
# uses memory-efficient version of phast, from Melissa Hubisz at Cornell
#	mjhubisz at gmail.com

    cat << '_EOF_' > 4dX.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
set r = "/hive/data/genomes/hg19/bed/multiz46way"
set c = $1
set infile = $r/maf/$2
set outfile = $3
cd /scratch/tmp
# 'clean' maf
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
awk -v C=$c '$2 == C {print}' $r/4dX/refSeqReviewedNR.gp > $c.gp
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
$PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4dX/$outfile
rm -f $c.gp $c.maf $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4dX.csh

    ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/chrX.maf | \
        egrep -E -v "chrM|chrUn|random|_hap" | sed -e "s#.*multiz46way/maf/##" \
	> maf.list

    cat << '_EOF_' > template
#LOOP
4dX.csh $(root1) $(path1) {check out line+ mfa/$(root1).mfa}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 maf.list single template stdout | tac > jobList
    #	run this one job on hgwdev, takes a few minutes:
    ./4dX.csh chrX chrX.maf mfa/chrX.mfa
    #	not sure what these warnings are about:
# WARNING: ignoring out-of-range feature
# chrX    genepred        CDS     1       -1      .       +       2       transcript_id "NM_000475"
# WARNING: ignoring out-of-range feature
# chrX    genepred        CDS     1       -1      .       +       2       transcript_id "NM_005365.2"

    # combine mfa files
    cd ..
    sed -e "s/ /,/g" ../species.list > species.lst
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
	--aggregate `cat species.lst` mfa/*.mfa | sed s/"> "/">"/ > 4dX.chrX.mfa

XXXX ! 2010-12-29 - There is an error in the awk below.
XXXX It ends up only working on the first file mfa/chr1.mfa
XXXX with the result in placentals.mfa only from chr1.mfa
    sed -e 's/,macEug1.*//' species.lst > placentals.lst
    awk '
BEGIN { good = 1 }
{
    if (match($0, "^> macEug1")) { good = 0 }
    if (good) {print}
}
' mfa/*.mfa > placentals.mfa
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
	--aggregate `cat placentals.lst` placentals.mfa | sed s/"> "/">"/ \
	> 4dX.placentals.mfa

XXXX ! 2010-12-29 - There is an error in the awk below.
XXXX It ends up only working on the first file mfa/chr1.mfa
XXXX with the result in primates.mfa only from chr1.mfa
    sed -e 's/,tupBel1.*//' species.lst > primates.lst
    awk '
BEGIN { good = 1 }
{
    if (match($0, "^> tupBel1")) { good = 0 }
    if (good) {print}
}
' mfa/*.mfa > primates.mfa
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
	--aggregate `cat primates.lst` primates.mfa | sed -e "s/> />/" \
	> 4dX.primates.mfa

    # use phyloFit to create tree model (output is phyloFit.mod)
    time /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
	--EM --precision MED --msa-format FASTA --subst-mod REV \
	--tree ../tree-commas.nh 4dX.chrX.mfa
    #	real    0.54.139s
    mv phyloFit.mod phyloFit.chrX.mod

    grep TREE phyloFit.chrX.mod | sed 's/TREE\:\ //' > tree_4d.chrX.46way.nh

    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
        --no-branchlen --prune-all-but=`cat primates.lst` ../tree-commas.nh \
                > tree_commas.primates.nh
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
        --no-branchlen --prune-all-but=`cat placentals.lst` ../tree-commas.nh \
                > tree_commas.placentals.nh
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
	--EM --precision MED --msa-format FASTA --subst-mod REV \
	--tree tree_commas.primates.nh 4dX.primates.mfa
    mv phyloFit.mod phyloFit.chrX.primates.mod
    grep TREE phyloFit.chrX.primates.mod | sed 's/TREE\:\ //' \
	> tree_4d.chrX.primates.46way.nh
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
	--EM --precision MED --msa-format FASTA --subst-mod REV \
	--tree tree_commas.placentals.nh 4dX.placentals.mfa
    mv phyloFit.mod phyloFit.chrX.placentals.mod
    grep TREE phyloFit.chrX.placentals.mod | sed 's/TREE\:\ //' \
	> tree_4d.chrX.placentals.46way.nh

#########################################################################
# Phylogenetic tree from 46-way for non-chrX  (DONE - 2009-10-27 - Hiram)
# 	We need two trees, one for chrX only, and a second for all other chroms
    mkdir /hive/data/genomes/hg19/bed/multiz46way/4dNoX
    cd /hive/data/genomes/hg19/bed/multiz46way/4dNoX

    hgsql hg19 -Ne \
    "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" \
	| cut -f 2-20 | egrep -E -v "chrM|chrUn|random|_hap|chrX" \
	> refSeqReviewed.gp
    wc -l refSeqReviewed.gp
    # 12977 refSeqReviewed.gp
    genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
    wc -l refSeqReviewedNR.gp
    # 7252 refSeqReviewed.gp

    ssh memk
    mkdir /hive/data/genomes/hg19/bed/multiz46way/4dNoX/run
    cd /hive/data/genomes/hg19/bed/multiz46way/4dNoX/run
    mkdir ../mfa

# whole chrom mafs version, using new version of 
# uses memory-efficient version of phast, from Melissa Hubisz at Cornell
#	mjhubisz at gmail.com

    cat << '_EOF_' > 4dNoX.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
set r = "/hive/data/genomes/hg19/bed/multiz46way"
set c = $1
set infile = $r/maf/$2
set outfile = $3
cd /scratch/tmp
# 'clean' maf
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
awk -v C=$c '$2 == C {print}' $r/4dNoX/refSeqReviewedNR.gp > $c.gp
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
$PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4dNoX/$outfile
rm -f $c.gp $c.maf $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4dNoX.csh

    ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/chr*.maf | \
        egrep -E -v "chrM|chrUn|random|_hap|chrX" \
	| sed -e "s#.*multiz46way/maf/##" \
	> maf.list

    cat << '_EOF_' > template
#LOOP
4dNoX.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 maf.list single template stdout | tac > jobList
    para try ... check ... push ... etc
    para time
# Completed: 23 of 23 jobs
# CPU time in finished jobs:       9032s     150.53m     2.51h    0.10d  0.000 y
# IO & Wait Time:                   672s      11.21m     0.19h    0.01d  0.000 y
# Average job time:                 422s       7.03m     0.12h    0.00d
# Longest finished job:             860s      14.33m     0.24h    0.01d
# Submission to last job:          1210s      20.17m     0.34h    0.01d

    # combine mfa files
    cd ..
    sed -e "s/ /,/g" ../species.list > species.lst
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
	--aggregate `cat species.lst` mfa/*.mfa | sed s/"> "/">"/ \
	> 4dNoX.all.mfa

    sed -e 's/,macEug1.*//' species.lst > placentals.lst
    awk '
BEGIN { good = 1 }
{
    if (match($0, "^> macEug1")) { good = 0 }
    if (good) {print}
}
' mfa/*.mfa > placentals.mfa
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
	--aggregate `cat placentals.lst` placentals.mfa | sed s/"> "/">"/ \
	> 4dNoX.placentals.mfa

    sed -e 's/,tupBel1.*//' species.lst > primates.lst
    awk '
BEGIN { good = 1 }
{
    if (match($0, "^> tupBel1")) { good = 0 }
    if (good) {print}
}
' mfa/*.mfa > primates.mfa
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
	--aggregate `cat primates.lst` primates.mfa | sed -e "s/> />/" \
	> 4dNoX.primates.mfa


    # use phyloFit to create tree model (output is phyloFit.mod)
    time /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
	--EM --precision MED --msa-format FASTA --subst-mod REV \
	--tree ../tree-commas.nh 4dNoX.all.mfa
XXX - running Tue Oct 27 13:21:49 PDT 2009
    #	about 40 minutes
    mv phyloFit.mod phyloFit.NoChrX.mod

    grep TREE phyloFit.chrX.mod | sed 's/TREE\:\ //' > tree_4d.chrX.46way.nh

    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
        --no-branchlen --prune-all-but=`cat primates.lst` ../tree-commas.nh \
                > tree_commas.primates.nh
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
        --no-branchlen --prune-all-but=`cat placentals.lst` ../tree-commas.nh \
                > tree_commas.placentals.nh
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
	--EM --precision MED --msa-format FASTA --subst-mod REV \
	--tree tree_commas.primates.nh 4dNoX.primates.mfa
    mv phyloFit.mod phyloFit.NoChrX.primates.mod
    grep TREE phyloFit.NoChrX.primates.mod | sed 's/TREE\:\ //' \
	> tree_4d.NoChrX.primates.46way.nh
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
	--EM --precision MED --msa-format FASTA --subst-mod REV \
	--tree tree_commas.placentals.nh 4dNoX.placentals.mfa
    mv phyloFit.mod phyloFit.NoChrX.placentals.mod
    grep TREE phyloFit.NoChrX.placentals.mod | sed 's/TREE\:\ //' \
	> tree_4d.NoChrX.placentals.46way.nh

#########################################################################
# Phylogenetic tree from 46-way  (DONE - 2009-06-25,07-07 - Hiram)
#	This was an early first time experiment.  All this was redone
#	above for chrX only and non-chrX trees

    # Extract 4-fold degenerate sites based on 
    # of RefSeq Reviewed, coding
    mkdir /hive/data/genomes/hg19/bed/multiz46way/4d
    cd /hive/data/genomes/hg19/bed/multiz46way/4d

    hgsql hg19 -Ne \
    "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" | cut -f 2-20 \
	> refSeqReviewed.gp
    wc -l refSeqReviewed.gp
    # 14077 refSeqReviewed.gp
    genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
    wc -l refSeqReviewedNR.gp
    # 7951 refSeqReviewedNR.gp

    ssh memk
    mkdir /hive/data/genomes/hg19/bed/multiz46way/4d/run
    cd /hive/data/genomes/hg19/bed/multiz46way/4d/run
    mkdir ../mfa

# whole chrom mafs version, using new version of 
# uses memory-efficient version of phast, from Melissa Hubisz at Cornell
#	mjhubisz at gmail.com
    cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set r = "/hive/data/genomes/hg19/bed/multiz46way"
set c = $1
set infile = $r/maf/$2
set outfile = $3
cd /scratch/tmp
# 'clean' maf
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
awk -v C=$c '$2 == C {print}' $r/4d/refSeqReviewedNR.gp > $c.gp
set PHASTBIN=/cluster/bin/phast.2008-12-18
$PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/$outfile
rm -f $c.gp $c.maf $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4d.csh

    ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/*.maf | \
        egrep -E -v "chrM|chrUn|random|_hap" | sed -e "s#.*multiz46way/maf/##" \
	> maf.list

    cat << '_EOF_' > template
#LOOP
4d.csh $(root1) {check in line+ $(path1)} {check out line+ mfa/$(root1).mfa}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 maf.list single template stdout | tac > jobList
XXX - ready to go here - 2009-07-06
    rm -fr /cluster/data/hg19/bed/multiz46way/4d/mfa
    mkdir /cluster/data/hg19/bed/multiz46way/4d/mfa
    para create jobList
    para try
    para check
    para push

    # combine mfa files
    cd ..
    sed -e "s/ /,/g" ../species.list > species.lst
    /cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \
        sed s/"> "/">"/ > 4d.all.mfa

    sed -e 's/,macEug1.*//' species.lst > placentals.lst
    #  XXX this didn't work
    /cluster/bin/phast/msa_view --aggregate `cat placentals.lst` mfa/*.mfa | \
        sed s/"> "/">"/ > 4d.placentals.mfa

    # use phyloFit to create tree model (output is phyloFit.mod)
    set PHASTBIN=/cluster/bin/phast.2008-12-18
    time $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA \
	--subst-mod REV --tree ../tree-commas.nh 4d.all.mfa
    #	real    111m23.119s
    mv phyloFit.mod phyloFit.all.mod
    grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.46way.nh

    sed -e 's/.*,choHof1,//' species.lst > notPlacentals.list

    $PHASTBIN/tree_doctor \
        --prune=`cat notPlacentals.list` \
                tree_4d.46way.nh > tree_4d.46way.placental.nh

#############################################################################
# phastCons 46-way (DONE - 2009-09-21,2009-11-10 - Hiram)
    #	was unable to split the full chrom MAF files, now working on the
    #	maf files as they were split up during multiz

    # split 46way mafs into 10M chunks and generate sufficient statistics 
    # files for # phastCons
    ssh swarm
    mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
    cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
    ./splitRegions.pl mafSplit.bed > \
	/hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/region.list
    mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/ss
    mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/2009-10-19
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/2009-10-19
    mkdir ss

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/hg19_$c.maf
set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/2009-10-19/ss/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $2 ) then
    exit 0
endif
if ( -s $2.running ) then
    exit 0
endif

date >> $2.running

rm -fr $WINDOWS
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
endif
popd > /dev/null
date >> $2
rm -f $2.running
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	do the easy ones first to see some immediate results
    ls -1S -r ../../../splitRun/maf | sed -e "s/.maf//; s/hg19_//" > maf.list

    gensub2 maf.list single template jobList
    para -ram=32g create jobList
    para try ... check ... etc
# Completed: 503 of 504 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      14171s     236.18m     3.94h    0.16d  0.000 y
# IO & Wait Time:                188193s    3136.55m    52.28h    2.18d  0.006 y
# Average job time:                 402s       6.71m     0.11h    0.00d
# Longest finished job:            1597s      26.62m     0.44h    0.02d
# Submission to last job:          2586s      43.10m     0.72h    0.03d
    #	the one crashed job is hg19_chr18_gl000207_random.00.maf

    #	XXX - this did not work
    #	this takes a really long time.  memk was down to 2 usable
    #	machines - got it finished manually on a combination of hgwdevnew CPUs
    #	and other machines

    # Estimate phastCons parameters
    #	experimented with this as a parasol job on hgwdevnew to try a number
    #	of SS files.  With a command of:

/cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
--tree "(((((((((((((((((hg19,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
--out-root=$OUT/starting_tree

    #	running over the input files ../ss/*/*.ss results to
#.../genomes/hg19/bed/multiz46way/cons/startingTree/result/*/starting-tree.mod

    # add up the C and G:
    find ./result -type f | xargs ls -rt | while read F
do
    D=`dirname $F`
    echo -n `basename $D`" - "
    grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
done
    #	counting number of species seen in the maf file:
    find ./result -type f | xargs ls -rt | while read F
do
    D=`dirname $F`
    echo -n `basename $D`" - "
    grep TREE $F | sed -e \
"s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g"  | tr ',' '\n' | wc -l
done

    # Run phastCons
    #	This job is I/O intensive in its output files, beware where this
    #	takes place or do not run too many at once.
    ssh swarm
    mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons

    #	there are going to be several different phastCons runs using
    #	this same script.  They trigger off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is one of:
    #	all primates placentals

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
set c = $1
set cX = $1:r
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set cons = /hive/data/genomes/hg19/bed/multiz46way/cons
set tmp = $cons/tmp/$f
mkdir -p $tmp
set ssSrc = $cons
set useGrp = "$grp.mod"
if ( $cX == "chrX" ) then
    set useGrp = "$grp.chrX.mod"
endif
if (-s $cons/$grp/$grp.non-inf) then
  ln -s $cons/$grp/$grp.mod $tmp
  ln -s $cons/$grp/$grp.chrX.mod $tmp
  ln -s $cons/$grp/$grp.non-inf $tmp
  ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
else
  ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
  ln -s $cons/$grp/$grp.mod $tmp
  ln -s $cons/$grp/$grp.chrX.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f.ss $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
  $PHASTBIN/phastCons $f.ss $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p pp/$c bed/$c
sleep 4
touch pp/$c bed/$c
rm -f pp/$c/$f.pp
rm -f bed/$c/$f.bed
mv $tmp/$f.pp pp/$c
mv $tmp/$f.bed bed/$c
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ls -1S ../msa.split/2009-10-21/ss/chr*/chr* | sed -e "s/.ss$//" > ss.list

    # Create parasol batch and run it
    # run for all species
    cd /hive/data/genomes/hg19/bed/multiz46way/cons
    mkdir -p all
    cd all
    #	Using the two different .mod tree
    cp -p ../../4dNoX/phyloFit.NoChrX.mod ./all.mod
    cp -p ../../4dX/phyloFit.chrX.mod ./all.chrX.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=8g create jobList
    para try ... check ... push ... etc.
# Completed: 581 of 581 jobs
# CPU time in finished jobs:      41877s     697.95m    11.63h    0.48d  0.001 y
# IO & Wait Time:                 39172s     652.87m    10.88h    0.45d  0.001 y
# Average job time:                 139s       2.32m     0.04h    0.00d
# Longest finished job:             329s       5.48m     0.09h    0.00d
# Submission to last job:          2240s      37.33m     0.62h    0.03d

    # create Most Conserved track
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
    cut -f1 ../../../../chrom.sizes | while read C
do  
    ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
    do
        cat ${D}/${C}*.bed
    done | sort -k1,1 -k2,2n \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
done > tmpMostConserved.bed
/cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
    time nice -n +19 hgLoadBed hg19 phastConsElements46way mostConserved.bed
    #	Loaded 5163775 elements of size 6
    #	real     1m44.439s

    # Try for 5% overall cov, and 70% CDS cov 
    featureBits hg19 -enrichment refGene:cds phastConsElements46way
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    #	refGene:cds 1.187%, phastConsElements46way 5.065%,
    #	both 0.884%, cover 74.46%, enrich 14.70x

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
    mkdir downloads
    cat << '_EOF_' > phastCat.sh
#!/bin/sh

mkdir -p downloads
cut -f1 ../../../../chrom.sizes | while read C
do
    echo -n "${C} ... working ... "
    ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
    do
        cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
    done | gzip > downloads/${C}.phastCons46way.wigFix.gz
    echo "done"
done
'_EOF_'
    #	<< happy emacs
    chmod +x phastCat.sh
    time nice -n +19 ./phastCat.sh
    #	real    30m2.623s

    #	encode those files into wiggle data
    zcat downloads/*.wigFix.gz \
	| wigEncode stdin phastCons46way.wig phastCons46way.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    18m37.881s
    du -hsc *.wi?
    #	2.7G    phastCons46way.wib
    #	271M    phastCons46way.wig
    #	3.0G    total

    #	encode into a bigWig file:
    #	(warning wigToBigWig process grows to about 36 Gb)
    #	in bash, to avoid the 32 Gb memory limit:
sizeG=188743680
export sizeG
ulimit -d $sizeG 
ulimit -v $sizeG
    zcat downloads/*.wigFix.gz \
        | wigToBigWig stdin ../../../../chrom.sizes phastCons46way.bw
    #	real    52m36.142s
# -rw-rw-r--   1 21667535139 Oct 20 13:59 phastCons46way.bw
    mkdir /gbdb/hg19/bbi
    ln -s `pwd`/phastCons46way.bw /gbdb/hg19/bbi
    #	if you wanted to use the bigWig file, loading bigWig table:
    hgsql hg19 -e 'drop table if exists phastCons46way; \
            create table phastCons46way (fileName varchar(255) not null); \
            insert into phastCons46way values
	("/gbdb/hg19/bbi/phastCons46way.bw");'

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
    ln -s `pwd`/phastCons46way.wib /gbdb/hg19/multiz46way/phastCons46way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
	phastCons46way phastCons46way.wig
    #	real    1m45.381s

    wigTableStats.sh hg19 phastCons46way
# db.table      min max mean count sumData
# hg19.phastCons46way     0 1 0.103653 2845303719 2.94924e+08
#	stdDev viewLimits
#	0.230184 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
    time nice -n +19 hgWiggle -doHistogram -db=hg19 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons46way > histogram.data 2>&1
    #	real    7m37.212s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg19 Histogram phastCons46way track"
set xlabel " phastCons46way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ########################################################################
    ### Create a phastCons data set for Primates

    # setup primates-only run
    ssh swarm
    mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/primates
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
    # primates-only: exclude all but these for phastCons tree:

    cp -p ../../4dNoX/phyloFit.NoChrX.primates.mod primates.mod
    cp -p ../../4dX/phyloFit.chrX.primates.mod primates.chrX.mod
    #	and place the removed ones in the non-inf file so phastCons will
    #	truly ignore them:
    echo "tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun2,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr3,proCap1,echTel1,dasNov2,choHof1,macEug1,monDom5,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig2,fr2,gasAcu1,oryLat2,danRer6,petMar1" \
	> primates.non-inf

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=8g create jobList
    para try ... check ... push ... etc.
# Completed: 581 of 581 jobs
# CPU time in finished jobs:      17077s     284.62m     4.74h    0.20d  0.001 y
# IO & Wait Time:                 73693s    1228.21m    20.47h    0.85d  0.002 y
# Average job time:                 156s       2.60m     0.04h    0.00d
# Longest finished job:             402s       6.70m     0.11h    0.00d
# Submission to last job:          2322s      38.70m     0.65h    0.03d

    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
    # create Most Conserved track
    cut -f1 ../../../../chrom.sizes | while read C
do  
    ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
    do
        cat ${D}/${C}*.bed
    done | sort -k1,1 -k2,2n \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
done > tmpMostConserved.bed
/cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed

    featureBits hg19 mostConserved.bed
    #	146285948 bases of 2897316137 (5.049%) in intersection

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
    time nice -n +19 hgLoadBed hg19 phastConsElements46wayPrimates \
	mostConserved.bed
    #	Loaded 725627 elements of size 6
    #	real    0m8.583s
    # verify coverage
    featureBits hg19 phastConsElements46wayPrimates
    #	116785954 bases of 2897316137 (4.031%) in intersection

    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits hg19 -enrichment refGene:cds phastConsElements46wayPrimates
    #	refGene:cds 1.186%, phastConsElements46wayPrimates 4.031%,
    #	both 0.730%, cover 61.54%, enrich 15.27x

    featureBits hg19 -enrichment knownGene:cds phastConsElements46wayPrimates
    #	knownGene:cds 1.252%, phastConsElements46wayPrimates 4.031%,
    #	both 0.743%, cover 59.31%, enrich 14.71x

    #	Create the downloads .pp files, from which the phastCons wiggle data
    #	is calculated
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
    mkdir downloads
    cat << '_EOF_' > phastCat.sh
#!/bin/sh

mkdir -p downloads
cut -f1 ../../../../chrom.sizes | while read C
do
    echo -n "${C} ... working ... "
    ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
    do
	cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
    done | gzip > downloads/${C}.phastCons46way.primates.wigFix.gz
    echo "done"
done
'_EOF_'
    # << happy emacs
    chmod +x ./phastCat.sh
    time nice -n +19 ./phastCat.sh
    #	real    39m47.189s

    # Create merged posterier probability file and wiggle track data files
    zcat downloads/chr*.wigFix.gz \
	 | wigEncode stdin phastCons46wayPrimates.wig phastCons46wayPrimates.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    17m20.601s

    #	encode to bigWig
    #	(warning wigToBigWig process grows to about 36 Gb)
    #	in bash, to avoid the 32 Gb memory limit:
sizeG=188743680
export sizeG
ulimit -d $sizeG 
ulimit -v $sizeG

    zcat downloads/*.wigFix.gz \
        | wigToBigWig stdin ../../../../chrom.sizes phastCons46wayPrimates.bw

    ln -s `pwd`/phastCons46wayPrimates.bw /gbdb/hg19/bbi
    #	if desired to use the bigWig file, loading bigWig table:
    hgsql hg19 -e 'drop table if exists phastCons46wayPrimates; \
            create table phastCons46wayPrimates \
		(fileName varchar(255) not null); \
            insert into phastCons46wayPrimates values
	("/gbdb/hg19/bbi/phastCons46wayPrimates.bw");'

    ## load table with wiggle data
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
    ln -s `pwd`/phastCons46wayPrimates.wib \
	/gbdb/hg19/multiz46way/phastCons46wayPrimates.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
	phastCons46wayPrimates phastCons46wayPrimates.wig

    wigTableStats.sh hg19 phastCons46wayPrimates
# db.table      min max mean count sumData
hg19.phastCons46wayPrimates     0 1 0.128883 2845303719 3.66712e+08
#	stdDev viewLimits
#	0.214067 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg19 phastCons46wayPrimates  > histogram.data 2>&1
    #	real    5m30.086s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Hg19 Histogram phastCons46wayPrimates track"
set xlabel " phastCons46wayPrimates score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ########################################################################
    ### Create a phastCons data set for Placentals
    # setup placental-only run
    ssh swarm
    mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/placental
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental

    cp -p ../../4dNoX/phyloFit.NoChrX.placentals.mod placental.mod
    cp -p ../../4dX/phyloFit.chrX.placentals.mod placental.chrX.mod
    # placental-only: exclude all but these for phastCons tree:
    #	and place the removed ones in the non-inf file so phastCons will
    #	truly ignore them:
    echo "macEug1,monDom5,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig2,fr2,gasAcu1,oryLat2,danRer6,petMar1" \
        > placental.non-inf

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=8g create jobList
    para try ... check ... push ... etc.
# Completed: 581 of 581 jobs
# CPU time in finished jobs:      33942s     565.69m     9.43h    0.39d  0.001 y
# IO & Wait Time:                 75536s    1258.94m    20.98h    0.87d  0.002 y
# Average job time:                 188s       3.14m     0.05h    0.00d
# Longest finished job:             417s       6.95m     0.12h    0.00d
# Submission to last job:          1878s      31.30m     0.52h    0.02d

    # create Most Conserved track
    cut -f1 ../../../../chrom.sizes | while read C
do  
    ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
    do
        cat ${D}/${C}*.bed
    done | sort -k1,1 -k2,2n \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
done > tmpMostConserved.bed
/cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
    time nice -n +19 hgLoadBed hg19 phastConsElements46wayPlacental \
	mostConserved.bed
    #	Loaded 3743478 elements of size 6
    #	real    1m15.952s
    # verify coverage
    featureBits hg19 phastConsElements46wayPlacental
    #	118211444 bases of 2897316137 (4.080%) in intersection

    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits hg19 -enrichment refGene:cds phastConsElements46wayPlacental
    #	refGene:cds 1.187%, phastConsElements46wayPlacental 4.080%,
    #	both 0.861%, cover 72.59%, enrich 17.79x
    featureBits hg19 -enrichment knownGene:cds phastConsElements46wayPlacental
    #	knownGene:cds 1.252%, phastConsElements46wayPlacental 4.080%,
    #	both 0.879%, cover 70.22%, enrich 17.21x

    #	Create the downloads .pp files, from which the phastCons wiggle data
    #	is calculated
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
    mkdir downloads
    cat << '_EOF_' > phastCat.sh
#!/bin/sh

mkdir -p downloads
cut -f1 ../../../../chrom.sizes | while read C
do
    echo -n "${C} ... working ... "
    ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
    do
	cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
    done | gzip > downloads/${C}.phastCons46way.placental.wigFix.gz
    echo "done"
done
'_EOF_'
    # << happy emacs
    chmod +x ./phastCat.sh
    time nice -n +19 ./phastCat.sh

    # Create merged posterier probability file and wiggle track data files
    zcat downloads/chr*.wigFix.gz \
	| wigEncode stdin phastCons46wayPlacental.wig \
		phastCons46wayPlacental.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    14m53.395s

    #	encode to bigWig
    #	(warning wigToBigWig process grows to about 36 Gb)
    #	in bash, to avoid the 32 Gb memory limit:
sizeG=188743680
export sizeG
ulimit -d $sizeG 
ulimit -v $sizeG

    zcat downloads/*.wigFix.gz \
        | wigToBigWig stdin ../../../../chrom.sizes phastCons46wayPlacental.bw
    #	real    40m55.568s

    ln -s `pwd`/phastCons46wayPlacental.bw /gbdb/hg19/bbi
    #	loading bigWig table:
    hgsql hg19 -e 'drop table if exists phastCons46wayPlacental; \
            create table phastCons46wayPlacental \
		(fileName varchar(255) not null); \
            insert into phastCons46wayPlacental values
	("/gbdb/hg19/bbi/phastCons46wayPlacental.bw");'

    ## load table with wiggle data
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
    ln -s `pwd`/phastCons46wayPlacental.wib \
	/gbdb/hg19/multiz46way/phastCons46wayPlacental.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
	phastCons46wayPlacental phastCons46wayPlacental.wig

    wigTableStats.sh hg19 phastCons46wayPlacental
# db.table      min max mean count sumData
hg19.phastCons46wayPlacental    0 1 0.0885757 2845303719 2.52025e+08
#	stdDev viewLimits
#	0.210242 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg19 phastCons46wayPlacental > histogram.data 2>&1
    #	real    8m15.623s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg19 Histogram phastCons46wayPlacental track"
set xlabel " phastCons46wayPlacental score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#########################################################################
# phyloP conservation for 46-way (DONE - 2009-10-21,2009-11-10 - Hiram)
#
# Vertebrate, Placental, Primates
#
    # split SS files into 1M chunks, this business needs smaller files
    #   to complete

    ssh swarm
    mkdir /hive/data/genomes/hg19/bed/multiz46way/consPhyloP
    cd /hive/data/genomes/hg19/bed/multiz46way/consPhyloP
    mkdir ss run.split
    cd run.split

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/hg19_$c.maf
set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/run.split/ss/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $2 ) then
    exit 0
endif
if ( -s $2.running ) then
    exit 0
endif

date >> $2.running

rm -fr $WINDOWS
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000
endif
popd > /dev/null
date >> $2
rm -f $2.running
'_EOF_'
# << happy emacs

    ls -1S -r ../../splitRun/maf | sed -e "s/.maf//; s/hg19_//" > maf.list

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(path1) {check out exists+ done/$(path1).done}
#ENDLOOP
'_EOF_'
# << happy emacs

    mkdir ss done
    gensub2 maf.list single template jobList
    para -ram=8g create jobList
# Completed: 504 of 504 jobs
# CPU time in finished jobs:      14486s     241.43m     4.02h    0.17d  0.000 y
# IO & Wait Time:                306280s    5104.67m    85.08h    3.54d  0.010 y
# Average job time:                 636s      10.61m     0.18h    0.01d
# Longest finished job:            1635s      27.25m     0.45h    0.02d
# Submission to last job:          2965s      49.42m     0.82h    0.03d


    # run phyloP with score=LRT 
    ssh swarm
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP
    mkdir run.phyloP
    cd run.phyloP

    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.542
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
	../../cons/all/all.mod 0.542 > all.mod
    grep BACKGROUND ../../cons/all/all.chrX.mod \
	| awk '{printf "%0.3f\n", $3 + $4}'
    #	0.503
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
	../../cons/all/all.chrX.mod 0.503 > all.chrX.mod
    grep BACKGROUND ../../cons/primates/primates.mod \
	| awk '{printf "%0.3f\n", $3 + $4}'
    #	0.523
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
	../../cons/primates/primates.mod 0.523 > primates.mod
    grep BACKGROUND ../../cons/primates/primates.chrX.mod \
	| awk '{printf "%0.3f\n", $3 + $4}'
    #	0.491
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
	../../cons/primates/primates.chrX.mod 0.491 > primates.chrX.mod
    grep BACKGROUND ../../cons/placental/placental.mod \
	| awk '{printf "%0.3f\n", $3 + $4}'
    #	0.542
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
	../../cons/placental/placental.mod 0.542 > placental.mod
    grep BACKGROUND ../../cons/placental/placental.chrX.mod \
	| awk '{printf "%0.3f\n", $3 + $4}'
    #	0.489
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
	../../cons/placental/placental.chrX.mod 0.489 > placental.chrX.mod


    # repeat for chrX only tree
    cd /cluster/data/hg18/bed/multiz46way/4d
    $PHASTBIN/modFreqs 4d.chrX.mod $gc > 46way.chrX.mod
    ln -s `pwd`/46way.chrX.mod /usr/local/apache/golenPath/hg18/phastCons46way

    cat << '_EOF_' > doPhyloP.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
set f = $1
set out = $2
set cName = $f:r:r
set chrDir = $f:r
set n = $f:r:e
set grp = $cwd:t
set cons = /hive/data/genomes/hg19/bed/multiz46way/consPhyloP
set tmp = $cons/tmp/$grp/$f
rm -fr $tmp
mkdir -p $tmp
set ssSrc = "$cons/run.split/ss/$chrDir/$f"
set useGrp = "$grp.mod"
if ( $cName == "chrX" ) then
    set useGrp = "$grp.chrX.mod"
endif
ln -s $cons/run.phyloP/$grp.mod $tmp
ln -s $cons/run.phyloP/$grp.chrX.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
    -i SS $useGrp $ssSrc.ss > $f.wigFix
popd > /dev/null
mkdir -p $out:h
sleep 4
mv $tmp/$f.wigFix $out
rm -fr $tmp
'_EOF_'
    # << happy emacs

    # Create list of chunks
    find ../run.split/ss -type f | sed -e "s/.ss$//; s#^../run.split/ss/##" \
	> ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.phyloP/doPhyloP.csh $(file1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/all
    cd /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/all
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    para create jobList
    para try ... check ... push ... etc ...

    para time
# Completed: 3186 of 3186 jobs
# CPU time in finished jobs:    1306874s   21781.23m   363.02h   15.13d  0.041 y
# IO & Wait Time:                105488s    1758.14m    29.30h    1.22d  0.003 y
# Average job time:                 443s       7.39m     0.12h    0.01d
# Longest finished job:             678s      11.30m     0.19h    0.01d
# Submission to last job:          7789s     129.82m     2.16h    0.09d

    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/all
    find ./wigFix -type f \
	| sed -e "s#^./##; s/\./ /g; s/-/ - /g" \
	| sort -k1,1 -k3,3n -k4,4n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list
    cat wigFile.list | xargs cat \
	| wigEncode stdin phyloP46way.wig phyloP46way.wib > wigEncode.log 2>&1 &
    #	Converted stdin, upper limit 6.39, lower limit -13.27
    cat wigFile.list | xargs cat \
	| wigToBigWig stdin ../../../../chrom.sizes phyloP46way.bw
    #	if you wanted to use the bigWig file, loading bigWig table:
    ln -s `pwd`/phyloP46way.bw /gbdb/hg19/bbi
    hgsql hg19 -e 'drop table if exists phyloP46wayAll; \
            create table phyloP46wayAll \
		(fileName varchar(255) not null); \
            insert into phyloP46wayAll values
	("/gbdb/hg19/bbi/phyloP46way.bw");'

    #	loading the wiggle table:
    ln -s `pwd`/phyloP46way.wib /gbdb/hg19/multiz46way
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
	phyloP46wayAll phyloP46way.wig

    #	create download files:
    cat << '_EOF_' > mkDown.csh
#!/bin/csh -fe
foreach F (`cat wigFile.list`)
    set C = $F:h:t:r
    cat $F >> downloads/${C}.wigFix
end
'_EOF_'
    # << happy emacs
    chmod +x ./mkDown.csh
    mkdir downloads
    time ./mkDown.csh
    #	real    16m19.683s

    time gzip downloads/chr*.wigFix
    #	real    47m11.017s

    wigTableStats.sh hg19 phyloP46wayAll
# db.table      min max mean count sumData
# hg19.phyloP46wayAll     -14.08 6.424 0.0896064 2845303719 2.54957e+08
#	stdDev viewLimits
#	0.833186 viewLimits=-4.07632:4.25553
    #	that range is: 14.08+6.424 = 20.504

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.020504 -hBinCount=1000 -hMinVal=-14.08 -verbose=2 \
	    -db=hg19 phyloP46wayAll > histogram.data 2>&1
    #	real    8m15.623s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg19 Histogram phyloP46way track, all 46 vertebrates"
set xlabel " phyloP46way score, all 46 vertebrates"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.04]
set xrange [-2:2]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ######################   Running the primates  #######################
    mkdir /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/primates
    cd /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/primates
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    para create jobList
    para try ... check ... push ... etc ...

    para time
# Completed: 3186 of 3186 jobs
# CPU time in finished jobs:     447177s    7452.95m   124.22h    5.18d  0.014 y
# IO & Wait Time:                 36673s     611.22m    10.19h    0.42d  0.001 y
# Average job time:                 152s       2.53m     0.04h    0.00d
# Longest finished job:             279s       4.65m     0.08h    0.00d
# Submission to last job:          4849s      80.82m     1.35h    0.06d

    cd /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/primates
    find ./wigFix -type f \
	| sed -e "s#^./##; s/\./ /g; s/-/ - /g" \
	| sort -k1,1 -k3,3n -k4,4n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list
    cat wigFile.list | xargs cat \
	| wigEncode stdin phyloP46wayPrimates.wig phyloP46wayPrimates.wib \
	> wigEncode.log 2>&1 &
    #	Converted stdin, upper limit 0.65, lower limit -9.12
    cat wigFile.list | xargs cat \
	| wigToBigWig stdin ../../../../chrom.sizes phyloP46wayPrimates.bw
    #	if you wanted to use the bigWig file, loading bigWig table:
    ln -s `pwd`/phyloP46wayPrimates.bw /gbdb/hg19/bbi
    hgsql hg19 -e 'drop table if exists phyloP46wayPrimates; \
            create table phyloP46wayPrimates \
		(fileName varchar(255) not null); \
            insert into phyloP46wayPrimates values
	("/gbdb/hg19/bbi/phyloP46wayPrimates.bw");'

    #	loading the wiggle table:
    ln -s `pwd`/phyloP46wayPrimates.wib /gbdb/hg19/multiz46way
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
	phyloP46wayPrimates phyloP46wayPrimates.wig

    #	create download files:
    mkdir downloads
    time ../all/mkDown.csh
    #	real    18m44.186s
    time gzip downloads/chr*.wigFix
    #	real    32m11.461s

    wigTableStats.sh hg19 phyloP46wayPrimates
# db.table      min max mean count
# hg19.phyloP46wayPrimates        -9.065 0.655 0.0448196 2845303719
#	sumData stdDev viewLimits
#	1.27525e+08 0.600051 viewLimits=-2.95544:0.655
    #	that range is: 9.065+0.655 = 9.720

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.00972 -hBinCount=1000 -hMinVal=-9.065 -verbose=2 \
	    -db=hg19 phyloP46wayPrimates > histogram.data 2>&1
    #	real    8m15.623s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg19 Histogram phyloP46wayPrimates track"
set xlabel " phyloP46wayPrimates score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.03]
set xrange [-2:0.655]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ######################   Running the placentals  #######################
    mkdir /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/placentals
    cd /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/placentals
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    para create jobList
    para try ... check ... push ... etc ...
    para time
# Completed: 3186 of 3186 jobs
# CPU time in finished jobs:    1582989s   26383.14m   439.72h   18.32d  0.050 y
rY.phyloP46way.placental.wigFix.gz
# IO & Wait Time:                 25577s     426.29m     7.10h    0.30d  0.001 y
# Average job time:                 505s       8.41m     0.14h    0.01d
# Longest finished job:             768s      12.80m     0.21h    0.01d
# Submission to last job:         12967s     216.12m     3.60h    0.15d

    cd /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/placental
    find ./wigFix -type f \
	| sed -e "s#^./##; s/\./ /g; s/-/ - /g" \
	| sort -k1,1 -k3,3n -k4,4n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list
    cat wigFile.list | xargs cat \
	| wigEncode stdin phyloP46wayPlacental.wig phyloP46wayPlacental.wib \
	> wigEncode.log 2>&1 &
    #	Converted stdin, upper limit 2.95, lower limit -13.28
    cat wigFile.list | xargs cat \
	| wigToBigWig stdin ../../../../chrom.sizes phyloP46wayPlacental.bw

    #	loading bigWig table:
    ln -s `pwd`/phyloP46wayPlacental.bw /gbdb/hg19/bbi
    hgsql hg19 -e 'drop table if exists phyloP46wayPlacental; \
            create table phyloP46wayPlacental \
		(fileName varchar(255) not null); \
            insert into phyloP46wayPlacental values
	("/gbdb/hg19/bbi/phyloP46wayPlacental.bw");'

    #	loading the wiggle table:
    ln -s `pwd`/phyloP46wayPlacental.wib /gbdb/hg19/multiz46way
    time hgLoadWiggle
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
	phyloP46wayPlacental phyloP46wayPlacental.wig

    #	create download files:
    mkdir downloads
    time ../all/mkDown.csh
    #	real    18m52.778s
    time gzip downloads/chr*.wigFix
    #	real    46m55.550s

    wigTableStatus.sh hg19 phyloP46wayPlacental
# db.table      min max mean count sumData stdDev viewLimits
# hg19.phyloP46wayPlacental -13.796 2.941 0.0359345 2845303719 1.02245e+08
#	stdDev viewLimits
#	0.779426 viewLimits=-3.86119:2.941
    #	that range is: 13.796+2.941 = 16.737

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.016737 -hBinCount=1000 -hMinVal=-13.796 -verbose=2 \
	    -db=hg19 phyloP46wayPlacental > histogram.data 2>&1
    #	real    8m15.623s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg19 Histogram phyloP46wayPlacental track"
set xlabel " phyloP46wayPlacental score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.03]
set xrange [-2.5:2.5]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#########################################################################
# LASTZ Zebrafish DanRer6 (DONE - 2009-07-08,10 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
    cd /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08

    cat << '_EOF_' > DEF
# human vs X. zebrafish
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebrafish danRer6
SEQ2_DIR=/scratch/data/danRer6/danRer6.2bit
SEQ2_LEN=/scratch/data/danRer6/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=40

BASE=/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    1678m17.827s
    #	failed during the chain step due to encodek cluster problems
    #	finish that manually, then:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-continue=chainMerge > chainMerge.log 2>&1 &
    #	real    167m6.930s
    cat fb.hg19.chainDanRer6Link.txt 
    #	88391631 bases of 2897316137 (3.051%) in intersection

    #	running the swap - DONE - 2009-06-02
    mkdir /hive/data/genomes/danRer6/bed/blastz.hg19.swap
    cd /hive/data/genomes/danRer6/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    183m21.102s
    cat fb.danRer6.chainHg19Link.txt 
    #	96424507 bases of 1506896106 (6.399%) in intersection

##############################################################################
# LASTZ Elephant LoxAfr3 (DONE - 2009-07-21,23 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
    cd /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21

    cat << '_EOF_' > DEF
# Human vs. Elephant
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Elephant
SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit
SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    317m32.664s
    #	broken when it went to chaining on encodek, finish the chain then:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-continue=chainMerge > chainMerge.log 2>&1 &
    #	real    217m25.159s

    # time about 3h23m
    cat fb.hg19.chainLoxAfr3Link.txt 
    #	1351200080 bases of 2897316137 (46.636%) in intersection

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet -continue=syntenicNet -stop=syntenicNet \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> synNet.log 2>&1 &
    #	real    32m40.554s

    time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr3 > rbest.log 2>&1
    #	real    184m3.435s

    mkdir /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
    cd /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    220m16.839s
    cat fb.loxAfr3.chainHg19Link.txt 
    #	1323201500 bases of 3118565340 (42.430%) in intersection

##############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.

############################################################################
# AGILENT PROBES LIFTED FROM HG18 (DONE, 2009-07-28 Andy)

ssh hgwdev
bash
mkdir /hive/data/genomes/hg19/bed/agilentProbes
cd /hive/data/genomes/hg19/bed/agilentProbes
for table in `echo show tables like \'agilent%\' | hgsql hg18 | tail -n +2 | grep -v Probe`; do
    echo $table; echo "select * from $table" | hgsql hg18 | \
        tail -n +2 | cut -f2- > ${table}.hg18.bed; liftOver ${table}.hg18.bed \
          /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz ${table}.hg19.{bed,unmapped};
    hgLoadBed hg19 $table ${table}.hg19.bed; 
    echo done with $table; 
done
for unmap in *.unmapped; do
   table=${unmap%.hg19.unmapped}
   grep Deleted -A1 $unmap | grep -v Deleted | grep -v "^--" > agilentProbesHg18Unmapped/${table}.deleted.bed
   grep Split -A1 $unmap | grep -v Split | grep -v "^--" > agilentProbesHg18Unmapped/${table}.split.bed
   grep Partially -A1 $unmap | grep -v Partially | grep -v "^--" > agilentProbesHg18Unmapped/${table}.partiallyDeleted.bed
done
find agilentProbesHg18Unmapped/ -size 0b | xargs rm
rm *hg18.bed *.unmapped bed.tab
gzip *.bed
tar cfz agilentProbesHg18Unmapped.tar.gz agilentProbesHg18Unmapped
cd /usr/local/apache/htdocs/goldenPath/hg19
mkdir agilentProbes
cd agilentProbes/
ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped beds
ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped.tar.gz

##############################################################################
# LASTZ Tetraodon TetNig2 (DONE - 2009-08-10,11 - Hiram)
    #	This is the incorrect date/time stamp on this directory,
    #	it should be 2009-08-10
    mkdir /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
    cd /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10

    cat << '_EOF_' > DEF
# human vs tetraodon
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    220m36.068s
    #	forgot the qRepeats for tetNig2
    rm axtChain/hg19.tetNig2.net
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-continue=load -qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> load.log 2>&1 &
    #	real    5m53.096s
    cat fb.hg19.chainTetNig2Link.txt 
    #	49611132 bases of 2897316137 (1.712%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
    cd /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    13m21.591s
    #	forgot the qRepeats for tetNig2
    rm axtChain/tetNig2.hg19.net
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
	-continue=load -qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > load.log 2>&1 &
    #	real    4m7.559s
    cat fb.tetNig2.chainHg19Link.txt 
    #	42910930 bases of 302314788 (14.194%) in intersection


##############################################################################
# dbSNP BUILD 130 - PROVISIONAL REMAPPING TO BUILD 37 (DONE 8/28/09 angie)
    # /hive/data/outside/dbSNP/130/ was already set up during the hg18 run --
    # just add hg19 coord files and go from there.
    cd /hive/data/outside/dbSNP/130/human/data
    alias wg wget --timestamping
    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/misc/exchange
    # These are provisional files in an ad-hoc format.
    wg $ftpSnpDb/README.txt
    wg $ftpSnpDb/Remap_36_3_37_1.info
    wg $ftpSnpDb/Remap_36_3_37_1.txt.gz
    mv README.txt Remap_36_3_37_1_README
    zcat Remap_36_3_37_1.txt.gz | wc -l
#18823990

    # Use the remapping to transform ../ucscNcbiSnp.bed into one for hg19.
    # Useful columns, 1-based: 1=ID, 3=oldChr, 4=oldStart, 5=oldEnd,
    # 10=newChr, 11=newStart, 12=newEnd, 13=newLocType, 14=newWeight, 16=newStrand
    # For mappings to chr*_random, oldStart and oldEnd are empty -- skip.
    # Sort both hg18 snp file and remap file by {rsID,chr,start} to keep them in sync.
    mkdir /hive/data/outside/dbSNP/130/human/hg19
    cd /hive/data/outside/dbSNP/130/human/hg19
    sort -k4n,4n -k1,1 -k2n,2n ../ucscNcbiSnp.bed > /data/tmp/hg18.ucscNcbiSnp.idSorted.bed
    zcat ../data/Remap_36_3_37_1.txt.gz \
    | sort -t "	" -k1n,1n -k3,3 -k4n,4n \
      > /data/tmp/Remap_36_3_37_1.txt
    perl -we \
      'use strict; \
       sub nextMap { \
         my ($rsId, undef, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, \
             $nLocType, $nWt, $nRef, $nStr);\
         do { \
           ($rsId, undef, $oChr, $oStart, $oEnd, undef,undef,undef,undef, \
               $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = split("\t", <>); \
           if (defined $nStr) { \
             chomp $nStr; $nStr =~ tr/+-/01/; $oChr = "chr$oChr";  $nChr = "chr$nChr"; \
           } \
           $oStart--;  $oEnd--;  $nStart--;  $nEnd--;  # Yep. 0-based closed vs 1-based closed \
         } while (defined $nStr && ($oEnd < 0 || $nChr eq "chrUn")); \
         return ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, \
                 $nLocType, $nWt, $nRef, $nStr); \
       } # nextMap \
       my ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = \
         &nextMap(); \
       my ($rCount, $oCount, $tCount) = 0; \
       open(my $oldF, "/data/tmp/hg18.ucscNcbiSnp.idSorted.bed") || die; \
       while (my ($chr, $s, $e, $id, $str, $rn,$obs,$mt,$cn,$vn,$ah,$ahse,$fc,$lt,$wt) = \
              split("\t", <$oldF>)) { \
         my $thisRCount = 0; \
         while (defined $oChr && $chr eq $oChr && $s == $oStart && $e == $oEnd && $id == $rsId) { \
           print join("\t", $nChr,$nStart,$nEnd,$id,$nStr,$nRef,$obs,$mt,$cn,$vn,$ah,$ahse,$fc, \
                            $nLocType,$nWt,$nStart) \
                      . "\n"; \
           ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd, $nLocType, $nWt, $nRef, $nStr) = \
             &nextMap(); \
           $thisRCount++; \
         } \
         if (defined $rsId && $id > $rsId) {warn "Slipped a cog"; last;} \
         $tCount += $thisRCount; \
         $rCount++ if ($thisRCount > 0); \
         $oCount++; \
       } \
       close($oldF);  print STDERR "Replaced $rCount of $oCount inputs ($tCount outputs).\n";' \
      /data/tmp/Remap_36_3_37_1.txt \
    | sort -k1,1 -k2n,2n -k4,4 \
    > /data/tmp/hg19.ucscNcbiSnp.bed
#Replaced 18693260 of 19189750 inputs (18697579 outputs).
#504.562u 27.037s 8:59.57 98.5%  0+0k 0+0io 0pf+0w
    wc -l /data/tmp/hg19.ucscNcbiSnp.bed
#  18697579 /data/tmp/hg19.ucscNcbiSnp.bed

    # Drum roll please... translate NCBI's encoding into UCSC's, and
    # perform a bunch of checks.  This is where developer involvement
    # is most likely as NCBI extends the encodings used in dbSNP.
    cd /hive/data/outside/dbSNP/130/human/hg19
    snpNcbiToUcsc /data/tmp/hg19.ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit \
      -1000GenomesRsIds=../data/1000GenomesRsIds.txt snp130
#spaces stripped from observed:
#chr12   6093134 6093134 rs41402545
#Line 8049395 of /data/tmp/hg19.ucscNcbiSnp.bed: Encountered something that doesn't fit observedMixedFormat: GCAACTTCA
#count of snps with weight  0 = 0
#count of snps with weight  1 = 17042465
#count of snps with weight  2 = 345274
#count of snps with weight  3 = 1017906
#count of snps with weight 10 = 291934
#Skipped 1496 snp mappings due to errors -- see snp130Errors.bed
#146.837u 9.867s 4:21.63 59.8%   0+0k 0+0io 0pf+0w
    # Comparable to hg18.snp130, with some losses due to coord translation, loss of _randoms,
    # and 1496 errors (new locType or refNCBI inconsistent with new size).
    expr 18697579 - 291934 - 1496
#18404149

    # Move hg19.ucscNcbiSnp.bed from fast tmp to slow (today) hive:
    gzip /data/tmp/hg19.ucscNcbiSnp.bed
    mv /data/tmp/hg19.ucscNcbiSnp.bed.gz hg19.ucscNcbiSnp.bed.gz

    # Will try not reuse hg18.snp130's giant 18G fasta file, not duplicate.

    # Load up main track tables.
    cd /hive/data/outside/dbSNP/130/human/hg19
    hgLoadBed -tab -tmpDir=/data/tmp -allowStartEqualEnd \
      hg19 snp130 -sqlTable=snp130.sql snp130.bed
#Loaded 18404149 elements of size 17
#115.086u 21.663s 2:32:09.98 1.4%        0+0k 0+0io 1pf+0w
#that is freakishly long -- lots happening today w/db move, hive recovery,...
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
      hg19 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
      snp130Exceptions.bed
#Loaded 1982828 elements of size 5
#10.500u 0.851s 1:13.42 15.4%    0+0k 0+0io 0pf+0w
    hgLoadSqlTab hg19 snp130ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
      snp130ExceptionDesc.tab
    # Load up sequences *from hg18 file*:
    hgLoadSqlTab hg19 snp130Seq ~/kent/src/hg/lib/snpSeq.sql ../snp130Seq.tab

    # Put in a link where one would expect to find the track build dir...
    ln -s /hive/data/outside/dbSNP/130/human/hg19 /hive/data/genomes/hg19/bed/snp130

    # Look at the breakdown of exception categories:
    cd /hive/data/outside/dbSNP/130/human/hg19
    cut -f 5 snp130Exceptions.bed | sort | uniq -c | sort -nr
#1350217 MultipleAlignments
# 495981 ObservedMismatch
#  37603 ObservedTooLong
#  26855 SingleClassTriAllelic
#  24443 FlankMismatchGenomeShorter
#  17927 SingleClassLongerSpan
#  13685 SingleClassZeroSpan
#   6238 FlankMismatchGenomeLonger
#   3016 DuplicateObserved
#   2851 SingleClassQuadAllelic
#   1777 MixedObserved
#   1264 NamedDeletionZeroSpan
#    508 FlankMismatchGenomeEqual
#    329 NamedInsertionNonzeroSpan
#    121 ObservedContainsIupac
#     11 RefAlleleMismatch
#      2 ObservedWrongFormat

#TODO: go through those above (esp snp130Errors.bed) and send some bug reports to dbSNP.


##############################################################################
# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 8/31/09 angie)
    mkdir /hive/data/genomes/hg19/bed/snp130Ortho
    cd /hive/data/genomes/hg19/bed/snp130Ortho

    # Following Heather's lead in snp126orthos, filter SNPs to to keep
    # only those with class=single, length=1, chrom!~random;
    # Exclude those with exceptions MultipleAlignments,
    # SingleClassTriAllelic or SingleClassQuadAllelic.
    # Unlike snp masking, we do not filter for weight -- don't know why.
    awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      /hive/data/outside/dbSNP/130/human/hg19/snp130Exceptions.bed \
    | sort -u \
      > snp130ExcludeIds.txt
    awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
      /hive/data/outside/dbSNP/130/human/hg19/snp130.bed \
    | grep -vFwf snp130ExcludeIds.txt \
      > snp130Simple.bed
#203.193u 9.197s 2:57.40 119.7%  0+0k 0+0io 0pf+0w
    wc -l snp130Simple.bed
#12278514 snp130Simple.bed

    # Glom all human info that we need for the final table onto the
    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
    awk 'BEGIN{OFS="\t";} \
        {print $1, $2, $3, \
               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
               0, $6;}' \
      snp130Simple.bed > snp130ForLiftOver.bed
    # Map coords to chimp using liftOver.
    # I don't know why chimp took so much longer than macaque... the
    # chimp .over has fewer chains and fewer bytes than the macaque .over.
    mkdir run.liftOChimp
    cd run.liftOChimp
    mkdir split out
    splitFile ../snp130ForLiftOver.bed 25000 split/chunk
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
        \{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    ssh swarm
    cd /hive/data/genomes/hg19/bed/snp130Ortho/run.liftOChimp
    para make jobList
#Completed: 492 of 492 jobs
#CPU time in finished jobs:      51793s     863.22m    14.39h    0.60d  0.002 y
#IO & Wait Time:                  3825s      63.75m     1.06h    0.04d  0.000 y
#Average job time:                 113s       1.88m     0.03h    0.00d
#Longest finished job:             286s       4.77m     0.08h    0.00d
#Submission to last job:           300s       5.00m     0.08h    0.00d

    # Map coords to orangutan using liftOver.
    mkdir ../run.liftOPon
    cd ../run.liftOPon
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
        \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 492 of 492 jobs
#CPU time in finished jobs:     125656s    2094.26m    34.90h    1.45d  0.004 y
#IO & Wait Time:                  5413s      90.22m     1.50h    0.06d  0.000 y
#Average job time:                 266s       4.44m     0.07h    0.00d
#Longest finished job:             646s      10.77m     0.18h    0.01d
#Submission to last job:           649s      10.82m     0.18h    0.01d

    # Map coords to macaque using liftOver.
    mkdir ../run.liftOMac
    cd ../run.liftOMac
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
        \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 492 of 492 jobs
#CPU time in finished jobs:     161612s    2693.54m    44.89h    1.87d  0.005 y
#IO & Wait Time:                  6218s     103.63m     1.73h    0.07d  0.000 y
#Average job time:                 341s       5.69m     0.09h    0.00d
#Longest finished job:             727s      12.12m     0.20h    0.01d
#Submission to last job:           739s      12.32m     0.21h    0.01d

    cd /hive/data/genomes/hg19/bed/snp130Ortho
    # Concatenate the chimp results, sorting by chimp pos in order to
    # efficiently access 2bit sequence in getOrthoSeq.  The output of
    # that is then sorted by the glommed human info field, so that we
    # can use join to combine chimp and macaque results in the next step.
    # Ditto for macaque and orangutan.  Each command pipe takes ~5 minutes:
    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
    | sort > panTro2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
    | sort > ponAbe2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
    | sort > rheMac2.orthoGlom.txt
    wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
#  11428526 panTro2.orthoGlom.txt
#  10861969 ponAbe2.orthoGlom.txt
#   9694237 rheMac2.orthoGlom.txt

    # Use the glommed name field as a key to join up chimp and macaque
    # allele data.  Include glommed name from both files because if only
    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
    # in the orthoGlom fields from each file, which are in the same order
    # as the chimp and macaque columns of snp130OrthoPanTro2RheMac2.
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
      > tmp.txt
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      tmp.txt rheMac2.orthoGlom.txt \
    | perl -wpe 'chomp; \
        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
          split(/\|/, $glomKey); \
        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
        s/^.*$//;' \
    | sort -k1,1 -k2n,2n > snp130OrthoPt2Pa2Rm2.bed
#304.434u 27.118s 4:31.30 122.2% 0+0k 0+0io 0pf+0w
    wc -l snp130OrthoPt2Pa2Rm2.bed
#11876029 snp130OrthoPt2Pa2Rm2.bed

    cd /hive/data/genomes/hg19/bed/snp130Ortho
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
      hg19 snp130OrthoPt2Pa2Rm2 snp130OrthoPt2Pa2Rm2.bed
#Loaded 11876029 elements of size 22
#75.442u 8.828s 9:50.27 14.2%    0+0k 0+0io 0pf+0w

    # Cleanup fileserver:
    cd /hive/data/genomes/hg19/bed/snp130Ortho
    gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed &
    rm -r run*/split tmp.txt *.orthoGlom.txt

##############################################################################
# LASTZ Rabbit OryCun2 (DONE - 2009-08-12 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
    cd /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12

    cat << '_EOF_' > DEF
# Human vs. Rabbit
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rabbit at chunk 20,000,000 all but 36 contigs can fit in a single job
SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit
SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/oryCun2/oryCun2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/oryCun2/oryCun2.contigs.sizes
SEQ2_LIFT=/hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    516m41.981s
    cat fb.hg19.chainOryCun2Link.txt 
    #	1283994337 bases of 2897316137 (44.317%) in intersection
    #	should have run syntenicNet in that first run
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
    #	about 1 hour

    mkdir /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
    cd /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
    
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-swap -syntenicNet > swap.log 2>&1 &
    #	real    176m35.932s
    cat fb.oryCun2.chainHg19Link.txt 
    #	1260477501 bases of 2604023284 (48.405%) in intersection

##############################################################################
# running syntenicNet on CavPor3 lastz (DONE - 2009-08-27 - Hiram)
    cd /hive/data/genomes/hg19/bed/lastzCavPor3.2009-06-04
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
    #	about 44 minutes

##############################################################################
# loading the lastz tables on cavPor3 - (DONE - 2009-08-28 - Hiram)
    # the chain.tab and link.tab files are left over from the failed load
    cd  /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain

    #	find out their sizes, average and total:
    awk '{print length($0)}' chain.tab | ave stdin
Q1 92.000000 median 93.000000 Q3 96.000000
average 93.651267
min 64.000000 max 109.000000
count 27186468
total 2546047186.000000
    awk '{print length($0)}' link.tab | ave stdin
Q1 45.000000 median 47.000000 Q3 48.000000
average 46.731871
min 22.000000 max 52.000000
count 240602108
total 11243786622.000000

    cat << '_EOF_' > chainHg19Link.sql
CREATE TABLE chainHg19Link (
  bin smallint(5) unsigned NOT NULL default 0,
  tName varchar(255) NOT NULL default '',
  tStart int(10) unsigned NOT NULL default 0,
  tEnd int(10) unsigned NOT NULL default 0,
  qStart int(10) unsigned NOT NULL default 0,
  chainId int(10) unsigned NOT NULL default 0,
  KEY tName (tName(13),bin),
  KEY chainId (chainId)
) ENGINE=MyISAM max_rows=241000000 avg_row_length=50 pack_keys=1 CHARSET=latin1;
'_EOF_'
    # << happy emacs
    hgsql cavPor3 < chainHg19Link.sql

    time hgsql -e \
      'load data local infile "link.tab" into table chainHg19Link;' cavPor3
    #	real    405m15.956s

    cd  /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain

    #	and the net tracks were not loaded:
    time netClass -verbose=0 -noAr noClass.net cavPor3 hg19 cavPor3.hg19.net
    #	real    40m25.078s

    netFilter -minGap=10 cavPor3.hg19.net \
	| hgLoadNet -verbose=0 cavPor3 netHg19 stdin
    # real    33m24.972s (plus the featureBits below)

    featureBits cavPor3 chainHg19Link > fb.cavPor3.chainHg19Link.txt 2>&1
    cat fb.cavPor3.chainHg19Link.txt
    #	1279572660 bases of 2663369733 (48.043%) in intersection

##############################################################################
# DBSNP CODING ANNOTATIONS (DONE 10/12/10 angie)
# Updated 10/12/10 using rebuilt hg18 snp130CodingDbSnp.bed w/corrected coords.
# Originally done 9/1/09

    # Repeat the coord-remapping performed for snp130 on the hg18 coding anno table.
    cd /hive/data/outside/dbSNP/130/human/hg19
    sed -re 's/\trs([0-9]+)\t/\t\1\t/' ../snp130CodingDbSnp.bed \
    | sort -k4n,4n -k1,1 -k2n,2n > /data/tmp/hg18.snp130Coding.idSorted.bed
    # reuse /data/tmp/Remap_36_3_37_1.txt mapping file created for snp130 above,
    # but first translate its coords (1-based fully-closed with 2-base-long insertions)
    # into ours (0-based half-open with 0-base-long insertions) and discard incompletes.
    perl -we \
      'while (my ($rsId, undef, $oChr, $oStart, $oEnd, $oLocType, undef,undef,undef, \
                                $nChr, $nStart, $nEnd, $nLocType) = split("\t", <>)) { \
         next if ($oStart eq "" || $nStart eq ""); \
         $oChr = "chr$oChr";  $nChr = "chr$nChr"; \
         # 2-base-long insertion (loc_type==3) -> 0-base-long: \
         if ($oLocType == 3) { $oEnd--; } else { $oStart--; } \
         if ($nLocType == 3) { $nEnd--; } else { $nStart--; } \
         print join("\t", $rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) . "\n"; \
       }' /data/tmp/Remap_36_3_37_1.txt \
      > /data/tmp/Remap_36_3_37_1_ucscCoords.txt
    # Apply the cleaned-up mapping to id-sorted hg18 snp130CodingDbSnp:
    perl -we \
      'use strict; \
       my ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = split("\t", <>); \
       my ($rCount, $oCount, $tCount) = 0; \
       open(my $oldF, "/data/tmp/hg18.snp130Coding.idSorted.bed") || die; \
       while (my ($chr, $s, $e, $id, $tx, $frm, $alCount, $funcs, $als, $codons, $peps) = \
              split("\t", <$oldF>)) { \
         my $thisRCount = 0; \
         while (defined $rsId && $rsId < $id) { \
           ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = split("\t", <>); \
         } \
         while (defined $oChr && $chr eq $oChr && $s == $oStart && $e == $oEnd && $id == $rsId) { \
           chomp $nEnd; \
           print join("\t", $nChr, $nStart, $nEnd, "rs$id", $tx, $frm, \
                            $alCount, $funcs, $als, $codons, $peps) unless $nEnd < $nStart; \
           ($rsId, $oChr, $oStart, $oEnd, $nChr, $nStart, $nEnd) = split("\t", <>); \
           $thisRCount++; \
         } \
         $tCount += $thisRCount; \
         $rCount++ if ($thisRCount > 0); \
         $oCount++; \
       } \
       close($oldF);  print STDERR "Replaced $rCount of $oCount inputs ($tCount outputs).\n";' \
      /data/tmp/Remap_36_3_37_1_ucscCoords.txt \
    | sort -k1,1 -k2n,2n -k4,4 \
    > /data/tmp/hg19.snp130Coding.bed
#Replaced 197921 of 279815 inputs (198493 outputs).
#35.486u 1.515s 0:36.70 100.7%   0+0k 0+0io 0pf+0w
   hgLoadBed hg19 snp130CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      /data/tmp/hg19.snp130Coding.bed
#Loaded 198459 elements of size 11
    # A bit fewer than reported 198493 above, but we ditched a few with $nEnd < $nStart
    # (corresponding SNPs ended up in snp130Errors.bed not snp130.bed anyway).
    mv /data/tmp/hg19.snp130Coding.bed hg19.snp130CodingDbSnp.bed

############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
# BURGE	LAB DATA MAPPED WITH GEMMAPPER. PROVIDED BY THOMAS DERRIEN FROM RODERIC
# GUIGO'S LAB AT CRG. (E-MAIL: thomas.derrien@crg.es). Data received on
# 09/14/09. 
# (hartera, 2009-09-28, DONE) 
# 2009-12-14, hartera. Set cdsStart = cdsEnd = 0. Moved track data directory to 
# /hive/data/genomes/hg18/bed.
# 2010-01-04, hartera. Change the data to BED format and re-loaded tables. BED
# is more appropriate for this data type.
# The data is too dense in places (feedback from QA) so it would be more
# appropriate to have a Signal track as for the ENCODE RNA-seq data tracks. 
# 2010-02-09, hartera. Create bedGraph Signal subtracks for each tissue/cell
# using reads/per million mapped reads as the data value.
# 2010-02-17, hartera. Updated trackDb.ra entry to include views.
# 2010-02-18, hartera. Loaded the bedGraph tables for the Raw Signal
# subtracks.
# 2010-05-15 and 2010-05-16, hartera. Re-created the Signal subtracks using 
# the -bed12 option of bedItemOverlapCount so that blocks are used. 
 
   mkdir /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign
   cd /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign

# Added the statements below to a script so that it can be run to fetch
# all the sequences.

wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325476_brain_HCT168_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325477_liver_HCT169_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325478_heart_HCT170_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325479_skelMuscle_HCT171_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325480_colon_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325481_adipose_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325482_testes_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325483_lymphNode_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325484_HCT204_bt474_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325485_HCT205_HME_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325486_HCT202_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325487_HCT203_s2468.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325488_HCT206_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325489_HCT207_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
  
   # Load this data into tables for hg19.
   # Unzip the files:
   gunzip *.gff.gz 
   # Create a file with the list of file names and tissues.
   ls *.gff > burgeDataFiles.txt
   GSM325486_HCT202_s2468	breast
   GSM325487_HCT203_s2468	MCF-7
   GSM325488_HCT206_s2468	MB435 
   GSM325489_HCT207_s2468	T47D
   # Did not map these two as they are not 32 bp. 
   GSM325490_brain_s1368	MAQC	mixed human brain tissue/cell lines
   GSM325491_UHR_s247		MAQC_UHR mixed human cell lines
   # Edit the file above to add a tab separation between file name and tissue
   # name. Then remove the "read_name: " from the last field in each 
   # file otherwise it gets included in the name and load the data into hg18.
   # Write a script to do this: 
cat << '_EOF_' > formatAndLoadData
#!/bin/bash -e
   
# Assign variables
# Tab-separated file of file names and tissue/cell line names
DATAFILES=$1
# track name used as prefix for subtracks
TRACK=$2
# database
DATABASE=$3

cat $DATAFILES | while read file tissue; do
    subTrack=`echo $TRACK$tissue`
    echo $subTrack   
    sed -e 's/read_name:\s//' $file > ${subTrack}.gff
    ldHgGene -exon=read $DATABASE ${subTrack} ${subTrack}.gff 
done
'_EOF_'
   chmod +x formatAndLoadData
   ./formatAndLoadData burgeDataFiles.txt burgeRnaSeqGemMapperAlign hg19 \
     >& load.log &
   # Took about 2 hours to load the tables.
   # Copy trackDb entry in hg18 trackDb.ra to
   # ccds/trunk/gencode/browser/trackDb/human/hg19/trackDb.ra

   # 2009-12-14, Need to change cdsStart = cdsEnd = 0 in the tables as this
   # data should have no CDS defined. Currently cdsStart = cdsEnd = txEnd. 
   cd /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign
   hgsql -Ne 'show tables like "burge%";' hg19 > burgeTables
   foreach t (`cat burgeTables`)
      echo $t
      hgsql -e "update $t set cdsStart = 0;" hg19
      hgsql -e "update $t set cdsEnd = 0;" hg19
   end
   # Then move data to directory in hg19 genome bed directory
   cd /hive/data/genomes/hg19/bed
   mv /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign ./

   # 2010-01-04 Change the data to BED format. For genePred format, 
   # there is always a track configuration added for colouring tracks by
   # genomic codons which does not make sense for this data. Also, BED is
   # more appropriate for this data type.
   cd /hive/data/genomes/hg19/bed/burgeRnaSeqGemMapperAlign
   # Convert gff to genePred and then genePred to BED, drop old table and
   # then load database with BED format data. Need to fix the cdStart and
   # cdsEnd fields to be 0. 
   foreach f (`ls burgeRnaSeqGemMapperAlign*.gff`)
     echo $f >> bed.log
     set g=$f:r
     echo $g
     ldHgGene -exon=read -nobin -out=${g}.gp hg19 $g $f >>& bed.log
     awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1,$2,$3,$4,$5,0,0,$8,$9,$10}' \
         ${g}.gp > ${g}Fixed.gp 
     genePredToBed ${g}Fixed.gp > ${g}.bed 
     echo "Dropping table $g"
     hgsql -e "drop table ${g};" hg19
     hgLoadBed hg19 $g ${g}.bed >>& bed.log 
   end
   # Changed track type in trackDb/human/trackDb.ra to bed 12 and 
   # then did make alpha in trackDb directory.

   # trackDb/human/trackDb.ra entry was updated to include views for Raw Signal
   # and Alignment subtracks (2010-02-17)
   # 2010-05-15 and 2010-05-16. Add a Signal track so it is easier to view the data in 
   # regions where there is a high density of reads. 
   cd /hive/data/genomes/hg19/bed/burgeRnaSeqGemMapperAlign
   # Use bedItemOverlapCount to get counts of overlapping items for each base.
   # Need to sort the bed files and then get the number of reads mapped for
   # that tissue. Divide the counts by the number of million mapped reads to 
   # get the number of reads per million mapped reads as the data value. 
   # Re-make the subtracks using the -bed12 option so that blocks are used 
   # instead of just the first three fields of the BED file as is the default.
   rm *.count *.bedGraph
   foreach f (`ls *.bed`)
      echo $f
      set g=$f:r
      sort $f | bedItemOverlapCount -bed12 hg19 stdin > ${f}.count
      set size=`hgsql -Ne "select count(distinct name) from ${g};" hg19`
      awk -v size=${size} 'BEGIN {OFS="\t"} {print $1,$2,$3,($4 / (size/1000000));}' ${f}.count > ${g}.bedGraph
   end
   # Load the bedGraph tables into the database as Raw Signal tracks.
   foreach f (`ls *.bedGraph`)
      echo $f
      set g=$f:r
      hgsql -e "drop table ${g}AllRawSignal;" hg19 
      hgLoadBed -bedGraph=4 hg19 ${g}AllRawSignal $f >>& loadSignal.log
   end

  
##########################################################################
# BUILD ALLEN BRAIN TRACK (DONE 09/30/09 kent)

# Make the working directory
    ssh hgwdev
    cd /cluster/data/hg19/bed
    mkdir allenBrain
    cd allenBrain

# Remap the probe alignments from mm7 to hg19

    zcat /gbdb/mm9/liftOver/mm9ToHg19.over.chain.gz \
        |  pslMap -chainMapFile -swapMap \
	       /cluster/data/mm9/bed/allenBrain/allenBrainAli.psl stdin stdout \
	  |  sort -k 14,14 -k 16,16n > unscored.psl

    pslRecalcMatch unscored.psl /cluster/data/hg19/hg19.2bit \
        /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa allenBrainAli.psl

# Load the database
   hgsql hg19 < ~/kent/src/hg/lib/allenBrainUrl.sql
   hgsql hg19 -e 'load data local infile "/cluster/data/mm9/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;'
   hgLoadPsl hg19 allenBrainAli.psl
   mkdir /gbdb/hg19/allenBrain
   ln -s /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa /gbdb/hg19/allenBrain/allenBrainProbes.fa
   hgLoadSeq hg19 /gbdb/hg19/allenBrain/allenBrainProbes.fa

# Make mapping between known genes and allenBrain
   hgMapToGene hg19 allenBrainAli -type=psl knownGene knownToAllenBrain

#############################################################################
# ADD ALLEN BRAIN CORTEXT LINK (DONE, 11/18/09 kent)

# Copy over version from hg18 since we don't have new data from Allen Brain
# Inst.
    cd /cluster/data/hg19/bed/allenBrain
    cp /cluster/data/hg18/bed/allenBrain/allenBrainGene.tab .

# Load it into database.

    hgsql hg19 < ~/src/hg/lib/allenBrainGene.sql
    hgsql hg19 -e \
    'load data local infile "allenBrainGene.tab" into table allenBrainGene'


############################################################################
## Annotate 46-way multiple alignment with gene annotations
##		(DONE - 2008-12-08,23 - Hiram)
    # Gene frames
    ## survey all genomes to see what type of gene track to use
    ssh hgwdev
    mkdir /hive/data/genomes/hg19/bed/multiz46way/frames
    cd /hive/data/genomes/hg19/bed/multiz46way/frames
    #
    #	survey all the genomes to find out what kinds of gene tracks they have
    cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`cat ../species.list`)
    echo -n "${db}: "
    set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
    foreach table ($tables)
	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
	    $table == "knownGene" || $table == "xenoRefGene" ) then
		set count = `hgsql $db -N -e "select count(*) from $table"`
		echo -n "${table}: ${count}, "
	endif
    end
    set orgName = `hgsql hgcentraltest -N -e \
	    "select scientificName from dbDb where name='$db'"`
    set orgId = `hgsql hg19 -N -e \
	    "select id from organism where name='$orgName'"`
    if ($orgId == "") then
	echo "Mrnas: 0"
    else
	set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
	echo "Mrnas: ${count}"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./showGenes.csh
    #	rearrange that output to create four sections:
    #	1. knownGenes for hg19, mm9, rn4
    #	2. ensGene for almost everything else
    #	3. xenoRefGene for calJac1, petMar1, loxAfr3, papHam1, macEug1, oryCun2

    mkdir genes
    # knownGene
    for DB in hg19 mm9 rn4
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    echo "panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
	tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
	bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
	proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
	taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6" \
    | sed -e "s/  */ /g" > ensGene.list


do
    # ensGene
    for DB in panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
	tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
	bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
	proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
	taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    echo "calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2" > xenoRef.list

    # xenoRefGene
    for DB in calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    #	the following single command doesn't work on any 32 Gb computer,
    #	requires much more memory, turn it into a kluster job, see below ...

    #	Create this command with this script:
    cat << '_EOF_' > mkCmd.sh
#!/bin/sh

echo "time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \\"
for G in mm9 rn4
do
    if [ ! -s genes/${G}.gp.gz ]; then
	echo "missing genes/${G}.gp.gz"
	exit 255
    fi
    echo -n "${G} genes/${G}.gp.gz "
done
echo "\\"
for D in `sort ensGene.list`
do
    if [ ! -s genes/${D}.gp.gz ]; then
        echo "missing genes/${D}.gp.gz"
        exit 255
    fi
    echo -n "${D} genes/${D}.gp.gz "
done
echo "\\"
for D in `sort xenoRef.list`
do
    if [ ! -s genes/${D}.gp.gz ]; then
        echo "missing genes/${D}.gp.gz"
        exit 255
    fi
    echo -n "${D} genes/${D}.gp.gz "
done
echo "\\"
echo "    | gzip > multiz46way.mafFrames.gz) > frames.log 2>&1"
'_EOF_'
    # << happy emacs
    chmod +x ./mkCmd.sh

    #	this doesn't work on any 32 Gb computer, requires much more memory
    #	turn it into a kluster job, see below
    time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \
mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz \
panTro2 genes/panTro2.gp.gz gorGor1 genes/gorGor1.gp.gz ponAbe2 genes/ponAbe2.gp.gz rheMac2 genes/rheMac2.gp.gz tarSyr1 genes/tarSyr1.gp.gz micMur1 genes/micMur1.gp.gz otoGar1 genes/otoGar1.gp.gz tupBel1 genes/tupBel1.gp.gz dipOrd1 genes/dipOrd1.gp.gz cavPor3 genes/cavPor3.gp.gz speTri1 genes/speTri1.gp.gz ochPri2 genes/ochPri2.gp.gz vicPac1 genes/vicPac1.gp.gz turTru1 genes/turTru1.gp.gz bosTau4 genes/bosTau4.gp.gz equCab2 genes/equCab2.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz myoLuc1 genes/myoLuc1.gp.gz pteVam1 genes/pteVam1.gp.gz eriEur1 genes/eriEur1.gp.gz sorAra1 genes/sorAra1.gp.gz proCap1 genes/proCap1.gp.gz echTel1 genes/echTel1.gp.gz dasNov2 genes/dasNov2.gp.gz choHof1 genes/choHof1.gp.gz monDom5 genes/monDom5.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz taeGut1 genes/taeGut1.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz tetNig2 genes/tetNig2.gp.gz fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat2 genes/oryLat2.gp.gz danRer6 genes/danRer6.gp.gz \
calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz loxAfr3 genes/loxAfr3.gp.gz papHam1 genes/papHam1.gp.gz macEug1 genes/macEug1.gp.gz oryCun2 genes/oryCun2.gp.gz \
    | gzip > multiz46way.mafFrames.gz) > frames.log 2>&1

    #	that doesn't work on any 32 Gb computer, requires much more memory
    #	turn it into a kluster job
    ssh swarm
    cd /hive/data/genomes/hg19/bed/multiz46way/frames
    cat << '_EOF_' > runOne
#!/bin/csh -fe

set C = $1
set G = $2

cat ../maf/${C}.maf | genePredToMafFrames hg19 stdin stdout \
        ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
'_EOF_'
    # << happy emacs
    chmod +x runOne

    ls ../maf | sed -e "s/.maf//" > chr.list
    ls genes | sed -e "s/.gp.gz//" | grep -v hg19 > gene.list

    cat << '_EOF_' > template
#LOOP
runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
#ENDLOOP
'_EOF_'
    # << happy emacs

    mkdir parts
    gensub2 chr.list gene.list template jobList
    para -ram=8g create jobList
    para try ... check ... push
# Completed: 4185 of 4185 jobs
# CPU time in finished jobs:      72491s    1208.19m    20.14h    0.84d  0.002 y
# IO & Wait Time:               1462162s   24369.36m   406.16h   16.92d  0.046 y
# Average job time:                 367s       6.11m     0.10h    0.00d
# Longest finished job:            3165s      52.75m     0.88h    0.04d
# Submission to last job:          6364s     106.07m     1.77h    0.07d

    # see what it looks like in terms of number of annotations per DB:
    find ./parts -type f | while read F
do
    zcat ${F}
done | cut -f4 | sort | uniq -c | sort -n > annotation.survey.txt
  79191 rn4
 108287 petMar1
 139581 gorGor1
 140487 taeGut1
 143058 choHof1
 143233 vicPac1
 150073 anoCar1
 154462 tarSyr1
 163930 sorAra1
 164575 galGal3
 171191 macEug1
 174221 felCat3
 175831 dasNov2
 177622 ornAna1
 190729 eriEur1
 192285 tupBel1
 198052 speTri1
 199639 micMur1
 201731 papHam1
 201961 panTro2
 206170 oryCun2
 209327 ponAbe2
 209504 otoGar1
 210860 rheMac2
 212533 proCap1
 212848 myoLuc1
 213146 dipOrd1
 213479 calJac1
 215995 echTel1
 220341 ochPri2
 225132 loxAfr3
 226689 turTru1
 230903 monDom5
 232025 pteVam1
 232831 equCab2
 236945 cavPor3
 238167 bosTau4
 239857 mm9
 255727 canFam2
 316850 xenTro2
 359507 danRer6
 375156 oryLat2
 390076 fr2
 426532 gasAcu1
 434619 tetNig2

    #	load the resulting file
    ssh hgwdev
    cd /cluster/data/hg19/bed/multiz46way/frames
    find ./parts -type f | while read F
do
    zcat ${F}
done | sort -k1,1 -k2,2n > multiz46wayFrames.bed

    hgLoadMafFrames hg19 multiz46wayFrames stdin

    featureBits -countGaps hg19 multiz46wayFrames.bed
    #	57146632 bases of 3137161264 (1.822%) in intersection

    #	enable the trackDb entries:
# frames multiz46wayFrames
# irows on
    #	appears to work OK

#############################################################################
## create upstream refGene maf files
    cd /hive/data/genomes/hg19/bed/multiz46way/downloads/maf
    # bash script
#!/bin/sh
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits hg19 refGene:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags hg19 multiz46way \
                stdin stdout \
                -orgs=/hive/data/genomes/hg19/bed/multiz46way/species.list \
        | gzip -c > upstream${S}.maf.gz
    echo "done upstream${S}.maf.gz"
done

    cd /usr/local/apache/htdocs/goldenPath/hg19/multiz46way/maf
    ln -s /hive/data/genomes/hg19/bed/multiz46way/downloads/maf/up*.gz .
    md5sum up*.gz >> md5sum.txt


#############################################################################
# AFFY U133AB (Done - 2009-09-30 - Jim)
    # Align probes 
    ssh swarm
    cd /cluster/data/hg19/bed
    mkdir -p affyProbes/affyU133/run
    cd affyProbes/affyU133/run
    mkdir psl
    ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
    ls -1 /hive/data/outside/affyProbes/HG-U133AB_all.fa > mrna.lst

    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 genome.lst mrna.lst gsub jobList
    para create jobList
    para try
    para check
    para push
    para time
#Completed: 93 of 93 jobs
#CPU time in finished jobs:      21246s     354.09m     5.90h    0.25d  0.001 y
#IO & Wait Time:                   349s       5.82m     0.10h    0.00d  0.000 y
#Average job time:                 232s       3.87m     0.06h    0.00d
#Longest finished job:            1650s      27.50m     0.46h    0.02d
#Submission to last job:          1685s      28.08m     0.47h    0.02d


    # Do sort, best in genome filter.
    # to create affyU133.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyU133.psl /dev/null
    rm -r raw.psl psl

    # Load probes and alignments into database.
    ssh hgwdev
    cd /cluster/data/hg19/bed/affyProbes/affyU133
    hgLoadPsl hg19 affyU133.psl
    hgLoadSeq hg19 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa

    # Added knownToU133 table
    hgMapToGene hg19 affyU133 knownGene knownToU133
    # trim unwanted chip-prefix to be backwards compatible with hg17 and hg18
    hgsql hg19 -e 'update knownToU133 set value=substring(value,7)'

    # remove the trailing ";" from the value field (redmine #1685)
    hgsql hg19 -e 'update knownToU133 set value=trim(trailing ";" from value);'

##########################################################################
# GNF ATLAS 2 (Done - 2009-09-30 - Jim)
    # Align probes from GNF1H chip.
    ssh swarm
    cd /cluster/data/hg19/bed
    mkdir -p geneAtlas2/run/psl
    cd geneAtlas2/run
    mkdir psl
    ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
    ls -1 /hive/data/outside/gnf/human/atlas2/gnf1h.fa > mrna.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 genome.lst mrna.lst gsub jobList
    para create jobList
    para try
    para check
    para push
    para time
#Completed: 93 of 93 jobs
#CPU time in finished jobs:       3299s      54.98m     0.92h    0.04d  0.000 y
#IO & Wait Time:                   330s       5.50m     0.09h    0.00d  0.000 y
#Average job time:                  39s       0.65m     0.01h    0.00d
#Longest finished job:             370s       6.17m     0.10h    0.00d
#Submission to last job:           477s       7.95m     0.13h    0.01d


    # Do sort, best in genome filter
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1h.psl /dev/null
    rm -r raw.psl psl

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/geneAtlas2
    hgLoadPsl hg19 affyGnf1h.psl
    hgLoadSeq hg19 /gbdb/hgFixed/affyProbes/gnf1h.fa

    grep -v U133B ../affyProbes/affyU133/affyU133.psl \
	| sed -e "s/exemplar://; s/consensus://; s/U133A://" \
	| sed -e "s/;//" > affyU133A.psl

    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
    	affyU133A.psl  affyGnf1h.psl

    # Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio
    # Mapped 33186,  multiply-mapped 3171, missed 48, unmapped 11510

    hgLoadBed hg19 gnfAtlas2 gnfAtlas2.bed
    # Loaded 36357 elements of size 15

    # Added knownToGnf1h table
    hgMapToGene hg19 affyGnf1h knownGene knownToGnf1h

##########################################################################
# BUILD NIBB IMAGE PROBES (DONE 2009-10-12 JK)

# Make directory on san for cluster job and copy in sequence
    ssh swarm
    mkdir /hive/data/genomes/hg19/bed/nibbPics
    cd /hive/data/genomes/hg19/bed/nibbPics
    cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .

# Make parasol job dir and sequence list files
    mkdir run
    cd run
    mkdir psl
    ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
    echo ../nibbImageProbes.fa > mrna.lst

# Create parasol gensub file file
cat << '_EOF_' > gsub
#LOOP
blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
#ENDLOOP
'_EOF_'
# << emacs

# Create parasol batch
    gensub2 genome.lst mrna.lst gsub spec
    para create spec

# Do para try/push/time etc.
#Completed: 93 of 93 jobs
#CPU time in finished jobs:       8008s     133.47m     2.22h    0.09d  0.000 y
#IO & Wait Time:                   364s       6.07m     0.10h    0.00d  0.000 y
#Average job time:                  90s       1.50m     0.03h    0.00d
#Longest finished job:             765s      12.75m     0.21h    0.01d
#Submission to last job:           824s      13.73m     0.23h    0.01d

# Make sort and filter
    catDir psl | sort -k 10 \
        | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
	| sort -k 14,14 -k 16,16n \
	| sed 's#/scratch/data/hg19/nib/chr#chr#' \
	| sed 's/.nib//' > ../nibbImageProbes.psl

# Make bed file and copy in stuff
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/nibbPics
    cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .

# Load into database
    ln -s /cluster/data/hg19/bed/nibbPics/nibbImageProbes.fa /gbdb/hg19/nibbImageProbes.fa
    hgLoadSeq hg19 /gbdb/hg19/nibbImageProbes.fa
    hgLoadPsl hg19 nibbImageProbes.psl

##########################################################################
# Initial vgProbeTrack run for hg19 (galt 2009-10-15)
# see visiGene.txt make doc
# uses nibbImageProbes and vgProbeTrack utility
# creates vgAllProbes and knownToVisiGene
#    25931 
# updates visiGene.vgPrbAliAll.
# creates and runs hgLoadSeq on /gbdb/hg19/visiGene/*.fa

##########################################################################
# make new grp table to match hg18 (DONE  2009-10-01 kuhn)
# to split regulation from expression 
# phenDis group is also missing in hg19
# and add one more column: defaultIsClosed

# get the hg18.grp table into hg19 

# copy the hg18.grp table into hg19.grpNew and edit 
   hgsql hg19
   CREATE TABLE grpNew SELECT * FROM hg18.grp;
   # 24 rows in set (0.00 sec)
   
   DELETE FROM grpNew WHERE name LIKE "encode%";
   DELETE FROM grpNew WHERE name LIKE "remc%"; 
   DELETE FROM grpNew WHERE name LIKE "tcga%";
   DELETE FROM grpNew WHERE name LIKE "cancer%"; 
   DELETE FROM grpNew WHERE name LIKE "jk%";
   # 10 rows in set (0.00 sec)
   
# move the new table into place quickly
   DROP TABLE grp;
   RENAME TABLE grpNew TO grp;
 
#########################################################################
# BUILD OMIM RELATED GENES TRACK (done 2009-10-13 jk)

ssh hgwdev
cd /hive/data/genomes/hg19/bed
mkdir omimGene
cd omimGene

# download the file morbidmap and genemap from OMIM

mkdir omim
cd omim
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/morbidmap
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/genemap
cat genemap|sed -e 's/|/\t/g' > genemap.tab
autoSql ~/src/hg/lib/omimGeneMap.as x
cat x.sql |sed -e 's/PRIMARY KEY(numbering)/KEY(omimId)/' >omimGeneMap.sql
hgLoadSqlTab -warn hg19 omimGeneMap omimGeneMap.sql genemap.tab

# got warning on 3 records, just ignore them
# Warning: load of omimGeneMap did not go as planned: 12216 record(s), 0 row(s)

rm x.c x.h
cd ..
cat omim/morbidmap|sed -e 's/|/\t/g' > mobidmap.tab
autoSql ~/src/hg/lib/omimMorbidMap.as x 
cat x.sql |sed -e 's/PRIMARY KEY(description)/KEY(omimId)/' >omimMorbidMap.sql
hgLoadSqlTab -warn hg19 omimMorbidMap omimMorbidMap.sql mobidmap.tab

# get all UCSC genes (from the knownGene table) that cross-reference to a RefSeq gene 
# that has a non-empty OMIM ID according to the refLink table.  And use OMIM ID as
# the gene name for this new table.  Please note the alignId field still holds the KG ID.

hgsql hg19 -N -e \
'select omimId, kg.* from knownGene kg, knownToRefSeq kr, refLink l where omimId != 0 and mrnaAcc=kr.value and kg.name=kr.name ' \
|cut -f 1,3-13 >o1.tab

# collect more OMIM related genes via the MIM external DB links from UniProt

hgsql hg19 -N -e \
'select extAC, kg.* from knownGene kg, kgXref k, proteome.spXref2 p where spId=p.accession and extDB="MIM" and kg.name=kgId ' \
|cut -f 1,3-13 >o2.tab

# concatenate the above two gene sets and remove duplications.

cat o1.tab o2.tab |sort -u >o3.tab

# load the result into a temp table, fanO3
hgLoadSqlTab hg19 fanO3 ~/src/hg/lib/knownGene.sql o3.tab

# while holding onto the OMIM ID, get the canonical gene (via the knownGene, knowIsoforms, 
# and knownCanonical tables) that represent a cluster which contains 
# initial OMIM gene in the fanO3 table

hgsql hg19 -N -e \
'select f3.name, kg.* from fanO3 f3, knownGene kg, knownCanonical c, knownIsoforms i where f3.alignId=i.transcript and kg.name=c.transcript and c.clusterId=i.clusterId'\
> o4.tab

# first column is the OMIM ID
cut -f 1 o4.tab >j1.tmp

# col 3-13 is the gene structure of the canonical KG
cut -f 3-13 o4.tab >j2.tmp

# stitch them together and remove duplicates, load the result into fanO4 table
paste j1.tmp j2.tmp |sort -u >fanO4.tab
hgLoadSqlTab hg19 fanO4  ~/src/hg/lib/knownGene.sql fanO4.tab

# finally sort the table and create bed 4 file and load it as the omimGene table

hgsql hg19 -N -e 'select chrom, txStart, txEnd, name from fanO4 order by chrom, txStart, txEnd' |sort -u >omimGene.bed
hgLoadBed hg19 omimGene omimGene.bed

# create and load the omimToKnownCanonical table.

hgsql hg19 -N -e 'select name, alignId from fanO4 order by name'\
> omimToKnownCanonical.tab

hgLoadSqlTab hg19 omimToKnownCanonical  \
~/src/hg/lib/omimToKnownCanonical.sql omimToKnownCanonical.tab

# The following clean up could be done.
# hgsql hg19 -e 'drop table fanO3'
# hgsql hg19 -e 'drop table fanO4'
# rm j*.tmp
# rm o1.tab o2.tab o3.tab o4.tab

#########################################################################
# BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (in progress 2009-10-14 jk)

# Make the directory to work in
   cd /hive/data/genomes/hg19/bed
   mkdir hprd
   cd hprd

# Download HPRD_XML_070609.tar.gz from www.hprd.org. Unfortunately this
# requires registration, so can't just wget it.

    zcat HPRD_XML_070609.tar.gz | tar -xv

# This will create 20000 or more  xxxx.xml files under HPRD_XML_070609

# Create hprdToCdna table
    echo HPRD_XML_070609/*.xml | xargs grep entry_cdna  > j.cdna
    cat j.cdna| sed -e 's/\//\t/' | sed -e 's/.xml/\t/' |\
	sed -e 's/<entry_cdna>/\t/' | sed -e 's/<\//\t/'| sed -e 's/\./\t/'| cut -f 2,4|\
	grep -v None >hprdToCdna.tab

    hgsql hg19 <~/src/hg/lib/hprdToCdna.sql
    hgsql hg19 -e 'load data local infile "hprdToCdna.tab" into table hprdToCdna'

# Create hprdToUniProt table

    echo 'fgrep -H Swiss  HPRD_XML_070609/$1.xml' >do1

    ls HPRD_XML_070609 >j
    cat j |sed -e 's/.xml/\tdo1/g' >jj
    cut -f 1 jj >j.2
    cut -f 2 jj >j.1
    paste j.1 j.2 >doall
    chmod +x do*

    ./doall >j.out
    cat j.out|grep SwissProt | sed -e 's/\//\t/' | sed -e 's/.xml/\t/' | \
    sed -e 's/Prot>/\t/' | sed -e 's/<\//\t/'| cut -f 2,4|grep -v None >hprdToUniProt.tab

    hgsql hg19 <~/src/hg/lib/hprdToUniProt.sql
    hgsql hg19 -e 'load data local infile "hprdToUniProt.tab" into table hprdToUniProt'

# build knownToHprd table

    hgsql hg19 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=refseq' >j.kg1
    hgsql hg19 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2

    cat j.kg1 j.kg2 | sed 's/_.//' | sort -u >knownToHprd.tab
    wc knownToHprd.tab

    hgsql hg19 <~/src/hg/lib/knownToHprd.sql

    hgsql hg19 -e 'load data local infile "knownToHprd.tab" into table knownToHprd'
    hgsql hg19 -e 'select count(*) from knownToHprd'

# 21,516 records created

# remove temporary files.

    rm j*

#########################################################################
# hgPal downloads (DONE braney 2009-11-03)
#   FASTA from 46way for refGene, knownGene, knownCanonical 

    ssh hgwdev
    screen
    bash
    rm -rf /cluster/data/hg19/bed/multiz46way/pal
    mkdir /cluster/data/hg19/bed/multiz46way/pal
    cd /cluster/data/hg19/bed/multiz46way/pal
    for i in `cat ../species.list`; do echo $i; done > order.lst

    mz=multiz46way
    gp=refGene
    db=hg19
    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.jobs

    time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
    sleep 1
    tail -f $gp.jobs.log

# real    350m10.116s
# user    38m38.731s
# sys     7m22.106s


    mz=multiz46way
    gp=refGene
    db=hg19
    zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    # we're only distributing exons at the moment
    mz=multiz46way
    gp=refGene
    db=hg19
    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

    mz=multiz46way
    gp=knownGene
    db=hg19
    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
    sleep 1
    tail -f $gp.$mz.job.log

# oops... missed the timing


    mz=multiz46way
    gp=knownGene
    db=hg19

    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    mz=multiz46way
    gp=knownGene
    db=hg19
    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

    # now do the canonical set
    cd /cluster/data/hg19/bed/multiz46way/pal
    mz=multiz46way
    gp=knownCanonical
    db=hg19
    for j in `awk '{print $1}' /cluster/data/hg19/chrom.sizes`
    do
	echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
    done

    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -geneBeds=$j.known.bed  $db $mz knownGene order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
    sleep 1
    tail -f $gp.$mz.job.log

# real    302m20.489s
# user    27m31.179s
# sys     5m30.071s


    rm *.known.bed
    mz=multiz46way
    gp=knownCanonical
    db=hg19
    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    mz=multiz46way
    gp=knownCanonical
    db=hg19
    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

############################################################################
# SEGMENTAL DUPLICATIONS (2010-02-02 - 2010-02-04, hartera, DONE)
# (2011-09-26, Fan, REBUILT using corrected data from provider)
# File emailed from Tin Louie <tinlouie at u.washington.edu>
# in Evan Eichler's lab on 01/28/10. This is a data update since it was
# thought that the last data set was incorrect so the pipeline had to be
# re-run.
# NOTE: Received e-mail from Tin Louie suggesting that the otherSize 
# column could be dropped. It is just the size of the otherChrom and it 
# does not seem to be used for the track display or details page. It has the
# correct description in the table schema so it is ok to keep it for now. 
# In the future, this column could be dropped if it not useful. 
# There are a number of columns that could be dropped as they are 
# meaningless but decided to keep them as the code for the details page
# expect them to be there.
# 01/28/10 Received new data as previous run of the pipeline may have
# produced incorrect results. 
# 2010-02-02 Loader aborted on data since in some lines there was an empty
# field so the loader read only 28 words instead of 29. E-mailed Tin to
# ask for the data to be fixed. 
# 2010-02-03 Received new data as the previous data had empty fields.
# 2010-02-04 Loaded new data into hg19 database.
# 2010-02-09 Received new data on 02/08/10 as there were more errors in the
# code that caused the data to have empty fields.
# 2010-02-19 Changed the posBasesHit column values to match those for hg18.
# 2011-09-26 Rebuilt, using corrected (10th col) from data provider.
# In hg18, they are all 1000, but this is meaningless. 
    mkdir /hive/data/genomes/hg19/bed/genomicSuperDups/0926_2011
    cd /hive/data/genomes/hg19/bed/genomicSuperDups/0926_2011
    wget --timestamping \
         ftp://mesh.gs.washington.edu/pub/UCSC/hg19genomicSuperDups.fixed.tab.gz
    gunzip hg19genomicSuperDups.fixed.tab.gz
    # Fix incorrect chromosome names in data. Check both chrom and otherChrom.
    # Previously, found several cases where the last letter of random was
    # missing for the names of the random contigs. They all look good this
    # time.
    awk '{print $1}' hg19genomicSuperDups.fixed.tab | sort | uniq > chroms
    awk '{print $7}' hg19genomicSuperDups.fixed.tab | sort | uniq > otherChroms
    hgsql -Ne 'select chrom from chromInfo;' hg19 | sort | uniq > chromInfo.txt 
    comm -23 chroms chromInfo.txt
    comm -23 otherChroms chromInfo.txt
    # chroms and otherChroms match chromosome names in chromInfo.

    # The sed command is necessary to fix "_" used as strand to "-".
    # The awk command was necessary for some recent other species
    # genomicSuperDups that had some too-short regions.  It does not seem
    # to be necessary here, but doesn't hurt and may be useful in
    # future builds.
    hgsql -e 'drop table genomicSuperDups;' hg19
    sed -e 's/\t_\t/\t-\t/' hg19genomicSuperDups.fixed.tab \
    | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
    | hgLoadBed hg19 genomicSuperDups stdin \
      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
# Loaded 51599 elements of size 29
# Sorted
# Creating table definition for genomicSuperDups
# Saving bed.tab
# Loading hg19

    # 2009-11-05: 
    # Updated details page with suggested text and an additional reference. 
    # src/hg/makeDb/trackDb/genomicSuperDups.html
    # 2010-02-04: Updated the schema description as below in
    # src/hg/lib/genomicSuperDups.sql. Kept score as it is used in older
    # datasets e.g. on hg18 -  
    # Suggestions by Tin Louie for the schema description:
# I suggest that the description of those meaningless columns (on the webpage
# 'Schema for Segmental Dups') be changed to "for future use". The meaningless
# columns are:  score, posBasesHit, testResult, verdict, chits, ccov
# The descriptions of other columns should be changed for clarification:
# otherSize -- equal to otherEnd minus otherStart
# uid -- id shared by the query & subject of a hit 
     # 2010-02-19 Changed the posBasesHit column to be 1000. Checked with 
     # data provider about doing this so that the values are the same as for
     # those in the hg18 table.
     # hgsql -e 'update genomicSuperDups set posBasesHit = 1000;' hg19
# New corrected data fixed the above problem. 
############################################################################
# ADD LINK TO GENENETWORK (DONE. 12/02/09 Fan).

# Received geneNetwork ID list file, GN_human_RefSeq.txt, for hg19 from GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com].

    ssh hgwdev
    mkdir -p /cluster/data/hg19/bed/geneNetwork
    cd /cluster/data/hg19/bed/geneNetwork

    hgsql hg19 < ~/src/hg/lib/geneNetworkId.sql
    hgsql hg19 -e \
    'load data local infile "GN_human_RefSeq.txt" into table geneNetworkId'

######################################################################## 
# EXONIPHY (2009-12-28, hartera, DONE)
# 2010-01-05, hartera. Moved trackDb entry for exoniphy up to human directory
# level as it is the same for all assemblies.
    # New predictions for hg19 run by Melissa Hubisz 
    # (mjhubisz at gmail.com) for hg19 and sent by 
    # Adam Siepel (acs4 at cornell.edu) on 2009-12-18
    mkdir -p /hive/data/genomes/hg19/bed/exoniphy.2009-12-18
    cd /hive/data/genomes/hg19/bed/exoniphy.2009-12-18
    # Download predictions file, exoniphy.gff
    wget --timestamping \
         "http://compgen.bscb.cornell.edu/~acs/exoniphy.gff.gz"
    gunzip exoniphy.gff.gz
    # Remove table of lifted predictions from hg18
    hgsql -e 'drop table exoniphy;' hg19
    ldHgGene -genePredExt -gtf hg19 exoniphy exoniphy.gff
    # Read 620689 transcripts in 647299 lines in 1 files
    # 620689 groups 24 seqs 1 sources 4 feature types
    # 186601 gene predictions
    # Added a copy of the hg18 track description to trackDb/human/hg19 and 
    # updated it and added a trackDb entry to the trackDb.ra there.
    # 2010-01-04. Moved exoniphy trackDb entry up to the human level
    # trackDb.ra since the entry is the same for hg16-19. Removed the entry in
    # trackDb.ra in each of those assembly directories.

######################################################################## 
# Vega gene update (DONE - 2010-01-15 - Hiram)
    #	lookup version number at the Vega WEB site:
    #	http://vega.sanger.ac.uk/index.html
    #	and FTP site:
    #	ftp://ftp.sanger.ac.uk/pub/vega/
    cd /hive/data/genomes/hg19
    #	step wise to verify operation
    doEnsGeneUpdate.pl -vegaGene -ensVersion=36 -stop=download hg19.ensGene.ra
    doEnsGeneUpdate.pl -vegaGene -ensVersion=36 \
	-continue=process -stop=process hg19.ensGene.ra
    doEnsGeneUpdate.pl -vegaGene -ensVersion=36 \
	-continue=load -stop=load hg19.ensGene.ra
    doEnsGeneUpdate.pl -vegaGene -ensVersion=36 \
	-continue=cleanup hg19.ensGene.ra
    featureBits hg19 vegaGene
    # 64888909 bases of 2897316137 (2.240%) in intersection
    featureBits hg19 vegaPseudoGene
    # 6885145 bases of 2897316137 (0.238%) in intersection

########################################################################
# NHGRI GWAS CATALOG (DONE 2/4/13 angie)
# NOTE: This assumes that the corresponding section in hg18.txt has just been run.
#       It depends on the noCoords.tab file in the corresponding hg18 build directory.
# 2013 updates: 2/4
# 2012 updates: 12/10, 10/4, 8/1, 6/4, 4/4, 2/21 (remove extra whitespace, translate non-ASCII to html), 2/6
# Updated 12/7/11, 11/2/11, 10/3/11, 9/2/11, 8/1/11, 6/9/11, 4/1/11 (last one to use snp131), 3/1/11, 2/1/11
# Updated 12/7/10, 11/1/10, 10/6/10, 9/1/10, 8/2/10, 6/2/10, 5/12/10 (last one to use snp130)
# Updated 4/1/10, 3/1/10
# Originally done 1/19/10
    mkdir /hive/data/genomes/hg19/bed/gwasCatalog
    cd /hive/data/genomes/hg19/bed/gwasCatalog
    # Done once per dbSNP build, don't need to redo until next dbSNP build is released:
    zcat ../snp137/snp137.bed.gz | cut -f 1-4,6,8,18,21-24  \
    | sort -k4,4 \
      > snp137Coords.bed
    set today = `date +%y%m%d`
    mkdir /hive/data/genomes/hg19/bed/gwasCatalog/$today
    cd /hive/data/genomes/hg19/bed/gwasCatalog/$today

    # Mapping to hg19 by joining hg19 SNP coords with catalog flatfile (see hg18.txt)
    join -t "	" -1 4 ../snp137Coords.bed /hive/data/genomes/hg18/bed/gwasCatalog/$today/noCoords.tab \
        -o 1.1,1.2,1.3,1.4,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12,2.13,2.14,2.15,2.16,2.17,2.18,2.19,1.5,1.6,1.7,1.8,1.9,1.10,1.11 \
    | sort -k1,1 -k2n,2n \
        > gwasCatalogPlus.bed
    cut -f 1-22 gwasCatalogPlus.bed \
    | hgLoadBed hg19 gwasCatalog stdin \
        -tab -sqlTable=$HOME/kent/src/hg/lib/gwasCatalog.sql -notItemRgb -allowStartEqualEnd
#Read 12194 elements of size 22 from stdin
    # For David: find examples of risk alleles for which dbSNP observed
    # alleles are complementary (A/T or C/G) -- how do we know what strand the
    # risk allele is on??  -- asked corresp. author Teri Manolio.  Info is not
    # always available in the original publication, so sadly there is not always
    # a way to resolve these. GWAS catalog folks aren't going to modify their
    # database to add a column for these cases.
    hgsql hg19 -NBe 'select snp.name,gc.riskAllele,snp.strand,snp.refNcbi,snp.observed \
                     from gwasCatalog as gc, snp137 as snp \
                     where gc.riskAllele rlike "^rs[0-9]+-[ACGT]" and \
                           gc.name = snp.name and snp.observed in ("C/G", "A/T") \
                     order by gc.name;' > ambigStrand.txt
    wc -l ambigStrand.txt
#1249 ambigStrand.txt


######################################################################## 
# ailMel1 Panda alignment (DONE - 2010-02-04 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzAilMel1.2010-02-04
    cd /hive/data/genomes/hg19/bed/lastzAilMel1.2010-02-04

    cat << '_EOF_' > DEF
# Human vs. Panda
#	parameters from the Panda paper supplemental where they describe
#	their lastz parameters
BLASTZ_K=2200
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_H=2000
BLASTZ_C=2
BLASTZ_T=2

# our usual M
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Panda
SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit
SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzAilMel1.2010-02-04
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    434m21.792s
    cat fb.hg19.chainAilMel1Link.txt 
    #	1453400264 bases of 2897316137 (50.164%) in intersection

    mkdir /hive/data/genomes/ailMel1/bed/blastz.hg19.swap
    cd /hive/data/genomes/ailMel1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzAilMel1.2010-02-04/DEF \
	-swap -noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    real    124m14.393s
    cat fb.ailMel1.chainHg19Link.txt 
    #	1411953704 bases of 2245312831 (62.884%) in intersection

#########################################################################
# susScr1 Pig BLASTZ/CHAIN/NET (DONE - 2010-01-21 - Hiram)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/hg19/bed/lastzSusScr1.2010-01-21
    cd /hive/data/genomes/hg19/bed/lastzSusScr1.2010-01-21

    cat << '_EOF_' > DEF
# Pig vs. Human
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Pig SusScr1
SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit
SEQ2_LEN=/scratch/data/susScr1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzSusScr1.2010-01-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    1072m48.949s
    cat fb.hg19.chainSusScr1Link.txt 
    #	1198793067 bases of 2897316137 (41.376%) in intersection

    mkdir /hive/data/genomes/susScr1/bed/blastz.hg19.swap
    cd /hive/data/genomes/susScr1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzSusScr1.2010-01-21/DEF \
	-swap -noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    119m14.040s
    cat fb.susScr1.chainHg19Link.txt 
    #	1272787231 bases of 2231332019 (57.042%) in intersection

#########################################################################
# PERSONAL GENOME VARIANTS (DONE 12/29/09 giardine)

    # This is Angie's attempt to reconstruct Belinda's steps:
    mkdir /hive/data/genomes/hg19/bed/pgSnpLiftOver
    cd /hive/data/genomes/hg19/bed/pgSnpLiftOver
    # liftOver track files in /hive/data/genomes/hg18/bed/pgSnp/
    set hg18Dir = /hive/data/genomes/hg18/bed/pgSnp
    set chainFile = /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz

    #*** Are we losing insertions here?:
    foreach f (NA12878.pgSnp NA12891.pgSnp NA12892.pgSnp NA19240.pgSnp \
               pgWatson.bed pgYri3.txt pgSnpYh.txt)
      liftOver $hg18Dir/$f $chainFile $f:r.hg19.pgSnp{,.unmapped}
    end
    liftOver $hg18Dir/koref.sub.pgSnp $chainFile koref.hg19.pgSnp{,.unmapped}
    # Why pgVenter2?
    liftOver $hg18Dir/pgVenter.bed $chainFile pgVenter2.hg19.pgSnp{,.unmapped}
 
    # remove variants that are homozygous matches to hg19
cat > addRefNt.pl <<'_EOF_'
#!/usr/bin/perl -w
use strict;

my $build = 'hg19';
my $nib = "/hive/data/genomes/hg19/nib/";
my $nibFrag = "nibFrag";

while (<>) {
   chomp;
   my @f = split(/\t/);
   my $ref = '';
   if ($f[1] eq $f[2]) {
      $ref = '.'; #insertion nothing in ref
   }else {
      open(NIB, "$nibFrag $nib$f[0].nib $f[1] $f[2] + stdout |")
         or die "Couldn't run $nibFrag, $!\n";
      while(<NIB>) {
         chomp;
         if (/^>/) { next; }
         $ref .= $_;
      }
      close NIB or die "Couldn't close $nibFrag, $!\n";
   }
   #splice(@f, 3, 0, uc($ref));
   #print join("\t", @f), "\n";
   print join("\t", @f), "\t", uc($ref), "\n";
}
'_EOF_'
    # << emacs
    chmod a+x addRefNt.pl
    foreach f (*.pgSnp)
      addRefNt.pl $f \
      | perl -wpe '@w=split; s/^.*\n$// if ($w[3] eq $w[6]); s/\t\w+$//;' \
        > $f:r.filtered.pgSnp
    end
#TODO: complete attempt to reverse-engineer Belinda's work
    BelindasFix pgYri3.hg19.filtered.pgSnp > pgYri3.hg19.filtered.fixed.pgSnp

    # Load into db:
    foreach i (NA12878 NA12891 NA12892 NA19240 Watson)
      hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
        hg19 pg$i $i.hg19.filtered.pgSnp
    end
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
      hg19 pgVenter pgVenter2.hg19.filtered.pgSnp
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
      hg19 pgYh1 pgSnpYh1.hg19.filtered.pgSnp
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
      hg19 pgSjk koref.hg19.filtered.pgSnp
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable \
      hg19 pgYoruban3 pgYri3.hg19.filtered.fixed.pgSnp


########################################################################
# CRG MAPABILITY (2010-01-19 - 2010-01-28, hartera, DONE)
# Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca
# from the Guigo lab at the Center for Genomic Regulation (CRG) in Barcelona. 
# Data was produced using their GEM mapper aligner taking sliding k-mers 
# window of the human genome that were mapped back onto the genome with up 
# to 2mismatches. For each window, a mappability score is computed 
# S = 1/(nb of match_found) and the BigWig index was created according to 
# this score. 
# 2010-01-26 Loaded tables and added data to /gbdb/
# 2010-01-28 Changed the table names to have the "enc" prefix for consistency
# going forward with hg19 ENCODE tracks. Added trackDb entry for this 
# ENCODE Mapability track. 
# 2010-02-05 Added a 40mer sequence subtrack received on 2010-02-04.
# 2010-03-16 - 2010-03-18. Added metadata to trackDb for the subtracks and
# added downloads for the bigWig data files.
# 2010-04-28. Received new data from Thomas Derrien. Downloaded data and 
# added it to /gbdb/. A bug was found in a library used by bedGraphToBigWig so
# sent a new binary to data providers and they re-created the bigWig files. 
# 2010-05-11. All ENCODE tracks need to be preceded by the wgEncode prefix now
# on all assemblies. Update the file names in /gbdb/hg19/bbi and the 
# table names. (hartera)
# 2010-05-12. Added 24mer track to trackDb entry. Updated downloads with the
# new data.
    mkdir -p /hive/data/genomes/hg19/bed/crgMapability
    cd /hive/data/genomes/hg19/bed/crgMapability
cat << 'EOF' > temp
#!/bin/tcsh -ef
http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg19_H.sapiens.genome.hg19.main.mappability-36.bw.bz2
http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg19_H.sapiens.genome.hg19.main.mappability-50.bw.bz2
http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg19_H.sapiens.genome.hg19.main.mappability-75.bw.bz2
http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg19_H.sapiens.genome.hg19.main.mappability-100.bw.bz2
'EOF'

    awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \
        temp > download.csh
    rm temp
    chmod +x download.csh
    ./download.csh >& download.log &

     # Add the data to /gbdb/ and load the file names into tables (2010-01-26)
     cd /hive/data/genomes/hg19/bed/crgMapability
     bunzip2 *.bz2

     # Add data to gbdb
     mkdir -p /gbdb/hg19/bbi/
     # Symlink files with names as crgMapabilityAlignXmer.bw to /gbdb/hg18/bbi
     # and load file name into a table - one per dataset. Each table 
     # represents a subtrack.
     foreach f (`ls *.bw`)
        echo $f
        set g=`echo $f | cut -d "-" -f2`
        set num=`echo $g | cut -d "." -f1`
        set mer=`echo "${num}mer"`
        set nf=`echo "crgMapabilityAlign${mer}.bw"`
        echo $nf
        ln -s `pwd`/${f} /gbdb/hg19/bbi/${nf}
        hgsql hg19 -e "drop table if exists crgMapabilityAlign${mer}; \
     create table crgMapabilityAlign${mer} (fileName varchar(255) not null); \
     insert into crgMapabilityAlign${mer} values ('/gbdb/hg19/bbi/${nf}');"
     end

     # 2010-01-28
     # Renamed the tables to have a enc prefix for consistency going 
     # forward with hg19.
     cd /hive/data/genomes/hg19/bed/crgMapability
     hgsql -Ne 'show tables like "crg%";' hg19 > tables.txt
     foreach t (`cat tables.txt`)
        set g=`echo $t | sed -e 's/c/C/'`
        hgsql -e "alter table ${t} rename enc${g};" hg19
     end 
     # Added a trackDb entry for this ENCODE Mapability 
     # track in kent/src/hg/makeDb/trackDb/human/hg19/trackDb.enc.ra 
     # Copied track from the hg18/trackDb.wgEncode.ra entry.  
     # use bigWigInfo to check min and max values. 

     # Added data for a 40mer subtrack - 2010-02-05
     cd /hive/data/genomes/hg19/bed/crgMapability
     wget --timestamping \
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-40.bw.bz2
     bunzip2 H.sapiens.genome.hg19.main.mappability-40.bw.bz2
     ln -s `pwd`/H.sapiens.genome.hg19.main.mappability-40.bw \
/gbdb/hg19/bbi/crgMapabilityAlign40mer
        hgsql hg19 -e "drop table if exists encCrgMapabilityAlign40mer; \
     create table encCrgMapabilityAlign40mer (fileName varchar(255) not null); \
     insert into encCrgMapabilityAlign40mer values \
('/gbdb/hg19/bbi/crgMapabilityAlign40mer');"
    # Added a subtrack to trackDb/human/hg19/trackDb.enc.ra to the 
    # Mapability track.

    # 2010-03-16 - 2010-03-18
    # Added metadata to the trackDb entries for the subtracks and 
    # added downloads for these data files.
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg19/encodeDCC/encMapability
    cd /usr/local/apache/htdocs/goldenPath/hg19/encodeDCC/encMapability
    cp -p /gbdb/hg19/bbi/crg*.bw .
    gzip crg*.bw
    # Copied over hg18/encodeDCC/wgEncodeMapability/preamble.html 
    # and edited it to only mention the CRG dataset. 
    # Run encodeDownloadsPage.pl to generate the index page for downloads.
    # It does not capture all the information probably because the subtrack
    # name is different to the downloads name so change the file names and 
    # re-load the tables and make the downloads. 
    cd /hive/data/genomes/hg19/bed/crgMapability
    foreach f (`ls *.bw`)
       echo $f
       set g=`echo $f | cut -d "-" -f2`
       set num=`echo $g | cut -d "." -f1`
       set mer=`echo "${num}mer"`
       set of=`echo "crgMapabilityAlign${mer}.bw"`
       set nf=`echo "encCrgMapabilityAlign${mer}.bw"`
       echo $nf
       rm /gbdb/hg19/bbi/${of}
       ln -s `pwd`/${f} /gbdb/hg19/bbi/${nf}
       hgsql hg19 -e "drop table if exists encCrgMapabilityAlign${mer}; \
     create table encCrgMapabilityAlign${mer} (fileName varchar(255) not null); \
     insert into encCrgMapabilityAlign${mer} values ('/gbdb/hg19/bbi/${nf}');"
     end
     
     cd /usr/local/apache/htdocs/goldenPath/hg19/encodeDCC/encMapability
     rm crg*
     cp -p /gbdb/hg19/bbi/encCrg*.bw .
     gzip encCrg*.bw
     # Then run encodeDownloadsPages.pl
     /cluster/home/hartera/bin/encodeDownloadsPage.pl -db=hg19 -checksum \
          -preamble=preamble.html index.html . 

     # Downloaded and added new bigWig files to /gbdb/hg19/bbi 
     # (2010-04-28 and 2010-04-30, hartera). New files were created as 
     # there was a bug in the older version of bedGraphToBigWig.          
     cd /hive/data/genomes/hg19/bed/crgMapability
     rm temp download.csh download.log 
cat << 'EOF' > temp
#!/bin/tcsh -ef
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-100.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-24.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-36.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-40.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-50.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg19.main.mappability-75.bw.bz2
'EOF'
     awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \
         temp > download.csh
     rm temp
     chmod +x download.csh
     ./download.csh >& download.log &

     # Add data to /gbdb/. The file names in /gbdb/ are the same as before 
     # so the tables do not need to be reloaded.
     cd /hive/data/genomes/hg19/bed/crgMapability
     bunzip2 *.bz2
     foreach f (`ls *.bw`)
       echo $f
       set g=`echo $f | cut -d "-" -f2`
       set num=`echo $g | cut -d "." -f1`
       set mer=`echo "${num}mer"`
       set nf=`echo "encCrgMapabilityAlign${mer}.bw"`
       echo $nf
       rm /gbdb/hg19/bbi/${nf}
       ln -s `pwd`/${f} /gbdb/hg19/bbi/${nf}
     end

     # 2010-05-11, hartera. Re-name bigWig files and update tables
     # as all ENCODE tracks should now have the wgEncode prefix on all 
     # assemblies.
    cd /hive/data/genomes/hg19/bed/crgMapability
    foreach f (`ls *.bw`)
       echo $f
       set g=`echo $f | cut -d "-" -f2`
       set num=`echo $g | cut -d "." -f1`
       set mer=`echo "${num}mer"`
       set of=`echo "encCrgMapabilityAlign${mer}.bw"`
       set nf=`echo "wgEncodeCrgMapabilityAlign${mer}.bw"`
       echo $nf
       rm /gbdb/hg19/bbi/${of}
       ln -s `pwd`/${f} /gbdb/hg19/bbi/${nf}
       hgsql hg19 -e "drop table if exists encCrgMapabilityAlign${mer}; \
     create table wgEncodeCrgMapabilityAlign${mer} (fileName varchar(255) not null); \
     insert into wgEncodeCrgMapabilityAlign${mer} values ('/gbdb/hg19/bbi/${nf}');"
     end
     # Then change the subtrack names to match the new table names in
     # kent/src/hg/makeDb/trackDb/human/hg19/trackDb.wgEncode.ra as 
     # the contents of trackDb.enc.ra has been moved there. 

     # 2010-05-12
     # Added subtrack for the new 24mer table. 
     # Updated the downloads for the new data.
     cd /usr/local/apache/htdocs/goldenPath/hg19/encodeDCC
     # Change name of downloads directory to be consistent with the 
     # new track name.
     mv encMapability wgEncodeMapability
     cd wgEncodeMapability
     rm encCrg* md5sum.txt
     cp -p /gbdb/hg19/bbi/wgEncodeCrg*.bw .
     gzip wgEncodeCrg*.bw
     # Then run encodeDownloadsPage.pl script to update the index.html and
     # regenerate the md5sum.txt file.
     encodeDownloadsPage.pl -db=hg19 -checksum index.html 
 
#####################################################################
# tRNAs track (2010-01-13, Fan DONE)
# tRNAs track (2010-03-10, Fan RE-BUILT WITH UPDATED DATA FROM TODD LOWE)
#
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed
    mkdir tRNAs
    cd tRNAs

# Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/

    cp -p /projects/lowelab/users/lowe/Browser/vertebrates/hg19-tRNAs.bed .
    cp -p \
    /projects/lowelab/users/lowe/Browser/vertebrates/hg19_tRNAs_images.tar\
    .

    hgsql hg19 -e 'drop table if exists tRNAs'
    hgLoadBed -tab hg19 tRNAs hg19-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql

    mkdir gif
    cd gif
    tar -xvf ../hg19_tRNAs_images.tar
    mv image/* .
    rmdir image
    mkdir /hive/data/gbdb/hg19/RNA-img
    cp -p * /hive/data/gbdb/hg19/RNA-img

#####################################################################
# calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-01-21 - Hiram)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/hg19/bed/lastzCalJac3.2010-02-11
    cd /hive/data/genomes/hg19/bed/lastzCalJac3.2010-02-11

    cat << '_EOF_' > DEF
# human vs. marmoset
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Marmoset (calJac3)
SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
SEQ2_LIMIT=50
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzCalJac3.2010-02-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    287m24.258s
    cat fb.hg19.chainCalJac3Link.txt 
    #	2047068864 bases of 2897316137 (70.654%) in intersection

    mkdir /hive/data/genomes/calJac3/bed/blastz.hg19.swap
    cd /hive/data/genomes/calJac3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzCalJac3.2010-02-11/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    120m42.991s
    cat fb.calJac3.chainHg19Link.txt 
    #	2030475813 bases of 2752505800 (73.768%) in intersection

#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (Done Feb 26, 2010 -Jim)
    ssh hgwdev
    mkdir /cluster/data/hg19/bed/mrnaPcr
    cd /cluster/data/hg19/bed/mrnaPcr
    # First, get consistent FA and PSL for UCSC Genes.
    # Initially I tried to use files from /cluster/data/hg19/bed/ucsc.10/:
    # subColumn 10 /cluster/data/hg19/bed/ucsc.10/rnaToGenome.psl
    #   /cluster/data/hg19/bed/ucsc.10/txToAcc.tab ucscGenes.hg19.psl
    # /cluster/data/hg19/bed/ucsc.10/ucscGenes.fa
    # But the psl was not from exactly the same seq's as in the fa.
    # Jim's suggestion: use sequenceForBed to get genomic-translated
    # sequences, and then genePredToFakePsl.  sequenceToBed must be
    # run on hgwdev.
    genePredToBed /cluster/data/hg19/bed/ucsc.12/ucscGenes.gp > ucscGenes.bed
    hgsql hg19 -NBe 'select kgId,geneSymbol from kgXref' \
    | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
      > idSub.txt
    subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
    sequenceForBed -keepName -db=hg19 -bedIn=ucscGenesIdSubbed.bed \
      -fastaOut=stdout \
    | faToTwoBit stdin kgTargetSeq.2bit
    cut -f 1-10 /cluster/data/hg19/bed/ucsc.12/ucscGenes.gp \
    | genePredToFakePsl hg19 stdin kgTargetAli.psl /dev/null

    # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
    cd /cluster/data/hg19/bed/mrnaPcr
    hgLoadPsl hg19 kgTargetAli.psl
    mkdir /gbdb/hg19/targetDb
    ln -s /cluster/data/hg19/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg19/targetDb/

    # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
    # /gbdb/hg19/targetDb/kgTargetSeq.2bit .

    ssh hgwdev
    # Add records to hgcentraltest blatServers and targetDb:
    hgsql hgcentraltest -e \
      'INSERT into blatServers values ("hg19Kg", "blat12", 17807, 0, 1);'
    hgsql hgcentraltest -e \
      'INSERT into targetDb values("hg19Kg", "UCSC Genes", \
         "hg19", "kgTargetAli", "", "", \
         "/gbdb/hg19/targetDb/kgTargetSeq.2bit", 1, now(), "");'


########################################################################
# DGV V10 (DATABASE OF GENOMIC VARIANTS) (DONE 11/10/10 angie - color change 2/22/11 #2917)
# 2/22/11 color change (Bug #2917): swap blue and red; green -> brown
# Old DGV format is obsolete; see the following section.

#######################################################################
# DGV BETA (DATABASE OF GENOMIC VARIANTS) (DONE 2/11/13 angie)
    # DGV has changed their data format, and for the time being the data are
    # served by a beta web site, http://dgvbeta.tcag.ca/ ; in time that will
    # replace their current site.
    set today = `date +%y%m%d`
    mkdir -p /hive/data/genomes/hg19/bed/dgv/$today
    cd /hive/data/genomes/hg19/bed/dgv/$today
    wget http://dgvbeta.tcag.ca/dgv/docs/GRCh37_hg19_2012-11-23.txt
    head -1 GRCh37_hg19*.txt
#variantaccession        chr     start   end     varianttype     variantsubtype  reference       pubmedid        method  platform        mergeid mergedorsample  frequency       samplesize      cohortdescription       genes
    # It's more complicated than Gain/Loss/Complex or Inversion now (+ stray commas):
    cut -f 5,6 GRCh37_hg19*.txt | sort | uniq -c | head -100
#  20105 CNV
#   1297 CNV     ""
#  26687 CNV     CNV
#   2980 CNV     Complex
# 185262 CNV     Deletion
#  17032 CNV     Duplication
# 122635 CNV     Gain
#   4120 CNV     Gain+Loss
#  27120 CNV     Insertion
# 447672 CNV     Loss
#    277 OTHER
#     31 OTHER   ""
#     42 OTHER   Complex
#   2477 OTHER   Inversion
#    202 OTHER   Tandem duplication
#      1 varianttype     variantsubtype
    # shuffle fields into bed9+ w/itemRgb
    set purple = "200,0,200"
    set red = "200,0,0"
    set blue = "0,0,200"
    set brown = "139,69,19"
    tail -n +2 GRCh37_hg19*.txt \
    | perl -wpe 'chomp; \
      s/""//; \
      ($id, $chr, $start, $end, $varType, $varSubType, $ref, $pmid, $method, $platform, \
       undef, undef, undef, $sampleSize, $sampleDesc, $genes) = split("\t"); \
      $start-- unless ($start == 0); \
      $landmark = $genes; \
      $landmark =~ s/,/, /g; \
      $varSubType =~ s/^,//;  $varSubType =~ s/,$//; \
      $varTypeOut = "$varType ($varSubType)"; \
      $ref =~ s/_/ /g; \
      $method =~ s/_/ /g;  $method =~ s/,/, /g; \
      $sample = $sampleDesc; \
      $sample .= " (sample size: $sampleSize)" if ($sampleSize); \
      $method .= " ($platform)" if ($platform && $platform ne "Not Provided"); \
      $rgb = "0,0,0"; \
      if ($varType eq "CNV") { \
        if ($varSubType eq "Gain" || $varSubType eq "Insertion" || $varSubType eq "Duplication") {\
          $rgb = "'$blue'"; \
        } elsif ($varSubType eq "Loss" ||$varSubType eq "Deletion") { \
          $rgb = "'$red'"; \
        } elsif ($varSubType eq "") { \
          $varTypeOut = $varType; \
        } else { \
          $rgb = "'$brown'"; \
        } \
      } elsif ($varType eq "OTHER") { \
        if ($varSubType eq "Inversion") { \
          $rgb =  "'$purple'"; \
        } elsif ($varSubType eq "Tandem Duplication") { \
          $rgb = "'$blue'"; \
        } else { \
          $varTypeOut = $varType; \
        } \
      } \
      $_ = join("\t", "chr$chr", $start, $end, $id, 0, "+", \
                $start, $start, $rgb, $landmark, $varTypeOut, \
                $ref, $pmid, $method, $sample) . "\n";' \
        > dgv.bed
    hgLoadBed hg19 dgv dgv.bed \
      -sqlTable=$HOME/kent/src/hg/lib/dgv.sql -renameSqlTable -tab
#Read 857939 elements of size 15 from dgv.bed


#######################################################################
# felCat4 Cat BLASTZ/CHAIN/NET (DONE  - 2010-06-07 - Chin)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07
    cd /hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07

    cat << '_EOF_' > DEF
# human vs. cat
# maximum M allowed with lastz is only 254
BLASTZ_M=254

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Cat (felCat4)
SEQ2_DIR=/scratch/data/felCat4/felCat4.2bit
SEQ2_LEN=/scratch/data/felCat4/chrom.sizes
SEQ2_LIMIT=50
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet -noDbNameCheck \
	-chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    # doBlastzChainNet from step chainRun after para stop, para freeBatch 
    # After para stop para freeBatch in 
    # /hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07/axtChain/run] 
    # rm the run directory,  and use memk/swarm this time

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
         -continue chainRun \
         -syntenicNet -noDbNameCheck \
         -chainMinScore=3000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
         > do_chainRun.log 2>&1 &
    # real    245m43.063s
    # *** All done !  Elapsed time: 245m43s
    # *** Make sure that goldenPath/hg19/vsFelCat4/README.txt is accurate.
    # *** Add {chain,net}FelCat4 tracks to trackDb.ra if necessary.

    cat fb.hg19.chainFelCat4Link.txt 
    #  1266003011 bases of 2897316137 (43.696%) in intersection
    # make it time independent and indicate that it is really done
    cd /hive/data/genomes/hg19/bed 
    ln -s  lastzFelCat4.2010-06-07 lastz.felCat4

    # Swap
    mkdir /hive/data/genomes/felCat4/bed/blastz.hg19.swap
    cd /hive/data/genomes/felCat4/bed/blastz.hg19.swap

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzFelCat4.2010-06-07/DEF \
	-swap -syntenicNet -noDbNameCheck \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    # real    432m36.917s
    cat fb.felCat4.chainHg19Link.txt 
    #   1211702270 bases of 1990635005 (60.870%) in intersection

#####################################################################
# susScr2 Pig BLASTZ/CHAIN/NET (DONE - 2010-03-26,27 - Hiram)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/hg19/bed/lastzSusScr2.2010-03-26
    cd /hive/data/genomes/hg19/bed/lastzSusScr2.2010-03-26

    cat << '_EOF_' > DEF
# Pig vs. Human
BLASTZ_M=50

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Pig SusScr2
SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit
SEQ2_LEN=/scratch/data/susScr2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzSusScr2.2010-03-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	Elapsed time: 842m23s
    cat fb.hg19.chainSusScr2Link.txt 
    #	1198794058 bases of 2897316137 (41.376%) in intersection

    mkdir /hive/data/genomes/susScr2/bed/blastz.hg19.swap
    cd /hive/data/genomes/susScr2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzSusScr2.2010-03-26/DEF \
	-swap -noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	Elapsed time: 112m40s

    cat fb.susScr2.chainHg19Link.txt 
    #	1272785114 bases of 2231298548 (57.042%) in intersection

#########################################################################
# Vega gene update (DONE - 2010-04-07 - Hiram)
    #	lookup version number at the Vega WEB site:
    #	http://vega.sanger.ac.uk/index.html
    #	and FTP site:
    #	ftp://ftp.sanger.ac.uk/pub/vega/
    cd /hive/data/genomes/hg19
    #	step wise to verify operation
    doEnsGeneUpdate.pl -vegaGene -ensVersion=38 -stop=download hg19.ensGene.ra
    doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \
	-continue=process -stop=process hg19.ensGene.ra
# genePredCheck -db=hg19 vegaPseudo.gp.gz
# checked: 11590 failed: 0
# genePredCheck -db=hg19 not.vegaPseudo.gp.gz
# checked: 96345 failed: 0
# genePredCheck -db=hg19 hg19.allGenes.gp.gz
# checked: 107935 failed: 0
    doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \
	-continue=load -stop=load hg19.ensGene.ra
# zcat: download/Homo_sapiens.VEGA.38.pep.all.fa.gz: unexpected end of file
    #	they changed their file name convention ...
    doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \
	-continue=cleanup hg19.ensGene.ra
    featureBits hg19 vegaGene
    # 74206453 bases of 2897316137 (2.561%) in intersection
    featureBits hg19 vegaPseudoGene
    # 8494715 bases of 2897316137 (0.293%) in intersection

#####################################################################
# oviAri1 Sheep BLASTZ/CHAIN/NET (DONE - 2010-04-16 - Chin)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/hg19/bed/lastzOviAri1.2010-04-16
    cd /hive/data/genomes/hg19/bed/lastzOviAri1.2010-04-16

    cat << '_EOF_' > DEF
# Sheep vs. Human
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Sheep OviAri1
SEQ2_DIR=/scratch/data/oviAri1/oviAri1.2bit
SEQ2_LEN=/scratch/data/oviAri1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzOviAri1.2010-04-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    578m58.918s
    cat fb.hg19.chainOviAri1Link.txt
    #   878545517 bases of 2897316137 (30.323%) in intersection

    #   and the swap
    mkdir /hive/data/genomes/oviAri1/bed/blastz.hg19.swap
    cd /hive/data/genomes/oviAri1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzOviAri1.2010-04-16/DEF \
        -swap -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    72m47.780s

    cat fb.oviAri1.chainHg19Link.txt
    #   824310420 bases of 1201271277 (68.620%) in intersection
    

######################################################################## 
# H-Inv 7.0 Gene track (DONE - 2010-04-07 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/hinv
    cd /hive/data/genomes/hg19/bed/hinv
    ./hinvToBed12.pl go > broken.1.exons.txt
    hgLoadBed hg19 hinv70Coding fcdna.coding.bed
    #	Loaded 272257 elements of size 12
    featureBits hg19 hinv70Coding
    #	141717797 bases of 2897316137 (4.891%) in intersection

    hgLoadBed hg19 hinv70NonCoding fcdna.nonCoding.bed
    #	Loaded 22625 elements of size 12
    featureBits hg19 hinv70NonCoding 
    #	1350960 bases of 2897316137 (0.047%) in intersection

    hgLoadBed hg19 hinv70PseudoGene fcdna.pseudoGene.bed
    #	Loaded 1166 elements of size 12
    featureBits hg19 hinv70PseudoGene
    #	1701647 bases of 2897316137 (0.059%) in intersection

    featureBits hg19 hinv70Coding hinv70PseudoGene
    #	619377 bases of 2897316137 (0.021%) in intersection

    featureBits hg19 hinv70Coding hinv70NonCoding
    #	912553 bases of 2897316137 (0.031%) in intersection

    featureBits hg19 hinv70PseudoGene hinv70NonCoding
    #	9642 bases of 2897316137 (0.000%) in intersection

######################################################################## 
# Updating the ucscToEnsembl table (DONE - 2010-04-06 - Hiram)
    #	as of Ensembl V57, their naming scheme changed for the randoms
    cd /hive/data/genomes/hg19/bed/ucscToEnsembl
cat ../../chrom.sizes | while read L
do
    size=`echo $L | awk '{print $2}'`
    ucName=`echo $L | awk '{print $1}'`
    ensName=`echo $ucName | sed -e "s/^chrM/MT/; s/^chr//;"`
    case $ucName in
        chr17_ctg5_hap1) ensName="HSCHR17_1"
                ;;
        chr4_ctg9_hap1) ensName="HSCHR4_1"
                ;;
        chr6_apd_hap1) ensName="HSCHR6_MHC_APD"
                ;;
        chr6_cox_hap2) ensName="HSCHR6_MHC_COX"
                ;;
        chr6_dbb_hap3) ensName="HSCHR6_MHC_DBB"
                ;;
        chr6_mann_hap4) ensName="HSCHR6_MHC_MANN"
                ;;
        chr6_mcf_hap5) ensName="HSCHR6_MHC_MCF"
                ;;
        chr6_qbl_hap6) ensName="HSCHR6_MHC_QBL"
                ;;
        chr6_ssto_hap7) ensName="HSCHR6_MHC_SSTO"
                ;;
        *_gl*)
ensName=`echo $L | awk '{print $1}' | sed -e "s/^chr.*_gl/GL/; s/_random//"`
                ;;
    esac
    echo -e "$ucName\t$ensName"
done > ucscToEnsemblV57.tab

    hgsql hg19 -e 'delete from ucscToEnsembl where ucsc like "%";'
    hgsql hg19 -e \
'LOAD DATA LOCAL INFILE "ucscToEnsemblV57.tab" INTO TABLE ucscToEnsembl'


############################################################################
# dbSNP BUILD 131 (SNP131) (DONE 5/25/10 angie - TWEAKED 8/4/10)
# Originally done 4/15/10 -- updated 5/25 with corrected function codes from
# dbSNP (b131_SNPContigLocusId_37_1.bcp.gz).
    # Set up build directory
    mkdir -p /hive/data/outside/dbSNP/131/{human,shared}

    # Get field encodings -- if there are changes or additions to the
    # encoding of the corresponding fields, you might need to update
    # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
    # hg/lib/snp125Ui.c).
    cd /hive/data/outside/dbSNP/131/shared
    alias wg wget --timestamping
    set ftpShared = ftp://ftp.ncbi.nih.gov/snp/database/shared_data
    wg $ftpShared/LocTypeCode.bcp.gz
    wg $ftpShared/SnpClassCode.bcp.gz
    wg $ftpShared/SnpFunctionCode.bcp.gz
    wg $ftpShared/SnpValidationCode.bcp.gz
    # Here is another source -- it is not as up-to-date as the above, but
    # our encodings (enums and sets in snp131.sql) are named more similar
    # to those in the 2005 ASN:
    # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn

    ########################## DOWNLOAD #############################
    cd /hive/data/outside/dbSNP/131/human
    mkdir data schema rs_fasta
    # Get data from NCBI (anonymous FTP)
    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
    wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
    cd /hive/data/outside/dbSNP/131/human/data
    # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
    wg $ftpSnpDb/organism_data/b131_SNPContigLoc_37_1.bcp.gz
    wg $ftpSnpDb/organism_data/b131_SNPContigLocusId_37_1.bcp.gz
    wg $ftpSnpDb/organism_data/b131_ContigInfo_37_1.bcp.gz
    # MapInfo has alignment weights
    wg $ftpSnpDb/organism_data/b131_SNPMapInfo_37_1.bcp.gz
    # SNP has univar_id, validation status and heterozygosity
    wg $ftpSnpDb/organism_data/SNP.bcp.gz

    # Get schema
    cd /hive/data/outside/dbSNP/131/human/schema
    wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
    wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz

    # Get fasta files
    # using headers of fasta files for molType, class, observed
    cd /hive/data/outside/dbSNP/131/human/rs_fasta
    wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz

    ########################## LOAD NCBI TABLES #############################
    # Simplify names of data files -- strip version & extras to get
    # local canonical table names.
    cd /hive/data/outside/dbSNP/131/human/data
    foreach f (*.bcp.gz)
      set new = `echo $f \
                 | sed -e 's/^b131_SNP//; s/^b131_//; s/_37_1//; s/.bcp//;'`
      mv $f $new
      echo $new
    end

    cd /hive/data/outside/dbSNP/131/human/schema
    zcat human_9606_table.sql.gz \
    | perl -we '$/ = "\nGO\n\n\n"; \
        while (<>) { \
          next unless /^CREATE TABLE \[(b131_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_37_1)?\]/; \
          s/b131_(SNP)?//; s/_37_1//; \
          s/[\[\]]//g;  s/GO\n\n/;/;  s/smalldatetime/datetime/g; \
          s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
          s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
          s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
          s/(image|varchar\s+\(\d+\))/BLOB/g; \
          print; \
        }' \
      > table.sql

    # load on hgwdev (kolossus disk almost full, no more small cluster mysql5's):
    hgsql -e 'create database hg19snp131'
    cd /hive/data/outside/dbSNP/131/human/schema
    hgsql hg19snp131 < table.sql
    cd ../data

    # Avoid wasting space by excluding mappings to non-reference contigs (ContigInfo.group_label):
    zcat ContigInfo.gz | cut -f 12 | uniq | sort -u
#CRA_TCAGchr7v2
#Celera
#GRCh37
#Homo sapiens MT
#HuRef
    foreach t (ContigInfo MapInfo ContigLocusId)
      zcat $t.gz \
      | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp131 $t placeholder stdin
    end

    # Compare contig list between our ctgPos and reference contigs in ContigInfo.
    # If they are identical, sweet, we probably have a $db/jkStuff/liftContigs.lft
    # or similar file to use below.  If they are not identical, need to make 
    # lift file using available information.
    hgsql hg19 -N -B -e 'select contig from ctgPos;' \
    | sort > /tmp/1
    # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
    hgsql hg19snp131 -N -B -e 'select contig_acc from ContigInfo;' | sort > /tmp/2
    diff /tmp/1 /tmp/2
    # Doh!  Completely different: ctgPos has GL*, ContigInfo has NC_* / NT_*
    # We will need to generate own liftUp file for N*_* contig IDs.

    # NC_001807 entrez sez "Record removed.This sequence was removed
    # since the accepted reference sequence for the Homo sapiens
    # mitochondrion is the rCRS/Mitomap sequence, which is now
    # available as the record NC_012920".  
    # They align w/gaps on both q & t, so liftUp won't do, we need liftOver:
    blat -noHead NC_012920.fa /hive/data/genomes/hg19/M/chrM.fa stdout \
    | axtChain -psl -linearGap=medium stdin -faT NC_012920.fa /hive/data/genomes/hg19/hg19.2bit \
        NC_012920ToChrM.over.chain

    # NT_004350: entrez sez:
#COMMENT     REFSEQ INFORMATION: Features on this sequence have been produced
#            for build 37 version 1 of the NCBI's genome annotation [see
#            documentation].   The reference sequence is identical to
#            GL000003.1.

    # Using the contigs named in ContigInfo, screen-scrape genbank to get GL ID for contig ID.
    cp /dev/null contigToGl.txt
    foreach nt (`hgsql hg19snp131 -N -B -e 'select contig_acc from ContigInfo;'`)
      wget --quiet -O - 'http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?tool=portal&amp;db=nuccore&amp;val='$nt'&amp;dopt=genbank&amp;sendto=on' \
      | perl -we 'while (<>) { \
                    if (/^LOCUS/) { \
                      m/^LOCUS\s+'$nt'\s+(\d+) bp/ || die "parse ('$nt'): $_\t"; \
		      $size = $1; \
                    } elsif (/^            (GL\d+)\.\d+\.$/) { \
                      print "'$nt'\t$1\t$size\n"; \
		    } \
                  }' \
        >> contigToGl.txt
    end
    hgsql hg19 -NBe 'select chromStart, SUBSTRING_INDEX(contig, ".", 1), \
                       ctgPos.size, ctgPos.chrom, chromInfo.size \
                       from ctgPos,chromInfo \
                       where ctgPos.chrom=chromInfo.chrom order by contig' \
      > glToLift.txt
    sort -k2,2 contigToGl.txt \
    | join -1 2 -2 2 -t"	" -o 1.1,2.1,1.3,1.4,1.5 -a 2 -e MISSING \
        glToLift.txt - \
      > /hive/data/genomes/hg19/jkStuff/liftContigs.lft

    # Manually add NC_001807 -> chrM just in case:
    echo "0	NC_001807	16571	chrM	16571" \
      >> /hive/data/genomes/hg19/jkStuff/liftContigs.lft
    # Blat NC_012920 to chrM shows gaps, so we'll need to use liftOver chain created above.

    # Make sure there are no orient != 0 contigs among those selected.
    hgsql hg19snp131 -NBe \
      'select count(*) from ContigInfo where orient != 0;'
#0

    # ContigLoc is huge, and we want just the reference contig mappings.
    # So, based on the reference & haplo ctg_id values in ContigInfo,
    # filter to get just the mappings for those contigs:
    zcat ContigLoc.gz \
    | awk '$3 <= 9 || $3 == 6647 || $3 >= 11178' \
    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
    | hgLoadSqlTab -oldTable hg19snp131 ContigLoc placeholder stdin
#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 1
#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 2
#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 3
#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 4
#load of ContigLoc did not go as planned: 27500025 record(s), 0 row(s) skipped, 3273 warning(s) loading /dev/stdin
    zcat SNP.gz \
    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
    | hgLoadSqlTab -oldTable hg19snp131 SNP placeholder stdin
#Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 1
#Warning 1366 Incorrect integer value: '' for column 'map_property' at row 1
#Warning 1264 Out of range value adjusted for column 'last_updated_time' at row 2
#Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 2
#Warning 1366 Incorrect integer value: '' for column 'map_property' at row 2
    # ... no big deal.
    foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
     echo -n "${t}:\t"
      hgsql -N -B hg19snp131 -e 'select count(*) from '$t
    end
#ContigInfo:           260
#ContigLoc:       27500025
#ContigLocusId:   55347972
#MapInfo:         23619373
#SNP:    	  23653729


    #################### EXTRACT INFO FROM NCBI TABLES ####################
    # Glom each SNP's function codes together and load up a new hg19Snp131 table.
    # Also extract NCBI's annotations of coding SNPs' effects on translation.
    # We extract ContigLocusId info only for reference assembly mapping.
    # Some SNP's functional annotations are for an alternate assembly, so we will
    # have no NCBI functional annotations to display for those (but our own are 
    # available).
    cd /hive/data/outside/dbSNP/131/human
    # Add indices to tables for a big join (5 or 6 minutes):
    hgsql hg19snp131 -e \
      'alter table ContigInfo add index (ctg_id); \
       alter table ContigLocusId add index (ctg_id);'
    hgsql hg19snp131 -NBe 'select snp_id, ci.contig_acc, asn_from, asn_to, mrna_acc, \
                           fxn_class, reading_frame, allele, residue, codon, cli.ctg_id \
                           from ContigLocusId as cli, ContigInfo as ci \
                           where cli.ctg_id = ci.ctg_id;' \
      > ncbiFuncAnnotations.txt
    wc -l ncbiFuncAnnotations.txt
#16835438 ncbiFuncAnnotations.txt
    # Ignore function code 8 (cds-reference, just means that some allele matches reference)
    # and glom functions for each SNP id:
    cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
    | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
    | perl -we 'while (<>) { chomp; \
                  ($id, undef, $s, $e, $f, $c) = split; \
                  if (defined $prevId && $id == $prevId && $c == $prevC && $s == $prevS) { \
                    $prevFunc .= "$f," unless ($f == 8); \
                  } else { \
                    if (defined $prevId) { \
                      print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if ($prevFunc); \
                    } \
                    $prevFunc = ($f == 8) ? "" : "$f,"; \
                  } \
                  ($prevId, $prevC, $prevS, $prevE) = ($id, $c, $s, $e); \
                } \
                print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if ($prevFunc);' \
      > ucscFunc.txt
    wc -l ucscFunc.txt
#10328697 ucscFunc.txt
    cat > ucscFunc.sql <<EOF
CREATE TABLE ucscFunc (
        snp_id int NOT NULL ,
        ctg_id int(10) NOT NULL ,
        asn_from int(10) NOT NULL ,
        asn_to int(10) NOT NULL ,
        fxn_class varchar(255) NOT NULL ,
        INDEX snp_id (snp_id),
        INDEX ctg_id (ctg_id)
);
EOF
    hgLoadSqlTab hg19snp131 ucscFunc{,.sql,.txt}
    # 10/12/10: Those coords are NCBI's 0-based, fully-closed, 2-base-wide insertions.
    # We need to leave the coords alone here so ucscFunc can be joined below.
    # Make a list of SNPs with func anno's that are insertion SNPs, so we can use 
    # the list to determine what type of coord fix to apply to each annotation
    # when making snp130CodingDbSnp below.
    hgsql hg19snp131 -NBe \
      'select ci.contig_acc, cl.asn_from, cl.asn_to, uf.snp_id \
       from ucscFunc as uf, ContigLoc as cl, ContigInfo as ci \
       where uf.snp_id = cl.snp_id and \
             uf.ctg_id = cl.ctg_id and uf.asn_from = cl.asn_from and uf.asn_to = cl.asn_to and \
             cl.loc_type = 3 and \
             cl.ctg_id = ci.ctg_id' \
      > ncbiFuncInsertions.ctg.bed
    wc -l ncbiFuncInsertions.ctg.bed
#1165272 ncbiFuncInsertions.ctg.bed

    # Extract observed allele, molType and snp class from FASTA headers gnl
    # 4/13: found some inconsistent headers in rs_chPAR.fas.gz vs. other rs_ch*,
    # reported to dbSNP, Lon said that rs_chPAR.fas.gz snuck in from build 130!
    rm /hive/data/outside/dbSNP/131/human/rs_fasta/rs_chPAR.fas.gz
    zcat /hive/data/outside/dbSNP/131/human/rs_fasta/rs_ch*.fas.gz \
    | grep '^>gnl' \
    | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
    | sort -nu \
      > ucscGnl.txt
#520.305u 74.766s 7:02.48 140.8% 0+0k 0+0io 0pf+0w
    wc -l ucscGnl.txt
#23653726 ucscGnl.txt
    cut -f 1 ucscGnl.txt | uniq | wc -l
#23653726
    cat > ucscGnl.sql <<EOF
CREATE TABLE ucscGnl (
        snp_id int NOT NULL ,
        observed varchar(255) NOT NULL,
        molType varchar(255) NOT NULL,
        class varchar(255) NULL ,
        INDEX snp_id (snp_id)
);
EOF
    hgLoadSqlTab hg19snp131 ucscGnl{,.sql,.txt}

    # Add indices to tables for a big join (5 or 6 minutes):
    hgsql hg19snp131 -e \
      'alter table ContigLoc  add index (ctg_id); \
       alter table SNP        add index (snp_id); \
       alter table MapInfo    add index (snp_id);'

    # Big leftie join to bring together all of the columns that we want in snp131,
    # using all of the available joining info:
    hgsql hg19snp131 -NBe \
     'SELECT ci.contig_acc, cl.asn_from, cl.asn_to, cl.snp_id, cl.orientation, cl.allele, \
             ug.observed, ug.molType, ug.class, \
             s.validation_status, s.avg_heterozygosity, s.het_se, \
             uf.fxn_class, cl.loc_type, mi.weight, cl.phys_pos_from \
      FROM \
      ((((ContigLoc as cl JOIN ContigInfo as ci \
               ON cl.ctg_id = ci.ctg_id) \
          LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
         LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
        LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
       LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
                                and uf.asn_from = cl.asn_from;' \
      > ucscNcbiSnp.ctg.bed
#75.815u 13.622s 32:04.35 4.6%   0+0k 0+0io 0pf+0w
    wc -l ucscNcbiSnp.ctg.bed 
#27500025 ucscNcbiSnp.ctg.bed
    # Use liftUp for everything except mito, then liftOver for mito:
    # There are some weird cases of length=1 but locType=range... in all the cases 
    # that I checked, the length really seems to be 1 so I'm not sure where they got 
    # the locType=range.  Tweak locType in those cases so we can keep those SNPs:
    grep -vw ^NC_012920 ucscNcbiSnp.ctg.bed \
    | awk -F"\t" 'BEGIN{OFS="\t";}  $2 == $3 && $14 == 1 {$14=2; numTweaked++;}  {print;} \
           END{print numTweaked, "single-base, locType=range, tweaked locType" > "/dev/stderr";}' \
    | liftUp ucscNcbiSnp.bed \
      /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
#2535    single-base, locType=range, tweaked locType
#392.182u 27.358s 7:20.66 95.2%  0+0k 0+0io 0pf+0w
    # For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
    # doesn't deal with 0-base items.  Fake out phys_pos_from to 0 because many coords
    # will differ, oh well.
    grep -w NC_012920 ucscNcbiSnp.ctg.bed \
    | awk -F"\t" 'BEGIN{OFS="\t";} {$3 += 1; $16 = 0; print;}' \
    | liftOver -bedPlus=3 stdin NC_012920ToChrM.over.chain stdout chrM.unmapped \
    | awk -F"\t" 'BEGIN{OFS="\t";} {$3 -= 1; print;}' \
    | sort -k2n,2n \
      > chrMNcbiSnp.bed
#3.479u 2.428s 0:53.57 10.9%     0+0k 0+0io 4pf+0w
    # Good, got all but 2 SNPS (rs28693675 and rs55749223, partially deleted / deleted in new)
    cat chrMNcbiSnp.bed >> ucscNcbiSnp.bed
    wc -l ucscNcbiSnp.bed
#27500023 ucscNcbiSnp.bed

    # Translate NCBI's encoding into UCSC's, and perform a bunch of checks.
    # This is where developer involvement is most likely as NCBI extends the 
    # encodings used in dbSNP.
    cd /hive/data/outside/dbSNP/131/human/
    snpNcbiToUcsc ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit snp131
#spaces stripped from observed:
#chr12   6093134 6093134 rs41402545
#count of snps with weight  0 = 67535
#count of snps with weight  1 = 23023681
#count of snps with weight  2 = 472416
#count of snps with weight  3 = 2536961
#count of snps with weight 10 = 1399430
#Skipped 7 snp mappings due to errors -- see snp131Errors.bed
#173.162u 5.982s 7:57.91 37.4%   0+0k 0+0io 3pf+0w
    head snp131Errors.bed
#chr13   32953907        32954033        rs80359736      rs80359736 is 126 bases long but refNCBI is different length: CATCATCAGATTTATATTCTCTGTTAACAGAAGGAAAGAGATACAGAATTTATCATCTTGCAACTTCAAAATCTAAAAGTAAATCTGAAAGAGCTAACAT
#chr17   41223118        41223133        rs80359888      Missing observed value (deleted SNP?).
#chr17   41245687        41245900        rs80359886      rs80359886 is 213 bases long but refNCBI is different length: AATATGCCTGGTAGAAGACTTCCTCCTCAGCCTATTCTTTTTAGGTGCTTTTGAATTGTGGATATTTAATTCGAGTTCCATATTGCTTATACTGCTGCTT
#chr17   41245687        41245900        rs80359886      Missing observed value (deleted SNP?).
#chr17   41276085        41276094        rs80359887      Missing observed value (deleted SNP?).
#chrM    308     310     rs66492218      Unexpected coords for locType "between" (3) -- expected NCBI's chrEnd = chrStart+1.
#chrM    308     310     rs66492218      rs66492218 is 2 bases long but refNCBI is different length: -
    wc -l snp*
#  26033053 snp131.bed
#        22 snp131.sql
#         7 snp131Errors.bed
#        18 snp131ExceptionDesc.tab
#   4281351 snp131Exceptions.bed
    # 8M new snps, lots more exceptions than snp130 (had 2631563)

    # Make one big fasta file.
    # It's a monster: 18G!  Can we split by hashing rsId?
    zcat rs_fasta/rs_ch*.fas.gz \
    | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
      > snp131.fa
    # Check for duplicates.
    grep ^\>rs snp131.fa | sort > /data/tmp/seqHeaders
    wc -l /data/tmp/seqHeaders
#23653726 /data/tmp/seqHeaders
    uniq /data/tmp/seqHeaders | wc -l
#23653726
    # Use hgLoadSeq to generate .tab output for sequence file offsets,
    # and keep only the columns that we need: acc and file_offset.
    # Index it and translate to snpSeq table format.
    hgLoadSeq -test placeholder snp131.fa
#23653726 sequences
#128.364u 25.531s 10:52.02 23.6% 0+0k 0+0io 0pf+0w
    cut -f 2,6 seq.tab > snp131Seq.tab
    rm seq.tab

    # Load up main track tables.
    cd /hive/data/outside/dbSNP/131/human
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
      hg19 snp131 -sqlTable=snp131.sql snp131.bed
#Loaded 26033053 elements of size 17
#162.666u 19.611s 8:53.56 34.1%  0+0k 0+0io 0pf+0w
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
      hg19 snp131Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
      snp131Exceptions.bed
#Loaded 4281351 elements of size 5
#32.020u 2.006s 1:22.87 41.0%    0+0k 0+0io 0pf+0w
    hgLoadSqlTab hg19 snp131ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
      snp131ExceptionDesc.tab
    # Load up sequences.
    mkdir -p /gbdb/hg19/snp
    ln -s /hive/data/outside/dbSNP/131/human/snp131.fa /gbdb/hg19/snp/snp131.fa
    hgLoadSqlTab hg19 snp131Seq ~/kent/src/hg/lib/snpSeq.sql snp131Seq.tab

    # Put in a link where one would expect to find the track build dir...
    ln -s /hive/data/outside/dbSNP/131/human /hive/data/genomes/hg19/bed/snp131

#*** NOTE FOR NEXT TIME: ask cluster-admin to pack the snp131 table

    # Look at the breakdown of exception categories:
    cd /hive/data/outside/dbSNP/131/human
    cut -f 5 snp131Exceptions.bed | sort | uniq -c | sort -nr
#3088435 MultipleAlignments
# 886159 ObservedMismatch
#  92341 SingleClassTriAllelic
#  70184 SingleClassZeroSpan
#  43319 ObservedTooLong
#  25745 MixedObserved
#  22606 SingleClassLongerSpan
#  19681 SingleClassQuadAllelic
#  15245 FlankMismatchGenomeShorter
#   9808 DuplicateObserved
#   4463 NamedDeletionZeroSpan
#   2040 FlankMismatchGenomeLonger
#    802 ObservedContainsIupac
#    317 NamedInsertionNonzeroSpan
#    142 FlankMismatchGenomeEqual
#     62 RefAlleleMismatch
#      1 RefAlleleRevComp
#      1 ObservedWrongFormat
    # Compared to snp130, nice to see fewer disfunctional locTypes (FlankMismatch*)
    # and singleClassQuadAllelic -- major increases in most others though.
    # Sent a few bug reports to dbSNP

    # Tweaked 8/4/10 to correct missing-func (loophole fixed in perl that generates
    # ucscFunc.txt above).
    hgsql hg19 -e "update snp131 set func = 'unknown' where name = 'rs75946332' and func = '';"


############################################################################
# SPLIT SNP131 INTO CLINICAL / NON-CLINICAL (DONE 8/19/10 angie)
# http://redmine.soe.ucsc.edu/issues/559
    cd /hive/data/outside/dbSNP/131/human/data
    wget --timestamping ftp://ftp.ncbi.nlm.nih.gov/snp/database/organism_data/human_9606/SNP_bitfield.bcp.gz
    # I did a little analysis of the bitfields -- see file 
    # /hive/data/outside/dbSNP/131/human/data/bitfield_breakdown.txt .
    cd /hive/data/outside/dbSNP/131/human
    zcat data/SNP_bitfield.bcp.gz \
    | perl -wpe '@w = split;  if ($w[3] & 0x40) { $_ = "rs$w[0]\n" } else { $_ = ""; }' \
      > clinicalRsIds.txt
    wc -l clinicalRsIds.txt
#16605 clinicalRsIds.txt
    grep -Fwf clinicalRsIds.txt snp131.bed > snp131Clinical.bed
    wc -l snp131Clinical.bed
#14907 snp131Clinical.bed
    # Wow, just a subset have been mapped to hg19, bummer.
    grep -vFwf clinicalRsIds.txt snp131.bed > snp131NonClinical.bed
    wc -l snp131*Clinical.bed
#     14907 snp131Clinical.bed
#  26018146 snp131NonClinical.bed
#  26033053 total
    # Good, 26033053 is the right total.
    # Edit snp131.sql to use table name "snp131Tmp" so we don't nuke snp131.
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
      hg19 snp131Clinical -sqlTable=snp131.sql -renameSqlTable snp131Clinical.bed
#Loaded 14907 elements of size 17
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
      hg19 snp131NonClinical -sqlTable=snp131.sql -renameSqlTable snp131NonClinical.bed
#Loaded 26018146 elements of size 17
    # 9/1/10: Don't forget to fix the empty-func bug!
    hgsql hg19 -e 'update snp131Clinical set func = "unknown" where func = "";'


############################################################################
# SPLIT SNP131 INTO MULTIMAPPED / HAPMAP+1000GENOMES / MISC (DONE 11/5/10 angie)
# another swipe at http://redmine.soe.ucsc.edu/issues/559
    cd /hive/data/outside/dbSNP/131/human

    # First, separate out the "SNPs" that map to multiple genomic loci:
    grep -Fw MultipleAlignments snp131Exceptions.bed | wc -l
#3088435
    grep -Fw MultipleAlignments snp131Exceptions.bed \
    | cut -f 4 \
    | sort -u > multipleMappingIds.txt
    wc -l multipleMappingIds.txt
#978068 multipleMappingIds.txt
    grep -Fwf multipleMappingIds.txt snp131.bed > snp131NonUnique.bed
    wc -l snp131NonUnique.bed
#3088435 snp131NonUnique.bed
    grep -vFwf multipleMappingIds.txt snp131.bed > snp131Unique.bed
    wc -l snp131Unique.bed
#22944618 snp131Unique.bed

    # Next, separate the uniquely mapped SNPs into HapMap and/or 1000 Genomes
    # (the ones that we are certain have been observed in a large number of 
    # apparently healthy samples) vs. other ones (rarer SNPs, maybe clinical SNPs).

#*** NOTE *** To do this right, we need to get allele freq data from dbSNP and use it
#             as a filter here:

    egrep -w 'by-hapmap|by-1000genomes' snp131Unique.bed > snp131Common.bed
    wc -l snp131Common.bed
#12750453 snp131Common.bed
    egrep -vw 'by-hapmap|by-1000genomes' snp131Unique.bed > snp131Misc.bed
    wc -l snp131Misc.bed
#10194165 snp131Misc.bed

    # How many "clinical" SNPs (i.e. included in a Locus-Specific Database) are in the common set?
    grep -Fwf clinicalRsIds.txt snp131Common.bed > snp131CommonButClinical.bed
    wc -l snp131CommonButClinical.bed
#2373 snp131CommonButClinical.bed
    calc 2373 / 16605
#2373 / 16605 = 0.142909
    # A higher percentage than I expected... are 15% of OMIM / LSDB SNPs common?  
    # Spot-checking, OMIM will mention SNPs from GWAS, or as endpoints of an interval...
    # the SNPs are clearly common but sometimes common variants are associated with
    # a disorder.
    # How many "clinical" SNPs in the NonUnique set?
    grep -Fwf clinicalRsIds.txt snp131NonUnique.bed | wc -l
#901

    # Load tables:
    foreach t (snp131Common snp131Misc snp131NonUnique)
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
        hg19 $t -sqlTable=snp131.sql -renameSqlTable $t.bed
    end
#Reading snp131Common.bed
#Loaded 12750453 elements of size 17
#Reading snp131Misc.bed
#Loaded 10194165 elements of size 17
#Reading snp131NonUnique.bed
#Loaded 3088435 elements of size 17


############################################################################
# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP131 (DONE 6/3/10 angie)
# First done 4/15/10.  Then found that SNPs that appeared on both a main chrom
# (like chr6) and on a haplo chrom (like chr6_cox_hap2) were being flagged
# as multiple alignments when they should be, excluding them from this.
# Regenerated exceptions, then regenerated this.
    mkdir /hive/data/genomes/hg19/bed/snp131Ortho
    cd /hive/data/genomes/hg19/bed/snp131Ortho

    # Following Heather's lead in snp126orthos, filter SNPs to to keep
    # only those with class=single, length=1, chrom!~random;
    # Exclude those with exceptions MultipleAlignments,
    # SingleClassTriAllelic or SingleClassQuadAllelic.
    # Unlike snp masking, we do not filter for weight -- don't know why.
    awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      /hive/data/outside/dbSNP/131/human/snp131Exceptions.bed \
    | sort -u \
      > snp131ExcludeIds.txt
    awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
      /hive/data/outside/dbSNP/131/human/snp131.bed \
    | grep -vFwf snp131ExcludeIds.txt \
      > snp131Simple.bed
#333.829u 11.879s 3:57.31 145.6% 0+0k 0+0io 0pf+0w
    wc -l snp131Simple.bed
#17784981 snp131Simple.bed
#with too many SNPs excluded, was: 17337248 snp131Simple.bed

    # Glom all human info that we need for the final table onto the
    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
    awk 'BEGIN{OFS="\t";} \
        {print $1, $2, $3, \
               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
               0, $6;}' \
      snp131Simple.bed > snp131ForLiftOver.bed

    # Map coords to chimp using liftOver.
    mkdir run.liftOChimp
    cd run.liftOChimp
    mkdir split out
    splitFile ../snp131ForLiftOver.bed 25000 split/chunk
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
        \{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    ssh swarm
    cd /hive/data/genomes/hg19/bed/snp131Ortho/run.liftOChimp
    para make jobList
#Completed: 712 of 712 jobs
#CPU time in finished jobs:     127853s    2130.88m    35.51h    1.48d  0.004 y
#IO & Wait Time:                 11528s     192.14m     3.20h    0.13d  0.000 y
#Average job time:                 196s       3.26m     0.05h    0.00d
#Longest finished job:             506s       8.43m     0.14h    0.01d
#Submission to last job:           676s      11.27m     0.19h    0.01d

    # Map coords to orangutan using liftOver.
    mkdir ../run.liftOPon
    cd ../run.liftOPon
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
        \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
# on pk:
#Completed: 712 of 712 jobs
#CPU time in finished jobs:     230882s    3848.03m    64.13h    2.67d  0.007 y
#IO & Wait Time:                  3660s      61.00m     1.02h    0.04d  0.000 y
#Average job time:                 329s       5.49m     0.09h    0.00d
#Longest finished job:            1019s      16.98m     0.28h    0.01d
#Submission to last job:          1667s      27.78m     0.46h    0.02d

    # Map coords to macaque using liftOver.
    mkdir ../run.liftOMac
    cd ../run.liftOMac
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
        \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 712 of 712 jobs
#CPU time in finished jobs:     281168s    4686.14m    78.10h    3.25d  0.009 y
#IO & Wait Time:                 22164s     369.39m     6.16h    0.26d  0.001 y
#Average job time:                 426s       7.10m     0.12h    0.00d
#Longest finished job:             868s      14.47m     0.24h    0.01d
#Submission to last job:           872s      14.53m     0.24h    0.01d

    cd /hive/data/genomes/hg19/bed/snp131Ortho
    # Concatenate the chimp results, sorting by chimp pos in order to
    # efficiently access 2bit sequence in getOrthoSeq.  The output of
    # that is then sorted by the glommed human info field, so that we
    # can use join to combine chimp and macaque results in the next step.
    # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
    | sort > panTro2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
    | sort > ponAbe2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
    | sort > rheMac2.orthoGlom.txt
    wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
#  16641106 panTro2.orthoGlom.txt
#  15796202 ponAbe2.orthoGlom.txt
#  14289736 rheMac2.orthoGlom.txt
#was:  16230258 panTro2.orthoGlom.txt
#was:  15535287 ponAbe2.orthoGlom.txt
#was:  13996256 rheMac2.orthoGlom.txt

    # Use the glommed name field as a key to join up chimp and macaque
    # allele data.  Include glommed name from both files because if only
    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
    # in the orthoGlom fields from each file, which are in the same order
    # as the chimp and macaque columns of snp131OrthoPanTro2RheMac2.
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
      > tmp.txt
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      tmp.txt rheMac2.orthoGlom.txt \
    | perl -wpe 'chomp; \
        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
          split(/\|/, $glomKey); \
        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
        s/^.*$//;' \
    | sort -k1,1 -k2n,2n > snp131OrthoPt2Pa2Rm2.bed
#437.114u 37.309s 6:33.92 120.4% 0+0k 0+0io 0pf+0w
    wc -l snp131OrthoPt2Pa2Rm2.bed
#17276174 snp131OrthoPt2Pa2Rm2.bed
#was: 16842459 snp131OrthoPt2Pa2Rm2.bed

    hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
      hg19 snp131OrthoPt2Pa2Rm2 snp131OrthoPt2Pa2Rm2.bed
#Loaded 17276174 elements of size 22
#123.287u 13.079s 8:17.88 27.3%  0+0k 0+0io 0pf+0w

    # Cleanup:
    nice gzip snp131Simple.bed snp131ExcludeIds.txt snp131ForLiftOver.bed
    rm -r run*/split tmp.txt *.orthoGlom.txt bed.tab


############################################################################
# DBSNP CODING ANNOTATIONS (DONE 10/12/10 angie)
# Updated 10/12/10 - redone w/corrected genome coords (Redmine Track #1249)
# Updated 5/25/10 with corrected function codes (b131_SNPContigLocusId_37_1.bcp.gz).
# Updated 4/16 - redone w/snp131, using mapping locations of dbSNP's func. annos.
#                found some strange function codes and notified dbSNP.
# originally done 6/2/09
    cd /hive/data/outside/dbSNP/131/human
    # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.  
    # For anything except an insertion (0 bases between flanks), 
    # we need to add 1 to the end coord.  For an insertion, we need
    # to add 1 to the start coord.  Make a hash of the insertion IDs,
    # then look up each ID in ncbiFuncAnnotations.txt to tell which
    # transform to apply.
    # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
    perl -we 'open($IDS, "ncbiFuncInsertions.ctg.bed") || die "ids: $!"; \
              while (<$IDS>) { chomp; $ids{$_} = 1; } \
              close($IDS); \
              %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 44=>1, 45=>1); \
              while (<>) { \
                chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                next unless $coding{$w[5]}; \
                $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                  $w[2]++; # 2-base insertions: increment start coord \
                } else { \
                  $w[3]++; # increment end coord to get half-open \
                } \
                print join("\t", @w) . "\n"; \
              }' ncbiFuncAnnotations.txt \
    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
    | uniq \
      > ncbiCodingAnnotations.txt
    wc -l ncbiCodingAnnotations.txt
#950616 ncbiCodingAnnotations.txt

    # How many & what kinds of function types?
    cut -f 6 ncbiCodingAnnotations.txt \
    | sort -n | uniq -c
# 168639 3   (coding-synon)
# 443419 8   (cds-reference -- ignored)
#      1 9   (coding-synonymy-unknown: rs80359842)
#   9790 41  (nonsense)
# 261982 42  (missense)
#  65656 44  (frameshift)
#   1129 45  (cds-indel)
    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
    perl -e  'while (<>) { chomp; \
                my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                if (defined $lastRs && \
                    ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                     $lastTx ne $txId || $lastFrm ne $frm)) { \
                  if (defined $refRow) { \
                    $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                    $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                  } \
                  print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                        "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                  $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                } \
                ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                    ($rsId, $ctg, $s, $e, $txId, $frm); \
                $count++; \
                if ($fxn == 8) { \
                  $refRow = [$fxn, $nt, $aa, $codon]; \
                } else { \
                 $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                } \
              } \
              if (defined $refRow) { \
                $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
              } \
              print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                    "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
      ncbiCodingAnnotations.txt \
    | liftUp snp131CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
    hgLoadBed hg19 snp131CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      snp131CodingDbSnp.bed
#Loaded 443454 elements of size 11


############################################################################
# SNPMASKED SEQUENCE FOR SNP131 (DONE 5/27/10 angie)
    mkdir /hive/data/genomes/hg19/snp131Mask
    cd /hive/data/genomes/hg19/snp131Mask

    # Identify rsIds with various problems -- we will exclude those.
    # MultipleAlignments is kinda broad because anything that maps on
    # both chrN and chrN_foo_hap1 will be excluded... similarly, extra
    # matches on chrN_random might disqualify good matches on chrN.
    # Well, erring on the side of caution is good.
    awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
      /hive/data/outside/dbSNP/131/human/snp131Exceptions.bed \
      | sort -u \
      > snp131ExcludeRsIds.txt
    time grep -vFwf snp131ExcludeRsIds.txt \
      /hive/data/outside/dbSNP/131/human/snp131.bed \
      > snp131Cleaned.bed
#193.507u 5.203s 4:07.62 80.2%   0+0k 0+0io 0pf+0w

    # Substitutions:
    mkdir substitutions
    snpMaskSingle snp131Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
    | faSplit byname stdin substitutions/
    # 180 warnings about differing observed strings at same base position --
    # saved as diffObserved.txt.
#Masked 17377658 snps in 17377491 out of 3099287100 genomic bases
#/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3099287100 (difference is 37874164)
#52.903u 10.964s 4:49.08 22.0%   0+0k 0+0io 3pf+0w
    # Check that 37874164 is the total #bases in sequences with nothing in snp131Cleaned:
    grep -Fw single snp131Cleaned.bed | cut -f 1 | uniq > /tmp/1
    grep -vwf /tmp/1 ../chrom.sizes
    grep -vwf /tmp/1 ../chrom.sizes \
    | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
#37874164
#TODO: send list to dbSNP.
    # Make sure that sizes are identical, first diffs are normal -> IUPAC,
    # and first diffs' case is preserved:
    foreach f (substitutions/chr*.fa)
      faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
    end
#chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10491 (y != c)
#chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 61004 (r != a)
#...
#(output OK -- ambiguous bases replacing [agct] at SNP positions)
    foreach f (substitutions/chr*.fa)
      echo $f:t:r
      mv $f $f:r.subst.fa
      gzip $f:r.subst.fa
    end

    # Insertions:
    mkdir insertions
    snpMaskAddInsertions snp131Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
    | faSplit byname stdin insertions/
#Added 2496221 snps totaling 5939697 bases to 3098816404 genomic bases
#/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3098816404 (difference is 38344860)
#52.764u 12.593s 3:55.47 27.7%   0+0k 0+0io 2pf+0w
    # Again, that just means that some chroms didn't have filtered SNPs.
    # Make sure that all sizes have increased relative to original:
    foreach f (insertions/chr*.fa)
      echo -n "${f:t:r}: "
      faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ" \
      |& perl -we '$_=<>; \
           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
             if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
             else {die "ERROR: ins size $1 <= $2\n";} \
           } else {die $_;}'
    end
#chr1: OK: ins size 249717078 > 249250621
#chr10: OK: ins size 135805198 > 135534747
#...
#(output OK -- new sizes > old)
    foreach f (insertions/chr*.fa)
      mv $f $f:r.ins.fa
      gzip $f:r.ins.fa
    end

    # Deletions:
    mkdir deletions
    snpMaskCutDeletions snp131Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
    | faSplit byname stdin deletions/
#Cut 1522178 snps totaling 3455905 bases from 3098701788 genomic bases
#/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3098701788 (difference is 38459476)
#114.251u 20.911s 4:24.26 51.1%  0+0k 0+0io 3pf+0w
    # Again, that just means that some chroms didn't have filtered SNPs.
    # Make sure that all sizes have decreased relative to original:
    foreach f (deletions/chr*.fa)
      echo -n "${f:t:r}: "
      faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ" \
      |& perl -we '$_=<>; \
           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
             if ($1 < $2) {print "OK: del size $1 < $2\n";} \
             else {die "ERROR: del size $1 >= $2\n";} \
           } else {die $_;}'
    end
#chr1: OK: del size 248968549 < 249250621
#chr10: OK: del size 135378065 < 135534747
#...
#(output OK -- del sizes < old)
    foreach f (deletions/chr*.fa)
      mv $f $f:r.del.fa
      gzip $f:r.del.fa
    end

    # Clean up and prepare for download:
    gzip snp131Cleaned.bed
    foreach d (substitutions insertions deletions)
      pushd $d
        md5sum *.gz > md5sum.txt
        cp /hive/data/genomes/hg18/snp130Mask/$d/README.txt .
      popd
    end
    # Edit the README.txt in each subdir.

    # Create download links on hgwdev.
    # NOTE: Currently we offer only the substitutions.
    # If we get any user requests, then maybe we can put the insertions
    # and deletions out there.
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask
    ln -s /hive/data/genomes/hg19/snp131Mask/substitutions/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask/
## If there is user demand for ins & del, then start over with an empty
## goldenPath/snp131Mask and do this:
##    foreach type (substitutions insertions deletions)
##      mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask/$type
##      ln -s /hive/data/genomes/hg19/snp131Mask/$type/* \
##        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask/$type/
##    end


##############################################################################
#  RE-BUILD sno/miRNA TRACK (DONE - 04-20-2010 - Chin)

    # The data in this track is out of date so update the track. 
    mkdir -p /hive/data/genomes/hg19/bed/wgRna-2010-04-20
    cd /hive/data/genomes/hg19/bed/wgRna-2010-04-20

    # Download GFF file of latest miRNA annotations from miRBase at the
    # ftp://mirbase.org/pub/mirbase/CURRENT/. This is Release 14.0
    # (September,  2009)
    # 04-27-2010 Get the newest miRNA release 15
    wget --timestamping \
         ftp://mirbase.org/pub/mirbase/CURRENT/genomes/hsa.gff
    # Re-format, need to add "chr" to the beginning of each line.
    sed -e 's/^/chr/' hsa.gff > hsMirBaseFormat.gff
    # Remove extra "chr" in comment lines
    perl -pi.bak -e 's/chr#/#/' hsMirBaseFormat.gff
    # Change chrMT to chrM
    perl -pi.bak -e 's/chrMT/chrM/' hsMirBaseFormat.gff
    # Remove all but ID name in last field
    sed -e 's/\";//g' hsMirBaseFormat.gff | sed -e 's/ID=\"//g' \
       | sed -e 's/ACC=\"MI[0-9]*\s//' > hsMirBaseFormatIdOnly.gff

    # set score to zero, since the color is based on the type of the RNA
    # Starts appear to be 1-based when compared to miRNAs in current
    # track
    # and those in Ensembl.
    # Confirmed with Sam Griffith-Jones (one of the authors of miRBase,
    # sam.griffith-jones@manchester.ac.uk) that these GFF coordinates
    # are 1-based. 
    # Also add thickStart and thickEnd columns and "miRNA" for type.
    awk 'BEGIN {FS="\t"} {OFS="\t"} \
        {if ($0 !~ /#/ && $7 == "+") \
         print $1, $4-1, $5, $9, 0, $7, 0, 0, "miRNA"; \
       else if ($0 !~ /#/ && $7 == "-") \
         print $1, $4-1, $5, $9, 0, $7, 0, 0, "miRNA";}' \
        hsMirBaseFormatIdOnly.gff > hsMirBaseFormatIdOnly.bed

    # 2010-04-21
    # Down load the current snoRNABase coordinates (version 3, based on hg19)
    #  from 
    # http://www-snorna.biotoul.fr/coordinates.php
    #   to
    # /hive/data/genomes/hg19/bed/wgRna-2010-04-20/snoRNABaseVer3Coords.xls

    cd /hive/data/genomes/hg19/bed/wgRna-2010-04-20/
    cp snoRNABaseVer3Coords.xls snoRNABaseVer3Coords.txt
    # remove the header line (column title).
    # remove all the quotes surrounding characters field
    perl -pi.bak -e 's/\"//g' snoRNABaseVer3Coords.txt
    # Reformat to BED format with thickStart and thickEnd set to 0.
    awk 'BEGIN {FS="\t"} {OFS="\t"} \
        {if ($4 == "+") \
         print $1, $2-1, $3, $5, 0, $4, 0, 0,$6; \
       else if ($4 == "-") \
         print $1, $2-1, $3, $5, 0, $4, 0, 0,$6;}' \
       snoRNABaseVer3Coords.txt > snoRNABaseVer3Coords.bed
    # 2010-08-02: snoRnaBase team has not response to hg19 update request.
    # use liftOver to convert the 400 coordinates to hg19 directly.
    liftOver snoRNABaseVer3Coords.bed -bedPlus=3 \ 
      /hive/data/genomes/hg18/bed/liftOver10K/hg18ToHg19.over.chain.gz \
        snoRNABaseHg19Coords.bed unMapped
    # Reading liftover chains
    # Mapping coordinates


    # Merge the miRNA and snoRNA files together
    cat hsMirBaseFormatIdOnly.bed snoRNABaseHg19Coords.bed \
        > wgRna20100420.bed
    # Create and load wgRna
    cp -p /cluster/bin/build/build-kent/src/hg/lib/wgRna.sql wgRna.sql
    hgLoadBed -sqlTable=wgRna.sql hg19 wgRna wgRna20100420.bed
    #  Reading wgRna20100420.bed
    #  Loaded 1341 elements of size 9
    #  Sorted
    #  Creating table definition for wgRna
    #  Saving bed.tab
    #  Loading hg19

    # Clean up
    rm *.bak

    # some details about this track:
    hgsql -e "select count(*) from wgRna;" hg19
    #  1341
    # contain 4 types:
    cat wgRna20100420.bed | awk '{print $9}' | sort | uniq
    # CDBox
    # HAcaBox
    # miRNA
    # scaRna
    hgsql -e "select type, count(*) from wgRna  group by type;" hg19
    #   CDBox     269
    #   HAcaBox   112
    #   miRNA     939
    #   scaRna     21
    featureBits hg19 wgRna
    # 122226 bases of 2899183193 (0.004%) in intersection


#############################################################################
# AFFY U133Plus2 (DONE 2010-10-04 Chin) (Removed prefixes 2010-11-29 galt)
    # Align probes 
    ssh swarm
    cd /hive/data/genomes/hg19/bed
    mkdir -p affyProbes/affyU133Plus2/run
    cd affyProbes/affyU133Plus2/run
    mkdir psl
    ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
    ls -1 /hive/data/outside/affyProbes/U133Plus2_all.fa > mrna.lst

    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 genome.lst mrna.lst gsub jobList
    para create jobList
    para try
    para check
    para push
    para time
    # Completed: 96 of 96 jobs
    # CPU time in finished jobs: 31136s     518.93m     8.65h    0.36d  0.001 y
    # IO & Wait Time:            2218s      36.97m     0.62h    0.03d  0.000 y
    # Average job time:                 347s       5.79m     0.10h    0.00d
    # Longest finished job:            2548s      42.47m     0.71h    0.03d
    # Submission to last job:          4244s      70.73m     1.18h    0.05d

    # Do sort, best in genome filter.
    # to create affyU133Plus2.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyU133Plus2.psl /dev/null
    #   Processing raw.psl to ../affyU133Plus2.psl and /dev/null
    #   .....Processed 693340 alignments
    rm -r raw.psl psl

    # Load probes and alignments into database.
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/affyProbes/affyU133Plus2

    # remove prefix
    perl -pi.bak -e "s/U133\+2://" affyU133Plus2.psl
    hgLoadPsl hg19 affyU133Plus2.psl
    hgLoadSeq -abbr=U133+2: hg19 /gbdb/hgFixed/affyProbes/U133Plus2_all.fa
         # note: re-ran the -replace was used with hgLoadSeq
    #   Creating seq.tab file
    #   Adding /gbdb/hgFixed/affyProbes/U133Plus2_all.fa
    #   54613 sequences
    #   Updating seq table
    #   All done
    #  hgsql -e "select count(*) from affyU133Plus2;" hg19
    #     58592  


    # Added knownToU133Plus2 table
    hgMapToGene hg19 affyU133Plus2 knownGene knownToU133Plus2


#############################################################################
# AFFY U95 (DONE 2010-10-07 Chin)  (Removed prefixes 2010-11-29 galt)
    # Align probes 
    ssh swarm
    cd /hive/data/genomes/hg19/bed
    mkdir -p affyProbes/affyU95/run
    cd affyProbes/affyU95/run
    mkdir psl
    ls -1 /scratch/data/hg19/nib/*.nib > genome.lst
    ls -1 /hive/data/outside/affyProbes/HG-U95Av2_all.fa > mrna.lst

    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -fine -ooc=/scratch/data/hg19/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 genome.lst mrna.lst gsub jobList
    para create jobList
    para try
    para check
    para push
    para time
# Completed: 93 of 93 jobs
# CPU time in finished jobs:       2101s      35.01m     0.58h    0.02d  0.000 y
# IO & Wait Time:                   657s      10.95m     0.18h    0.01d  0.000 y
# Average job time:                  30s       0.49m     0.01h    0.00d
# Longest finished job:             165s       2.75m     0.05h    0.00d
# Submission to last job:           619s      10.32m     0.17h    0.01d
# Estimated complete:                 0s       0.00m     0.00h    0.00d
#Submission to last job:          1685s      28.08m     0.47h    0.02d


    # Do sort, best in genome filter.
    # to create affyU95.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyU95.psl /dev/null
    rm -r raw.psl psl

    # Load probes and alignments into database.
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/affyProbes/affyU95

    # remove prefix
    perl -pi.bak -e "s/U95Av2://" affyU95.psl
    hgLoadPsl hg19 affyU95.psl
    hgLoadSeq -abbr=U95Av2: hg19 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
         # note: re-ran the -replace was used with hgLoadSeq
    #   Creating seq.tab file
    #   Adding /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
    #   12386 sequences
    #   Updating seq table
    #   All done

    # Added knownToU95 table
    hgMapToGene hg19 affyU95 knownGene knownToU95

#############################################################################
# ucscRetro track (2010-03-31, baertsch,hartera DONE)
mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/retro/hg19
cd /hive/groups/gencode/pseudogenes/retroFinder/retro/hg19

mkdir -p /hive/data/genomes/hg19/bed/retro/ 

cat << '_EOF_' > DEF

RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 -skipBlatMerge "
DB=hg19
SCORETHRESH=550
GENOMENAME='Homo sapiens'
GBDB=hg
MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz
TMPMRNA=/hive/groups/gencode/pseudogenes/retroFinder/mrnaBlastz/$DB
TMPEST=/hive/groups/gencode/pseudogenes/retroFinder/est/$DB
EST=all_est
SPLICED_EST=intronEst
SPLIT_EST=0
SPLIT_SPLICED_EST=0
SCRIPT=/cluster/home/baertsch/baertsch/scripts
GENOME=/hive/data/genomes/
RETRODIR=$GENOME/$DB/bed/retro
BASE=/hive/groups/gencode/pseudogenes/retroFinder/retro
OUTDIR=/hive/groups/gencode/pseudogenes/retroFinder/retro/$DB
RESULT=$OUTDIR/result
LOG=$OUTDIR/log
OUT=$OUTDIR/out
OVERLAPDIR=$OUTDIR/run.o
VERSION=2
TABLE=ucscRetroInfo$VERSION
ALIGN=ucscRetroAli$VERSION
LOCAL=/scratch/data/$DB
NIB=$LOCAL/nib
RMSK=rmsk
NET1=netCanFam2 
NET2=netMm9 
NET3=netRheMac2 
GENE1=knownGene
GENE2=refGene
GENE3=mgcGenes
CLUSTER=swarm
SPECIES="hg19"
ROOTDIR="/cluster/home/hartera/public_html"
EXPDIR=exp
GENEPFAM=knownGene
PFAM=knownToPfam
PFAMIDFIELD=name
PFAMDOMAIN=value
ARRAY=gnfAtlas2
ARRAYMEDIAN=hgFixed.gnfHumanAtlas2Median
ARRAYRATIO=hgFixed.gnfHumanAtlas2AllRatio
ARRAYABS=hgFixed.gnfHumanAtlas2All
ARRAYEXP=hgFixed.gnfHumanAtlas2MedianExps
ARRAYEXPALL=hgFixed.gnfHumanAtlas2AllExps
ARRAYLOOKUP=knownToGnfAtlas2
ARRAYPSLS="/hive/data/genomes/hg19/bed/geneAtlas2/affyU133A.psl /hive/data/genomes/hg19/bed/geneAtlas2/affyGnf1h.psl"
ALTSPLICE=
SPLITBYAGE=$SCRIPT/splitRetrosByAge
PDB=proteins090821

'_EOF_'
    # << happy emacs

retroFinder-1.16/scripts/filterMrna.sh DEF
retroFinder-1.16/scripts/filterEst.sh DEF
nohup retroFinder-1.16/scripts/ucscRetroStep1.sh DEF
#check cluster job
nohup retroFinder-1.16/scripts/ucscRetroStep2.sh DEF
nohup retroFinder-1.16/scripts/ucscRetroStep3.sh DEF
#check cluster job
nohup retroFinder-1.16/scripts/ucscRetroStep4.sh DEF
    # Load the track
nohup retroFinder-1.16/scripts/ucscRetroStep5.sh DEF
nohup retroFinder-1.16/scripts/ucscRetroStep6.sh DEF
#add ucscRetroAli to trackDb.ra 
cp  ucscRetroAli.psl /hive/data/genomes/hg19/bed/retro/ 
cp  ucscRetroInfo.bed /hive/data/genomes/hg19/bed/retro/ 
cp  ucscRetroCds.tab /hive/data/genomes/hg19/bed/retro/ 

cd /hive/data/genomes/hg19/bed/retro/ 
#############################################################################
# UPDATE KEGG TABLES (DONE, Fan, 6/18/10)

mkdir -p /hive/data/genomes/hg19/bed/pathways/kegg
cd /hive/data/genomes/hg19/bed/pathways/kegg

wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab

cat map_title.tab | sed -e 's/\t/\thsa\t/' > j.tmp
cut -f 2 j.tmp >j.hsa
cut -f 1,3 j.tmp >j.1
paste j.hsa j.1 |sed -e 's/\t//' > keggMapDesc.tab
rm j.hsa j.1
rm j.tmp

hgsql hg19 -e 'drop table keggMapDesc'
hgsql hg19 < ~/kent/src/hg/lib/keggMapDesc.sql
hgsql hg19 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'

wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/hsa/hsa_pathway.list

cat hsa_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp
hgsql hg19 -e 'drop table keggPathway'
hgsql hg19 < ~/kent/src/hg/lib/keggPathway.sql
hgsql hg19 -e 'load data local infile "j.tmp" into table keggPathway'

hgsql hg19 -N -e \
'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \
>keggPathway.tab

hgsql hg19 -e 'delete from keggPathway'

hgsql hg19 -e 'load data local infile "keggPathway.tab" into table keggPathway'

rm j.tmp
#############################################################################
# Add KEGG column to hg19 Gene Sorter (Done, Fan, 6/18/2010)

mkdir -p /hive/data/genomes/hg19/bed/geneSorter
cd /hive/data/genomes/hg19/bed/geneSorter
hgsql hg19 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab

hgsql hg19 -e 'drop table knownToKeggEntrez'

hgsql hg19 < ~/kent/src/hg/lib/knownToKeggEntrez.sql

hgsql hg19 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez'

#############################################################################
# Haplotype locations (DONE - 2010-06-29 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/haplotypeLocations
    cd /hive/data/genomes/hg19/bed/haplotypeLocations
    for H in 1 2 3 4 5 6 7 8 9
do
    grep -v "^#" ../../download/alternate_loci/ALT_REF_LOCI_${H}/placed_scaffolds/alt_locus_scaf2primary.pos | awk -F'\t' '
{
printf "chr%d\t%d\t%d\t%s\n", $3, $4, $6, $1
}
'
done | sed -e "s/HSCHR6_MHC_APD_CTG1/chr6_apd_hap1/;
s/HSCHR6_MHC_COX_CTG1/chr6_cox_hap2/;
s/HSCHR6_MHC_DBB_CTG1/chr6_dbb_hap3/;
s/HSCHR6_MHC_MANN_CTG1/chr6_mann_hap4/;
s/HSCHR6_MHC_MCF_CTG1/chr6_mcf_hap5/;
s/HSCHR6_MHC_QBL_CTG1/chr6_qbl_hap6/;
s/HSCHR6_MHC_SSTO_CTG1/chr6_ssto_hap7/;
s/HSCHR4_1_CTG9/chr4_ctg9_hap1/;
s/HSCHR17_1_CTG5/chr17_ctg5_hap1/;" > haplotypeLocations.bed

    hgLoadBed hg19 haplotypeLocations haplotypeLocations.bed
    featureBits hg19 haplotypeLocations
# 7207422 bases of 2897316137 (0.249%) in intersection

#############################################################################
# BUILD THE TRACK OF IKMC MAPPED TO HUMAN GENOME. (DONE 5/23/12 angie)
# done 8/2/11 Fan
    ssh hgwdev
    mkdir -p /hive/data/genomes/hg19/bed/ikmc/2012_05
    cd /hive/data/genomes/hg19/bed/ikmc/2012_05
    # Save files emailed from Carol Bult as
    # 20120518_human.gff.gz
    # Make bed12 with itemRgb:
    # watch out for a few items on chrUn|NT_167216.1 which we call chrUn_gl000222.
    zcat 20120518_human.gff.gz \
    | sed -e 's/^chrUn|NT_167216.1/chrUn_gl000222/' \
    | perl -we \
      'while (<>) { \
         s/\r?\n$//; \
         ($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \
         if ($s eq "") { warn "$_\n";  s/^.*//; next; } # Some lines have no coords. \
         $col = ($col eq "Yellow") ? "255,215,0" : \
                ($col eq "Green")  ? "0,240,0" : \
                ($col eq "Blue")   ? "0,0,200" : "0,0,0"; \
         $s--; \
         $id =~ s/^MGI:\d+; ([\w ]+); .*/$1/ || die "Cant parse id \"$id\""; \
         $id =~ s/ //g; \
         my $geneId = join("|", $chr, $ctr, "${n}_$id"); \
         push @{$geneBlks{$geneId}}, [$s, $e, $col] unless $e <= 0; \
      } \
      warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \
      foreach my $geneId (keys %geneBlks) { \
         my @blks = @{$geneBlks{$geneId}}; \
         my ($chrom, $center, $name) = split(/\|/, $geneId); \
         my $blkCount = @blks; \
         @blks = sort {$a->[0] <=> $b->[0]} @blks; \
         my $chromStart = $blks[0]->[0]; \
         my $chromEnd = $blks[$blkCount-1]->[1]; \
         my $color = $blks[0]->[2]; \
         my $blkStarts = ""; \
         my $blkSizes = ""; \
         foreach my $blk (@blks) { \
           my ($start, $end, $col) = @{$blk}; \
           $blkStarts .= ($start - $chromStart) . ","; \
           $blkSizes  .= ($end - $start) . ","; \
           if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \
         } \
        print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \
                   $chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \
      }' \
    | sort -k 1,1 -k 2n,2n > hgIkmc.bed
#Got 46392 genes.
    
    # Make an alias-style table with associated info (MGI ID and status):
    zcat 20120518_human.gff.gz \
    | sed -e 's/^chrUn|NT_167216.1/chrUn_gl000222/' \
    | perl -wpe 's/\r?\n$//; @w = split("\t"); \
      if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \
      if ($w[4] <= 0) { s/^.*//; next; } # A few lines have end=0. \
      $w[8] =~ m/^(MGI:\d+); ([\w ]+); (\w.*)/ || die; \
      ($mgi, $designId, $status) = ($1, $2, $3); \
      $designId =~ s/ //g; \
      # NOTE: This line differs from the mouse version: has $designId preceding $w[2]: \
      $_ = "$w[10]_$designId\t$mgi,$designId,$w[2],$status\n";' \
    | sort -u > hgIkmcExtra.tab
    wc -l hgIkmcExtra.tab
#46392 hgIkmcExtra.tab

    # load and check tables
    hgLoadBed hg19 hgIkmc hgIkmc.bed
    checkTableCoords  -verbose=2  hg19 hgIkmc
    hgLoadSqlTab hg19 hgIkmcExtra $HOME/kent/src/hg/lib/genericAlias.sql hgIkmcExtra.tab
    runJoiner.csh mm9 ikmc
# mm9.ikmcExtra.name - hits 51052 of 51052 ok


#############################################################################
# adding patches to the sequence (DONE - 2010-07-23)
    #	fetch the "official" chrM sequence
    mkdir -p /hive/data/genomes/hg19/bed/additionalSequence/chrM
    cd /hive/data/genomes/hg19/bed/additionalSequence/chrM
    wget --timestamping -O NC_012920.1.fa \
"http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=NC_012920.1.fa"
    echo ">chrM_NC_012920" > chrM_NC_012920.fa
    grep -v "^>" NC_012920.1.fa | sed -e "/^$/d" >> chrM_NC_012920.fa

    #	fetch the first two patches:
    mkdir -p /hive/data/genomes/hg19/bed/additionalSequence/patches
    cd /hive/data/genomes/hg19/bed/additionalSequence/patches
    wget --cut-dirs=7 --no-parent --timestamping --no-remove-listing -m \
        -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/PATCHES/"

    #	take a look through the downloaded files to find the name
    #	correspondence.  The fasta names are in:
    zcat patch_release_1/FASTA/alt.scaf.fa.gz | grep "^>" 
    # the other names are in:
    cat patch_release_1/alt_scaffold_placement.txt
    #	Decide on UCSC chrom names
    #	Create a file with these different names to use later:
# fasta string             alt_scaf_name  parent_name  UCSC chrom name
    cat << '_EOF_' > ucscNames.txt
gi|289436847|gb|GL339449.1 HSCHR5_1_CTG1 CM000667.1 chr5_ctg1_hap1
gi|289436846|gb|GL339450.1 HG79_PATCH CM000671.1 chr9_gl339450
chrM_NC_012920 unknown unknown chrM_NC_012920
'_EOF_'
    # << happy emacs


    #	construct the files for UCSC:
    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch1
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch1

    #	add these sequences to existing hg19.2bit to make a new one:
    cat ../chrM/chrM_NC_012920.fa > patch1.ucsc.fa
    zcat ../patches/patch_release_1/FASTA/alt.scaf.fa.gz \
        | sed -e "s/^>.*GL339449.1.*/>chr5_ctg1_hap1/;" \
-e "s/^>.*GL339450.1.*/>chr9_gl339450/" >> patch1.ucsc.fa

    twoBitToFa /gbdb/hg19/hg19.2bit hg19.existing.fa
    faToTwoBit hg19.existing.fa patch1.ucsc.fa hg19.patch1.2bit
    rm -f /gbdb/hg19/hg19.unmasked.patch1.2bit
    #	temporarily use this unmasked sequence
    ln -s `pwd`/hg19.unmasked.patch1.2bit /gbdb/hg19/hg19.patch1.2bit
    twoBitInfo hg19.patch1.2bit stdout | sort -k2nr > patch1.chrom.sizes

    cat << '_EOF_' > mkTables.pl
#!/usr/bin/env perl

use strict;
use warnings;

sub usage() {
    printf STDERR "usage: mkTables.pl patches.chrom.sizes \\\n";
    printf STDERR " ../patches/ucscNames.txt ../patches/patch_release_1/AGP/alt.scaf.agp.gz\n";
}

my $argc = scalar(@ARGV);

if ($argc < 3) {
    usage;
    exit 255;
}

my $sizes = shift;      # patches.chrom.sizes
my $names = shift;      # patches/ucscNames.txt
my $agpFile = shift;    # alt.scaf.agp.gz

my %glToChr;
my %chrToCtg;
my %fastaToChr;
my %chrToSize;

open(FH, "<$sizes") or die "can not read $sizes";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $size) = split('\s+', $line);
    $chrToSize{$chr} = $size;
}
close (FH);

open(CI, ">chromInfo.txt") or die "can not write to chromInfo.txt";
open(CT, ">ctgPos.txt") or die "can not write to ctgPos.txt";
open(FH, "<$names");
while (my $line = <FH>) {
    chomp $line;
    my ($faName, $ctg, $cmName, $chr) = split('\s+', $line);
    $faName =~ s/.*gb.GL/GL/;
    my $size = $chrToSize{$chr};
    if (exists($glToChr{$faName})) {
        if ($glToChr{$faName} ne $chr) {
            printf STDERR "ERROR: contig name: $faName was chr name: $glToChr{$faName}\n";
            printf STDERR " now claiming to be chr name: $chr\n";
            exit 255;
        }
    } else {
        $glToChr{$faName} = $chr;
    }
    printf CT "%s\t%d\t%s\t0\t%d\n", $faName, $size, $chr, $size;
    printf CI "%s\t%d\t/gbdb/hg19/hg19.patches.2bit\n", $chr, $size;
}
close (FH);
close (CT);
close (CI);

my $prevObj = "";
my $newIx = 1;
open (GP,">gap.txt") or die "can not write to gap.txt";
open (GL,">gold.txt") or die "can not write to gold.txt";
open (FH,"zcat $agpFile|") or die "can not read $agpFile";
while (my $line = <FH>) {
    next if ($line =~ m/^\s*#/);
    chomp $line;
    my ($object, $objStart, $objEnd, $ix, $type, $frag, $fragStart, $fragEnd, $strand) = split('\s+', $line);
    die "ERROR: can not find contig $object to chr name"
        if (!exists($glToChr{$object}));
    $newIx = 1 if ($prevObj ne $object);
    my $chr = $glToChr{$object};
    if ($type eq "N") {
        # frag is size, fragStart is type of gap, and fragEnd is bridged y/n
        printf GP "%s\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
            $chr, $objStart-1, $objEnd, $newIx, $type, $frag, $fragStart,
            $fragEnd;
    } else {
        printf GL "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
            $chr, $objStart-1, $objEnd, $newIx, $type, $frag, $fragStart-1,
            $fragEnd, $strand;
    }
    ++$newIx;
    $prevObj = $object;
    printf "%s\n", $line;
}
close (FH);
close (GL);
close (GP);
'_EOF_'
    # << happy emacs
    chmod +x mkTables.pl
    ./mkTables.pl  patch1.chrom.sizes ../patches/ucscNames.txt \
	../patches/patch_release_1/AGP/alt.scaf.agp.gz
    echo 'chrM_NC_012920  16569   /gbdb/hg19/hg19.patches.2bit' \
	>> chromInfo.txt

    #	create tab files
    hgLoadBed -noLoad -maxChromNameLength=14 \
        -sqlTable=/cluster/home/hiram/kent/src/hg/lib/agpFrag.sql \
        hg19 tGold gold.txt
    rm -f gold.tab
    mv bed.tab gold.tab
    hgLoadBed -noLoad -maxChromNameLength=14 \
        -sqlTable=/cluster/home/hiram/kent/src/hg/lib/gap.sql \
        hg19 tGap gap.txt
    rm -f gap.tab
    mv bed.tab gap.tab

    # these table inserts are performed carefully to make sure they are
    # sane, for example, count the rows before and after load:
    hgsql -e 'load data local infile "gold.tab" into table gold;' hg19
    hgsql -e 'load data local infile "gap.tab" into table gap;' hg19
    hgsql -e 'load data local infile "ctgPos.txt" into table ctgPos;' hg19
    hgsql -e 'load data local infile "chromInfo.txt" into table chromInfo;' hg19

    hgsql -e 'update chromInfo set fileName="/gbdb/hg19/hg19.patch1.2bit";' hg19

    cat << '_EOF_' > ctgPos2.txt
HSCHR5_1_CTG1   1620324 chr5_ctg1_hap1  0       1620324 F
HG79_PATCH      330164  chrUn_gl339450  0       330164  F
NC_012920.1     16569   chrM_NC_012920  0       16569   F
NC_001807.4     16571   chrM    0       16571   O
'_EOF_'
    # << happy emacs

    hgsql -e 'load data local infile "ctgPos2.txt" into table ctgPos2;' hg19

    #	RepeatMasking and SimpleRepeats
    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch1/RMRun
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch1/RMRun
    ln -s ../patch1.ucsc.fa .
    time /scratch/data/RepeatMasker/RepeatMasker -align -s \
	-species 'Homo sapiens' patch1.ucsc.fa
    # took about 6 hours.  Probably should have broken up the large chr5 bit
    #	sort it:

    head -3 patch1.ucsc.fa.out | sed -e "s/  *$//" > patch1.ucsc.sort.out
    headRest 3 patch1.ucsc.fa.out | sort -k5,5 -k6,6n \
	| sed -e "s/  *$//" >> patch1.ucsc.sort.out
    #	create a .tab file to load
    hgLoadBed -noLoad -maxChromNameLength=14 \
        -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql \
        hg19 tRmsk patch1.ucsc.sort.out
    mv bed.tab patch1.rmsk.tab
    hgsql -e 'load data local infile "patch1.rmsk.tab" into table rmsk;' hg19
    #	create nestedRepeats
    /cluster/bin/scripts/extractNestedRepeats.pl patch1.ucsc.sort.out
    #	create a .tab file to load
    hgLoadBed -noLoad -maxChromNameLength=14 \
        -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql \
        hg19 tNest patch1.nestedRepeats.bed
    rm -f patch1.nestedRepeats.tab
    mv bed.tab patch1.nestedRepeats.tab
    hgsql -e 'load data local infile "patch1.nestedRepeats.tab" into table nestedRepeats;' hg19

    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch1/simpleRepeat
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch1/simpleRepeat
    ln -s ../patch1.ucsc.fa .
    /cluster/bin/$MACHTYPE/trfBig -trf=/cluster/bin/$MACHTYPE/trf \
	patch1.ucsc.fa /dev/null -bedAt=patch1.ucsc.bed -tempDir=.
    awk '$5 <= 12' patch1.ucsc.bed > trfMask.bed
    mkdir trfMaskChrom
    splitFileByColumn trfMask.bed trfMaskChrom/
    hgLoadBed -oldTable hg19 simpleRepeat patch1.ucsc.bed \
        -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql

    #	add these masks
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch1
    twoBitMask -add hg19.unmasked.patch1.2bit RMRun/patch1.ucsc.sort.out \
	hg19.rmsk.2bit
    ln -s `pwd`/hg19.unmasked.patch1.2bit /gbdb/hg19/hg19.patch1.2bit
    twoBitMask -add hg19.rmsk.2bit simpleRepeat/trfMask.bed hg19.patch1.t.2bit
    #	safe to ignore errors about >= 13 fields
    twoBitToFa hg19.patch1.t.2bit stdout | faSize stdin \
	> hg19.patch1.2bit.faSize 2>&1
    # 3139128321 bases (239950803 N's 2899177518 real 1431272440 upper
    # 1467905078 lower) in 96 sequences in 1 files
    # %46.76 masked total, %50.63 masked real
    #	update the unmasked sequence from earlier:
    rm -f hg19.patch1.2bit; mv hg19.patch1.t.2bit hg19.patch1.2bit

    time blat hg19.patch1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=hg19.patch1.11.ooc \
	-repMatch=1024
    #	Wrote 30723 overused 11-mers to hg19.patch1.11.ooc
    cp -p hg19.patch1.2bit hg19.patch1.11.ooc /hive/data/staging/data/hg19
    mkdir nib
    twoBitToFa -seq=chrM_NC_012920 hg19.patch1.2bit stdout \
	| faToNib -softMask stdin nib/chrM_NC_012920.nib
    twoBitToFa -seq=chr5_ctg1_hap1 hg19.patch1.2bit stdout \
	| faToNib -softMask stdin nib/chr5_ctg1_hap1.nib
    twoBitToFa -seq=chr9_gl339450 hg19.patch1.2bit stdout \
	| faToNib -softMask stdin nib/chr9_gl339450.nib


    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch1/linSpecRep
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch1/linSpecRep
for C in chr9_gl339450 chrM_NC_012920 chr5_ctg1_hap1
do
    head -3 ../RMRun/patch1.ucsc.sort.out > ${C}.out
    grep "${C} " ../RMRun/patch1.ucsc.sort.out >> ${C}.out
    rm -f ${C}.out_mus*
    /scratch/data/RepeatMasker/DateRepeats ${C}.out -query human -comp mouse
    /cluster/bin/scripts/extractRepeats 1 ${C}.out_mus* > ${C}.out.spec
    rm -f ${C}.out_mus*
done

    # copy new files to /hive/data/staging/data/hg19/ and request rsync
    #	to kluster nodes
-rw-rw-r-- 1      2036 Jul 21 15:39 patch1.chrom.sizes
-rw-rw-r-- 1 816756572 Jul 23 15:32 hg19.patch1.2bit
-rw-rw-r-- 1    122900 Jul 23 15:56 hg19.patch1.11.ooc
-rw-rw-r-- 1      8293 Jul 23 15:59 nib/chrM_NC_012920.nib
-rw-rw-r-- 1    810170 Jul 23 16:01 nib/chr5_ctg1_hap1.nib
-rw-rw-r-- 1    165090 Jul 23 16:01 nib/chr9_gl339450.nib
-rw-rw-r-- 1   34676 Jul 23 16:27 lineageSpecificRepeats/chr9_gl339450.out.spec
-rw-rw-r-- 1     386 Jul 23 16:27 lineageSpecificRepeats/chrM_NC_012920.out.spec
-rw-rw-r-- 1  221381 Jul 23 16:27 lineageSpecificRepeats/chr5_ctg1_hap1.out.spec 
#############################################################################
# Update BlastTab tables for hg19 (Done, Fan, 8/6/2010)
    ssh hgwdev
    mkdir -p /hive/data/genomes/hg19/bed/ucsc.12/hgNearBlastp
    cd /hive/data/genomes/hg19/bed/ucsc.12/hgNearBlastp
    mkdir 100806
    cd 100806
    # Get the proteins used by the other hgNear organisms:
    pepPredToFa hg19 knownGenePep hg19.known.faa
    pepPredToFa mm9 knownGenePep mm9.known.faa
    pepPredToFa rn4 knownGenePep rn4.known.faa
    pepPredToFa danRer6 ensPep danRer6.ensPep.faa
    pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
    pepPredToFa ce6 sangerPep ce6.sangerPep.faa
    pepPredToFa sacCer2 sgdPep sacCer2.sgdPep.faa

    cat << _EOF_ > config.ra
# Latest human vs. other Gene Sorter orgs:
# mouse, rat, zebrafish, worm, yeast, fly

targetGenesetPrefix known
targetDb hg19
queryDbs mm9 rn4 danRer6 dm3 ce6 sacCer2

hg19Fa /hive/data/genomes/hg19/bed/ucsc.12/ucscGenes.faa
mm9Fa /hive/data/genomes/mm9/bed/ucsc.12/ucscGenes.faa
rn4Fa /hive/data/genomes/rn4/bed/blastp/known.faa
danRer6Fa /hive/data/genomes/danRer6/bed/blastp/danRer6.ensPep.faa
dm3Fa /hive/data/genomes/dm3/bed/flybase5.3/flyBasePep.fa
ce6Fa /hive/data/genomes/ce6/bed/blastp/wormPep190.faa
sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/sgdPep.faa

buildDir /hive/data/genomes/hg19/bed/ucsc.12/hgNearBlastp/100806
scratchDir /hive/data/genomes/hg19/bed/ucsc.12/hgNearBlastp/100806/tmp
_EOF_
doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log
#########################################################################
# LIFTOVER TO Hg18 (RE-DONE - 2010-07-26 - Hiram )
    #	preserving the previous 10K liftOver files
    mkdir /hive/data/genomes/hg19/bed/liftOver10K
    cd /hive/data/genomes/hg19/bed/liftOver10K
    ln -s ../blat.hg18.2009-06-04/hg19ToHg18.over.chain.gz .

    #	this liftOver is a 5000 size chunk
    mkdir /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26
    cd /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26
    # -debug run to create run dir, preview scripts...
    #	verifies files can be found
    doSameSpeciesLiftOver.pl -debug hg19 hg18
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	 hg19 hg18 > do.log 2>&1
    #	real    115m26.071s

    #	checking liftOver accuracy
    mkdir /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26/refGene
    cd /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26/refGene
    hgsql -N -e "select * from refGene;" hg19 | cut -f2- > refGene.hg19.gp
    wc -l refGene.hg19.gp
    #	36766
    #	the 5K block size lift over chain
    liftOver -genePred refGene.hg19.gp ../hg19ToHg18.over.chain.gz \
	refGene.hg19ToHg18.5K.lift.gp refGene.hg19ToHg18.5K.unMapped.gp
    wc -l refGene.hg19ToHg18.5K.unMapped.gp
    #   830
    #	the 10K block size lift over chain
    liftOver -genePred refGene.hg19.gp \
	../../liftOver10K/hg19ToHg18.over.chain.gz \
	refGene.hg19ToHg18.10K.lift.gp refGene.hg19ToHg18.10K.unMapped.gp
    wc -l refGene.hg19ToHg18.10K.unMapped.gp
    #	820
    #	construct custom track of chain files.
    #	the 5K block size lift over chain
    chainToPsl ../hg19ToHg18.over.chain.gz \
	/hive/data/genomes/hg19/chrom.sizes \
	/hive/data/genomes/hg18/chrom.sizes \
	/hive/data/genomes/hg19/hg19.2bit \
	/hive/data/genomes/hg18/hg18.2bit stdout \
	| pslToBed stdin hg19ToHg18.5K.bed
    #	the 10K block size lift over chain
    chainToPsl ../../liftOver10K/hg19ToHg18.over.chain.gz \
	/hive/data/genomes/hg19/chrom.sizes \
	/hive/data/genomes/hg18/chrom.sizes \
	/hive/data/genomes/hg19/hg19.2bit \
	/hive/data/genomes/hg18/hg18.2bit stdout \
	| pslToBed stdin hg19ToHg18.10K.bed

#############################################################################
# GENSCAN PREDICTIONS (DONE - 2010-07-30 Fan)
# 		      (PARTIALLY RE-DONE AFTER FIXING .LTF FILES - 2010-08-03 Fan)
    #   After several attempts, the old genscan process could not be
    #   successfully completed, nor the new process used by mm9,
    #   even with the manual steps to do gsBig with
    #   smaller windowSize.
    #
    #	A new process is developed to overcome the challenges.
    #   This new build process for hg19 genscan is substantially different from
    #	hg18.  Due to gsBig errors, the chroms are split into 2,000,000 bp
    #   segments and the -windowSize is set to the default value of 1,200,000.
    #   The results are then collected and lifted to original chrom
    #   coordinates and then loaded into 3 genscan tables. 

    ssh hgwdev
    
    mkdir -p /hive/data/genomes/hg19/bed/genscan
    cd /hive/data/genomes/hg19/bed/genscan

    # Check out hg3rdParty/genscanlinux to get latest genscan.
    cvs co hg3rdParty/genscanlinux

# the latest and correct genscan build subdir is newTry

    mkdir -p /hive/data/genomes/hg19/bed/genscan/newTry
    cd /hive/data/genomes/hg19/bed/genscan/newTry

# collect .fa files for all chroms in one subdir

    mkdir -p faOrig
    for f in `cat /hive/data/genomes/hg19/chr.fasta.list`
	do
	cp -p /hive/data/genomes/hg19/$f faOrig
	done

# construct the chrom.list

    ls -1 faOrig |sed -e 's/\.fa//'>chrom.list

# creaet the file template to be used for cluster runs
    cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=../../../hg3rdParty/genscanlinux/genscan -par=../../../hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
#ENDLOOP
'_EOF_'

# create the chunk2, lift, and runBlat subdir structures
    
    mkdir -p chunk2
    mkdir -p lift
    mkdir -p runBlat
    
    for f in `cat chrom.list`
	do
	mkdir -p lift/$f
	mkdir -p chunk2/$f
	mkdir -p runBlat/$f/gtf 
	mkdir -p runBlat/$f/pep 
	mkdir -p runBlat/$f/subopt
        cp ./template runBlat/$f
        done

# split the chrom fa files into chunks of 2,000,000 bases or less 
# and create corresponding lift files

    for f in `cat chrom.list`
	do
	cd chunk2/$f
	cat ../../faOrig/$f.fa\
	|faSplit -lift=../../lift/$f/$f.lft gap stdin 2000000 ./ck_
	cd ../..
	done

# LATER FOUND THAT THE SEQUENCE ID IN THE .fa FILES UNDER 
# /hive/data/genomes/hg19/* ARE NOT ALWAYS THE SAME AS THE CHROM ID.
# THIS CAUSED INCORRECT CHROM IDs GENERATED IN THE .lft FILES.
# PERFORM THE FOLLOWING TO FIX THE .lft FILES.

    mkdir fixLift
    cd fixLift

    cat << '_EOF_' > fixAll
fix1 chr6_apd_hap1 apd.chr6.4622290.0.4622290.1
fix1 chr6_cox_hap2 cox.chr6.4795371.0.4795371.1
fix1 chr6_dbb_hap3 dbb.chr6.4610396.0.4610396.1
fix1 chr6_mann_hap4 mann
fix1 chr6_mcf_hap5 mcf.chr6.4833398.0.4833398.1
fix1 chr6_qbl_hap6 qbl.chr6.4611984.0.4611984.1
fix1 chr6_ssto_hap7 ssto
'_EOF_'

    cat << '_EOF_' > fix1
echo
echo
echo processing $1 $2
cat /hive/data/genomes/hg19/bed/genscan/newTry/lift/$1/$1.lft |grep $2
cat /hive/data/genomes/hg19/bed/genscan/newTry/lift/$1/$1.lft |sed -e
"s/${2}/${1}/g" >new/$1.lft
cat new/$1.lft |grep $1
cp new/$1.lft /hive/data/genomes/hg19/bed/genscan/newTry/lift/$1/$1.lft 
'_EOF_'

    chmod +x fix*
    fixAll
    cd ..

# go to memk to run cluster jobs

    ssh memk
    cd /hive/data/genomes/hg19/bed/genscan/newTry

# create genome.list and jobList files for each chrom

    for f in `cat chrom.list`
	do
	cd runBlat/$f
	ls -1 /hive/data/genomes/hg19/bed/genscan/newTry/chunk2/$f/* > genome.list
	gensub2 genome.list single template jobList
	cd ../..
	done

# create batch files

    for f in `cat chrom.list`
	do
	cd runBlat/$f
	para create jobList
	cd ../..
	done

# Send off cluster runs

    for f in `cat chrom.list`
	do
        cd runBlat/$f
	para try
        cd ../..
       done

    for f in `cat chrom.list`
	do
        cd runBlat/$f
	para push
        cd ../..
       done

# When all the cluster runs are finished,
# go back to hgwdev for final data collection and table loading

    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/genscan/newTry

# collect ptf results and lift them and then load into the genscan table 

    mkdir -p gtf
    for f in `cat chrom.list`
	do
	echo
	echo processing $f
	cat runBlat/$f/gtf/*.gtf| liftUp -type=.gtf stdout \
        lift/$f/$f.lft error stdin \
	| sed -e "s/ck_/${f}_/g" > gtf/$f.gtf
        ldHgGene -oldTable hg19 -gtf genscan gtf/$f.gtf
	done

# collect subopt results and lift them and then load them into the
# genscanSubopt table

    mkdir -p subopt
    for f in `cat chrom.list`
	do
        echo
	echo processing $f
	cat runBlat/$f/subopt/*.bed| liftUp -type=.bed stdout \
        lift/$f/$f.lft error stdin \
	| sed -e "s/ck_/${f}_/g" > subopt/$f.bed
	hgLoadBed -oldTable hg19 genscanSubopt subopt/$f.bed
	done

# collect pep results and load them into the genscanPep table

    mkdir -p pep
    rm pep/genscanPep.pep
    for f in `cat chrom.list`
	do
        echo
	echo processing $f
	cat runBlat/$f/pep/*.pep\
	| sed -e "s/ck_/${f}_/g" >> pep/genscanPep.pep
	done
    hgPepPred hg19 generic genscanPep pep/genscanPep.pep

################################################################
# HUMAN FETAL BRAIN EXON ARRAYS (YALE) (DONE 2010-08-03 - Chin)
# Note from Andy: All primary data files were lost, The table
# has all of the original data. 
# The "kent/src/hg/makeDb/hgCgiData/Human/microarrayGroups.ra" file should 
# still be valid.  The hgFixed table sestanBrainAtlasExps should be ok.
# So it's just a matter of getting the hg18.sestanBrainAtlas table in
#  hg19 coordinates.

    mkdir  /hive/data/genomes/hg19/bed/yaleMicroarrays
    cd  /hive/data/genomes/hg19/bed/yaleMicroarrays
    cp  /hive/data/genomes/hg18/bed/yaleMicroarrays/sestanBrainAtlas.bed \
           sestanBrainAtlasHg18.bed
    # In the hg18, thickStart is off by 1, fix it
    # hgsql -e " select thickStart-chromStart from sestanBrainAtlas;" hg18 |
    #   sort | uniq
    # 1
    cat sestanBrainAtlasHg18.bed | awk '{print  $1,$2,$3,$4,$5,$6,$7-1,$8,$9,$10,$11,$12,$13,$14,$15 }' >  sestanBrainAtlasHg18Fixed.bed


    # use liftOver to convert the hg18 coordinates to hg19 directly.
    liftOver sestanBrainAtlasHg18Fixed.bed -bedPlus=8 \
      /hive/data/genomes/hg18/bed/liftOver10K/hg18ToHg19.over.chain.gz \
        sestanBrainAtlasHg19.bed unMapped

    # Check the result of liftOver:
    wc -l sestanBrainAtlasHg1*.bed
    # 877877 sestanBrainAtlasHg18.bed
    # 877469 sestanBrainAtlasHg19.bed
    cat unMapped | awk ' /^chr/ {print $1}' | wc -l       
    408
    cat unMapped | awk ' /^#/|| /^chr/ {print $1, $2, $3,_}' \
       > summaryUnMapped.txt

    # Fix up the result, so that $11(blockSizes) = $8(thickEnd)  - $7
    # (thickStart) without the fix, the checkTableCoors will complain.
     cat sestanBrainAtlasHg19.bed | awk '{print  $1,$2,$3,$4,$5,$6,$7,$8,$9, \
        $10, $8-$7, $12,$13,$14,$15 }' >  sestanBrainAtlasHg19Fixed.bed


    # load the table
    hgLoadBed hg19 sestanBrainAtlas sestanBrainAtlasHg19Fixed.bed
    # Reading sestanBrainAtlasHg19.bed
    # Loaded 877469 elements of size 15
    # Sorted
    # Creating table definition for sestanBrainAtlas
    # Saving bed.tab
    # Loading hg19

    # track stuff done from hg18 days: 
    # kent/src/hg/makeDb/trackDb/human/sestanBrainAtlas.html
    # kent/src/hg/makeDb/trackDb/human/trackDb.ra

#############################################################################
# Updating to patch2 sequence (DONE - 2010-08-18 - Hiram)
#	Most of this business is encapsulated into .sh or .pl scripts in
#	these directories.  They can be used next time with slight alterations.
    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patches/patch_release_2
    cd /hive/data/genomes/hg19/bed/additionalSequence/patches/patch_release_2
    wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
        -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p2/"

    cd /hive/data/genomes/hg19/bed/additionalSequence/patches
    #	construct a script that can gather the names from the delivered
    #	files and generate UCSC names.  May be useful next time.
    cat << '_EOF_' > gatherNames.pl
#!/usr/bin/env perl

use strict;
use warnings;

sub usage() {
    print STDERR "usage: ./gatherNames.pl patch_release_2\n";
}

my $argc = scalar(@ARGV);

if ($argc != 1) {
    usage;
    exit 255;
}

my $patchDir = shift;

if ( ! -d $patchDir ) {
    print STDERR "ERROR: given directory $patchDir is not a directory or does not exist";
    usage;
    exit 255;
}

my %ctgToChr;
my %ctgToFastaName;
my $fasta = "$patchDir/PATCHES/alt_scaffolds/FASTA/alt.scaf.fa.gz";
open (FH, "zcat $fasta | grep '^>' |")
	or die "ERROR: can not read $fasta";
while ( my $line = <FH> ) {
    chomp $line;
    my ($gi, $rest) = split('\s+',$line,2);
    my ($x, $acc, $y, $gl, $z) = split('\|', $gi);
    my $chr = $rest;
    $chr =~ s/Homo sapiens chromosome //;
    $chr =~ s/ genomic contig.*//;
    $ctgToChr{$gl} = $chr;
    $ctgToFastaName{$gl} = $gi;
    $ctgToFastaName{$gl} =~ s/\|$//;
    $ctgToFastaName{$gl} =~ s/^>//;
#    printf "%s\t%s\n", $gl, $chr;
}
close (FH);

my $placement = "$patchDir/PATCHES/alt_scaffolds/alt_scaffold_placement.txt";
open (FH, "sort -t'\t' -k6,6n $placement|") or die "ERROR: can not read $placement";
while (my $line = <FH>) {
    chomp $line;
    next if ($line =~ m/^\s*#/);
    my ($altAsmName, $primAsmName, $altScafName, $altScafAcc, $parentType,
	$parentName, $parentAcc, $regionName, $ori, $altScafStart,
	$altScafStop, $parentStart, $parentStop, $altStartTail, $altStopTail) =
	split('\t',$line);
    my $chr = $ctgToChr{$altScafAcc};
    die "ERROR: chrom name here does not match: $chr != $parentName"
	if ($chr ne $parentName);
#    printf "chr%s %s\t%s\t%s\t%s\t%s\n", $chr, $parentName, $altScafAcc,
#	$altScafName, $parentAcc, $regionName;
    my $ucscChrName = lc($altScafName);
    if ($ucscChrName =~ m/_patch$/) {
	$ucscChrName = sprintf("chr%d_%s", $parentName, lc($altScafAcc));
	$ucscChrName =~ s/\.1$//;
    } else {
	$ucscChrName =~ s/^hs//;
	$ucscChrName =~ s/_1$//;
	my ($chrNum, $hapNum, $ctgName) = split('_', $ucscChrName, 3);
	$ucscChrName = sprintf("%s_%s_%s", $chrNum, $ctgName, lc($altScafAcc));
	$ucscChrName =~ s/\.1$//;
    }
    printf "%s %s %s %s\n", $ctgToFastaName{$altScafAcc}, $altScafName,
	$parentAcc, $ucscChrName;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./gatherNames.pl
    ./gatherNames.pl patch_release_2 > ucscNames.patch2.txt

    # with that name translation file in place, begin to put the sequences
    #	together:
    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch2
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch2
    cat << '_EOF_' > addSequence.pl
#!/usr/bin/env perl

use strict;
use warnings;

sub usage() {
    print STDERR "usage: ./addSequence.pl ../patches/ucscNames.patch2.txt \\\n";
    print STDERR "\t../patches/patch_release_2/PATCHES/alt_scaffolds/FASTA/alt.scaf.fa.gz\n";
}

my $argc = scalar(@ARGV);

if ($argc != 2) {
    usage;
    exit 255;
}

my %skipSequence;
$skipSequence{"chr9_gl339450"} = 1;
$skipSequence{"chr5_ctg1_gl339449"} = 1;

my $names = shift;
my $fasta = shift;

my %fastaToChrNames;
open(FH, "<$names") or die "ERROR: can not read $names";
while (my $line = <FH>) {
    chomp $line;
    my ($fa, $ctg, $cm, $chr) = split('\s+', $line);
    $fastaToChrNames{$fa} = $chr;
}
close (FH);

open(PA, ">patch2.ucsc.fa") or die "ERROR: can not write to patch2.ucsc.fa";
open(FH, "zcat $fasta|") or die "ERROR: can not zcat $fasta";
my $skipToNext = 0;
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
	my ($fa, $rest) = split('\s+', $line, 2);
	$fa =~ s/\|$//;
	$fa =~ s/^>//;
	die "can not find $fa" if (!exists($fastaToChrNames{$fa}));
	my $chr = $fastaToChrNames{$fa};
	if (exists($skipSequence{$chr})) {
	    $skipToNext = 1;
	} else {
	    printf PA ">%s\n", $chr;
	    $skipToNext = 0;
	}
    } else {
	next if($skipToNext);
	print PA $line;
    }
}
close (FH);
close (PA);

my $here=`pwd`;
chomp $here;
print `twoBitToFa ../patch1/hg19.patch1.2bit hg19.existing.fa`;
print `faToTwoBit hg19.existing.fa patch2.ucsc.fa hg19.patch2.2bit`;
print `rm -f /gbdb/hg19/hg19.patch2.2bit`;
print `ln -s $here/hg19.patch2.2bit /gbdb/hg19`;
print `twoBitInfo hg19.patch2.2bit stdout | sort -k2nr > patch2.chrom.sizes`;
'_EOF_'
    # << happy emacs
    chmod +x addSequence.pl
    ./addSequence.pl ../patches/ucscNames.patch2.txt \
      ../patches/patch_release_2/PATCHES/alt_scaffolds/FASTA/alt.scaf.fa.gz

    sort patch2.chrom.sizes ../patch1/patch1.chrom.sizes | uniq -c \
        | sort -rn | awk '$1 == 1' | awk '{printf "%s\t%s\n", $2, $3}' \
        | sort -k2,2nr > patches.chrom.sizes

    cat << '_EOF_' > mkTables.pl
#!/usr/bin/env perl

use strict;
use warnings;

sub usage() {
    printf STDERR "usage: ./mkTables.pl patches.chrom.sizes \\\n";
    printf STDERR " ../patches/ucscNames.patch2.txt ../patches/patch_release_2/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz\n";
}

my $argc = scalar(@ARGV);

if ($argc < 3) {
    usage;
    exit 255;
}

my %skipSequence;
$skipSequence{"GL339449.1"} = 1;
$skipSequence{"GL339450.1"} = 1;

my $sizes = shift;	# patches.chrom.sizes
my $names = shift;	# patches/ucscNames.txt
my $agpFile = shift;	# alt.scaf.agp.gz

my %glToChr;
my %chrToCtg;
my %fastaToChr;
my %chrToSize;

open(FH, "<$sizes") or die "can not read $sizes";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $size) = split('\s+', $line);
    $chrToSize{$chr} = $size;
}
close (FH);

open(CI, ">chromInfo.txt") or die "can not write to chromInfo.txt";
open(CT, ">ctgPos.txt") or die "can not write to ctgPos.txt";
open(FH, "<$names");
while (my $line = <FH>) {
    chomp $line;
    my ($faName, $ctg, $cmName, $chr) = split('\s+', $line);
    $faName =~ s/.*gb.GL/GL/;
    next if (exists($skipSequence{$faName}));
    my $size = $chrToSize{$chr};
    if (exists($glToChr{$faName})) {
	if ($glToChr{$faName} ne $chr) {
	    printf STDERR "ERROR: contig name: $faName was chr name: $glToChr{$faName}\n";
	    printf STDERR " now claiming to be chr name: $chr\n";
	    exit 255;
	}
    } else {
	$glToChr{$faName} = $chr;
    }
    die "not defined faName" if (!defined($faName));
    die "not defined $faName $chr size" if (!defined($size));
    printf CT "%s\t%d\t%s\t0\t%d\n", $faName, $size, $chr, $size;
    printf CI "%s\t%d\t/gbdb/hg19/hg19.patch2.2bit\n", $chr, $size;
}
close (FH);
close (CT);
close (CI);

my $prevObj = "";
my $newIx = 1;
open (GP,">gap.txt") or die "can not write to gap.txt";
open (GL,">gold.txt") or die "can not write to gold.txt";
open (FH,"zcat $agpFile|") or die "can not read $agpFile";
while (my $line = <FH>) {
    next if ($line =~ m/^\s*#/);
    chomp $line;
    my ($object, $objStart, $objEnd, $ix, $type, $frag, $fragStart, $fragEnd, $strand) = split('\s+', $line);
    next if (exists($skipSequence{$object}));
    die "ERROR: can not find contig $object to chr name"
	if (!exists($glToChr{$object}));
    $newIx = 1 if ($prevObj ne $object);
    my $chr = $glToChr{$object};
    if ($type eq "N") {
	# frag is size, fragStart is type of gap, and fragEnd is bridged y/n
	printf GP "%s\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
	    $chr, $objStart-1, $objEnd, $newIx, $type, $frag, $fragStart,
	    $fragEnd;
    } else {
	printf GL "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
	    $chr, $objStart-1, $objEnd, $newIx, $type, $frag, $fragStart-1,
	    $fragEnd, $strand;
    }
    ++$newIx;
    $prevObj = $object;
    printf "%s\n", $line;
}
close (FH);
close (GL);
close (GP);
'_EOF_'
    # << happy emacs
    chmod +x mkTables.pl
    ./mkTables.pl patches.chrom.sizes \
    	../patches/ucscNames.patch2.txt \
	../patches/patch_release_2/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz

    hgLoadBed -noLoad -maxChromNameLength=14 \
        -sqlTable=/cluster/home/hiram/kent/src/hg/lib/agpFrag.sql \
        hg19 tGold gold.txt
    rm -f gold.tab
    mv bed.tab gold.tab
    hgLoadBed -noLoad -maxChromNameLength=14 \
        -sqlTable=/cluster/home/hiram/kent/src/hg/lib/gap.sql \
        hg19 tGap gap.txt
    rm -f gap.tab
    mv bed.tab gap.tab

    hgsql -e 'load data local infile "gap.tab" into table gap;' hg19
    hgsql -e 'load data local infile "gold.tab" into table gold;' hg19
    hgsql -e 'load data local infile "ctgPos.txt" into table ctgPos;' hg19

    cat << '_EOF_' > mkCtgPos2.pl
#!/usr/bin/env perl

use strict;
use warnings;

sub usage() {
    print STDERR "usage: ./mkCtgPos2.pl ../patches/ucscNames.patch2.txt \\\n";
    print STDERR "\tpatch2.chrom.sizes\n";
}

my $argc = scalar(@ARGV);

if ($argc != 2) {
    usage;
    exit 255;
}

my %skipSequence;
$skipSequence{"chr9_gl339450"} = 1;
$skipSequence{"chr5_ctg1_gl339449"} = 1;

my $names = shift;
my $sizes = shift;

my %chrSize;
open(FH, "<$sizes") or die "ERROR: can not read $sizes";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $size) = split('\s+', $line);
    $chrSize{$chr} = $size;
}
close (FH);

open(FH, "<$names") or die "ERROR: can not read $names";
while (my $line = <FH>) {
    chomp $line;
    my ($fa, $ctg, $cm, $chr) = split('\s+', $line);
    next if (exists($skipSequence{$chr}));
    if (exists($chrSize{$chr})) {
	my $size = $chrSize{$chr};
	printf "%s\t%d\t%s\t0\t%d\tF\n", $ctg, $size, $chr, $size;
    }
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x mkCtgPos2.pl
    ./mkCtgPos2.pl ../patches/ucscNames.patch2.txt patch2.chrom.sizes \
	> ctgPos2.txt
    hgsql -e 'load data local infile "ctgPos2.txt" into table ctgPos2;' hg19

    cat << '_EOF_' > mkHapLocate.pl
#!/usr/bin/env perl

use strict;
use warnings;

sub usage() {
    print STDERR "usage: ./mkHapLocate.pl ctgPos.txt \\\n";
    print STDERR "\t../patches/patch_release_2/PATCHES/alt_scaffolds/alt_scaffold_placement.txt\n";
}

my $argc = scalar(@ARGV);

if ($argc != 2) {
    usage;
    exit 255;
}

my %skipSequence;
$skipSequence{"chr9_gl339450"} = 1;
$skipSequence{"chr5_ctg1_gl339449"} = 1;

my $ctgPos = shift;
my $placement = shift;

my %ctgToHap;
open(FH, "<$ctgPos") or die "ERROR: can not read $ctgPos";
while (my $line = <FH>) {
    my ($ctg, $size, $hapName, $rest) = split('\s+', $line, 4);
    $ctgToHap{$ctg} = $hapName;
}
close (FH);

open(FH,"<$placement") or die "ERROR: can not read $placement";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($altAsmName, $primAsmName, $altScafName, $altScafAcc, $parentType,
	$parentName, $parentAcc, $regionName, $ori, $altScafStart,
	$altScafStop, $parentStart, $parentStop, $altStartTail,
	$altStopTail) = split('\t', $line);
    if (exists($ctgToHap{$altScafAcc})) {
	my $hapName = $ctgToHap{$altScafAcc};
	printf "chr%s\t%d\t%d\t%s\n", $parentName, $parentStart-1,
	    $parentStop, $hapName;
    } else {
	print STDERR "not found: $altScafAcc $altScafName\n";
    }
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x mkHapLocate.pl
    ./mkHapLocate.pl ctgPos.txt \
../patches/patch_release_2/PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
    > haplotypeLocations.bed
    hgLoadBed -oldTable hg19 haplotypeLocations haplotypeLocations.bed 

    mkdir simpleRepeat
    cd simpleRepeat
    ln -s ../patch2.ucsc.fa
    /cluster/bin/$MACHTYPE/trfBig -trf=/cluster/bin/$MACHTYPE/trf \
      patch2.ucsc.fa /dev/null -bedAt=patch2.ucsc.bed -tempDir=.
    awk '$5 <= 12' patch2.ucsc.bed > trfMask.bed
    splitFileByColumn trfMask.bed trfMaskChrom/
    hgLoadBed -oldTable hg19 simpleRepeat patch2.ucsc.bed \
        -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql

    mkdir ../RMRun
    cd ../RMRun
    ln -s ../patch2.ucsc.fa
    faSplit byname patch2.ucsc.fa ctgs/

    cat << '_EOF_' > runOne
#!/bin/csh -fe

set ctg = $1

set runDir = /hive/data/genomes/hg19/bed/additionalSequence/patch2/RMRun
set src = ${runDir}/ctgs/${ctg}.fa
mkdir -p ${runDir}/out/${ctg}
cd ${runDir}/out/${ctg}

cp -p ${src} .
/scratch/data/RepeatMasker/RepeatMasker -align -s -species 'Homo sapiens' ${ctg}.fa
rm -f ${ctg}.fa
'_EOF_'
    # << happy emacs
    chmod +x runOne
    
    ls ctgs > ctg.list
    cat << '_EOF_' > template
#LOOP
runOne $(root1) {check out line+ out/$(root1)/$(root1).fa.out}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ctg.list single template jobList
    ssh swarm
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch2/RMRun
    para create jobList
    mkdir out
    para try
    para push
# Completed: 68 of 68 jobs
# CPU time in finished jobs:      88730s    1478.83m    24.65h    1.03d  0.003 y
# IO & Wait Time:                  6277s     104.62m     1.74h    0.07d  0.000 y
# Average job time:                1397s      23.29m     0.39h    0.02d
# Longest finished job:            4154s      69.23m     1.15h    0.05d
# Submission to last job:          4291s      71.52m     1.19h    0.05d

    find ./out -type f -name "*.fa.out" | head -1 | xargs head -3 \
	> hg19.patch2.out
    find ./out -type f -name "*.fa.out" | xargs -L 1 headRest 3 \
	| sort -k5,5 -k6,6n | sed -e "s/  *$//" >> hg19.patch2.out

    extractNestedRepeats.pl hg19.patch2.out > hg19.nestedRepeats.patch2.bed
    hgLoadBed -noLoad -maxChromNameLength=14 \
        -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql \
        hg19 tNest hg19.nestedRepeats.patch2.bed
    rm -f hg19.nestedRepeats.patch2.tab
    mv bed.tab hg19.nestedRepeats.patch2.tab

    hgLoadOut -tabFile=hg19.patch2.rmsk.tab -nosplit hg19 hg19.patch2.out

    hgsql -e 'load data local infile "hg19.nestedRepeats.patch2.tab" into table nestedRepeats;' hg19
    hgsql -e 'load data local infile "hg19.patch2.rmsk.tab" into table rmsk;' \
	hg19

    mv hg19.patch2.2bit hg19.patch2.0.2bit
    twoBitMask -add hg19.patch2.0.2bit RMRun/hg19.patch2.out hg19.patch2.1.2bit
    twoBitMask -add hg19.patch2.1.2bit \
        simpleRepeat/trfMask.bed hg19.patch2.2bit
    twoBitToFa hg19.patch2.2bit stdout | faSize stdin \
	> faSize.hg19.patch2.2bit.txt
    
#############################################################################
# new blat server for the hg19.patch2 sequence (DONE - 2010-08-18 - Hiram)
    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("hg19", "blat4", "17792", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("hg19", "blat4", "17793", "0", "1");' \
	    hgcentraltest

#############################################################################
# establish some liftover chains from reference sequence to new added sequence
#	(WORKING - 2010-08-19 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch2/blat.2010-08-11
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch2/blat.2010-08-11
    twoBitInfo ../hg19.patch2.2bit stdout | sort > patch2.chrom.sizes
    twoBitInfo ../../../../hg19.2bit stdout | sort > hg19.chrom.sizes
    comm -13 hg19.chrom.sizes patch2.chrom.sizes > new.sequence.chrom.sizes
for S in `awk '{print $1}' new.sequence.chrom.sizes`
do
    echo $S
    mkdir -p $S
    twoBitToFa ../hg19.patch2.2bit:${S} ${S}/${S}.fa
done

    faToTwoBit chr*/chr*.fa hg19.patch2.only.2bit
    rm -fr chr*/chr*.fa
    rmdir chr*

    ssh swarm
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch2/blat.2010-08-11
    cat << '_EOF_' > runOne
#!/bin/csh -ef

set hg19 = "/hive/data/genomes/hg19/bed/additionalSequence/patch2/hg19.patch2.2bit"
set runJob = `pwd`/job.csh
set target = $1
set outPsl = $2
set query = `echo $target | sed -e "s/_.*//"`
set qSize = `twoBitInfo ${hg19}:${query} stdout | awk '{print $2}'`
set tSequence = "${hg19}:${target}"
set qSequence = "${hg19}:${query}:0-${qSize}"
mkdir -p psl/${target}
pushd psl/${target}
# echo "${runJob} ${tSequence} ${qSequence} `pwd`/${target}.psl"
${runJob} ${tSequence} ${qSequence} `pwd`/${target}.psl
popd
'_EOF_'
    # << happy emacs
    chmod +x runOne

    cat << '_EOF_' > job.csh
#!/bin/csh -ef

set targetList = $1
set queryListIn = $2
set outPsl = $3

if ($targetList:e == "lst") set targetList = /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26/run.blat/$targetList
if ($queryListIn:e == "lst") set queryListIn = /hive/data/genomes/hg19/bed/blat.hg18.2010-07-26/run.blat/$queryListIn

# Use local disk for output, and move the final result to $outPsl
# when done, to minimize I/O.
set tmpDir = `mktemp -d -p /scratch/tmp doSame.blat.XXXXXX`
pushd $tmpDir

# We might get a .lst or a 2bit spec here -- convert to (list of) 2bit spec:
if ($queryListIn:e == "lst") then
  set specList = `cat $queryListIn`
else
  set specList = $queryListIn
endif

# Further partition the query spec(s) into 5k coord ranges, building up
# a .lst of 2bit specs for blat and a .lft liftUp spec for the results:
cp /dev/null reSplitQuery.lst
cp /dev/null query.lft
foreach spec ($specList)
  set file  = `echo $spec | awk -F: '{print $1;}'`
  set seq   = `echo $spec | awk -F: '{print $2;}'`
  set range = `echo $spec | awk -F: '{print $3;}'`
  set start = `echo $range | awk -F- '{print $1;}'`
  set end   = `echo $range | awk -F- '{print $2;}'`
  if (! -e q.sizes) twoBitInfo $file q.sizes
  set seqSize = `awk '$1 == "'$seq'" {print $2;}' q.sizes`
  set chunkEnd = '0'
  while ($chunkEnd < $end)
    set chunkEnd = `expr $start + 5000`
    if ($chunkEnd > $end) set chunkEnd = $end
    set chunkSize = `expr $chunkEnd - $start`
    echo $file\:$seq\:$start-$chunkEnd >> reSplitQuery.lst
    if (($start == 0) && ($chunkEnd == $seqSize)) then
      echo "$start	$seq	$seqSize	$seq	$seqSize" >> query.lft
    else
      echo "$start	$seq"":$start-$chunkEnd	$chunkSize	$seq	$seqSize" >> query.lft
    endif
    set start = `expr $chunkEnd - 500`
  end
end

# Align unsplit target sequence(s) to .lst of 2bit specs for 5k chunks
# of query:
blat $targetList reSplitQuery.lst tmpUnlifted.psl \
  -tileSize=11 -minScore=100 -minIdentity=98 -fastMap -noHead

# Lift query coords back up:
liftUp -pslQ -nohead tmpOut.psl query.lft warn tmpUnlifted.psl

# Move final result into place:
mv tmpOut.psl $outPsl

popd
rm -rf $tmpDir
'_EOF_'
    # << happy emacs
    chmod +x job.csh

    cat << '_EOF_' > template
#LOOP
runOne $(root1) {check out line+ psl/$(root1)/$(root1).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    mkdir psl
    awk '{print $2}' new.sequence.chrom.sizes > target.list
    gensub2 target.list single template jobList
    para create jobList
    para try ... check ... push
    para time
# Completed: 71 of 71 jobs
# CPU time in finished jobs:      29233s     487.22m     8.12h    0.34d  0.001 y
# IO & Wait Time:                  2567s      42.78m     0.71h    0.03d  0.000 y
# Average job time:                 448s       7.46m     0.12h    0.01d
# Longest finished job:            3757s      62.62m     1.04h    0.04d
# Submission to last job:          4070s      67.83m     1.13h    0.05d

    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch2/blat.2010-08-11
for C in `cat target.list`
do
    echo ${C}
    mkdir chain/${C}
    cd chain/${C}
    ln -s ../../psl/${C}/${C}.psl
    axtChain -linearGap=medium -psl ${C}.psl ../../../hg19.patch2.2bit ../../../hg19.patch2.2bit ${C}.chain
    cd ../..
done

    cd chain
    find . -type f | xargs cat | chainSort stdin patch2.sort.chain
    hgLoadChain hg19 patch2Chain patch2.sort.chain
    #	Loading 93897 chains into hg19.patch2Chain

#########################################################################
# Vega gene update (DONE - 2010-08-25 - Hiram)
    #	lookup version number at the Vega WEB site:
    #	http://vega.sanger.ac.uk/index.html
    #	and FTP site:
    #	ftp://ftp.sanger.ac.uk/pub/vega/
    cd /hive/data/genomes/hg19
    #	step wise to verify operation
    doEnsGeneUpdate.pl -vegaGene -ensVersion=39 -stop=download hg19.ensGene.ra
    # they changed their naming convention again.  Look at the FTP site,
    #	fix the download script:
    cd bed/vega.39/download
    ./doDownload.csh
# -rw-rw-r-- 1 12399394 Aug 25 09:15 gtf_file.gz
# -rw-rw-r-- 1  9329992 Aug 25 09:15 Homo_sapiens.VEGA.39.pep.all.fa.gz

    doEnsGeneUpdate.pl -vegaGene -ensVersion=39 \
	-continue=process -stop=process hg19.ensGene.ra
# genePredCheck -db=hg19 vegaPseudo.gp.gz
# checked: 12012 failed: 0
# genePredCheck -db=hg19 not.vegaPseudo.gp.gz
# checked: 103437 failed: 0
# genePredCheck -db=hg19 hg19.allGenes.gp.gz
# checked: 115449 failed: 0

    doEnsGeneUpdate.pl -vegaGene -ensVersion=39 \
	-continue=load -stop=load hg19.ensGene.ra
    featureBits hg19 vegaGene
    # 78097231 bases of 2911519270 (2.682%) in intersection
    featureBits hg19 vegaPseudoGene
    #	8782198 bases of 2911519270 (0.302%) in intersection

#########################################################################
# FOSMID END PAIRS (STARTED 9/1/10 angie, WORKING - 2011-07-07 - Hiram)
    # First I downloaded raw files from NCBI, to see if they are newer
    # than what we have in fosends.3:
    mkdir /hive/data/outside/fosends.4
    cd /hive/data/outside/fosends.4
    wget --timestamping ftp://ftp.ncbi.nih.gov/genomes/FOSMIDS/homo_sapiens/\*
    # The file dates are 2005, but all log files begin with 2002 dates.
    # Good, we can proceed with fosends.3 and won't have to reverse-engineer
    # Terry's lost pipeline for translating the files.
    # take a look at the names
    zcat Hs*.dr.mfa.gz | grep "^>" > sequence.names
    zcat *.trim.log.gz | cut -f6 | sort > clone.names
    # we are going to translate the .T0 .T1 .T2 suffix on the sequence names
    #	to _T0 _T1 _T2 to avoid problems during blat and other parts of
    #	the pipeline
    # using the *.trim.log.gz files, figure out the pairs, write to
    #	files endPairs.txt and singles.txt
    cat << '_EOF_' > pairEnds.pl
#!/bin/env perl

use strict;
use warnings;

my %endIds;	# key is cloneId.subId, value is endId_F,endId_R
open(FH,"zcat Hs.*trim.log.gz|") or die "can not read Hs.*trim.log.gz";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    my ($id, $l0, $l1, $endId, $wibr, $cloneId, $fr, $cId, $l2) =
	split('\s+', $line);
    my ($oneEnd, $subId) = split('\.', $endId);
    my $key = "$cloneId.$subId";
    if (exists($endIds{$key})) {
	die "ERROR: third end: $endId $cloneId $endIds{$key}"
		if ($endIds{$key} =~ m/,/);
	if ($fr eq "F") {
	    $endIds{$key} = "$endId,$endIds{$key}";
	} else {
	    $endIds{$key} = "$endIds{$key},$endId";
	}
    } else {
	$endIds{$key} = $endId;
    }
}
close (FH);

open (EP, ">endPairs.txt") or die "can not write to endPairs.txt";
open (SI, ">singles.txt") or die "can not write to singles.txt";
foreach my $key (sort (keys %endIds)) {
    my ($cloneId, $subId) = split('\.', $key);
    if ($endIds{$key} =~ m/,/) {
	my ($fwd, $rev) = split(',', $endIds{$key});
	$fwd =~ s/\./_/;
	$rev =~ s/\./_/;
	printf EP "%s\t%s\t%s\n", $fwd, $rev, $cloneId;
    } else {
	my $fwd = $endIds{$key};
	$fwd =~ s/\./_/;
	printf SI "%s\t%s\n", $fwd, $cloneId;
    }
}
close(EP);
close(SI);
'_EOF_'
    # << happy emacs
    chmod +x pairEnds.pl
    ./pairEnds.pl

    # working in the hg19 build directory
    mkdir /hive/data/genomes/hg19/bed/fosEndPairs
    cd /hive/data/genomes/hg19/bed/fosEndPairs

    mkdir /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds
    cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds
    # fixup the names, and upper case all sequence (the names are already uc)
    zcat /hive/data/outside/fosends.4/*.dr.mfa.gz \
	| sed -e "s/^>gnl.* />/; s/\.T/_T/" | tr '[a-z]' '[A-Z]' > fosends.4.fa
    faCount fosends.4.fa > fosends.4.faCount
    tail -1 fosends.4.faCount
# total   1258791003      329912804       274295722       339230145       279976011       35376321        40508700
    # verify nothing broken, nothing lost with the name transition
    faCount /hive/data/outside/fosends.4/*.mfa.gz > /tmp/fs4.faCount
    tail -1 /tmp/fs4.faCount
# total   1258791003      329912804       274295722       339230145       279976011       35376321        40508700

    faSize fosends.4.fa
# 1258791003 bases (35376321 N's 1223414682 real 1223414682 upper 0 lower) in 1615492 sequences in 1 files
# Total size: mean 779.2 sd 190.8 min 12 (G248P84602FB12_T0) max 3784 (G248P89620RG7_T0) median 722

    mkdir splitEnds
    faSplit sequence fosEnds.4.fa 400 splitEnds/fosEnds

    # figure out break points in hg19 around large gaps
    hgsql -N -e "select chrom,chromStart,chromEnd,size from gap;" hg19 \
	| sort -k4nr > hg19.gap.bed
    # script to combine adjacent gaps into single gaps
    cat << '_EOF_' > bedCollapse.pl
#!/bin/env perl

use strict;
use warnings;

my $argc = scalar(@ARGV);

if ($argc < 1) {
    printf STDERR "usage: ./bedCollapse.pl <file.bed>\n";
    printf STDERR "will combine adjacent bed elements into one element\n";
    exit 255
}

my $file = shift;
my $chr = "";
my $prevEnd = 0;
my $start = 0;
my $end = 0;
my $size = 0;
open (FH, "sort -k1,1 -k2,2n $file|") or die "can not read $file";
while (my $line = <FH>) {
    chomp $line;
    my ($c, $s, $e, $rest) = split('\s+', $line, 4);
    $size = $end - $start;
    if (length($chr) > 1) {
        if ($chr ne $c) {
            printf "%s\t%d\t%d\t%d\n", $chr, $start, $end, $size;
            $chr = $c; $start = $s; $end = $e;
        } else {
            if ($s == $end) { 
                $end = $e;
            } else {
                printf "%s\t%d\t%d\t%d\n", $chr, $start, $end, $size;
                $chr = $c; $start = $s; $end = $e;
            }
        }
    } else {
        $chr = $c; $start = $s; $end = $e;
    }
}
printf "%s\t%d\t%d\t%d\n", $chr, $start, $end, $size;
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x bedCollapse.pl
    # filter out for gaps of 50,000 and larger
    ./bedCollapse.pl hg19.gap.bed | awk '$4 > 49999' > hg19.gapBreaks.bed
    cat << '_EOF_' > hg19SplitSpec.pl
#!/bin/env perl

use strict;
use warnings;

my %chromSizes;
open (FH, "<../../../chrom.sizes") or die "can not read ../../../chrom.sizes";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $size) = split('\s+', $line);
    $chromSizes{$chr} = $size;
}
close (FH);

my %chrDone;	# to measure which chroms have been done
my $curChr = "";
my $start = 0;
my $end = 0;
open (FH, "grep -v '_' hg19.gapBreaks.bed|") or die "can not grep hg19.gapBreaks.bed";
while (my $line = <FH>) {
    chomp $line;
    my ($c, $s, $e, $rest) = split('\s+', $line, 4);
    if (length($curChr) > 0) {
	if ($c eq $curChr) {
	    $end = $s;
	    my $size = $end - $start;
	    die "ERROR: size is zero ? $c $s $e" if ($size < 1);
	    printf "%s\t%d\t%d\t%d\n", $curChr, $start, $end, $size;
	    $chrDone{$curChr} = 1;
	} else {	# finish off previous chrom
	    my $chrSize = $chromSizes{$curChr};
	    if ($start < $chrSize) {
		my $size = $chrSize - $start;
		printf "%s\t%d\t%d\t%d\n", $curChr, $start, $chrSize, $size;
		$chrDone{$curChr} = 1;
	    }
	    $curChr = $c;
	}
    } else {  # first line in the file
	$curChr = $c;
	if ($s > 0) {
	    printf "%s\t0\t%d\t%d\n", $curChr, $s, $s;
	    $chrDone{$curChr} = 1;
	}
    }
    $start = $e;	# next start is this end
    $end = $start;	# next end will be next start or chrSize
}
close(FH);
'_EOF_'
    # << happy emacs
    chmod +x hg19SplitSpec.pl
    ./hg19SplitSpec.pl > hg19.splits.bed

    cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds
    # 4M chunks might work a bit better
    ./partitionBed.pl 4000000 10000 hg19.splits.bed > hg19.4M.10K.bed
    mkdir /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds/run4M
    cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds/run4M
    awk '{printf "%s:%d-%d\n", $1,$2,$3}' ../hg19.4M.10K.bed > hg19.list
    ls ../../splitEnds | sed -e "s/.fa//" > fosEnds.list

    # XXX this didn't work with -fastMap, too much of the query was knocked
    #	out and the results wouldn't pass the pslReps filter
    #  Need to run blat without any arguments, filter the results
    #	with pslReps
    cat << '_EOF_' > runOne
#!/bin/csh -fe

set t=$1
set q=$2
set c=`echo $t | sed -e 's/:.*//'`
set ctgStart=`echo $t | sed -e 's/.*://; s/-.*//'`
set ctgSize=`echo $t | sed -e 's/:/ /; s/-/ /;' | awk '{printf "%d", $3-$2}'`
set chrSize=`egrep "^$c " /scratch/data/hg19/chrom.sizes | cut -f2`
set result="psl/${t}/${q}.psl"
/bin/mkdir -p "psl/${t}"
set tmpLift="/scratch/tmp/${t}.${q}.lift"
set tResult="/scratch/tmp/${t}.${q}.psl"
echo $ctgStart $t $ctgSize $c $chrSize > ${tmpLift}

blat /scratch/data/hg19/hg19.2bit:${t} ../../splitEnds/${q}.fa stdout \
    | liftUp -type=.psl ${tResult} ${tmpLift} error stdin
pslReps  -nearTop=0.01 -minCover=0.70 -minAli=0.85 -noIntrons ${tResult} \
                        ${result} /dev/null
/bin/rm -f ${tmpLift} ${tResult}
'_EOF_'
    # << happy emacs
    chmod +x runOne

    ssh swarm
    cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds/run4M
    gensub2 hg19.list fosEnds.list template jobList
    #	340,470 jobs
    para create jobList
    para try
    para check ... push ... etc
# Completed: 340470 of 340470 jobs
# CPU time in finished jobs:  306516290s 5108604.83m 85143.41h 3547.64d  9.720 y
# IO & Wait Time:              22550855s  375847.59m  6264.13h  261.01d  0.715 y
# Average job time:                 967s      16.11m     0.27h    0.01d
# Longest finished job:           18257s     304.28m     5.07h    0.21d
# Submission to last job:        572370s    9539.50m   158.99h    6.62d

    cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds/run4M
    time pslSort dirs raw.psl /scratch/tmp psl/ch* > sort.log 2>&1 &
    # 340470 files in 873 dirs
    # Got 340470 files 583 files per mid file
    #	real    291m28.005s
# -rw-rw-r--   1 11527396213 Jul 13 14:53 raw.psl


    # and now that all those individual results are together, filter all
    # of them
    cd /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds
    time pslReps  -nearTop=0.01 -minCover=0.70 -minAli=0.85 -noIntrons \
	run4M/raw.psl hg19.fosEnds.psl /dev/null
    #	Processed 64042919 alignments
    #	real    11m9.402s
    #	-rw-rw-r-- 1   299504257 Jul 13 15:18 hg19.fosEnds.psl

XXX - running - Wed Jul 13 15:08:08 PDT 2011

    # FYI: to see all minimum covers:
grep "^[0-9]" raw.psl \
	| awk '{printf "%.2f\n", 100*($1+$3)/$11}' | ave stdin
# Q1 7.320000
# median 8.040000
# Q3 11.690000
# average 13.519178
# min 0.980000
# max 100.000000
# count 627281022
# total 8480323865.619440
# standard deviation 13.540966

    cd /hive/data/genomes/hg19/bed/fosEndPairs
    time /cluster/home/hiram/bin/x86_64/pslPairs \
    -tInsert=5000 -minId=0.94 -noBin -min=30000 -max=50000 -slop -short \
	-long -orphan -mismatch -verbose mapEnds/hg19.fosEnds.psl \
	/hive/data/outside/fosends.4/endPairs.txt all_fosends hg19.fosEnds
    #	real    0m10.042s
    # filter for score over 300
    awk '$5 >= 300' hg19.fosEnds.pairs | sort -k1,1 -k2,2n \
	> hg19.fosEndPairs.bed
    wc -l hg19.fosEnds.pairs hg19.fosEndPairs.bed
    #	230791 hg19.fosEnds.pairs
    #	230583 hg19.fosEndPairs.bed

    awk '$5 >= 300' hg19.fosEnds.slop hg19.fosEnds.short hg19.fosEnds.long \
	hg19.fosEnds.mismatch hg19.fosEnds.orphan | sort -k1,1 -k2,2n \
	> hg19.fosEndPairsBad.bed

    # for all names in the hg19.fosEndPairs.bed and hg19.fosEndPairsBad.bed
    # files, extract those names from the mapEnds/hg19.fosEnds.psl file
    # to construct a psl file to load up to represent all ends
    awk '{print $11}' hg19.fosEndPairs.bed hg19.fosEndPairsBad.bed \
	| tr '[,]' '[\n]' | sort -u > hg19.allEnds.names
    # the '\t' here actually needs to be the literal: Ctrl-v i
    #	to get a "real" tab there as the character
    headRest 5 mapEnds/hg19.fosEnds.psl | sort -k10 \
	| join -t '\t' -1 10 -2 1 \
-o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19,1.20,1.21 - hg19.allEnds.names | sort -k 14,14 -k 15,15n \
	> hg19.fosEnds.load.psl

    sed -e "s/fosEndPairs/hg19FosEndPairs/" \
	$HOME/kent/src/hg/lib/fosEndPairs.sql > hg19FosEndPairs.sql
    sed -e "s/all_fosends/hg19AllFosEnds/" hg19.fosEndPairs.bed \
	| sort -u | hgLoadBed -notItemRgb hg19 hg19FosEndPairs stdin \
	    -sqlTable=hg19FosEndPairs.sql 
    # Loaded 229708 elements
    # why so few compared to before:
    hgsql -e "select count(*) from fosEndPairs" hg19
    #	384442
    wc -l /hive/data/genomes/hg17/bed/fosends/fosEndPairs.bed
    #	384558 /hive/data/genomes/hg17/bed/fosends/fosEndPairs.bed
    # looking at one of those pairs that did not map:
    # G248P800001RA7 G248P800001FA7
    # compared to /hive/data/outside/fosends.2/fosEnds.fa
    # and /hive/data/outside/fosends.3/fosEnds.fa
    # the end sequences in those old files is shorter than the new ones
    # those shorter sequences more easily pass the minCover filter

    # note - this track isn't pushed to RR, just used for assembly QA
    sed -e "s/fosEndPairsBad/hg19FosEndPairsBad/" \
                 ~/kent/src/hg/lib/fosEndPairsBad.sql > hg19FosEndPairsBad.sql

    sed -e "s/all_fosends/hg19AllFosEnds/" hg19.fosEndPairsBad.bed \
	| sort -u | hgLoadBed -notItemRgb hg19 hg19FosEndPairsBad \
	    hg19.fosEndPairsBad.bed stdin -sqlTable=hg19FosEndPairsBad.sql
    #	Loaded 198665 elements of size 11
    # why do we have so many more ?
    wc -l /hive/data/genomes/hg17/bed/fosends/fosEndPairsBad.bed
    #	30830 /hive/data/genomes/hg17/bed/fosends/fosEndPairsBad.bed

    time hgLoadPsl hg19 -table=hg19AllFosEnds hg19.fosEnds.load.psl
    #	load of hg19AllFosEnds did not go as planned: 1160639 record(s),
    #	0 row(s) skipped, 229 warning(s) loading psl.tab
    #	real    0m35.096s
    #	with some warnings such as:
# Warning 1264 Out of range value adjusted for column 'qBaseInsert' at row 3053
# Warning 1264 Out of range value adjusted for column 'qBaseInsert' at row 10569
# Warning 1264 Out of range value adjusted for column 'qBaseInsert' at row 13642
# ...

# the sequences are already loaded from the hg18 lift:
    grep "^>" /gbdb/hg19/fosends/fosEnds.fromHg18.fa | sed -e 's/>//' \
	| sort > hg18.fosEnds.names
    # but they are not the same names:
    grep "^>" mapEnds/fosends.4.fa | sed -e 's/>//' \
	| sort > hg19.fosEnds.names
    sed -e 's/_T.//' hg19.fosEnds.names | sort -u > hg19.unique.fosEnd.names
    wc -l hg18.fosEnds.names hg19.unique.fosEnd.names
    #	1087670 hg18.fosEnds.names
    #	1567737 hg19.unique.fosEnd.names
    comm -12 hg18.fosEnds.names hg19.unique.fosEnd.names | wc -l
    #	1087670

    grep "^>" /hive/data/outside/fosends.3/fosEnds.fa \
	| sed -e 's/>//' | sort > fosends.3.names
    zcat /hive/data/outside/fosends.4/Hs*.dr.mfa.gz | grep "^>" \
	| awk '{print $2}' | sort > fosends.4.names

    zcat /hive/data/outside/fosends.4/Hs.WGS*.dr.mfa.gz \
       | sed -e "s/^>gnl.* />/; s/\.T/_T/" > fosEnds.4.fa

    mkdir /gbdb/hg19/fosends
    ln -s /hive/data/genomes/hg19/bed/fosEndPairs/mapEnds/fosends.4.fa \
    ln -s /hive/data/genomes/hg19/bed/fosEndPairs/fosEnds.4.fa \
	/gbdb/hg19/fosends/hg19FosEndPairs.fa
    time hgLoadSeq hg19 /gbdb/hg19/fosends/hg19FosEndPairs.fa
    #	1615492 sequences
    #	real    1m26.084s

##############################################################################
# hg18 <-> hg19 difference tracks (DONE - 2010-09-03 - Hiram)
    # this is documented in hg18.txt, same heading as above
    mkdir /hive/data/genomes/hg18/bed/liftOverHg19
    cd /hive/data/genomes/hg18/bed/liftOverHg19

#############################################################################
# HGDP GEOGRAPHIC SNP MAPS (DONE 9/15/10 angie)
    # Project data downloaded and parsed in /hive/data/outside/hgdpGeo,
    # see makeDb/doc/hgdpGeo.txt.
    mkdir /hive/data/genomes/hg19/bed/hgdpGeo
    cd /hive/data/genomes/hg19/bed/hgdpGeo
    # Make an rsId-sorted snp coords file for joining with the hgdpGeo data.
    grep -Fwf /hive/data/outside/hgdpGeo/rsIDs.lst \
      ../snp131/snp131.bed \
    | awk 'BEGIN{OFS="\t";} {print $4, $1, $2, $3, $8;}' \
    | sort > snp131CoordsAndRef.txt
    # How many distinct SNPs in there?  (compare to 657000 from HGDP):
    cut -f 1 snp131CoordsAndRef.txt | uniq | wc -l
#656332
    # Join files to make a track table -- well, first we'll need to 
    # normalize alleles to the + strand:
    join -e ERROR -t'	' -o 1.2,1.3,1.4,1.1,2.2,2.3,2.4,1.5 \
      snp131CoordsAndRef.txt /hive/data/outside/hgdpGeo/hgdpGeoCoordless.txt \
    | sed -re 's/([AGTC])\*/\1/' \
    | sort -k1,1 -k2n,2n \
      > hgdpGeo.fixme
    wc -l hgdpGeo.fixme
#667392 hgdpGeo.fixme
    # Use the snp131 reference allele to detect when we need to rev-comp
    # the alleles to match the + strand.  Also, throw out SNPs for which 
    # the ref allele is multi-base -- it's questionable whether we're giving
    # the right coords (some funny things happen with dbSNP's clustering...):
    cat > fixAlleles.pl <<'_EOF_'
#!/usr/bin/env perl
use warnings;
use strict;
my %rc = ('A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A');
while (<>) {
  chomp;  my ($c, $s, $e, $rs, $ancAl, $derAl, $freqs, $ref) = split;
  next unless ($ref =~ /^[ACGT]$/);
  if ($ancAl ne $ref && $derAl ne $ref) {
    $ancAl = $rc{$ancAl};
    $derAl = $rc{$derAl};
  }
  print join("\t", $c, $s, $e, $rs, $ancAl, $derAl, $freqs) . "\n";
}
'_EOF_'
    # << emacs
    chmod a+x fixAlleles.pl
    ./fixAlleles.pl hgdpGeo.fixme > hgdpGeo.tab
    wc -l hgdpGeo.tab
#667349 hgdpGeo.tab
    hgLoadBed hg19 hgdpGeo hgdpGeo.tab \
      -sqlTable=$HOME/kent/src/hg/lib/hgdpGeo.sql
#Loaded 667349 elements of size 7


###########################################################################
# RECOMBINATION RATES (DONE 2010-08-26 - Chin)

# The STS MArkers track must be completed prior to creating this track

    ssh kkstore02
    cd /hive/data/genomes/hg19/bed
    mkdir -p recombRate
    cd recombRate

# Copy other necessary files here (in future, can take from previous
# version)
# NOTE: these are stable, and could be saved in a permanent spot

    cp -p /projects/hg2/booch/psl/info/decode_all .
    cp -p /projects/hg2/booch/psl/info/marshfield_all .
    cp -p /projects/hg2/booch/psl/info/genethon_all .

# Compared these 3 files with the 3 files of hg17, they are identical.

# Determine maximum concordant set of markers for each of the maps
    /cluster/bin/scripts/assignGPsts -full -maxcon \
        /hive/data/outside/ncbi/sts.11/stsAlias.bed \
        /hive/data/genomes/hg19/bed/sts/stsMarkers_pos.rdb \
        decode_all > decode.marker.rdb
    /cluster/bin/scripts/assignGPsts -full -maxcon \
        /hive/data/outside/ncbi/sts.11/stsAlias.bed \
        /hive/data/genomes/hg19/bed/sts/stsMarkers_pos.rdb \
        marshfield_all > marshfield.marker.rdb
    /cluster/bin/scripts/assignGPsts -full -maxcon \
        /hive/data/outside/ncbi/sts.11/stsAlias.bed \
        /hive/data/genomes/hg19/bed/sts/stsMarkers_pos.rdb \
        genethon_all > genethon.marker.rdb

# Determine the rates for each of the maps
    /cluster/bin/scripts/markers_to_recomb_rate.terry.pl decode.marker.rdb \
            hive/data/genomes/hg19/chrom.sizes 1000000 1000000 \
                > decode_1mb_slide_1mb
    /cluster/bin/scripts/markers_to_recomb_rate.terry.pl genethon.marker.rdb \
            /hive/data/genomes/hg19/chrom.sizes 1000000 1000000 \
                >  genethon_1mb_slide_1mb
# got many "... out of genetic distance order. DISCARDING" messages.

    /cluster/bin/scripts/markers_to_recomb_rate.terry.pl marshfield.marker.rdb \
            /hive/data/genomes/hg19/chrom.sizes 1000000 1000000 \
              > marshfield_1mb_slide_1mb
# Got many "... out of genetic distance order. DISCARDING" messages.

# Convert files to proper format
# which requires the "inserts" file
    # get the size of true chrom:
    cd  /hive/data/genomes/hg19/bed/recombRate  
    cat /hive/data/genomes/hg19/chrom.sizes | awk '$1 !~/_/ && !/^chrM/ \
    {print $1, $2}'  > chr.sizes

    # order contigs on each chrom:
    mkdir /hive/data/genomes/hg19/bed/recombRate/orderedCtg

    cat << '_EOF_' > orderCtg.pl
#!/usr/bin/perl
# create ordered contig lists for each chrom
my $db = hg19;
my $ordDir = "/hive/data/genomes/hg19/bed/recombRate/orderedCtg";
@chroms = (1..22, X, Y);

foreach $chr (@chroms) 
{
my $chrName = "chr$chr";
my $chrFile = "chr$chr.ol";
my $ordFile = "$ordDir/$chrFile";
my $sqlstmt="hgsql $db -s -e \'SELECT c.chromStart, c.contig, c.size, c.chrom, i.size FROM ctgPos c, chromInfo i where c.chrom = \"$chrName\" AND c.chrom=i.chrom order by chromStart ASC\' ";
system(" $sqlstmt  > $ordFile");
}
'_EOF_'
    # << happy emacs


    chmod +x orderCtg.pl
    ./orderCtg.pl

    # create the inserts file 
    cat << '_EOF_' > createInserts.pl
#!/usr/local/bin/perl
# FILE: createInserts
# Author: Terry Furey
# Date: 7/13/2004
# Modified by Chin 2010-10-14
# Description: Create inserts file used in creation of some tracks.
# 	This used to be created when lift files being created

$dir = "/hive/data/genomes/hg19";

@chroms = (1..22, X, Y);

print "#Chrom\tBefore_Contig\tAfter_Contig\tNum_bases\tType\n";
$i = 0;
# Look at certain gaps in chroms
foreach $chr (@chroms) {
    open(AGP, "$dir/$chr/chr$chr.agp");
    while ($line = <AGP>) {
	chomp($line);
	@fields = split("\t", $line);
	$lastctg = "";
	$laststart = 1;
	# Want centromeres, large heterochromatin, short_arms, and large gaps
	if (($fields[6] eq "centromere") ||
	    (($fields[6] eq "heterochromatin") && ($fields[2] - $fields[1] > 1000000)) ||
	    ($fields[6] eq "short_arm") ||
	    (($fields[2] - $fields[1]) > 1000000)) {
	    # Record info about gap
	    $chr[$i] = $fields[0];
	    $start[$i] = $fields[1];
	    $end[$i] = $fields[2];
	    $size[$i] = $end[$i] - $start[$i] + 1;
	    $type[$i] = $fields[6];
	    # Find contigs surrounding gap
            open(ORDCTG, "$dir/bed/recombRate/orderedCtg/chr$chr.ol") || die("Could not open $dir/bed/recombRate/orderedCtg/chr$chr.ol\n") ;
	    # short_arm gaps have no previous contig
	    if ($type[$i] eq "short_arm") {
		$ctg1[$i] = "-";
		$start[$i] = 1;
		# Nest record has next contig
		$line1 = <ORDCTG>;
		@fields1 = split("\t",$line1);
		$ctg2[$i] = $fields1[1];
		# Reset end and recalculate size
		$end[$i] = $fields1[0];
		$size[$i] = $end[$i] - $start[$i] + 1;
	    # non-short_arm gaps
	    } else {
		# Find gap immediately before gap
		while ($line1 = <ORDCTG>) {
		    chomp($line1);
		    @fields1 = split("\t", $line1);
		    # This contig ends where gap begins
		    if (($fields1[0] + $fields1[2] + 1) == $start[$i]) {
			$ctg1[$i] = $fields1[1]; 
			# Succeeding contig is in next record
			if ($line1 = <ORDCTG>) {
			    @fields1 = split("\t", $line1);
			    $ctg2[$i] = $fields1[1];
			    # Reset end coordinate and re-calculate size
			    $end[$i] = $fields1[0];
			    $size[$i] = $end[$i] - $start[$i] + 1;
			} else {
			    $ctg2[$i] = "-";
			}
		    # Keep track of possible previous contigs and starts
		    } elsif (($fields1[0] + $fields1[2] + 1) < $start[$i]) {
			$lastctg = $fields1[1];
			$laststart = $fields1[0] + $fields1[2] + 1;
		    # Another gap separated this gap from previous contig,
		    # so didn't find match and passed it up
		    } elsif (($ctg1[$i] eq "") && ($laststart > 1)) {
			# Set start coordinate to end of last contig
			$ctg1[$i] = $lastctg;
			$start[$i] = $laststart;
			# Reset end coordinate and re-calculate size
			@fields1 = split("\t", $line1);
			$ctg2[$i] = $fields1[1];
			$end[$i] = $fields1[0];
			$size[$i] = $end[$i] - $start[$i] + 1;
		    }
		}
	    }
	    close(ORDCTG); 
	    $i++;
	}
    }
    close(AGP);	
}
$num = $i;

# Print them out
for ($i = 0; $i < $num; $i++) {
    # Don't print out duplicate lines for same gap (i.e. centromere and heterochromatin
    # in same gap
    if (($chr[$i] ne $chr[$i-1]) || (($ctg1[$i] ne $ctg1[$i-1]) && ($start[$i] > $end[$i-1]))) {
	# Large gaps must be heterochromatin
	if ($size[$i] > 3100000) {
	    $type[$i] = "heterochromatin";
	}
	# gaps at beginning must be short_arm
	if ($start[$i] <= 1) {
	    $type[$i] = "short_arm";
	}
	# Only want large heterochromatic regions, not telomeres
	if (($type[$i] ne "heterochromatin") || ($size[$i] > 1000000)) { 
	    print "$chr[$i]\t$ctg1[$i]\t$ctg2[$i]\t$size[$i]\t$type[$i]\n";
	}
    }
}
'_EOF_'
    # << happy emacs
 
    chmod +x createInserts.pl
    ./createInserts.pl > inserts

# Convert files to proper format
    cat << '_EOF_' > convRecombRate.pl
#!/usr/local/bin/perl
# File: convRecombRate
# Author: Terry Furey
# Date: 9/2002
# Modified by Chin 2010-10-14
# Project: Human
# Description: Changes recomb rates in large gaps to nan
# Usage message
if ($#ARGV != 3) {
  print stderr "USAGE: createSetBands <recomb file> <insert file> <ctg dir> <window (kb)>\n";
  exit(1);
}

# Read parameters
$rrfile = shift(@ARGV);
$insfile = shift(@ARGV);
$ctgdir = shift(@ARGV);
$basewind = shift(@ARGV);
$window = $basewind * 1000;

# Determine the golden path positions for each of the inserts
open(INSERT, "<$insfile") || die("Could not open $insfile\n");
$line = <INSERT>; #header
while ($line = <INSERT>) {
  next if (substr($line, 0, 1) eq "#"); 
  chomp($line);
  ($chr, $first, $second, $length, $type) = split(' ',$line);

  $thischr = substr($chr,3);
  open(ORDCTG, "<$ctgdir/orderedCtg/chr$thischr.ol") || die("Could not open $ctgdir/orderedCtg/chr$thischr.ol\n");

  # Determine first window for the insert
  if ($first eq "-") {
    $begin = 0;
    $end = int(($length)/$window)*$window;
  } else {
    $found = 0;
    print stderr "Finding $chr $first\n";
    while(!$found) {
      $line = <ORDCTG>;
      chomp($line);
      ($ctgstart, $ctg, $ctglen, $ctgchr, $chrlen) = split(' ',$line);
      if ($ctg eq $first) {
	$begin = int(($ctgstart + $ctglen)/$window)*$window;
	$end = int(($ctgstart + $ctglen + $length)/$window)*$window + $window;
	$found = 1;
      }
    }
  }
  close(ORDCTG);
  print stderr "$chr $begin - $end $type\n";
  for ($i = $begin; $i < $end; $i=$i+$window) {
    $gap{$chr}{$i} = 1;
  }
}
close(INSERT);

# Now, match up with PCT
open(RR, "<$rrfile") || die("Could not open $rrfile\n");
while ($line = <RR>) {
  chomp($line);
  ($chr, $start, $end, $ave, $female, $male) = split("\t", $line);
  if ($gap{$chr}{$start}) {
    print "$chr\t$start\t$end\tNaN\tNaN\tNaN\n";
  } else {
    print "$line\n";
  }
}
close(RR);
'_EOF_'
    # << happy emacs
    
    chmod +x convRecombRate.pl


./convRecombRate.pl  decode_1mb_slide_1mb inserts \
        . 1000 > decode_1mb_slide_1mb_conv

./convRecombRate.pl  marshfield_1mb_slide_1mb inserts \
        . 1000 > marshfield_1mb_slide_1mb_conv

./convRecombRate.pl  genethon_1mb_slide_1mb inserts \
        . 1000 > genethon_1mb_slide_1mb_conv

# Create bed file and load
    /cluster/bin/scripts/createRRbed decode_1mb_slide_1mb_conv \
        marshfield_1mb_slide_1mb_conv genethon_1mb_slide_1mb_conv \
                > recombRate.bed

    ssh hgwdev
    # reuse the recombRate.sql and recombRate.as from before
    hgLoadBed -noBin -tab \
        -sqlTable=$HOME/kent/src/hg/lib/recombRate.sql \
            hg19 recombRate recombRate.bed


############################################################################
# GENE BOUNDS (RNACLUSTER) (DONE 2010-10-20 - Chin)
# Create rnaCluster table (depends on {est,mrna}OrientInfo)

cd /hive/data/genomes/hg19/bed
mkdir rnaCluster
cd rnaCluster/
mkdir chrom

# Create a list of accessions that come from RAGE libraries and need to
# be excluded.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg19 rage.libs

    cat << '_EOF_' > runClusterRna
#!/bin/csh -fe
foreach f (/hive/data/genomes/hg19/nib/chr*.nib) 
    set c = $f:t:r 
    set out = chrom/$c.bed 
    # Exclude accesions in the RAGE file 
    echo clusterRna -mrnaExclude=hg19.rage.libs hg19 /dev/null $out  -chrom=$c 
    clusterRna -mrnaExclude=hg19.rage.libs -verbose=2 \
    -rna=all_mrna -est=intronEst \ 
    hg19 /dev/null $out -chrom=$c 
 end
'_EOF_'
    # << happy emacs
    chmod +x ./runClusterRna

    ./runClusterRna

hgLoadBed hg19 rnaCluster chrom/*.bed

#############################################################################
# dbSNP BUILD 132 (SNP132) BASIC TABLES (DONE 11/17/10 angie)
# Initially loaded 11/15/10; I found some missing or improperly located rs_fasta
# sequences and dbSNP re-dumped rs_fasta, so I rebuilt everything dependent on
# rs_fasta 11/17.
    # Set up build directory
    mkdir -p /hive/data/outside/dbSNP/132/{human,shared}

    # Get field encodings -- if there are changes or additions to the
    # encoding of the corresponding fields, you might need to update
    # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
    # hg/lib/snp125Ui.c).
    cd /hive/data/outside/dbSNP/132/shared
    alias wg wget --timestamping
    set ftpShared = ftp://ftp.ncbi.nih.gov/snp/database/shared_data
    wg $ftpShared/LocTypeCode.bcp.gz
    wg $ftpShared/SnpClassCode.bcp.gz
    wg $ftpShared/SnpFunctionCode.bcp.gz
    wg $ftpShared/SnpValidationCode.bcp.gz
    wg $ftpShared/Allele.bcp.gz

    ########################## DOWNLOAD #############################
    cd /hive/data/outside/dbSNP/132/human
    mkdir data schema rs_fasta
    # Get data from NCBI (anonymous FTP)
    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
    wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
    cd /hive/data/outside/dbSNP/132/human/data
    # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
    wg $ftpSnpDb/organism_data/b132_SNPContigLoc_37_1.bcp.gz
    # ContigLocusId table has functional annotations
    wg $ftpSnpDb/organism_data/b132_SNPContigLocusId_37_1.bcp.gz
    wg $ftpSnpDb/organism_data/b132_ContigInfo_37_1.bcp.gz
    # MapInfo has alignment weights
    wg $ftpSnpDb/organism_data/b132_SNPMapInfo_37_1.bcp.gz
    # SNP has univar_id, validation status and heterozygosity
    wg $ftpSnpDb/organism_data/SNP.bcp.gz
    # New info as of 132: allele freq, 'clinical' bit, SNP submitter handles
    wg $ftpSnpDb/organism_data/SNPAlleleFreq.bcp.gz
    wg $ftpSnpDb/organism_data/SNP_bitfield.bcp.gz
    wg $ftpSnpDb/organism_data/Batch.bcp.gz
    wg $ftpSnpDb/organism_data/SubSNP.bcp.gz
    wg $ftpSnpDb/organism_data/SNPSubSNPLink.bcp.gz

    # Get schema
    cd /hive/data/outside/dbSNP/132/human/schema
    wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
    wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz

    # Get fasta files
    # using headers of fasta files for molType, class, observed
    cd /hive/data/outside/dbSNP/132/human/rs_fasta
    # Re-downloaded 11/17/10 (dbSNP re-dump):
    wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz

    ########################## LOAD NCBI TABLES #############################
    # Simplify names of data files -- strip version & extras to get
    # local canonical table names.
    cd /hive/data/outside/dbSNP/132/human/data
    foreach f (*.bcp.gz)
      set new = `echo $f \
                 | sed -e 's/^b132_SNP//; s/^b132_//; s/_37_1//; s/.bcp//;'`
      mv $f $new
      echo $new
    end

    cd /hive/data/outside/dbSNP/132/human/schema
    zcat human_9606_table.sql.gz \
    | perl -we '$/ = "\nGO\n\n\n"; \
        while (<>) { \
          next unless /^CREATE TABLE \[(b132_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP|SNPAlleleFreq)(_37_1)?\]/; \
          s/b132_(SNP)?//; s/_37_1//; \
          s/[\[\]]//g;  s/GO\n\n/;/;  s/smalldatetime/datetime/g; \
          s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
          s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
          s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
          s/(image|varchar\s+\(\d+\))/BLOB/g; \
          print; \
        }' \
      > table.sql
    zcat dbSNP_main_table.sql.gz \
    | sed -re 's/\r//g;' \
    | perl -we '$/ = "\nGO\n\n\n";  \
        while (<>) { \
          next unless /^CREATE TABLE \[Allele\]/; \
          s/[\[\]]//g;  s/GO\n\n\n/;\n/;  s/smalldatetime/datetime/g; \
          print; \
        }' \
      >> table.sql

    # load on hgwdev
    hgsql -e 'create database hg19snp132'
    cd /hive/data/outside/dbSNP/132/human/schema
    hgsql hg19snp132 < table.sql
    cd ../data

    # Avoid wasting space by excluding mappings to non-reference contigs (ContigInfo.group_label):
    zcat ContigInfo.gz | cut -f 12 | uniq | sort -u
#CRA_TCAGchr7v2
#Celera
#GRCh37
#HuRef
    foreach t (ContigInfo MapInfo ContigLocusId)
      zcat $t.gz \
      | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable hg19snp132 $t placeholder stdin
    end

    # Compare contig list between liftContigs.lft and reference contigs in ContigInfo.
    cut -f 2 /hive/data/genomes/hg19/jkStuff/liftContigs.lft | sort > /data/tmp/1
    # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
    hgsql hg19snp132 -N -B -e 'select contig_acc from ContigInfo;' | sort > /data/tmp/2
    diff /data/tmp/[12]
#1c1
#< NC_001807
#---
#> NC_012920
    # darn mitochondria version oops.  Use NC_012920ToChrM.over.chain generated for snp131.

    # Make sure there are no orient != 0 contigs among those selected.
    hgsql hg19snp132 -NBe \
      'select count(*) from ContigInfo where orient != 0;'
#0

    # ContigLoc is huge, and we want just the reference contig mappings.
    # Keep lines only if they have a word match to some reference contig ID.
    # That probably will allow some false positives from coord matches,
    # but we can clean those up afterward.
    zcat ContigInfo.gz | g -w GRCh37 | cut -f 1 | sort -n > GRCh37ContigInfo.ctg_id.txt
    wc -l GRCh37ContigInfo.ctg_id.txt 
#259 GRCh37ContigInfo.ctg_id.txt
    zcat ContigLoc.gz \
    | grep -Fwf GRCh37ContigInfo.ctg_id.txt \
    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
    | hgLoadSqlTab -oldTable hg19snp132 ContigLoc placeholder stdin
#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 1
#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 2
#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 3
#Warning 1366 Incorrect integer value: '' for column 'loc_sts_uid' at row 4
#...
#load of ContigLoc did not go as planned: 34610577 record(s), 0 row(s) skipped, 4143419 warning(s) loading /dev/stdin
    # Get rid of those false positives (crazy slow, create indices first next time):
    hgsql hg19snp132 -e 'create table ContigLocFix select cl.* from ContigLoc as cl, ContigInfo as ci where cl.ctg_id = ci.ctg_id;'
    hgsql hg19snp132 -e 'drop table ContigLoc; \
                         rename table ContigLocFix to ContigLoc;'

    zcat SNP.gz \
    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
    | hgLoadSqlTab -oldTable hg19snp132 SNP placeholder stdin
#Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 1
#Warning 1366 Incorrect integer value: '' for column 'map_property' at row 1
#Warning 1366 Incorrect integer value: '' for column 'CpG_code' at row 2
#Warning 1366 Incorrect integer value: '' for column 'map_property' at row 2
#Warning 1265 Data truncated for column 'avg_heterozygosity' at row 3
#Warning 1265 Data truncated for column 'het_se' at row 3
#...
#load of SNP did not go as planned: 30443714 record(s), 0 row(s) skipped, 21366755 warning(s) loading /dev/stdin
    # ... no big deal.
    foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
     echo -n "${t}:\t"
      hgsql -N -B hg19snp132 -e 'select count(*) from '$t
    end
#ContigInfo:     259
#ContigLoc:      34610479
#ContigLocusId:  19829016
#MapInfo:        30399417
#SNP:    	 30443714

    #################### EXTRACT INFO FROM NCBI TABLES ####################
    # Glom each SNP's function codes together and load up a new hg19Snp132 table.
    # Also extract NCBI's annotations of coding SNPs' effects on translation.
    # We extract ContigLocusId info only for reference assembly mapping.
    # Some SNP's functional annotations are for an alternate assembly, so we will
    # have no NCBI functional annotations to display for those (but our own are 
    # available).
    cd /hive/data/outside/dbSNP/132/human
    # Add indices to tables for a big join (5 or 6 minutes):
    hgsql hg19snp132 -e \
      'alter table ContigInfo add index (ctg_id); \
       alter table ContigLocusId add index (ctg_id);'
    hgsql hg19snp132 -NBe 'select snp_id, ci.contig_acc, asn_from, asn_to, mrna_acc, \
                           fxn_class, reading_frame, allele, residue, codon, cli.ctg_id \
                           from ContigLocusId as cli, ContigInfo as ci \
                           where cli.ctg_id = ci.ctg_id;' \
      > ncbiFuncAnnotations.txt
    wc -l ncbiFuncAnnotations.txt
#19828052 ncbiFuncAnnotations.txt
    # Ignore function code 8 (cds-reference, just means that some allele matches reference)
    # and glom functions for each SNP id:
    cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
    | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
    | perl -we 'while (<>) { chomp; \
                  ($id, undef, $s, $e, $f, $c) = split; \
                  if (defined $prevId && $id == $prevId && $c == $prevC && $s == $prevS) { \
                    $prevFunc .= "$f," unless ($f == 8); \
                  } else { \
                    if (defined $prevId) { \
                      print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if ($prevFunc); \
                    } \
                    $prevFunc = ($f == 8) ? "" : "$f,"; \
                  } \
                  ($prevId, $prevC, $prevS, $prevE) = ($id, $c, $s, $e); \
                } \
                print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if ($prevFunc);' \
      > ucscFunc.txt
    wc -l ucscFunc.txt
#11673179 ucscFunc.txt
    cat > ucscFunc.sql <<EOF
CREATE TABLE ucscFunc (
        snp_id int NOT NULL ,
        ctg_id int(10) NOT NULL ,
        asn_from int(10) NOT NULL ,
        asn_to int(10) NOT NULL ,
        fxn_class varchar(255) NOT NULL ,
        INDEX snp_id (snp_id),
        INDEX ctg_id (ctg_id)
);
EOF
    hgLoadSqlTab hg19snp132 ucscFunc{,.sql,.txt}
    # ucscFunc coords are NCBI's 0-based, fully-closed, 2-base-wide insertions.
    # We need to leave the coords alone here so ucscFunc can be joined below.
    # Make a list of SNPs with func anno's that are insertion SNPs, so we can use 
    # the list to determine what type of coord fix to apply to each annotation
    # when making snp130CodingDbSnp below.
    hgsql hg19snp132 -NBe \
      'select ci.contig_acc, cl.asn_from, cl.asn_to, uf.snp_id \
       from ucscFunc as uf, ContigLoc as cl, ContigInfo as ci \
       where uf.snp_id = cl.snp_id and \
             uf.ctg_id = cl.ctg_id and uf.asn_from = cl.asn_from and uf.asn_to = cl.asn_to and \
             cl.loc_type = 3 and \
             cl.ctg_id = ci.ctg_id' \
      > ncbiFuncInsertions.ctg.bed
    wc -l ncbiFuncInsertions.ctg.bed
#1099530 ncbiFuncInsertions.ctg.bed

    # Extract observed allele, molType and snp class from FASTA headers gnl
    mkdir rs_fasta/rejects
    mv rs_fasta/rs_ch{AltOnly,NotOn}.fas.gz rs_fasta/rejects/
    zcat rs_fasta/rs_ch*.fas.gz \
    | grep '^>gnl' \
    | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
    | sort -nu \
      > ucscGnl.txt
#547.545u 92.641s 6:08.04 173.9% 0+0k 0+0io 0pf+0w
    wc -l ucscGnl.txt
#30144822 ucscGnl.txt
# weird -- it shrunk from the original 30152349
    cut -f 1 ucscGnl.txt | uniq | wc -l
#30144822
    cat > ucscGnl.sql <<EOF
CREATE TABLE ucscGnl (
        snp_id int NOT NULL ,
        observed varchar(255) NOT NULL,
        molType varchar(255) NOT NULL,
        class varchar(255) NULL ,
        INDEX snp_id (snp_id)
);
EOF
    hgLoadSqlTab hg19snp132 ucscGnl{,.sql,.txt}

    # Add indices to tables for a big join:
    hgsql hg19snp132 -e \
      'alter table ContigLoc  add index (ctg_id); \
       alter table SNP        add index (snp_id); \
       alter table MapInfo    add index (snp_id);'

#TODO: add SNPAlleleFreq to this query and snpNcbiToUcsc (bitfield/"clinical" too)

    # Big leftie join to bring together all of the columns that we want in snp132,
    # using all of the available joining info:
    hgsql hg19snp132 -NBe \
     'SELECT ci.contig_acc, cl.asn_from, cl.asn_to, cl.snp_id, cl.orientation, cl.allele, \
             ug.observed, ug.molType, ug.class, \
             s.validation_status, s.avg_heterozygosity, s.het_se, \
             uf.fxn_class, cl.loc_type, mi.weight, cl.phys_pos_from \
      FROM \
      ((((ContigLoc as cl JOIN ContigInfo as ci \
               ON cl.ctg_id = ci.ctg_id) \
          LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
         LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
        LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
       LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
                                and uf.asn_from = cl.asn_from;' \
      > ucscNcbiSnp.ctg.bed
#73.036u 12.668s 25:32.42 5.5%   0+0k 0+0io 0pf+0w
    wc -l ucscNcbiSnp.ctg.bed 
#34610479 ucscNcbiSnp.ctg.bed

    # Use liftUp for everything except mito, then liftOver for mito.
    # There are some weird cases of length=1 but locType=range... in all the cases 
    # that I checked, the length really seems to be 1 so I'm not sure where they got 
    # the locType=range.  Tweak locType in those cases so we can keep those SNPs:
    grep -vw ^NC_012920 ucscNcbiSnp.ctg.bed \
    | awk -F"\t" 'BEGIN{OFS="\t";} \
           $2 == $3 && $14 == 1 {$14=2; \
                                 if (numTweaked < 10) {print $4 > "/dev/stderr";} \
                                 numTweaked++;}  {print;} \
           END{print numTweaked, "single-base, locType=range, tweaked locType" > "/dev/stderr";}' \
    | liftUp ucscNcbiSnp.bed \
      /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
#TODO: examine these again, report to dbSNP:
#118203330
#118203339
#118203340
#118203367
#118203389
#118203401
#118203405
#118203425
#118203428
#118203433
#588     single-base, locType=range, tweaked locType
#217.470u 28.776s 2:54.46 141.1% 0+0k 0+0io 0pf+0w
    # For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
    # doesn't deal with 0-base items.  Fake out phys_pos_from to 0 because many coords
    # will differ, oh well.
    grep -w NC_012920 ucscNcbiSnp.ctg.bed \
    | awk -F"\t" 'BEGIN{OFS="\t";} {$3 += 1; $16 = 0; print;}' \
    | liftOver -bedPlus=3 stdin \
        /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain stdout chrM.unmapped \
    | awk -F"\t" 'BEGIN{OFS="\t";} {$3 -= 1; print;}' \
    | sort -k2n,2n \
      > chrMNcbiSnp.bed
#2.827u 1.576s 0:48.15 9.1%      0+0k 0+0io 0pf+0w
#2.805u 1.392s 0:51.90 8.0%      0+0k 0+0io 7pf+0w
    cat chrM.unmapped
    # Good, got all but 3 SNPS (rs28693675, rs55749223 and rs112781979, partially deleted/deleted)
    cat chrMNcbiSnp.bed >> ucscNcbiSnp.bed
    wc -l ucscNcbiSnp.bed
#34610476 ucscNcbiSnp.bed

    # Translate NCBI's encoding into UCSC's, and perform a bunch of checks.
    cd /hive/data/outside/dbSNP/132/human/
    # Updated snpNcbiToUCSC for new MAX_SNPID (80M -> 120M), 
    # new named alleles oddball formats: CHLC.GGAA2D04, GDB:190880, SHGC-35515, =D22S272
    # new MAX_SNPSIZE (1k -> 16k)
    snpNcbiToUcsc ucscNcbiSnp.bed /hive/data/genomes/hg19/hg19.2bit snp132
#spaces stripped from observed:
#chr12   6093134 6093134 rs41402545
#count of snps with weight  0 = 69071
#count of snps with weight  1 = 29465124
#count of snps with weight  2 = 523595
#count of snps with weight  3 = 3037908
#count of snps with weight 10 = 1514756
#Skipped 976 snp mappings due to errors -- see snp132Errors.bed
#214.507u 7.542s 4:38.56 79.7%   0+0k 0+0io 0pf+0w
    head snp132Errors.bed
#chr1    11082586        11082587        rs80356737      Unexpected refNCBI "AT" for locType "between" (3) -- expected "-"
#chr1    11082586        11082587        rs80356737      rs80356737 is 1 bases long but refNCBI is different length: AT
#chr1    43392806        43392807        rs80359840      Unexpected refNCBI "CA" for locType "between" (3) -- expected "-"
#chr1    43392806        43392807        rs80359840      rs80359840 is 1 bases long but refNCBI is different length: CA
#chr1    43395420        43395421        rs80359834      Unexpected refNCBI "AC" for locType "between" (3) -- expected "-"
#chr1    43395420        43395421        rs80359834      rs80359834 is 1 bases long but refNCBI is different length: AC
#chr1    43395659        43395660        rs80359833      Unexpected refNCBI "AG" for locType "between" (3) -- expected "-"
#chr1    43395659        43395660        rs80359833      rs80359833 is 1 bases long but refNCBI is different length: AG
#chr1    43396801        43396802        rs80359831      Unexpected refNCBI "GC" for locType "between" (3) -- expected "-"
#chr1    43396801        43396802        rs80359831      rs80359831 is 1 bases long but refNCBI is different length: GC
    wc -l snp*
#  33026121 snp132.bed
#        22 snp132.sql
#       976 snp132Errors.bed
#        18 snp132ExceptionDesc.tab
#   4945948 snp132Exceptions.bed
    # 7M new snps, not a big increase in exceptions (snp131 had 4281351)

    # Make one big fasta file.
    # It's a monster: 18G!  Can we split by hashing rsId?
    zcat rs_fasta/rs_ch*.fas.gz \
    | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
      > snp132.fa
#611.912u 114.850s 7:40.41 157.8%        0+0k 0+0io 0pf+0w
    # Check for duplicates.
    grep ^\>rs snp132.fa | sort > /data/tmp/seqHeaders
    wc -l /data/tmp/seqHeaders
#30144822 /data/tmp/seqHeaders
    uniq /data/tmp/seqHeaders | wc -l
#30144822
    # Use hgLoadSeq to generate .tab output for sequence file offsets,
    # and keep only the columns that we need: acc and file_offset.
    # Index it and translate to snpSeq table format.
    hgLoadSeq -test placeholder snp132.fa
#30144822 sequences
#52.698u 14.570s 9:18.88 12.0%   0+0k 0+0io 0pf+0w
    cut -f 2,6 seq.tab > snp132Seq.tab
    rm seq.tab

    # Load up main track tables.
    cd /hive/data/outside/dbSNP/132/human
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
      hg19 snp132 -sqlTable=snp132.sql snp132.bed
#Loaded 33026121 elements of size 17
#124.626u 10.983s 8:20.73 27.0%  0+0k 0+0io 0pf+0w
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
      hg19 snp132Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
      snp132Exceptions.bed
#Loaded 4945948 elements of size 5
#14.816u 0.930s 0:52.42 30.0%    0+0k 0+0io 0pf+0w
    hgLoadSqlTab hg19 snp132ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
      snp132ExceptionDesc.tab
    # Load up sequences.
    mkdir -p /gbdb/hg19/snp
    ln -s /hive/data/outside/dbSNP/132/human/snp132.fa /gbdb/hg19/snp/snp132.fa
    hgLoadSqlTab hg19 snp132Seq ~/kent/src/hg/lib/snpSeq.sql snp132Seq.tab

    # Put in a link where one would expect to find the track build dir...
    ln -s /hive/data/outside/dbSNP/132/human /hive/data/genomes/hg19/bed/snp132

#*** TODO: ask cluster-admin to pack the snp132 table (or whatever tables we'll push)

    # Look at the breakdown of exception categories:
    cd /hive/data/outside/dbSNP/132/human
    cut -f 5 snp132Exceptions.bed | sort | uniq -c | sort -nr
#3644435 MultipleAlignments
# 964493 ObservedMismatch
#  90035 SingleClassTriAllelic
#  77552 SingleClassZeroSpan
#  43631 ObservedTooLong
#  33650 MixedObserved
#  26701 FlankMismatchGenomeShorter
#  25574 SingleClassLongerSpan
#  12222 RefAlleleMismatch
#  11525 DuplicateObserved
#   8340 SingleClassQuadAllelic
#   4463 NamedDeletionZeroSpan
#   2052 FlankMismatchGenomeLonger
#    806 ObservedContainsIupac
#    317 NamedInsertionNonzeroSpan
#    150 FlankMismatchGenomeEqual
#      1 RefAlleleRevComp
#      1 ObservedWrongFormat
#TODO: Sent a few bug reports to dbSNP


############################################################################
# SNP132 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 11/22/10 angie)
    mkdir /hive/data/genomes/hg19/bed/snp132Ortho
    cd /hive/data/genomes/hg19/bed/snp132Ortho

    # Following Heather's lead in snp126orthos, filter SNPs to to keep
    # only those with class=single, length=1, chrom!~random;
    # Exclude those with exceptions MultipleAlignments,
    # SingleClassTriAllelic or SingleClassQuadAllelic.
    awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      /hive/data/outside/dbSNP/132/human/snp132Exceptions.bed \
    | sort -u \
      > snp132ExcludeIds.txt
    awk '$3-$2 == 1 && $1 !~ /_random/ && $1 !~ /^chrUn/ && $11 == "single" {print;}' \
      /hive/data/outside/dbSNP/132/human/snp132.bed \
    | grep -vFwf snp132ExcludeIds.txt \
      > snp132Simple.bed
#264.984u 13.702s 3:57.29 117.4% 0+0k 0+0io 0pf+0w
    wc -l snp132Simple.bed
#23908516 snp132Simple.bed

    # Glom all human info that we need for the final table onto the
    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
    awk 'BEGIN{OFS="\t";} \
        {print $1, $2, $3, \
               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
               0, $6;}' \
      snp132Simple.bed > snp132ForLiftOver.bed
#62.518u 2.141s 1:09.79 92.6%    0+0k 0+0io 0pf+0w

    # Map coords to chimp using liftOver.
    mkdir run.liftOChimp
    cd run.liftOChimp
    mkdir split out
#*** NOTE FOR NEXT TIME: make this 10000 not 50000:
    splitFile ../snp132ForLiftOver.bed 50000 split/chunk
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro2.over.chain.gz \
        \{check out exists out/panTro2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    ssh swarm
    cd /hive/data/genomes/hg19/bed/snp132Ortho/run.liftOChimp
    para make jobList
#Completed: 479 of 479 jobs
#CPU time in finished jobs:     168182s    2803.03m    46.72h    1.95d  0.005 y
#IO & Wait Time:                 12873s     214.55m     3.58h    0.15d  0.000 y
#Average job time:                 378s       6.30m     0.10h    0.00d
#Longest finished job:            1152s      19.20m     0.32h    0.01d
#Submission to last job:          1165s      19.42m     0.32h    0.01d

    # Map coords to orangutan using liftOver.
    mkdir ../run.liftOPon
    cd ../run.liftOPon
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
        \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 479 of 479 jobs
#CPU time in finished jobs:     413536s    6892.27m   114.87h    4.79d  0.013 y
#IO & Wait Time:                 29174s     486.23m     8.10h    0.34d  0.001 y
#Average job time:                 924s      15.40m     0.26h    0.01d
#Longest finished job:            2299s      38.32m     0.64h    0.03d
#Submission to last job:          2309s      38.48m     0.64h    0.03d

    # Map coords to macaque using liftOver.
    mkdir ../run.liftOMac
    cd ../run.liftOMac
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
        \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 479 of 479 jobs
#CPU time in finished jobs:     463852s    7730.87m   128.85h    5.37d  0.015 y
#IO & Wait Time:                 32857s     547.62m     9.13h    0.38d  0.001 y
#Average job time:                1037s      17.28m     0.29h    0.01d
#Longest finished job:            2354s      39.23m     0.65h    0.03d
#Submission to last job:          2444s      40.73m     0.68h    0.03d

    cd /hive/data/genomes/hg19/bed/snp132Ortho
    # Concatenate the chimp results, sorting by chimp pos in order to
    # efficiently access 2bit sequence in getOrthoSeq.  The output of
    # that is then sorted by the glommed human info field, so that we
    # can use join to combine chimp and macaque results in the next step.
    # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
    | sort > panTro2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
    | sort > ponAbe2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
    | sort > rheMac2.orthoGlom.txt
    wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
#22382184 panTro2.orthoGlom.txt
#21280289 ponAbe2.orthoGlom.txt
#19249238 rheMac2.orthoGlom.txt

    # Use the glommed name field as a key to join up chimp and macaque
    # allele data.  Include glommed name from both files because if only
    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
    # in the orthoGlom fields from each file, which are in the same order
    # as the chimp and macaque columns of snp132OrthoPanTro2RheMac2.
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
      > tmp.txt
#113.229u 24.575s 1:26.02 160.1% 0+0k 0+0io 0pf+0w
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      tmp.txt rheMac2.orthoGlom.txt \
    | perl -wpe 'chomp; \
        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
          split(/\|/, $glomKey); \
        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
        s/^.*$//;' \
    | sort -k1,1 -k2n,2n > snp132OrthoPt2Pa2Rm2.bed
#516.332u 101.651s 7:59.63 128.8%        0+0k 0+0io 0pf+0w
    wc -l snp132OrthoPt2Pa2Rm2.bed
#23235355 snp132OrthoPt2Pa2Rm2.bed

    hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
      hg19 snp132OrthoPt2Pa2Rm2 snp132OrthoPt2Pa2Rm2.bed
#Loaded 23235355 elements of size 22
#87.826u 8.471s 8:22.46 19.1%    0+0k 0+0io 0pf+0w

    # Cleanup:
    rm -r run*/split tmp.txt *.orthoGlom.txt bed.tab
    nice gzip snp132Simple.bed snp132ExcludeIds.txt snp132ForLiftOver.bed &


############################################################################
# DBSNP CODING ANNOTATIONS (132) (DONE 11/17/10 angie)
# These annotations are not restricted to the ones that we display,
# so it wasn't necessary to rebuild this after rebuilding snp132 to 
# include SNPs missing from the first rs_fasta dump.
    cd /hive/data/outside/dbSNP/132/human
    # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.  
    # For anything except an insertion (0 bases between flanks), 
    # we need to add 1 to the end coord.  For an insertion, we need
    # to add 1 to the start coord.  Make a hash of the insertion IDs,
    # then look up each ID in ncbiFuncAnnotations.txt to tell which
    # transform to apply.
    # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
    perl -we 'open($IDS, "ncbiFuncInsertions.ctg.bed") || die "ids: $!"; \
              while (<$IDS>) { chomp; $ids{$_} = 1; } \
              close($IDS); \
              %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 44=>1, 45=>1); \
              while (<>) { \
                chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                next unless $coding{$w[5]}; \
                $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                  $w[2]++; # 2-base insertions: increment start coord \
                } else { \
                  $w[3]++; # increment end coord to get half-open \
                } \
                print join("\t", @w) . "\n"; \
              }' ncbiFuncAnnotations.txt \
    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
    | uniq \
      > ncbiCodingAnnotations.txt
    wc -l ncbiCodingAnnotations.txt
#1015611 ncbiCodingAnnotations.txt
    # How many & what kinds of function types?
    cut -f 6 ncbiCodingAnnotations.txt \
    | sort -n | uniq -c
# 179089 3   (coding-synon)
# 493143 8   (cds-reference -- ignored)
#  10575 41  (nonsense)
# 272848 42  (missense)
#  57934 44  (frameshift)
#   2022 45  (cds-indel)
    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
    perl -e  'while (<>) { chomp; \
                my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                if (defined $lastRs && \
                    ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                     $lastTx ne $txId || $lastFrm ne $frm)) { \
                  if (defined $refRow) { \
                    $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                    $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                  } \
                  print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                        "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                  $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                } \
                ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                    ($rsId, $ctg, $s, $e, $txId, $frm); \
                $count++; \
                if ($fxn == 8) { \
                  $refRow = [$fxn, $nt, $aa, $codon]; \
                } else { \
                 $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                } \
              } \
              if (defined $refRow) { \
                $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
              } \
              print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                    "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
      ncbiCodingAnnotations.txt \
    | liftUp snp132CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
    hgLoadBed hg19 snp132CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      snp132CodingDbSnp.bed
#Loaded 492412 elements of size 11


############################################################################
# SNPMASKED SEQUENCE FOR SNP132 (DONE 11/22/10 angie)
    mkdir /hive/data/genomes/hg19/snp132Mask
    cd /hive/data/genomes/hg19/snp132Mask

    # Identify rsIds with various problems -- we will exclude those.
    awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
      /hive/data/outside/dbSNP/132/human/snp132Exceptions.bed \
      | sort -u \
      > snp132ExcludeRsIds.txt
    grep -vFwf snp132ExcludeRsIds.txt \
      /hive/data/outside/dbSNP/132/human/snp132.bed \
      > snp132Cleaned.bed
#262.134u 4.612s 4:39.15 95.5%   0+0k 0+0io 0pf+0w

    # Substitutions:
    mkdir substitutions
    snpMaskSingle snp132Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
    | faSplit byname stdin substitutions/
#Masked 23848747 snps in 23846674 out of 3134643623 genomic bases
    # 2,361 warnings about differing observed strings at same base position --
    # saved as diffObserved.txt.
#/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3134643623 (difference is 2517641)
#53.892u 15.183s 3:24.90 33.7%   0+0k 0+0io 0pf+0w
    # Check that 2517641 is the total #bases in sequences with nothing in snp132Cleaned:
    grep -Fw single snp132Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
    grep -vwf /data/tmp/1 ../chrom.sizes
    grep -vwf /data/tmp/1 ../chrom.sizes \
    | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
#2517641
    # Make sure that sizes are identical, first diffs are normal -> IUPAC,
    # and first diffs' case is preserved:
    foreach f (substitutions/chr*.fa)
      faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
    end
#chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10326 (y != t)
#chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 61004 (r != a)
#...
#(output OK -- ambiguous bases replacing [agct] at SNP positions)
    foreach f (substitutions/chr*.fa)
      echo $f:t:r
      mv $f $f:r.subst.fa
      gzip $f:r.subst.fa
    end

    # Insertions & deletions not done.  To date we have only offered substs for download.
    # If there is user demand, use template from snp131 above.

    # Clean up and prepare for download:
    gzip snp132Cleaned.bed &
    foreach d (substitutions)
      pushd $d
        md5sum *.gz > md5sum.txt
        cp /hive/data/genomes/hg19/snp131Mask/$d/README.txt .
      popd
    end
    # Edit the README.txt.

    # Create download links on hgwdev.
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp132Mask
    ln -s /hive/data/genomes/hg19/snp132Mask/substitutions/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp132Mask/

#############################################################################
#CREATE MICROSAT TRACK (DONE 2001-11-13 - Chin)
    ssh hgwdev
    cd /cluster/data/hg19/bed
    mkdir microsat
    cd microsat
    awk '($5==2 || $5==3) && $6 >= 15 && $8 100 && $9 0
    {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}'
	../simpleRepeat/simpleRepeat.bed > microsat.bed
    hgLoadBed hg19 microsat microsat.bed

#############################################################################
# Add Human RNA-editing track hg18 (Done, galt, 7/12/2010)

# DARNED=DAtabase of RNa EDiting
#http://darned.ucc.ie/
#University College Cork

mkdir -p /hive/data/genomes/hg19/bed/darned
cd /hive/data/genomes/hg19/bed/darned
# create go.csh to download and compose allChroms.bed
./go.csh
hgLoadBed hg19 darned allChroms.bed
# at human, level
# added darned.html
# added trackDb.ra entry

# Bug #6417 duplicate records (6) in Human RNA editing (DARNED) track
# (DONE 2012-12-22 Chin)
    cat allChroms.bed | sort  > allSort.bed
    cat allChroms.bed | sort -u > allSort.bed
    wc -l *.bed
    #  42045 allChroms.bed
    #  42045 allSort.bed
    #  42039 allUniq.bed
    hgLoadBed hg19 darned allUniq.bed
    # Loaded 42039 elements of size 9

#############################################################################
# lsSnpPdb: import of LS-SNP/PDB data for SNP 131 (2010-12-03 markd)
    # down load from JHU
    ssh genbank
    sudo su - genbank
    cd /cluster/data/genbank
    ./bin/lsSnpPdbDownloadStep hg19
    # load into hgwdev database
    ssh hgwdev
    cd /cluster/data/genbank
    ./bin/lsSnpPdbDbLoadStep hg19
    # once this has been QAed, will auto-update from genbank scripts

#############################################################################
# NEW SNP132 (DONE 3/8/11 angie)
# 3/8/11: Re-ran snpNcbiToUcsc & reloaded to not count PAR SNPs as multiply mapped
# Reloaded 1/24/11 to get rid of a couple exceptions that were derived from dbSNP bitfields
# Previously loaded 1/5/11
    # New table type snp132Ext, with same columns as snp125 plus exceptions,
    # allele freqs, and submitter handles, using new script doDbSnp.pl.
    mkdir -p /hive/data/outside/dbSNP/132/human
    cd /hive/data/outside/dbSNP/132/human
    # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
    # to find the subdir name to use as orgDir below (human_9606 in this case).
    # Then click into that directory and look for file names like 
    #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
    # -- use the first num for build and the second num_num for buildAssembly.
    # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
    cat > config.ra <<EOF
db hg19
orgDir human_9606
build 132
buildAssembly 37_1
liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
# *** This release contains more than one assembly label.
# *** Please examine this list in case we need to exclude any of these:
#
#CRA_TCAGchr7v2
#Celera
#GRCh37
#HuRef
# *** Add refAssemblyLabel to config.ra.  If keeping all labels, it will
# *** look like this:
#
#refAssemblyLabel CRA_TCAGchr7v2,Celera,GRCh37,HuRef
#
# *** Edit out any of those that are not included in hg19 (e.g. Celera).
# *** Then restart this script with -continue=loadDnSnp .
    # GRCh37 is the only one that corresponds to hg19, so add it to config.ra:
    echo "refAssemblyLabel GRCh37" >> config.ra

    # Try again with updated config:
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra \
      -continue=loadDbSnp >>& do.log &
    tail -f do.log

    # 3/8/11:
    cd /hive/data/outside/dbSNP/132/human
    mkdir -p `cat workingDir`
    cp -p ucscNcbiSnp.bed.gz `cat workingDir`
    cd `cat workingDir`
    hgsql hg19 -NBe 'select chrom,chromStart,chromEnd,name from par' > par.bed
    snpNcbiToUcsc -par=par.bed \
      -snp132Ext ucscNcbiSnp.bed.gz /hive/data/genomes/hg19/hg19.2bit snp132
    # No change to snp132Errors.bed; only change to snp132ExceptionDesc.tab was
    # the MultipleAlignments count; snp132.bed items that lost their MultipleAlignments
    # exceptions were X & Y PAR matches.
    gzip snp132.bed snp132ExceptionDesc.tab snp132Errors.bed
    mv snp132* /hive/data/outside/dbSNP/132/human/
    cd /hive/data/outside/dbSNP/132/human/
    # Doh, dbSNP also assigned a weight of 3 to the PAR SNPs, and that is triggering
    # our snp132NonUnique filter (below).  I sent dbSNP an email about that, and will
    # tweak weight to 1 where I find a PAR SNP without MultipleAlignments, since I see
    # it as a bug fix.
    zcat snp132.bed.gz \
    | awk -F"\t" 'BEGIN{OFS="\t";} \
           (($1 == "chrX" && \
             (($3 > 60000 && $2 < 2699520) || ($3 > 154931043 && $2 < 155260560))) || \
            ($1 == "chrY" && \
             (($3 > 10000 && $2 < 2649520) || ($3 > 59034049 && $2 < 59363566)))) && \
           $18 !~/MultipleAlignments/ {$17 = 1;} \
           {print;}' > snp132.parWeightTweak.bed
      wc -l snp132.parWeightTweak.bed
#33026121 snp132.parWeightTweak.bed
    # Make sure only the weight has changed:
    zcat snp132.bed.gz | cut -f 1-16,18-25 > /data/tmp/snp132.weightless.bed
    cut -f 1-16,18-25 snp132.parWeightTweak.bed > /data/tmp/snp132.parTweak.weightless.bed
    cmp /data/tmp/snp132*.weightless.bed
    # No output, good.
    # Reload snp132 with the tweaked weights:
    hgLoadBed -tab -onServer -tmpDir=$TMPDIR -allowStartEqualEnd \
      hg19 snp132 -sqlTable=snp132.sql snp132.parWeightTweak.bed
    zcat snp132ExceptionDesc.tab.gz \
    | hgLoadSqlTab hg19 snp132ExceptionDesc $HOME/kent/src/hg/lib/snp125ExceptionDesc.sql stdin
    gzip snp132.parWeightTweak.bed


#############################################################################
# Agilent arrays (2010-12-01 Andy)
cd /hive/data/genomes/hg19/bed/agilentProbes/
# first move all the lifted versions out of the way
mkdir lifted.2009-07-28/
mv * lifted.2009-07-28/
# FTP download from ftp.agilent.com using given user/pass from Anniek De-witte
# (anniek_de-witte@agilent.com)
# downloaded files are gzipped beds. The files are typically located in a 
# directory called "FOR_UCSC" or something like that.  The user/pass and the
# directory are deleted after it's confirmed they're received, so it's not
# too helpful to mention specifics here.
ftp -u user -p password ftp.agilent.com
> cd directory
> get 014693_D_BED_20100501.bed.gz
> get 014698_D_BED_20100501.bed.gz
> get 014950_D_BED_20100501.bed.gz
> get 021529_D_BED_20100501.bed.gz
> get 021850_D_BED_20100430.bed.gz
> get 021924_D_BED_20100501.bed.gz
> get 022060_D_BED_20100501.bed.gz
> get 023642_D_BED_20100430.bed.gz
> get 028081_D_BED_20101014.bed.gz
> get 029830_D_BED_20100922.bed.gz
# unzip everything
gunzip *
ln -s 022060_D_BED_20100501.bed agilent4x180k.bed
ln -s 021529_D_BED_20100501.bed agilentCgh1x1m.bed
ln -s 014693_D_BED_20100501.bed agilentCgh1x244k.bed
ln -s 014698_D_BED_20100501.bed agilentCgh2x105k.bed
ln -s 021850_D_BED_20100430.bed agilentCgh2x400k.bed
ln -s 014950_D_BED_20100501.bed agilentCgh4x44k.bed
ln -s 021924_D_BED_20100501.bed agilentCgh8x60k.bed
ln -s 028081_D_BED_20101014.bed agilentCghSnp2x400k.bed
ln -s 029830_D_BED_20100922.bed agilentCghSnp4x180k.bed
ln -s 023642_D_BED_20100430.bed agilentHrd1x1m.bed
for bed in agilent*.bed; do
    tail -n +2 $bed | hgLoadBed hg19 ${bed%.bed} stdin
done
rm bed.tab
### Update (2011-11-01 Andy Pohl
# (acquired bed file in e-mail attachment)
cd /hive/data/genomes/hg19/bed/agilentProbes
ln -s 030587_D_BED_20101006_Colored.bed agilentCghSnpCancer4x180k.bed
tail +3 agilentCghSnpCancer4x180k.bed | hgLoadBed hg19 agilentCghSnpCancer4x180k stdin


#################################################################################
# Rfam (2011-11-30 Melissa Cline)
#
# This contains genomic alignments of Rfam sequences, from the Rfam group.
#
# This data is used in building UCSC Genes.
#
cd /hive/data/outside/Rfam
mkdir 111130
cd 111130
wget ftp://ftp.sanger.ac.uk/pub/databases/Rfam/CURRENT/genome.gff3.tar.gz
tar xzvf genome.gff3.tar.gz
mkdir hg19
cat /hive/data/genomes/hg19/chrom.aliases \
 |awk '{ print("cat /hive/data/outside/Rfam/111130/genome_gff/" $1 ".gff3",  
               "|sed", sprintf("%c", 39) "s/" $1 "/" $2 "/" sprintf("%c", 39))}' |bash \
 |grep -v -e "^#" \
 |awk '{ print($1 "\t" $4 - 1 "\t" $5 "\t" $9 "\t1\t" $7 "\t" 
               $4 - 1 "\t" $5 "\t0\t1\t" $5 - $4 + 1 "\t0"  }' \
> hg19/Rfam.bed      


#####################################################
# Vista Enhancers (galt 2010-12-09 done)
#
# Vista from Lawrence-Berkeley has assayed
# 301 human conserved non-coding intra- and inter-
# genic elements for their ability to promote
# lacZ in mouse embryos.  A positive looks like
# a mouse with a purple spine.
#

mkdir /hive/data/genomes/hg19/bed/vistaEnhancers
cd /hive/data/genomes/hg19/bed/vistaEnhancers

# download data file from the vista browser (coordinates are for hg19)
wget -O enhancerbrowser.datadownload.txt 'http://enhancer.lbl.gov/cgi-bin/imagedb3.pl?page_size=100;show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1'

# give elements with positive label a score of 900, 
# give elements with negative label a score of 200.
# print to 5-field bed file
cat enhancerbrowser.datadownload.txt \
        | grep ">" \
        | sed -e 's#^<pre>##' \
        | sed -e 's#</pre>$##' \
        | grep "^>Human" \
        | sed -e 's#^>Human|##' \
        | tr :- ' ' \
        | sed -e 's/positive/900/'\
        | sed -e 's/negative/200/' \
        | awk '{print $1"\t"$2"\t"$3"\telement_"$6"\t"$8}' \
        | grep -P -v "^chr\t" \  
        > vistaEnhancers.bed
hgLoadBed hg19 vistaEnhancers vistaEnhancers.bed
#Loaded 1339 elements of size 5


# add to hg19/trackDb.ra
track vistaEnhancers override
url http://enhancer.lbl.gov/cgi-bin/imagedb3.pl?form=presentation&show=1&experiment_id=$$&organism_id=1


#####################################################
# UNIGENE/SAGE TRACK (RE-BUILT - 2010-12-10 Fan)

# Create the uniGene alignments

    # Download of the latest UniGene version is now automated by a
    # cron job -- see /cluster/home/angie/crontab ,
    # /cluster/home/angie/unigeneVers/unigene.csh .

    ssh hgwdev
    mkdir -p /hive/data/genomes/hg19/bed/uniGene/101210
    cd /hive/data/genomes/hg19/bed/uniGene/101210

    set Version = 228

    zcat /hive/data/outside/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\
    sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa

    ssh swarm
    set Version = 228
    mkdir -p /hive/data/genomes/hg19/bed/uniGene/101210/run.blat
    cd /hive/data/genomes/hg19/bed/uniGene/101210/run.blat

    ls -1 /hive/data/genomes/hg19/nib/*.nib > genome.lst
    ls -1S \
    /hive/data/genomes/hg19/bed/uniGene/101210/Hs.seq.uniq.simpleHeader.fa \
      > uniGene.lst
    
    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/hive/data/genomes/hg19/11.ooc $(path1) $(path2)  {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 genome.lst uniGene.lst template.sub para.spec
    para create para.spec
    mkdir psl
    para try
    para check
    para push

#Completed: 93 of 93 jobs
#CPU time in finished jobs:      68896s    1148.26m    19.14h    0.80d  0.002 y
#IO & Wait Time:                  4789s      79.82m     1.33h    0.06d  0.000 y
#Average job time:                 792s      13.21m     0.22h    0.01d
#Longest finished job:            5274s      87.90m     1.47h    0.06d
#Submission to last job:          5840s      97.33m     1.62h    0.07d
#Estimated complete:                 0s       0.00m     0.00h    0.00d

    pslSort dirs raw.psl tmp psl >& pslSort.log
    cat raw.psl|\
    pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
      stdin hg19.uniGene.pslReps.psl /dev/null
    
    gzip raw.psl

    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/uniGene/101210/run.blat

    hgLoadPsl -table=uniGene_3 hg19 hg19.uniGene.pslReps.psl

    mkdir -p /gbdb/hg19/uniGene
    cd /gbdb/hg19/uniGene

    rm Hs.seq.uniq.simpleHeader.fa
    ln -s \
    /hive/data/genomes/hg19/bed/uniGene/101210/Hs.seq.uniq.simpleHeader.fa \
    Hs.seq.uniq.simpleHeader.fa

# load the sequence

    hgLoadSeq -replace hg19 /gbdb/hg19/uniGene/Hs.seq.uniq.simpleHeader.fa

##############################################################################

##############################################################################
# GAD View Lift (DONE, Andy 2010-12-12)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir gad
cd gad/
echo "select * from gad" | hgsql hg18 | tail -n +2 > hg18.bed
liftOver hg18.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.{bed,unmapped}
hgLoadBed -noBin hg19 gad hg19.bed
rm bed.tab
wc -l *.bed
#  1883 hg18.bed
#  1860 hg19.bed
grep -i split hg19.unmapped | wc -l
#18
grep -i part hg19.unmapped | wc -l
#5
for table in gadAll gadList; do 
    hgsqldump hg18 $table | hgsql hg19
done

#############################################################################
# EVOFOLD (Done, 2010-12-13) - Galt using Jakob's procedure from hg18.txt.

# RNA secondary structure predictions lifted from hg17 and filtered
  ssh -C hgwdev
  mkdir -p /cluster/data/hg19/bed/evofold
  cd /cluster/data/hg19/bed/evofold
  echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed
  liftOver -bedPlus=6 -minMatch=1.0 foldsHg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg19.over.chain.gz tmp.bed unmapped.bed
  # remove elements which are wrong size after lifting
  awk '$3-$2 == $7' tmp.bed | sort -k4,4 > rawFoldsHg19.bed

  # structure filters
  # first, remove pairs that can't form in human
  cut -f 1-6 rawFoldsHg19.bed > tmp.bed
  # sequenceForBed can be found and compiled from here: $HOME/kent/src/hg/altSplice/altSplice/
  nice sequenceForBed -db=hg19 -bedIn=tmp.bed -fastaOut=tmp.fa
  cat tmp.fa | sed -e 's/\.[+-]\.chr.*$//' \
             | sed -e '/^>/s/$/\t/' | tr -d '\n' | sed -e 's/>/\n/g' | sed -e '1d' -e '$s/$/\n/' | sort -k1,1 > foldsHg19Seq.tab
  # Several python scripts were originally in /cluster/home/jsp/scripts/
  # I copied them to this directory and
  # I removed the optional "psyco" speedup library which does not work with our 64-bit python
  join -1 4 -2 1 -o "1.4 1.8 2.2" rawFoldsHg19.bed foldsHg19Seq.tab | sed -e 's/  */\t/g' | sort -k1,1 \
	     | ./tabFoldFilter.py > cleanFolds.tab
  join -1 4 -2 1 -o "1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.2 1.9" rawFoldsHg19.bed cleanFolds.tab | sed -e 's/  */\t/g' > tmp1.bed
  # second, remove poor predictions
  # scripts can be found in cvs tree at: cvsroot/jsp/scripts/. They use a few modules which can be found at: cvsroot/jsp/py_modules
  cat tmp1.bed | ./bedRnassFilter.py --dangling --minAvrStemSize=3 | ./bedRnassFilter.sh 1 3 \
	       | ./roundListFloats.py -c9 > foldsHg19.bed
  # clean up
  rm tmp.bed tmp1.bed foldsHg17.bed foldsHg19Seq.tab rawFoldsHg19.bed tmp.fa cleanFolds.tab

  # upload
  hgLoadBed -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/evofold.sql hg19 evofold foldsHg19.bed
#############################################################################
# CREATE .PNG PICTURE FILES OF EVOFOLD RNA STRUCTURES. (DONE, 4/29/2011, Fan)

ssh hgwdev
mkdir /hive/data/genomes/hg19/bed/evofold/doEvoFold
cd /hive/data/genomes/hg19/bed/evofold/doEvoFold

# Creaet sub-directories to store .png files (total of about 47.5 K of them) sparated by chromosomes.

mkdir -p evoFold/chr1
mkdir -p evoFold/chr10
mkdir -p evoFold/chr11
mkdir -p evoFold/chr12
mkdir -p evoFold/chr13
mkdir -p evoFold/chr14
mkdir -p evoFold/chr15
mkdir -p evoFold/chr16
mkdir -p evoFold/chr17
mkdir -p evoFold/chr18
mkdir -p evoFold/chr19
mkdir -p evoFold/chr2
mkdir -p evoFold/chr20
mkdir -p evoFold/chr21
mkdir -p evoFold/chr22
mkdir -p evoFold/chr3
mkdir -p evoFold/chr4
mkdir -p evoFold/chr5
mkdir -p evoFold/chr6
mkdir -p evoFold/chr7
mkdir -p evoFold/chr8
mkdir -p evoFold/chr9
mkdir -p evoFold/chrM
mkdir -p evoFold/chrX
mkdir -p evoFold/chrY

# get latest verion of the .jar file of VARNA

wget --timestamping http://varna.lri.fr/bin/VARNAv3-7.jar

# Create Java command line files

echo 'doEvoFold hg19 do$1 $1' >do1Chrom
chmod +x do1Chrom

do1Chrom chr1
do1Chrom chr10
do1Chrom chr11
do1Chrom chr12
do1Chrom chr13
do1Chrom chr14
do1Chrom chr15
do1Chrom chr16
do1Chrom chr17
do1Chrom chr18
do1Chrom chr19
do1Chrom chr2
do1Chrom chr20
do1Chrom chr21
do1Chrom chr22
do1Chrom chr3
do1Chrom chr4
do1Chrom chr5
do1Chrom chr6
do1Chrom chr7
do1Chrom chr8
do1Chrom chr9
do1Chrom chrM
do1Chrom chrX
do1Chrom chrY

# run the dochrXX command files in small batches with '&' to exploit multiple CPU
# wait an hour for each batch to finish so that we don't suck in too much computational resources.

dochr1 &
dochr2 &
dochr3 &
dochr4 &
dochr5 &

sleep 3600

dochr6 &
dochr7 &
dochr8 &
dochr9 &
dochr10 &

sleep 3600

dochr11 &
dochr12 &
dochr13 &
dochr14 &
dochr15 &

sleep 3600

dochr16 &
dochr17 &
dochr18 &
dochr19 &
dochr20 &

sleep 3600

dochr21 &
dochr22 &
dochrX &
dochrY &
dochrM &

# check the resulting .png files

# create a simple script file, check1, with the following 3 lines:

echo $1
hgsql hg19 -N -e "select count(*) from evofold where chrom='${1}'"
ls evoFold/$1/*.png|wc

chmod +x check1

# create another script file, checkAll, with the following lines:

check1 chr1
check1 chr10
check1 chr11
check1 chr12
check1 chr13
check1 chr14
check1 chr15
check1 chr16
check1 chr17
check1 chr18
check1 chr19
check1 chr2
check1 chr20
check1 chr21
check1 chr22
check1 chr3
check1 chr4
check1 chr5
check1 chr6
check1 chr7
check1 chr8
check1 chr9
check1 chrM
check1 chrX
check1 chrY

chmod +x checkAll
checkAll >j.check

# examing the resuls in j.check to make sure things are OK.

# create symbolic links

ln -s /hive/data/genomes/hg19/bed/evofold/doEvoFold/evoFold  /gbdb/hg19/evoFold 
ln -s /gbdb/hg19/evoFold /usr/local/apache/htdocs/evoFold/hg19

##########################################################################
# Build targetScanS track - (DONE - 2010-12-13 galt)
#       requested by: George Bell gbell at wi.mit.edu
    ssh hgwdev
    mkdir -p /cluster/data/hg19/bed/targetScanS
    cd /cluster/data/hg19/bed/targetScanS

    wget --timestamping http://www.targetscan.org/vert_50/ucsc/hg19/hg19Cons_ALL_CHRS.BED

    hgLoadBed hg19 targetScanS hg19Cons_ALL_CHRS.BED
    #	Loaded 54199 elements of size 6
    featureBits hg19 targetScanS
    #   354163 bases of 2897316137 (0.012%) in intersection

    # Create/edit/check in targetScans.html and trackDb.ra under
    # kent/src/hg/makeDb/trackDb/human/hg19

##########################################################################
# Neandertal tracks for hg19 (DONE - 2010-12-14 - Hiram)
    # data supplied by Ed Green into /hive/data/outside/homNea/hg19
    # add Neandertal group to hg19 grp
    hgsql hg19 -e \
      "insert into grp values ('neandertal', 'Neandertal Assembly and Analysis', 6.5, 1);"

    mkdir -p /hive/data/genomes/hg19/bed/homNea/seqAlis
    cd /hive/data/genomes/hg19/bed/homNea/seqAlis

for T in Feld1 Mez1 Sid1253 Vi33.16 Vi33.25 Vi33.26
do
    ln -s /hive/data/outside/homNea/hg19/${T}.hg18.bam \
	./SL${T}.hg19.bam
done

    ln -s /hive/data/outside/homNea/hg19/*.bam .
    for F in *.bam
do
    samtools index $F
done

    mkdir -p /gbdb/hg19/neandertal/seqAlis
    ln -s `pwd`/SL*.b* /gbdb/hg19/neandertal/seqAlis/

for T in Feld1 Mez1 Sid1253
do
    hgBbiDbLink hg19 bamSL${T} \
	/gbdb/hg19/neandertal/seqAlis/SL${T}.hg19.bam
done
for T in 16 25 26
do
    hgBbiDbLink hg19 bamSLVi33dot${T} \
	/gbdb/hg19/neandertal/seqAlis/SLVi33.${T}.hg19.bam
done

##########################################################################
# DECIPHER, RGD QTL, RGD RAT QTL (MAYBE DONE, Andy 2010-12-13)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir decipher rgdQtl rgdRatQtl
for tab in decipher rgdQtl rgdRatQtl; do 
  echo "select * from "$tab | hgsql hg18 | tail -n +2 | cut -f2- > ${tab}/hg18.bed
  liftOver ${tab}/hg18.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz ${tab}/hg19.{bed,unmapped}
  hgLoadBed hg19 $tab ${tab}/hg19.bed
done
rm bed.tab
wc -l {decipher,rgdQtl,rgdRatQtl}/hg1{8,9}.bed
#  4227 decipher/hg18.bed
#  4048 decipher/hg19.bed
#   254 rgdQtl/hg18.bed
#   225 rgdQtl/hg19.bed
#  6033 rgdRatQtl/hg18.bed
#  5804 rgdRatQtl/hg19.bed
## This isn't very good.  In each case, the unmapped % is over 2%.
## DECIPHER: 95.8%, RGD QTL: 88.6%, RGD RAT QTL: 96.2%
## update for rgdQtl:
hgsqldump hg18 rgdQtlLink | hgsql hg19 
hgsqldump hg18 rgdRatQtlLink | hgsql hg19

#############################################################################
# FISH CLONES LIFT (DONE, Andy 2010-12-14)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir fishClones
cd fishClones/
echo "select * from fishClones" | hgsql hg18 | tail -n +2 > hg18.bed5p
liftOver -bedPlus=5 hg18.bed5p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.bed5p hg19.unmapped 
wc -l *.bed5p
#   9788 hg18.bed5p
#   9758 hg19.bed5p
grep -i split hg19.unmapped  | wc -l
# 17
grep -i partially  hg19.unmapped  | wc -l
# 13
cp ~/kent/src/hg/lib/fishClones.sql .
hgLoadBed -tab -sqlTable=fishClones.sql -notItemRgb hg19 fishClones hg19.bed5p

#############################################################################
# CGAP SAGE LIFT (DONE, Galt 2010-12-16)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir cgapSage
cd cgapSage
echo "select * from cgapSage" | hgsql hg18 -N > hg18.bed8p
liftOver -tab -bedPlus=8 -hasBin hg18.bed8p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.bed8p hg19.unmapped 
wc -l *.bed8p
#  276905 hg18.bed8p
#  276865 hg19.bed8p
grep -i split hg19.unmapped  | wc -l
# 0
grep -i partially  hg19.unmapped  | wc -l
# 3
cp ~/kent/src/hg/lib/cgapSage/cgapSage.sql .
hgLoadBed -tab -hasBin -sqlTable=cgapSage.sql -notItemRgb hg19 cgapSage hg19.bed8p

# no lift needed for the lib table
echo "select * from cgapSageLib" | hgsql hg18 -N > cgapSageLib.tab
cp ~/kent/src/hg/lib/cgapSage/cgapSageLib.sql .
hgLoadSqlTab hg19 cgapSageLib cgapSageLib.sql cgapSageLib.tab

#############################################################################
# LASTZ Zebrafish DanRer7 (DONE - 2010-12-17 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17
    cd /hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17

    cat << '_EOF_' > DEF
# human vs X. zebrafish
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebrafish danRer7
SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit
SEQ2_LEN=/scratch/data/danRer7/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=40

BASE=/hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	Elapsed time: 1698m29s
    cat fb.hg19.chainDanRer7Link.txt 
    #	80849592 bases of 2897316137 (2.790%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/danRer7/bed/blastz.hg19.swap
    cd /hive/data/genomes/danRer7/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    42m52.920s
    cat fb.danRer7.chainHg19Link.txt 
    #	86716552 bases of 1409770109 (6.151%) in intersection

##############################################################################
# UMass Med School brain histone ChIP-seq 
# (DONE - 2010-12-17 - Kate)
#
# From Troy Whitfield, submitting for Zhiping Weng, collab with Akbarian
# Published in PNAS, April 2010
# Variables: cell, sample, sex, age
# Tissue:  Prefrontal cortex (PFC)
#
# 11 individuals, age .5 to 69 years
# 13 bigWigs of H3K4me3 enrichment: 11 neuronal, 2 non-neuronal 
# 3 peaks files (bed5FloatScore)
# Neuronal cells selected by FACS sorting, based on NeuN marker (NeuN+)
# Non-neuronal (NeuN-) cells are largely glia, microglia, and endothelium
# 3 peak files

# Note: used publicly available blood cell (lymphocyte)
# ChIP-seq as controls:
# K562, GM12878 (from Bernstein ENCODE group), CD4+ (from Barski, HLB)

    cd /hive/data/genomes/hg19/bed

    mkdir uMassBrainHistone
    cd uMassBrainHistone

    wget http://zlab.umassmed.edu/~whitfiet/hg19/uMassBrainHist.tar.gz
    tar xvfz uMassBrainHist.tar.gz
    cd data

    set t = "uMassBrainHistone"

# Load peak tracks
# peaks in neuron not in  blood
    hgLoadBed hg19 -noNameIx -renameSqlTable \
      -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
      ${t}PeaksNeuron 11Neuronal_vs_3Blood_hg19.bed
# Loaded 7947 elements of size 6

# peaks in infants (<1 year), not in seniors (>60)
    hgLoadBed hg19 -noNameIx -renameSqlTable \
      -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
      ${t}PeaksInfant 3Young_vs3Old_hg19.bed
# Loaded 1292 elements of size 6

# peaks specific to individuals
    hgLoadBed hg19 -noNameIx -renameSqlTable \
      -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
      ${t}PeaksSample SampleSpecific_hg19.bed
# Loaded 3214 elements of size 6

# Load signal tracks
# Get metadata from .ddf file

    cd ..
    cat uMassMedBrainHist.ddf

#files   view    sample  cell    sex     age
#data/s6.bw      Signal  6       neuron  male    4.7
#data/s6n.bw     Signal  6       non-neuron      male    4.7
#data/s2.bw      Signal  2       neuron  male    0.58
#data/s3.bw      Signal  3       neuron  female  0.75
#data/s1.bw      Signal  1       neuron  male    0.5
#data/s9.bw      Signal  9       neuron  female  68
#data/s5.bw      Signal  5       neuron  female  2.8
#data/s11.bw     Signal  11      neuron  female  69
#data/s11n.bw    Signal  11      non-neuron      female  69
#data/s7.bw      Signal  7       neuron  male    8.8
#data/s8.bw      Signal  8       neuron  male    14
#data/s10.bw     Signal  10      neuron  female  69
#data/s4.bw      Signal  4       neuron  male    1.3
#data/11Neuronal_vs_3Blood_hg19.bed      Peaks   N/A     N/A     N/A     N/A
#data/3Young_vs3Old_hg19.bed     Peaks   N/A     N/A     N/A     N/
#data/SampleSpecific_hg19.bed    Peaks   N/A     N/A     N/A     N/A


# Generate table names from DDF
# Format:  uMassBrainHistoneSignalS<sample>Neu<P|M><age>yrs<M|F>
# e.g. uMassBrainHistoneSignalS1185Neu4pt7yrsM

    grep Signal uMassMedBrainHist.ddf | \

    cat << 'EOF' > list.pl
while (<>) {
    ($file, $view, $sample, $cell, $sex, $age) = split;
    next unless $view eq 'Signal';
    $cell = ($cell eq 'neuron' ? 'P' : 'M');
    $age =~ s/\./pt/;
    $sex = ($sex eq 'male' ? 'M' : 'F');
    $table = "uMassBrainHistoneSignalS" . $sample . "Neu" . $cell . $age . "yrs" . $sex;
    print $file . "." . $table . "\n";
}
'EOF'

    cat << 'EOF' > load.csh
set gbdb = "/gbdb/hg19/bbi/uMassBrainHistone"
foreach x (`perl list.pl < uMassMedBrainHist.ddf`)
    set f = $x:r
    set t = $x:e
    echo "Loading $f into $t"
    bigWigInfo $f
    ln -s `pwd`/$f $gbdb/$t.bw
    hgBbiDbLink hg19 $t $gbdb/$t.bw
end
'EOF'

    csh load.csh >&! load.out &

#############################################################################
# SwitchDB TSS Track (DONE 2010-12-17 galt)
#
# This liftover is tricky because of the gmStart and gmEnd, which
# are not lifted automatically.  
# The gm coordinates have to be lifted separately.

ssh hgwdev
mkdir /cluster/data/hg19/bed/switchDbTss
cd /cluster/data/hg19/bed/switchDbTss
ln -s /cluster/data/hg17/bed/switchDbTss/hg17.bed hg17.bed
liftOver -tab -bedPlus=6 hg17.bed /gbdb/hg17/liftOver/hg17ToHg19.over.chain.gz hg19.bed unMapped
cat unMapped | grep '^#' | sort | uniq -c
#     61 #Deleted in new

ln -s ~/kent/src/hg/lib/switchDbTss.sql
hgLoadBed -renameSqlTable -sqlTable=switchDbTss.sql hg19 switchDbTssTemp hg19.bed

mysql> select count(*) from switchDbTssTemp;
+----------+
| count(*) |
+----------+
|   132332 | 
+----------+

hgsql -N hg19 -e "select distinct chrom, gmChromStart, gmChromEnd, gmName from switchDbTssTemp" > gmLoc.hg17.bed4
liftOver -tab -bedPlus=4 gmLoc.hg17.bed4 /gbdb/hg17/liftOver/hg17ToHg19.over.chain.gz gmLoc.hg19.bed4 gmUnMapped

cat gmUnMapped | grep '^#' | sort | uniq -c
#      1 #Deleted in new
#     58 #Partially deleted in new
#     57 #Split in new

hgLoadBed hg19 switchDbTssGmLocTemp gmLoc.hg19.bed4


hgsql hg19 < switchDbTss.sql

hgsql hg19 -e "insert into switchDbTss select a.bin, a.chrom, a.chromStart, a.chromEnd, a.name, a.score, a.strand, a.confScore, a.gmName, b.chromStart as gmChromStart, b.chromEnd as gmChromEnd, a.isPseudo from switchDbTssTemp a, switchDbTssGmLocTemp b where a.gmName = b.name"

mysql> select count(*) from switchDbTss;
+----------+
| count(*) |
+----------+
|   131780 | 
+----------+

hgsql hg19 -e "drop table switchDbTssTemp"
hgsql hg19 -e "drop table switchDbTssGmLocTemp"

#############################################################################

#############################################################################
# FOSMID END PAIRS LIFT FROM HG18 (DONE 2010-12-28, Andy)

mkdir /hive/data/genomes/hg19/bed/hg18MassiveLift/fosEndPairs
cd /hive/data/genomes/hg19/bed/hg18MassiveLift/fosEndPairs/
echo "select * from fosEndPairs" | hgsql hg18 | tail -n +2 | cut -f2- > hg18.fosEndPairs.fep.bed
echo "select * from all_fosends" | hgsql hg18 | tail -n +2 | cut -f2- > hg18.all_fosends.psl
# Converting to bed 12 because of the positional info in nonstandard fields.
# the awk script is pretty simple and is in the directory.
awk -f toBed12.awk hg18.fosEndPairs.fep.bed > hg18.fosEndPairs.bed12
liftOver -bedPlus=12 hg18.fosEndPairs.bed12 /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.fosEndPairs.{bed12,unmapped12}
liftOver -pslT hg18.all_fosends.psl /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.{psl,unmapped}
# remove pairs which have one (or both) ends not in the lifted set of all ends
cut -f14 hg19.fosEndPairs.bed12 | tr ',' '\n' | sort > hg19.fosEndPairs.names
cut -f10 hg19.all_fosends.psl | sort > hg19.all_fosends.names
grep -Fxv -f hg19.all_fosends.names hg19.fosEndPairs.names > bad.names
grep -Fv -f bad.names hg19.fosEndPairs.bed12 > hg19.fosEndPairs.good.bed12
wc -l hg19*.bed12
#  384635 hg19.fosEndPairs.bed12
#  384442 hg19.fosEndPairs.good.bed12
## so... 193 bad ones were removed.  These would have caused join errors if they were left in.

## convert back to fosEndPairs bed6+
awk -f toFosEndPairs.awk hg19.fosEndPairs.good.bed12 > hg19.fosEndPairs.fep.bed

cp ~/kent/src/hg/lib/fosEndPairs.sql .
hgLoadBed -sqlTable=fosEndPairs.sql -notItemRgb hg19 fosEndPairs hg19.fosEndPairs.fep.bed 
hgLoadPsl -table=all_fosends hg19 hg19.all_fosends.psl

wc -l *.fep.bed
#  386129 hg18.fosEndPairs.fep.bed
#  384442 hg19.fosEndPairs.fep.bed
## 99.6% lifted

## Now we need the sequences from all_fosends to be loaded into the seq table in hg19

mkdir /gbdb/hg19/fosends
ln -s /gbdb/hg18/fosends/fosEnds.fa /gbdb/hg19/fosEnds.fromHg18.fa 
hgLoadSeq hg19 /gbdb/hg19/fosends/fosEnds.fromHg18.fa

#############################################################################
# DECIPHER LIFT FROM HG18 (DONE 2010-12-27, Andy)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift/
mkdir decipher
cd decipher/
hgsql -e "select * from decipherRaw" hg18 | tail -n +2 > hg18.decipherRaw.txt
cat hg18.decipherRaw.txt | awk 'BEGIN{FS="\t";OFS="\t"}{ chr="chr"$4; $4=$1; $1=chr; $2 = $2 - 1; print;}' | liftOver -bedPlus=4 -tab stdin /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz stdout hg19.decipherRaw.unmapped | sed 's/^chr//' | awk 'BEGIN{FS="\t";OFS="\t"}{ t=$1; $1=$4; $4=t; print;}' > hg19.decipherRaw.txt
cp ~/kent/src/hg/lib/decipherRaw.sql .
hgLoadSqlTab hg19 decipherRaw decipherRaw.sql hg19.decipherRaw.txt
hgsql -e "select * from decipher" hg18 | tail -n +2 | cut -f2- > hg18.decipher.bed
liftOver hg18.decipher.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.decipher.{bed,unmapped}
cut -f1 hg19.decipherRaw.txt > hg19.decipherRaw.names
cut -f4 hg19.decipher.bed > hg19.decipher.names
# how many of the lifted deciphers are not in the lifted decipherRaws?
grep -Fvx -f hg19.decipher.names hg19.decipherRaw.names | wc -l
#0
# none. ok then, we are done.
rm *.names
wc -l *.bed
#  4227 hg18.decipher.bed
#  4048 hg19.decipher.bed
hgLoadBed hg19 decipher hg19.decipher.bed

#############################################################################
# CLONE COVERAGE LIFT FROM HG18 (DONE 2010-12-29, Andy)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift/
mkdir clonePos
cd clonePos/
hgsql --skip-column-names -e "select * from clonePos" hg18 | awk 'BEGIN{FS="\t"; OFS="\t";}{print $4, $5, $6, $1, $2, $3, $7, $8;}' > hg18.clonePos.bed4p
liftOver -tab -bedPlus=4 hg18.clonePos.bed4p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.clonePos.{bed4p,unmapped}
awk 'BEGIN{FS="\t"; OFS="\t";}{print $4, $5, $6, $1, $2, $3, $7, $8;}' hg19.clonePos.bed4p > hg19.clonePos.txt
hgLoadSqlTab hg19 clonePos clonePos.sql hg19.clonePos.txt
## it loaded but there seems to be a dependency on the "chr*_gl" tables.  sigh..
mkdir gl
cd gl/
echo "show tables like 'chr%_gl'" | hgsql hg18 | tail -n +2 | while read table; do echo "select * from "$table | hgsql hg18 | tail -n +2 | cut -f2- | awk -v chr=${table%_gl} 'BEGIN{OFS="\t"}{print chr, $2, $3, $1, "1000", $4;}'; done > hg18.gl.bed
liftOver hg18.gl.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.gl.{bed,unmapped}
hgLoadBed -noLoad hg19 gl hg19.gl.bed
mv bed.tab hg19.gl.withBin.bed
awk 'BEGIN{OFS="\t";}{fname=$2"_gl.txt"; print $1, $5, $3, $4, $7 >> fname;}' hg19.gl.withBin.bed
cd ../
for tab in `echo show tables like "'chr%_gl'" | hgsql hg19 | tail -n +2`; do 
  echo select frag from $tab | hgsql hg19 | tail -n +2  >> gl.names; 
done
sed 's/_.*//' gl.names | sort | uniq > uniq.gl.names
cut -f4 hg19.clonePos.bed4p > hg19.clonePos.names
diff uniq.gl.names hg19.clonePos.names | grep '<' | sed 's/< //' > bad_gl.names
for f in chr*.txt; do 
   tab=${f%.txt}
   grep -v -f ../bad_gl.names $f > ${tab}.update.txt
   hgLoadSqlTab hg19 ${tab} hg18.chr1_gl.sql ${tab}.update.txt; 
done

#############################################################################
# MGI MOUSE QTL LIFT (DONE 2010-12-30, Andy)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir mgiMouseQtl
cd mgiMouseQtl/
## There are two subtracks to deal with but it's not a big deal.
## Both are bed4.
for tab in jaxQtlAsIs jaxQtlPadded; do
   hgsql hg18 --skip-column-names -e "select chrom,chromStart,chromEnd,name from "$tab > hg18.${tab}.bed
   liftOver hg18.${tab}.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${tab}.{bed,unmapped}
   hgLoadBed hg19 $tab hg19.${tab}.bed
done
wc -l *.bed
#    398 hg18.jaxQtlAsIs.bed
#   1463 hg18.jaxQtlPadded.bed
#    383 hg19.jaxQtlAsIs.bed
#   1462 hg19.jaxQtlPadded.bed
## 96.2% for jaxQtlAsIs, 99.9% for jaxQtlPadded.

#############################################################################
# H-INV LIFT (DONE 2010-12-30, Andy)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir HInv
cd HInv/
hgsql hg18 --skip-column-names -e "select * from HInvGeneMrna" | cut -f2- > hg18.HInvGeneMrna.psl
liftOver -pslT hg18.HInvGeneMrna.psl /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.HInvGeneMrna.{psl,unmapped}
hgLoadPsl -table=HInvGeneMrna hg19 hg19.HInvGeneMrna.psl
## A couple non-positional tables too:
hgsqldump hg18 knownToHInv | hgsql hg19
hgsqldump hg18 HInv | hgsql hg19

#############################################################################
# SIB ALT-SPLICING LIFT (DONE 2010-12-30, Andy)
# Note: 
# Obsolete. See the section "SIB Transcriptome (DONE 2011-12-02 Chin)" 
# down below and redmine track #5630 for more details

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir sibTxGraph
cd sibTxGraph/
## there is positional data in the 9th column, so I'll convert it to a bed12+
## and back to the native format later.  Again, the awk scripts to do that
## are aptly named, and reside in the hg19 directory with the data. 
hgsql hg18 --skip-column-names -e "select * from sibTxGraph" | cut -f2- | awk -f toBed12Plus.awk > hg18.sibTxGraph.bed12p
liftOver -tab -bedPlus=12 hg18.sibTxGraph.bed12p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.sibTxGraph.{bed12p,unmapped}
awk -f toSib.awk hg19.sibTxGraph.bed12p > hg19.sibTxGraph.txt
cut -f1-3 hg19.sibTxGraph.bed12p > hg19.sibTxGraph.bed3
hgLoadBed -noLoad hg19 sibTxGraph hg19.sibTxGraph.bed3
cut -f1 bed.tab > hg19.sibTxGraph.bins
paste hg19.sibTxGraph.bins hg19.sibTxGraph.txt > hg19.sibTxGraph.withBin.txt
## Oddly, there's no .sql file for this in the kent source-tree, so I'll 
## make one directly from hg18 that's suitable for hgLoadSqlTab
hgsqldump --no-data --compact hg18 sibTxGraph | sed '/^SET/d;s/ENGINE.*//'  > sibTxGraph.sql
hgLoadSqlTab hg19 sibTxGraph sibTxGraph.sql hg19.sibTxGraph.withBin.txt

wc -l *.bed12p
#    47094 hg18.sibTxGraph.bed12p
#    47008 hg19.sibTxGraph.bed12p
## 99.8% lifted, not bad.

#############################################################################
# ILLUMINA WG-6 LIFT TO HG19 (DONE 2010-12-30, Andy)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir illuminaProbes
cd illuminaProbes/
## Just copy the seq table to hg19
hgsqldump hg18 illuminaProbesSeq | hgsql hg19
## Two tables: a PSL and a BED:
hgsql hg18 --skip-column-names -e "select * from illuminaProbes" | cut -f2- > hg18.illuminaProbes.bed
hgsql hg18 --skip-column-names -e "select * from illuminaProbesAlign" | cut -f2- > hg18.illuminaProbesAlign.psl
liftOver hg18.illuminaProbes.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.illuminaProbes.{bed,unmapped}
liftOver -pslT hg18.illuminaProbesAlign.psl /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.illuminaProbesAlign.{psl,unmapped}
hgLoadBed hg19 illuminaProbes hg19.illuminaProbes.bed
hgLoadPsl -table=illuminaProbesAlign hg19 hg19.illuminaProbesAlign.psl
## Just to check the probes and align tables are essentially the same
cut -f4 hg19.illuminaProbes.bed | sort > hg19.illuminaProbes.names
cut -f10 hg19.illuminaProbesAlign.psl | sort > hg19.illuminaProbesAlign.names
diff *.names
#(no output)
wc -l *.bed *.psl
#   44163 hg18.illuminaProbes.bed
#   44088 hg19.illuminaProbes.bed
#   44163 hg18.illuminaProbesAlign.psl
#   44088 hg19.illuminaProbesAlign.psl
## 99.8% lifted

#############################################################################
# EIO/JCVI NAS LIFT TO HG19 (DONE 2010-12-30, Andy)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir eioJcviNAS
cd eioJcviNAS/
for tab in eioJcviNASNeg eioJcviNASPos; do
   hgsql hg18 --skip-column-names -e "select * from "$tab > hg18.${tab}.bed 
   liftOver hg18.${tab}.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${tab}.{bed,unmapped}
   hgLoadBed hg19 ${tab} hg19.${tab}.bed
done
wc -l *.bed 
#  338278 hg18.eioJcviNASNeg.bed
#  130535 hg18.eioJcviNASPos.bed
#  338238 hg19.eioJcviNASNeg.bed
#  130504 hg19.eioJcviNASPos.bed
## > 99.9% of items lifted in both tables: pretty good.
## One strange thing about this one is that the hg18 tables don't have a bin
## field.  I doubt it's important to keep it that way.

#############################################################################
# ORegAnno LIFT TO HG19 (DONE 2010-12-31, Andy)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir oreganno
cd oreganno
hgsql hg18 --skip-column-names -e "select * from oreganno" | cut -f2- > hg18.oreganno.bed3p
liftOver -bedPlus=3 -tab hg18.oreganno.bed3p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.oreganno.{bed3p,unmapped}
cp ~/kent/src/hg/lib/oreganno.sql .
hgLoadBed -tab -sqlTable=oreganno.sql hg19 oreganno hg19.oreganno.bed3p
hgsqldump hg18 oregannoLink | hgsql hg19
hgsqldump hg18 oregannoAttr | hgsql hg19
wc -l *.bed3p
#  23130 hg18.oreganno.bed3p
#  23118 hg19.oreganno.bed3p
## 99.9% lifted.

#############################################################################
# NK NUC LAMINA LIFT TO HG19 (DONE 2010-12-31, Andy)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir laminB1
cd laminB1/
hgsql hg18 --skip-column-names -e "select * from laminB1Lads" | cut -f2- > hg18.laminB1Lads.bed
liftOver hg18.laminB1Lads.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.laminB1Lads.{bed,unmapped}
hgLoadBed hg19 laminB1Lads hg19.laminB1Lads.bed
wc -l *.bed
# 1344 hg18.laminB1Lads.bed
# 1302 hg19.laminB1Lads.bed
## 96.9% lifted... ok, not bad I guess.

ln -s /hive/data/genomes/hg18/bed/nuclearLamina/LaminB1_080513.wig hg18.laminB1.customWigVarStep
awk -f toBedGraph.awk hg18.laminB1.customWigVarStep > hg18.laminB1.bg
liftOver -bedPlus=3 -tab hg18.laminB1.bg /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.laminB1.{bg,unmapped}
awk '{if ($3 - $2 == 60) print;}' hg19.laminB1.bg | sort -k1,1 -k2,2n | awk 'BEGIN{prev=-100;chrom="";FS="\t";OFS="\t";}{ if ((chrom != $1) || ($2 - prev > 60)) {print; chrom = $1; prev = $2;}}'> hg19.laminB1.span60.bg
wigBedToStep hg19.laminB1.span60.bg hg19.laminB1.span60.wigVarStep
ln -s hg19.laminB1.span60.wigVarStep laminB1.wig
wigEncode laminB1.{wig,wiggle,wib}
#Converted laminB1.wig, upper limit 5.68, lower limit -6.60
ln -s `pwd`/laminB1.wib /gbdb/hg19/wib/laminB1.wib
hgLoadWiggle hg19 laminB1 laminB1.wiggle
wc -l hg18.laminB1.bg hg19.laminB1.span60.bg
#  2909178 hg18.laminB1.bg
#  2908692 hg19.laminB1.span60.bg
## In total, 99.98% of the datapoints lifted cleanly.

#############################################################################
# UCSF BRAIN METHYLATION (DONE 2010-12-31, Andy)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir ucsfBrainMethyl
cd ucsfBrainMethyl/

## 10 tables:
##
## ucsfChipSeqH3K4me3BrainCoverage    (bedGraph/bed3+)
## ucsfMreSeqBrainReads               (bed9)
## ucsfMreSeqBrainCpG                 (bedGraph/bed3+)
## ucsfMedipSeqBrainReads             (bed9)
## ucsfMedipSeqBrainCpG               (bedGraph/bed3+)
## ucsfMedipSeqBrainCoverage          (bedGraph/bed3+)
## ucsfRnaSeqBrainAllReads            (bed9)
## ucsfRnaSeqBrainAllCoverage         (bedGraph/bed3+)
## ucsfRnaSeqBrainSmartReads          (bed9)
## ucsfRnaSeqBrainSmartCoverage       (bedGraph/bed3+)

## Do the bed9s first:

for tab in ucsfMreSeqBrainReads ucsfMedipSeqBrainReads ucsfRnaSeqBrainAllReads ucsfRnaSeqBrainSmartReads; do
   hgsql hg18 --skip-column-names -e "select * from "$tab | cut -f2- > hg18.${tab}.bed
   liftOver hg18.${tab}.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${tab}.{bed,unmapped}
   hgLoadBed hg19 $tab hg19.${tab}.bed
   wc -l *.${tab}.bed
done

#   10110644 hg18.ucsfMreSeqBrainReads.bed
#   10109979 hg19.ucsfMreSeqBrainReads.bed
## 99.99% lifted
#   44130143 hg18.ucsfMedipSeqBrainReads.bed
#   44120612 hg19.ucsfMedipSeqBrainReads.bed
## 99.98% lifted
#   63033692 hg18.ucsfRnaSeqBrainAllReads.bed
#   63031432 hg19.ucsfRnaSeqBrainAllReads.bed
#   26767318 hg18.ucsfRnaSeqBrainSmartReads.bed
#   26766288 hg19.ucsfRnaSeqBrainSmartReads.bed
## getting old now, we get it... it lifts.
    
for tab in ucsfChipSeqH3K4me3BrainCoverage ucsfMreSeqBrainCpG ucsfMedipSeqBrainCpG ucsfMedipSeqBrainCoverage ucsfRnaSeqBrainAllCoverage ucsfRnaSeqBrainSmartCoverage; do
   hgsql hg18 --skip-column-names -e "select * from "$tab | cut -f2- > hg18.${tab}.bg
   liftOver -bedPlus=3 hg18.${tab}.bg /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${tab}.{bg,unmapped}
   hgLoadBed -bedGraph=4 hg19 $tab hg19.${tab}.bg
   wc -l *.${tab}.bg
done

#  2598517 hg18.ucsfChipSeqH3K4me3BrainCoverage.bg
#  2598085 hg19.ucsfChipSeqH3K4me3BrainCoverage.bg
#  1165599 hg18.ucsfMreSeqBrainCpG.bg
#  1165521 hg19.ucsfMreSeqBrainCpG.bg
# 20862283 hg18.ucsfMedipSeqBrainCpG.bg
# 20859033 hg19.ucsfMedipSeqBrainCpG.bg
# 80960101 hg18.ucsfMedipSeqBrainCoverage.bg
# 80943454 hg19.ucsfMedipSeqBrainCoverage.bg
# 17019268 hg18.ucsfRnaSeqBrainAllCoverage.bg
# 17017461 hg19.ucsfRnaSeqBrainAllCoverage.bg
#  6141663 hg18.ucsfRnaSeqBrainSmartCoverage.bg
#  6140890 hg19.ucsfRnaSeqBrainSmartCoverage.bg
## again in each case, almost all the data in the table lifts.

for f in *; do gzip $f; echo $f zipped; done

## One more thing: remove overlapping items in lifted bedGraphs:

for f in hg19*.bg.gz; do 
    pre=${f%.bg.gz};
    tab=${pre#hg19.};
    echo $tab;
    gunzip -c $f | sort -k1,1 -k2,2n | bedGraphLegalize -report=${pre}.bad.txt stdin stdout | gzip -c > ${pre}.legal.bg.gz
    hgLoadBed -bedGraph=4 hg19 $tab ${pre}.legal.bg.gz
done

#############################################################################
# SNP ARRAYS LIFT TO HG19 (DONE 2010-12-31, Andy)

cd /hive/data/genomes/hg19/bed/hg18MassiveLift
mkdir snpArray
cd snpArray/

## 12 arrays:
## 
## snpArrayAffy6                   (bed6+, 8 fields)
## snpArrayAffy6SV                 (bed6)
## snpArrayAffy5                   (bed6+, 8 fields)
## snpArrayAffy250Nsp              (bed6+, 8 fields)
## snpArrayAffy250Sty              (bed6+, 8 fields)
## snpArrayIllumina650             (bed6+, 7 fields)
## snpArrayIllumina550             (bed6+, 7 fields)
## snpArrayIllumina300             (bed6+, 7 fields)
## snpArrayIllumina1M              (bed6+, 7 fields)
## snpArrayIlluminaHumanCytoSNP_12 (bed6+, 7 fields)
## snpArrayIlluminaHuman660W_Quad  (bed6+, 7 fields)
## snpArrayIlluminaHumanOmni1_Quad (bed6+, 7 fields)

## Get the bed6 one out of the way first:

hgsql hg18 --skip-column-names -e "select * from snpArrayAffy6SV" | cut -f2- > hg18.snpArrayAffy6SV.bed
liftOver hg18.snpArrayAffy6SV.bed /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.snpArrayAffy6SV.{bed,unmapped}
hgLoadBed hg19 snpArrayAffy6SV hg19.snpArrayAffy6SV.bed
wc -l *.bed
#  945805 hg18.snpArrayAffy6SV.bed
#  945615 hg19.snpArrayAffy6SV.bed

## The rest each may or may not have their own module in lib
## For simplicity sake, I'll just dump the CREATEs straight from
## hg18 into their own .sql file.

for table in snpArrayAffy6 snpArrayAffy5 snpArrayAffy250Nsp snpArrayAffy250Sty snpArrayIllumina650 snpArrayIllumina550 snpArrayIllumina300 snpArrayIllumina1M snpArrayIlluminaHumanCytoSNP_12 snpArrayIlluminaHuman660W_Quad snpArrayIlluminaHumanOmni1_Quad; do
   hgsql hg18 --skip-column-names -e "select * from "$table | cut -f2- > hg18.${table}.bed6p
   hgsqldump --no-data --compact hg18 $table | sed '/^SET/d;s/ENGINE.*//' > hg18.${table}.sql
   liftOver -bedPlus=6 hg18.${table}.bed6p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${table}.{bed6p,unmapped}
   hgLoadBed -sqlTable=hg18.${table}.sql -renameSqlTable hg19 $table hg19.${table}.bed6p
done
for table in snpArrayAffy6 snpArrayAffy5 snpArrayAffy250Nsp snpArrayAffy250Sty snpArrayIllumina650 snpArrayIllumina550 snpArrayIllumina300 snpArrayIllumina1M snpArrayIlluminaHumanCytoSNP_12 snpArrayIlluminaHuman660W_Quad snpArrayIlluminaHumanOmni1_Quad; do
   hg18=`wc -l hg18.${table}.bed6p | awk '{print $1}'`
   hg19=`wc -l hg19.${table}.bed6p | awk '{print $1}'`
   perc=`echo ${hg19}"/"${hg18}" * 100" | R --vanilla | grep "\[1\]" | awk '{print $2}'`
   printf "%s: %d/%d items lifted (%.3f%%)\n" $table $hg19 $hg18 $perc
done
# snpArrayAffy6: 909297/909508 items lifted (99.977%)
# snpArrayAffy5: 440638/440734 items lifted (99.978%)
# snpArrayAffy250Nsp: 257159/257213 items lifted (99.979%)
# snpArrayAffy250Sty: 233887/233941 items lifted (99.977%)
# snpArrayIllumina650: 660388/660557 items lifted (99.974%)
# snpArrayIllumina550: 560972/561122 items lifted (99.973%)
# snpArrayIllumina300: 318046/318117 items lifted (99.978%)
# snpArrayIllumina1M: 1217520/1219961 items lifted (99.800%)
# snpArrayIlluminaHumanCytoSNP_12: 302127/302402 items lifted (99.909%)
# snpArrayIlluminaHuman660W_Quad: 664655/665901 items lifted (99.813%)
# snpArrayIlluminaHumanOmni1_Quad: 1169872/1175447 items lifted (99.526%)

## Now there's a few "Raw" tables to lift. Convert them to bed3+ first:

for tab in `echo show tables like "'snpArray%Raw'" | hgsql hg18 | tail -n +2`; do 
  hgsql hg18 --skip-column-names -e "select * from "$table | awk 'BEGIN{FS="\t";OFS="\t"}{print "chr"$6, $7 - 1, $7, $1, $2, $3, $4, $5}' > hg18.${table}.bed3p
  liftOver -bedPlus=3 -tab hg18.${table}.bed3p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${table}.{bed3p,unmapped}
  awk 'BEGIN{FS="\t";OFS="\t"}{print $4, $5, $6, $7, $8, substr($1, 4), $3;}' hg19.${table}.bed3p > hg19.${table}.txt
  hgsqldump --no-data --compact hg18 $table | sed '/^SET/d;s/ENGINE.*//' > hg18.${table}.sql
  hgLoadSqlTab hg19 $table hg18.${table}.sql hg19.${table}.txt
done

############################################################################
# HAPMAP SNPS AND HAPMAP LD PHASED LIFTS FROM HG18 (Andy)

mkdir /hive/data/genomes/hg19/bed/hg18MassiveLift/hapmapSnps
cd /hive/data/genomes/hg19/bed/hg18MassiveLift/hapmapSnps

## All the tables in the trackDb entry seem to be bed 6 +
for table in `grep -B1 "parent hapmapSnps" ~/kent/src/hg/makeDb/trackDb/human/trackDb.ra | grep track | sed 's/.*track\ //'`; do
   echo $table >> tables.txt;
done
for table in `cat tables.txt`; do 
   hgsql hg18 --skip-column-names -e "select * from "$table | cut -f2- > hg18.${table}.bed6p;
   liftOver -bedPlus=6 -tab hg18.${table}.bed6p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${table}.{bed6p,unmapped};
   hgsqldump --no-data --compact hg18 $table | sed '/^SET/d;s/ENGINE.*//' > ${table}.sql
   hgLoadBed -sqlTable=${table}.sql -tab hg19 $table hg19.${table}.bed6p;
   wc -l hg1{8,9}.${table}.bed6p >> lifts.txt
done
## Also need hapmapLd% and hapmapAllelesSummary
for table in `hgsql hg18 --skip-column-names -e "show tables like 'hapmapLd%'"` hapmapAllelesSummary; do
   hgsql hg18 --skip-column-names -e "select * from "$table | cut -f2- > hg18.${table}.bed6p;
   liftOver -bedPlus=6 -tab hg18.${table}.bed6p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.${table}.{bed6p,unmapped};
   hgsqldump --no-data --compact hg18 $table | sed '/^SET/d;s/ENGINE.*//' > ${table}.sql
   hgLoadBed -sqlTable=${table}.sql -tab hg19 $table hg19.${table}.bed6p;
   wc -l hg1{8,9}.${table}.bed6p >> lifts.txt
done
## Also need hapmapPhaseIIISummary
hgsql hg18 --skip-column-names -e "select * from hapmapPhaseIIISummary" | cut -f2- > hg18.hapmapPhaseIIISummary.bed5p
liftOver -bedPlus=5 hg18.hapmapPhaseIIISummary.bed5p /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz hg19.hapmapPhaseIIISummary.{bed5p,unmapped}
hgsqldump --no-data --compact hg18 hapmapPhaseIIISummary | sed '/^SET/d;s/ENGINE.*//' > hapmapPhaseIIISummary.sql
hgLoadBed -sqlTable=hapmapPhaseIIISummary.sql hg19 hapmapPhaseIIISummary hg19.hapmapPhaseIIISummary.bed5p


############################################################################
# INDEL-BASED CONSERVATION TRACK liftOver to hg19 (DONE - 2010-12-21 - Chin )
# Data from the Gerton Lunter (gerton.lunter at anat.ox.ac.uk), MRC
# Functional Genetics Unit, University of Oxford, United Kingdom.
# Data is from the paper:
# Lunter G, Ponting CP and Hein J Genome-wide identification of human
# functional DNA using a neutral indel model. PLoS Comput Biol. 2006
# Jan;2(1):e5.
    mkdir -p /hive/data/genomes/hg19/bed/consIndels/data
    cd /hive/data/genomes/hg19/bed/consIndels
    cp /hive/data/genomes/hg18/bed/consIndels/README.indels .
    cp /hive/data/genomes/hg18/bed/consIndels/igs-hg18mm8cf2.zip .
    # 38 Mb zip file in GFF format. This contains data for hg18
    # comparing it to mm8 and cf2 (canFam2).
    unzip igs-hg18mm8cf2.zip
    mv *.gff ./data/
    cd /hive/data/genomes/hg19/bed/consIndels/data    

    for f in *.gff
    do
       echo processing $f
       grep -v "track" $f >> ../allNoHeader.tmp
    done

    cd /hive/data/genomes/hg19/bed/consIndels/	
    cat allNoHeader.tmp | \
    awk '{print $1,$4,$5,$6,$9,$10,$11}' > consIndels.bed7p       

    liftOver -bedPlus=3 consIndels.bed7p \
      /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
      consIndelsHg19Mm8CanFam2.bed7p unMapped

    wc -l *.bed7p
    #  2603017 consIndels.bed7p
    #  2602701 consIndelsHg19Mm8CanFam2.bed7p
    grep -i split unMapped | wc -l
    # 41
    grep -i partially unMapped | wc -l
    # 66
    rm allNoHeader.tmp
    rm consIndels.bed7p
   
    # strip off the end of the name e.g. IGS0001.1:p=.74; FDR 0.27
    # so that the name displayed is short - IGS0001.1. The score field
    # is used to determine colouring and this is calculated from FDR
    cd /cluster/data/hg19/bed/consIndels
    cat  consIndelsHg19Mm8CanFam2.bed7p | 
      awk '{print $1,$2, $3, $5, $4}' | 
      sed -e 's/:p[=<]\.[0-9][0-9]*;//' \
       > consIndelsHg19Mm8CanFam2.bed

    # load data
    cd /hive/data/genomes/hg19/bed/consIndels
    hgLoadBed hg19 consIndelsHgMmCanFam consIndelsHg19Mm8CanFam2.bed
    # Reading consIndelsHg19Mm8CanFam2.bed
    # Loaded 2602701 elements of size 5
    # Sorted
    # Creating table definition for consIndelsHgMmCanFam
    # Saving bed.tab
    # Loading hg19

    # Get the IDs, posterior probabilities (p) for the segment being
    # neutral,
    # and the FDR from the original GFFs for a separate table. Some
    # items
    # have p<.001. Can not do Table Browser queries restricting
    # p to <, =, or > a specified value unless all values are floats.
    # Contacted the data contributor, Gerton Lunter, and he said it
    # would be
    # ok to change all p<.001 to p=0.0005
    cd /hive/data/genomes/hg19/bed/consIndels/
    cat consIndelsHg19Mm8CanFam2.bed7p \
      | awk 'BEGIN {FS="\t"} {print $5, $6, $7}'  \
      | sed -e 's/:/\t/' \
      | sed -e 's/p=\./0\./' | sed -e 's/p<\.001/0\.0005/' \
      | sed -e 's/;\sFDR/\t/' > consIndelsHg19Mm8CanFam2Conf.txt

    # there are no GFF files for the haplotype chroms
    # Reuse $HOME/kent/src/hg/lib/itemConf.* for the table of identifier,
    # posterior probability and false discovery rate (FDR).
    
    cd /hive/data/genomes/hg19/bed/consIndels
    hgLoadSqlTab hg19 consIndelsHgMmCanFamConf \
         $HOME/kent/src/hg/lib/itemConf.sql \
         consIndelsHg19Mm8CanFam2Conf.txt

    # check that all itesm are in this table.
    hgsql -N -e 'select distinct(name) from consIndelsHgMmCanFam;' hg19 \
         | sort > consIndels.names.sort
    hgsql -N -e 'select distinct(id) from consIndelsHgMmCanFamConf;' hg19 \
         | sort > consIndels.idsfromConf.sort
    wc -l *.sort
    # 2602701 consIndels.idsfromConf.sort
    # 2602701 consIndels.names.sort

    comm -12 consIndels.names.sort consIndels.idsfromConf.sort | wc -l
    # 2602701
    # so all element IDs are in both tables.
    # cleanup
    rm ./data/*.bak *.sort

    # add trackDb/human/hg19/trackDb.ra entry and add description that
    # was written by the data contributor. Add code to hgc.c to display
    # the posterior probability and the FDR on the details page for
    # track elements. Gerton Lunter provided a description for the data
    # on 2007-09-12.
    # Add hg19 to the "identifier consIndelsId" in all.joiner.


############################################################################
# POLYA_DB TRACK (DONE 2011-01-04 - Chin)
#

    # Data files and program:
    # "Bin Tian" <btian@umdnj.edu> provided the following two 
    # data files at /hive/data/outside/polyA:
    #   hg18.polyadb.bed  hg19.polyadb.bed
    # Andy found the SVM program he used before, and in here:
    #  /hive/data/genomes/hg18/bed/polyaDB/polya_svm_2.2.tar.gz
    # Copy it to /hive/data/outside/polyA. Unzip the program to 
    # polya_svm_2.2

    mkdir /hive/data/genomes/hg19/bed/polyaDB
    cd /hive/data/genomes/hg19/bed/polyaDB
    cp /hive/data/outside/polyA/hg19.polyadb.bed .
    hgLoadBed hg19 polyaDb hg19.polyadb.bed
    # add trackDb entry in human/hg19
    # polyA.html is at top human level 

    # since hg19.polyadb.bed provided is lifted over from hg18.polyadb.bed,
    # it it safe to lift polyaPredict table from hg18 to hg19 without re-run
    # the svm.
    hgsql -N -e "select * from polyaPredict;" hg18 | \
      cut -f2-9 > hg18.polyaPredict.bed
    wc -l hg18.polyaPredict.bed
    # 52182 hg18.polyaPredict.bed

    liftOver -bedPlus=8 hg18.polyaPredict.bed \
      /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
      hg19.polyaPredict.bed unMapped.polyaPrdict
    wc -l hg19.polyaPredict.bed
    # 52169 hg19.polyaPredict.bed
    hgLoadBed hg19 polyaPredict hg19.polyaPredict.bed
    # Reading hg19.polyaPredict.bed
    # Loaded 52169 elements of size 8
    # Sorted
    # Creating table definition for polyaPredict
    # Saving bed.tab
    # Loading hg19


#############################################################################
# FILTER SNP132 (DONE 3/8/11 angie)
# 4/8/11: changing table names to be consistent with shortLabel:
#         snp132Patient -> snp132Flagged
#         snp132NonUnique -> snp132Mult
# 3/8/11: redone after snp132 with tweaked weights in PARs, see above
# Previously done 1/24/11 after snp132
    # Redmine: Track #1684 (SNPs 132 (dbSNP))
    # Make several tracks that are filtered subsets of snp132:
    # First, filter out the multiply-aligned and/or weight >1 SNPs [any other exceptions?]
    cd /hive/data/outside/dbSNP/132/human
    zcat snp132.parWeightTweak.bed.gz \
    | perl -we \
      'open($mult, "| gzip -c > snp132Mult.bed.gz") || die; \
       open($common,    "| gzip -c > snp132Common.bed.gz") || die; \
       open($flagged,   "| gzip -c > snp132Flagged.bed.gz") || die; \
       open($misc,      "| gzip -c > snp132Misc.bed.gz") || die; \
       while (<>) { \
         @w = split("\t"); \
         if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \
           print $mult $_; \
         } else { \
           my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \
           my @alNs = split(",", $nStr);      die unless scalar(@alNs) == $alleleFreqCount; \
           my @freqs = split(",", $freqStr);  die unless scalar(@freqs) == $alleleFreqCount; \
           my ($total2N, $maxAlleleFreq) = (0, 0); \
           for (my $i = 0;  $i < $alleleFreqCount;  $i++) { \
             $total2N += $alNs[$i]; \
             $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \
           } \
           if ($alleleFreqCount >= 2 && $total2N >= 4 && $maxAlleleFreq <= 0.99) { \
             print $common $_; \
           } elsif($w[24] =~ /clinically-assoc/)  { \
             print $flagged $_; \
           } else { \
             print $misc $_; \
           } \
         } \
       } \
       close($mult);  close($common); close($flagged);  close($misc);'
    zcat snp132Mult.bed.gz | wc -l
#3568988
    zcat snp132Common.bed.gz | wc -l
#14024295
    zcat snp132Flagged.bed.gz | wc -l
#18084
    zcat snp132Misc.bed.gz | wc -l
#15414754

    # Load tables
    foreach subset (Mult Common Flagged Misc)
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
        hg19 snp132$subset -sqlTable=snp132.sql snp132$subset.bed.gz
    end


#############################################################################
# BUILD DECIPHER WITH NEW HG19 RELEASE (Done Fan, 2/8/11)

# The decipher track is built by an automated process.  The following two scripts:

#   kent/src/utils/decipher/checkDecipher.sh
#   kent/src/utils/decipher/buildDecipher 

#are used to automatically detect update on DECIPHER ftp sites and then
#download and build the decipher track.

# checkDecipher.sh is invoked by a cron job, it will call buildDecipher to build 
# the decipher track after the new data are downloaded.

#############################################################################
# GRC Incident database (DONE - 2011-02-10 - Hiram)
    # used to be NCBI Incident - changed to GRC Incident 2012-04-12
    # this procedure is run as a cron job in Hiram's account:

    #	43 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo

    # using the two scrips there: runUpdate.sh and update.sh
    # which are checked into the source tree as files:
    #	src/hg/utils/automation/grcIncidentUpdate.sh
    #	src/hg/utils/automation/grcRunIncidentUpdate.sh

    # they fetch the XML files from NCBI, convert them to SQL text
    # files, construct a bigBed file, and pushes it to genomewiki if
    # it is an update from previous

    # the table in the dataBase is: grcIncidentDb
    # which is the URL to the bb file, a single row:
    # http://genomewiki.ucsc.edu/images/6/67/Hg19.grcIncidentDb.bb

#############################################################################
# UNIGENE/SAGE TRACK (RE-BUILT - 2011-02-22 Fan)

# Create the uniGene alignments

    # Download of the latest UniGene version is now automated by a
    # cron job -- see /cluster/home/angie/crontab ,
    # /cluster/home/angie/unigeneVers/unigene.csh .

    ssh hgwdev
    mkdir -p /hive/data/genomes/hg19/bed/uniGene/022211
    cd /hive/data/genomes/hg19/bed/uniGene/022211

    set Version = 229

    zcat /hive/data/outside/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\
    sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa

    ssh swarm
    set Version = 229
    mkdir -p /hive/data/genomes/hg19/bed/uniGene/022211/run.blat
    cd /hive/data/genomes/hg19/bed/uniGene/022211/run.blat

    ls -1 /hive/data/genomes/hg19/nib/*.nib > genome.lst
    ls -1S \
    /hive/data/genomes/hg19/bed/uniGene/022211/Hs.seq.uniq.simpleHeader.fa \
      > uniGene.lst
    
    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/hive/data/genomes/hg19/11.ooc $(path1) $(path2)  {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 genome.lst uniGene.lst template.sub para.spec
    para create para.spec
    mkdir psl
    para try
    para check
    para push

#Completed: 93 of 93 jobs
#CPU time in finished jobs:      67404s    1123.41m    18.72h    0.78d  0.002 y
#IO & Wait Time:                  5838s      97.29m     1.62h    0.07d  0.000 y
#Average job time:                 788s      13.13m     0.22h    0.01d
#Longest finished job:            5228s      87.13m     1.45h    0.06d
#Submission to last job:          5320s      88.67m     1.48h    0.06d
#Estimated complete:                 0s       0.00m     0.00h    0.00d

    pslSort dirs raw.psl tmp psl >& pslSort.log
    cat raw.psl|\
    pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
      stdin hg19.uniGene.pslReps.psl /dev/null
    
    gzip raw.psl

    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/uniGene/022211/run.blat

    hgLoadPsl -table=uniGene_3 hg19 hg19.uniGene.pslReps.psl

    mkdir -p /gbdb/hg19/uniGene
    cd /gbdb/hg19/uniGene

    rm Hs.seq.uniq.simpleHeader.fa
    ln -s \
    /hive/data/genomes/hg19/bed/uniGene/022211/Hs.seq.uniq.simpleHeader.fa \
    Hs.seq.uniq.simpleHeader.fa

# load the sequence

    hgLoadSeq -replace hg19 /gbdb/hg19/uniGene/Hs.seq.uniq.simpleHeader.fa

##############################################################################
# Chimp Lastz run (DONE - 2011-02-23 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzPanTro3.2011-02-22
    cd /hive/data/genomes/hg19/bed/lastzPanTro3.2011-02-22
    cat << '_EOF_' > DEF
# human vs chimp
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Chimp PanTro3
SEQ2_DIR=/scratch/data/panTro3/panTro3.2bit
SEQ2_LEN=/scratch/data/panTro3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg19/bed/lastzPanTro3.2011-02-22
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long-running job
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-syntenicNet > do.log 2>&1 &
    # problems with memk, after recovery, continue chainMerge:
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-continue=chainMerge -syntenicNet > chainMerge.log 2>&1 &
    #	real    103m34.088s

    cat fb.hg19.chainPanTro3Link.txt 
    #	2760939621 bases of 2897316137 (95.293%) in intersection

    # filter with doRecipBest.pl 
    time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        hg19 panTro3 > rbest.log 2>&1
    # real    50m49.740s


#	running the swap
    mkdir /hive/data/genomes/panTro3/bed/blastz.hg19.swap
    cd /hive/data/genomes/panTro3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-swap /hive/data/genomes/hg19/bed/lastzPanTro3.2011-02-22/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-syntenicNet > swap.log 2>&1 &
#	real    86m49.706s
    cat fb.panTro3.chainHg19Link.txt 
    #	2772816267 bases of 2900529764 (95.597%) in intersection

############################################################################
# MAKE tfbsConsSites and tfbsConsFactors for TFBS conserved track (DONE weirauch braney 03/07/11)
# Questions?  braney at soe.ucsc.edu

ssh hgwdev
mkdir /cluster/data/hg19/bed/tfbsCons
cd /cluster/data/hg19/bed/tfbsCons

# Define all parameters in 'PARAMS.txt'
# Define all chromosomes in 'CHROMS.txt'
# Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts  weirauch@soe.ucsc.edu
set tarfile=/cluster/data/hg19/bed/tfbsCons/tfbsConsUtils.tar.gz
tar zxf $tarfile

nice ./getRefseqStats.pl &
nice ./getBatchQueries.pl &

ssh swarm
mkdir /cluster/bluearc/braney/tfloc
mkdir /hive/users/weirauch/tfloc_hg18
# Copy ./tmp/ctfbs_batch_list.txt to this dir
# Copy ./scripts/doit to this dir
para create ctfbs_batch_list.txt
para try
para push

# When the run is done (within a day or so), the results will be in individual dirs, one for each chromosome.

nice ./getBedFile.pl &

hgLoadBed -noSort hg19  tfbsConsSites -sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed -tab
hgLoadSqlTab hg19 tfbsConsFactors $HOME/kent/src/hg/lib/tfbsConsFactors.sql tfbsConsFactors.bed
#########################################################################
# BUILD THE TRACK OF IKMC MAPPED TO HUMAN GENOME. (DONE, Fan, 3/15/11)

    ssh hgwdev
    mkdir -p /hive/data/genomes/hg19/bed/ikmc/110314
    cd /hive/data/genomes/hg19/bed/ikmc/110314

# recieve  20110301_human.gff.gz from Carol Bult [Carol.Bult@jax.org] and place it under this subdirectory.
    gzip -d 20110301_human.gff.gz

# build hgIkmc table from raw data file ucschuman.gff (substitue some
# troublesome chroms in raw data file and remove some records mapped to 'chrUn')

    cat 20110301_human.gff |sort -u \
    |sed -e 's/chr9|NT_113911.1/chr9/' \
    |grep -v 'chrUn' \
    | perl -we \
      'while (<>) { \
         s/\r?\n$//; \
         ($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \
         if ($s eq "") { warn "$_\n";  s/^.*//; next; } # Some lines have no coords. \
         $col = ($col eq "Yellow") ? "255,215,0" : \
                ($col eq "Green")  ? "0,240,0" : \
                ($col eq "Blue")   ? "0,0,200" : "0,0,0"; \
         $s--; \
         $id =~ s/^MGI:\d+; (\w+); .*/$1/ || die "Cant parse id \"$id\""; \
         my $geneId = join("|", $chr, $ctr, "${n}_$id"); \
         push @{$geneBlks{$geneId}}, [$s, $e, $col] unless $e <= 0; \
      } \
      warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \
      foreach my $geneId (keys %geneBlks) { \
         my @blks = @{$geneBlks{$geneId}}; \
         my ($chrom, $center, $name) = split(/\|/, $geneId); \
         my $blkCount = @blks; \
         @blks = sort {$a->[0] <=> $b->[0]} @blks; \
         my $chromStart = $blks[0]->[0]; \
         my $chromEnd = $blks[$blkCount-1]->[1]; \
         my $color = $blks[0]->[2]; \
         my $blkStarts = ""; \
         my $blkSizes = ""; \
         foreach my $blk (@blks) { \
           my ($start, $end, $col) = @{$blk}; \
           $blkStarts .= ($start - $chromStart) . ","; \
           $blkSizes  .= ($end - $start) . ","; \
           if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \
         } \
        print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \
                   $chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \
      }' \
    | sort -k 1,1 -k 2n,2n > hgIkmc.bed
# Got 41936 genes.

# build hgIkmcExtra table

    cat 20110301_human.gff \
    | grep -v 'chrUn' \
    | perl -wpe 's/\r?\n$//; @w = split("\t"); \
      if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \
      if ($w[4] <= 0) { s/^.*//; next; } # A few lines have end=0. \
      $w[8] =~ m/^(MGI:\d+); (\w+); (\w.*)/ || die; \
      ($mgi, $designId, $status) = ($1, $2, $3); \
      $_ = "$w[10]_$designId\t$mgi,$designId,$w[2],$status\n";' \
    | sort -u > hgIkmcExtra.tab
    wc -l hgIkmcExtra.tab
# 41936 hgIkmcExtra.tab

# load tables
    hgLoadBed hg19 hgIkmc hgIkmc.bed
    checkTableCoords -verbose=2 hg19 hgIkmc

    hgLoadSqlTab hg19 hgIkmcExtra $HOME/kent/src/hg/lib/genericAlias.sql hgIkmcExtra.tab

#########################################################################
# LOAD ACEMBLY (DONE 2011-03-14 - Chin)
    mkdir /hive/data/outside/acembly
    cd /hive/data/outside/acembly
    wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Aug10.human.genes/AceView.ncbi_37.genes_gff.gff.gz
    wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Aug10.human.genes/AceView.ncbi_37.good_proteins_peptide.fasta.gz

    mkdir /cluster/data/hg19/bed/acembly
    cd /cluster/data/hg19/bed/acembly
    cp -p /hive/data/outside/acembly/AceView.ncbi_37.genes_gff.gff.gz .
    cp -p /hive/data/outside/acembly/AceView.ncbi_37.good_proteins_peptide.fasta.gz .
    gzip -d AceView.ncbi_37.genes_gff.gff.gz
    gzip -d AceView.ncbi_37.good_proteins_peptide.fasta.gz

    # If the result of this command is > 0, then some lines have end <
    # start
    # and need to be fixed:
    awk '$5 < $4 {print;}' AceView.ncbi_37.genes_gff.gff | wc -l
    # 0

    # Filter out empty lines, lines where the product_id has a stray
    # newline before it, and $chr|Hs# IDs that don't appear liftable.
    # (Note: the new gff does not have these two cases anymore.) 
    # Add 'chr' prefix to chrom number at field 1
    egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' AceView.ncbi_37.genes_gff.gff \
    | sed -e 's/^/chr/;' \
      > acembly.gff
    # fixed the chrmito prefix to chrM
    mv acembly.gff acembly.tmp
    cat acembly.tmp | sed -e 's/^chrmito/chrM/;' > acembly.gff 
    

    # Extract annotation classes from original gff:
    cat AceView.ncbi_37.genes_gff.gff | awk  '{print $12}' | sort | uniq
    # cDNA_supported;
    # Note: version 37 gff have only one gene type - cDNA_supported;
    # Per Danielle and Jean request, use pink to display them. 	
    # the following replace become no-op.
    egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' AceView.ncbi_37.genes_gff.gff \
    | perl -wpe 's/^.*Gene_type (\w+); transcript_id (\S+);.*/$2\t$1/; \
                 s/Cloud$/cloud/ || s/Spliced_gene$/spliced_gene/ || \
                   die "Unrecognized class/Gene_type:\n$_\n";' \
    | sort -u \
      > acemblyClass.tab
    # Useless use of single ref constructor in void context at -e line 2.
    # warning was issued, however it is harmless.


    # Some gff transcript_id's end in -unspliced (no intron), but the
    # corresponding protein fasta IDs to not have that suffix.  We need
    # them to match, so add where necessary.
    # Use perl to make a perl script to add -unspliced to protein IDs
    # where necessary:
    grep unspliced acemblyClass.tab | wc -l
    # 54180
    egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$'  AceView.ncbi_37.genes_gff.gff \
    | perl -wpe 's@^.*transcript_id (\S+)-unspliced;.*$@\$unsp{"$1"} = 1;@ || s/^.*\n$//;' \
    | sort -u \
      > addUnspliced.pl
    wc -l addUnspliced.pl
    # 54180 addUnspliced.pl
    cat >> addUnspliced.pl <<'_EOF_'
while (<>) {
  if (/^>(\S+)$/) {
    if ($unsp{$1}) {
      s/^>(\S+)/>$1-unspliced/;
    }
  }
  print;
}
'_EOF_'
    # << emacs

    # Add -unspliced suffix to protein IDs where necessary, and pare
    # down
    # proteins to just the ones that we have transcripts for:
    awk '{print $1;}' acemblyClass.tab   > transcriptNames.txt
    perl addUnspliced.pl AceView.ncbi_37.good_proteins_peptide.fasta \
    | faSomeRecords stdin transcriptNames.txt acemblyPep.fa
    grep unspliced acemblyPep.fa | wc -l
    # 31956
    # Danielle Thierry-Mieg explained that noncoding genes are included
    # so
    # the number of proteins can be smaller than the number of
    # transcripts.

    # Load tables
    ssh hgwdev
    cd /cluster/data/hg19/bed/acembly
    ldHgGene -gtf hg19 acembly acembly.gff
    # Reading acembly.gff
    # Read 259440 transcripts in 3870073 lines in 1 files
    #   259440 groups 25 seqs 1 sources 5 feature types
    # 259440 gene predictions

    hgLoadSqlTab hg19 acemblyClass ~/kent/src/hg/lib/acemblyClass.sql \
      acemblyClass.tab
    # Scanning through 1 files
    hgPepPred hg19 generic acemblyPep acemblyPep.fa
    # rm acemblyPep.tab
    runJoiner.csh hg19 acembly
    # found identifiers:
    # acemblyName
    # Checking keys on database hg19
    #  hg19.acemblyPep.name - hits 187692 of 187692 ok
    #  hg19.acemblyClass.name - hits 259440 of 259440 ok
    # running -times flag


#############################################################################
# Affy Exon track (DONE 2011-03-08 - Melissa Cline)

# scripts/splitAffyExonBeds.py (below)
#!/usr/bin/env python

import fileinput
import re
import string

filehandle = None
for line in fileinput.input():
    if re.search("track\tname", line):
        if filehandle != None:
            filehandle.close()
        filename = fileinput.filename()
        filename = string.replace(filename, "hg19-bed", "hg19-split-bed")
        if re.search("gene level exon", line):
            filename = string.replace(filename, ".bed", ".exon.bed")
        elif re.search("gene probeset", line):
            filename = string.replace(filename, ".bed", ".geneProbeset.bed")
        elif re.search("exon probeset", line):
            filename = string.replace(filename, ".bed", ".probeset.bed")
        elif re.search("probe", line):
            filename = string.replace(filename, ".bed", ".probe.bed")
        filehandle = open(filename, 'w')
    filehandle.write(line)
        

# scripts/mergeAcrossChromosomes.bash (below)
#!/usr/bin/env bash


PATHNAME="/hive/users/cline/Affy/"
tail -n +2 $PATHNAME/Affy-HuEx-hg19-split-bed/HuEx-1_0-st-v2.hg19.*.$1.bed \
  |grep -v "==>" > $PATHNAME/mergedBeds/AffyHuEx.$1.bed 


# scripts/splitByProbesetType.py (below)
#!/usr/bin/env python
from optparse import OptionParser
import re

parser = OptionParser()
parser.add_option("-s", "--supplementalData", dest="supplementalData",
                  default="supportingAnnotations/HuEx-1_0-st-v2.na31.hg19.probeset.csv")
parser.add_option("-b", "--bedData", dest="bedData", default="noOverlaps/AffyHuEx.probeset.overlapsMerged.bed")
(parameters, args) = parser.parse_args()
coreProbesets = dict()
extendedProbesets = dict()
fullProbesets = dict()
ambiguousProbesets = dict()
freeProbesets = dict()
supplementalFile = open(parameters.supplementalData)
for line in supplementalFile:
    tokens = line.split(",")
    probesetId = re.sub("\"", "", tokens[0])
    if re.search("core", line):
        coreProbesets[probesetId] = 1
    elif re.search("extended", line):
        extendedProbesets[probesetId] = 1
    elif re.search("full", line):
        fullProbesets[probesetId] = 1
    elif re.search("ambiguous", line):
        ambiguousProbesets[probesetId] = 1
    elif re.search("free", line):
        freeProbesets[probesetId] = 1
supplementalFile.close()
coreProbesetsOutfile = re.sub("noOverlaps", "partitioned2",
                              re.sub("overlapsMerged", "core.overlapsMerged",
                                     parameters.bedData))
coreProbesetsOut = open(coreProbesetsOutfile, 'w')
extendedProbesetsOutfile = re.sub("noOverlaps", "partitioned2",
                                  re.sub("overlapsMerged",
                                         "extended.overlapsMerged",
                                         parameters.bedData))
extendedProbesetsOut = open(extendedProbesetsOutfile, 'w')
fullProbesetsOutfile = re.sub("noOverlaps", "partitioned2",
                              re.sub("overlapsMerged", "full.overlapsMerged",
                                     parameters.bedData))
fullProbesetsOut = open(fullProbesetsOutfile, 'w')
ambiguousProbesetsOutfile = re.sub("noOverlaps", "partitioned2",
                                   re.sub("overlapsMerged", "ambiguous.overlapsMerged",
                                          parameters.bedData))
ambiguousProbesetsOut = open(ambiguousProbesetsOutfile, 'w')
freeProbesetsOutfile = re.sub("noOverlaps", "partitioned2",
                                   re.sub("overlapsMerged", "free.overlapsMerged",
                                          parameters.bedData))
freeProbesetsOut = open(freeProbesetsOutfile, 'w')
bedInput = open(parameters.bedData)
for line in bedInput:
    tokens = line.split()
    if len(tokens) < 3:
        print "error: misformed line", line
    else:
        name = line.split()[3]
        (gene,probeset) = name.split("_")
        if coreProbesets.has_key(probeset):
            coreProbesetsOut.write(line)
        elif extendedProbesets.has_key(probeset):
            extendedProbesetsOut.write(line)
        elif fullProbesets.has_key(probeset):
            fullProbesetsOut.write(line)
        elif ambiguousProbesets.has_key(probeset):
            ambiguousProbesetsOut.write(line)
        elif freeProbesets.has_key(probeset):
            freeProbesetsOut.write(line)
        else:
            print "warning: orphan line", line


# 1. Given data from the vendor, one file per chromosome with four types of
# bed entries per file (one file per chromosome), split them into four bed 
# files (yielding four bed files per chromosome).
ls Affy-HuEx-hg19-bed/*bed \
    | awk '{ print "scripts/splitAffyExonBeds.py", $1 }' |bash
 
# 2. Given a subdirectory with four bed files per chromosome for probeset
# and probe bed files, merge them into one bed file for probeset data
# and one bed file per probe data.  From here, we are ignoring the exon
# and gene probeset data.
scripts/mergeAcrossChromosomes.bash probeset
scripts/mergeAcrossChromosomes.bash probe

# 3. There is an issue that the probeset data contains overlapping blocks.
# Fix this with bedMergeOverlappingBlocks, written by Andy Pohl.  Note that
# the probe data does not contain overlapping blocks.
bedMergeOverlappingBlocks mergedBeds/affyHuEx.probeset.bed \
    noOverlaps/AffyHuEx.probeset.overlapsMerged.bed
cp mergedBeds/AffyHuEx.probe.bed noOverlaps/AffyHuEx.probe.overlapsMerged.bed

# 4. There are five different types of probesets, each of which has a different
# significance to the user.  These should best be represented as five different
# subtracks of a parent track (one parent track for probesets, and one for probes).
# Split the data by probeset type.  Note that the same process can be applied to
# both the probeset and probe data, because the probe data is represented by the
# probeset ID rather than by a distinct probe ID.
scripts/splitByProbesetType.py \
    -s supportingAnnotations/HuEx-1_0-st-v2.na31.hg19.probeset.csv \
    -p noOverlaps/AffyHuEx.probe.overlapsMerged.bed
hgLoadBed hg19 affyExonProbeAmbiguous \
    partitioned2/AffyHuEx.probe.ambiguous.overlapsMerged.bed
hgLoadBed hg19 affyExonProbeCore \
    partitioned2/AffyHuEx.probe.core.overlapsMerged.bed
hgLoadBed hg19 affyExonProbeExtended \
    partitioned2/AffyHuEx.probe.extended.overlapsMerged.bed
hgLoadBed hg19 affyExonProbeFree \
    partitioned2/AffyHuEx.probe.free.overlapsMerged.bed
hgLoadBed hg19 affyExonProbeFull \
    partitioned2/AffyHuEx.probe.full.overlapsMerged.bed

scripts/splitByProbesetType.py \
    -s supportingAnnotations/HuEx-1_0-st-v2.na31.hg19.probeset.csv \
    -p noOverlaps/AffyHuEx.probeset.overlapsMerged.bed
hgLoadBed hg19 affyExonProbesetAmbiguous \
    partitioned2/AffyHuEx.probeset.ambiguous.overlapsMerged.bed
hgLoadBed hg19 affyExonProbesetCore \
    partitioned2/AffyHuEx.probeset.core.overlapsMerged.bed
hgLoadBed hg19 affyExonProbesetExtended \
    partitioned2/AffyHuEx.probeset.extended.overlapsMerged.bed
hgLoadBed hg19 affyExonProbesetFree \
    partitioned2/AffyHuEx.probeset.free.overlapsMerged.bed
hgLoadBed hg19 affyExonProbesetFull \
    partitioned2/AffyHuEx.probeset.full.overlapsMerged.bed


#########################################################################
# LASTZ Turkey MelGal1 (DONE - 2011-03-28 - Chin)
    mkdir /hive/data/genomes/hg19/bed/lastzMelGal1.2011-03-28
    cd /hive/data/genomes/hg19/bed/lastzMelGal1.2011-03-28

    cat << '_EOF_' > DEF
# Turkey vs Human

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Turkey melGal1 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit
SEQ2_LEN=/scratch/data/melGal1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzMelGal1.2011-03-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
    # real    100m33.646s
    cat fb.hg19.chainMelGal1Link.txt
    #   76647912 bases of 2897316137 (2.645%) in intersection
    # Create link
    cd /hive/data/genomes/hg19/bed
    ln -s  lastzMelGal1.2011-03-28 lastz.melGal1


    #   running the swap 
    mkdir /hive/data/genomes/melGal1/bed/blastz.hg19.swap
    cd /hive/data/genomes/melGal1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzMelGal1.2011-03-28/DEF \
        -swap \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
    #   real    6m51.280s
    cat fb.melGal1.chainHg19Link.txt
    #   62120143 bases of 935922386 (6.637%) in intersection
    cd /hive/data/genomes/melGal1/bed
    ln -s blastz.hg19.swap lastz.hg19

#############################################################################
# BUILD seudoYale60 TRACK 
# Rachel built this for Gencode first, Fan adopted for UCSC GB, DONE, 3/31/2011.
#
# First, how Rachel built it:
# YALE PSEUDOPIPE PSEUDOGENE PREDICTIONS BASED ON ENSEMBL 60 
# (hartera, 2011-01-10, DONE)
# FTP site e-mailed by Suganthi Balasubramanian (suganthi.bala@yale.edu) from 
# the Gerstein lab. Data is from their PseudoPipe pipeline and it is based on 
# proteins from Ensembl Build 60 (pseudogene data from December 2010?). 

mkdir -p /hive/groups/gencode/browser/hg19/gencodeYalePseudoBuild60
cd /hive/groups/gencode/browser/hg19/gencodeYalePseudoBuild60

wget --timestamping \
     "http://tables.pseudogene.org/dump.cgi?table=Human60"

# Then re-name the file:
mv dump.cgi\?table=Human60 Human60YalePseudo.txt
# Header from data file.
ID      Chromosome      Start Coordinate        Stop Coordinate Strand  Parent
Protein  Protein Start   Protein Stop    Parent Gene     Fraction        Num
Insertions  Num Deletions   Num Shifts      Num Stops       E Value Identity
PolyA   Disablements    Exons   Introns Class   Sequence        Link

# urls are of type:
# http://tables.pseudogene.org/human60/<ID> so this can be added to the
# trackDb as for the previous track. 

# Get list of haplotype chroms:
grep _ Human60YalePseudo.txt | awk '{print $2}' | sort | uniq
HSCHR17_1
HSCHR6_MHC_APD
HSCHR6_MHC_COX
HSCHR6_MHC_DBB
HSCHR6_MHC_MANN
HSCHR6_MHC_MCF
HSCHR6_MHC_QBL
HSCHR6_MHC_SSTO

# These correspond to the haplotype chroms in GRCh37 (hg19).
# Convert data to genePred:
# chromsomomes are 1-22, X, Y, chr17_ctg5_hap1 (HSCHR17_1) and the chr6 
# haplotypes e.g. chr6_cox_hap1 (HSCHR6_MHC_COX) 

cat << '_EOF_' > formatPseudogenesToGenePred
#!/usr/bin/awk -f
# Parse Yale pseudogene data file.
# Exon coordinates are in this format: [[28688544, 28688864], [28689678, 2869117# 4], [28694308, 28694460], [28701327, 28701749]]
# Ignore header line
/^ID/ {
  next;
}
# Parse the data lines
BEGIN {FS="\t"} {OFS="\t"} {
  gsub(/\[/, "", $19);
  gsub(/\]/, "", $19); 
  split($19, exons, ",");
  # Count the number of start and end coordinates for exons and 
  # calculate the number of exons.
  count=(length(exons))/2;
  # Write out genePred. Add chr in front of chrom only if not haplotype.
  if ($2 !~ /HSCHR/) {
     printf "%s\tchr%s\t%c\t%d\t%d\t0\t0\t%d\t", $1, $2, $5, $3-1, $4, count; 
  }
  else {
     printf "%s\t%s\t%c\t%d\t%d\t0\t0\t%d\t", $1, $2, $5, $3-1, $4, count; 
  }
  # get list of exon starts, convert from 1-based to 0-based
  for (i=1; i <= length(exons); i+=2) {
     printf "%d,", exons[i]-1","; 
  }
  printf "\t";
  # get list of exon ends
  for (i=2; i <= length(exons); i+=2) {
     printf "%d,", exons[i]","; 
  }
  printf "\n"; 
}
'_EOF_'

chmod +x formatPseudogenesToGenePred
# format the Yale pseudogenes data to genePred.
./formatPseudogenesToGenePred Human60YalePseudo.txt \
    > gencodeYalePseudoBuild60.gp 

# The Genome Browser represents just the haplotype region as a separate 
# "chromosome" whereas the coordinates represent the haplotype region embedded
# into chr6.
cp -p /hive/data/genomes/hg19/jkStuff/ensGene.haplotype.lift .
# The lift file assumes the following chrom names 
-69170076       HSCHR4_1        191154276       chr4_ctg9_hap1  590426
-43384863       HSCHR17_1       81195210        chr17_ctg5_hap1 1680828
-28696603       HSCHR6_MHC_APD  171115067       chr6_apd_hap1   4622290
-28477796       HSCHR6_MHC_COX  171115067       chr6_cox_hap2   4795371
-28696603       HSCHR6_MHC_DBB  171115067       chr6_dbb_hap3   4610396
-28696603       HSCHR6_MHC_MANN 171115067       chr6_mann_hap4  4683263
-28696603       HSCHR6_MHC_MCF  171115067       chr6_mcf_hap5   4833398
-28696603       HSCHR6_MHC_QBL  171115067       chr6_qbl_hap6   4611984
-28659142       HSCHR6_MHC_SSTO 171115067       chr6_ssto_hap7  4928567

liftUp -type=.gp gencodeYalePseudoBuild60HapsLifted.gp \
       ensGene.haplotype.lift carry gencodeYalePseudoBuild60.gp
# Got 68 lifts in ensGene.haplotype.lift
# Lifting gencodeYalePseudoBuild55.gp

wc -l gencode*.gp
17888 gencodeYalePseudoBuild60.gp
17888 gencodeYalePseudoBuild60HapsLifted.gp
 
# Load table and then check some haplotype regions. See if look plausible. 
# Load the genePred file into hg19
hgLoadGenePred hg19 gencodeYalePseudoBuild60 \
gencodeYalePseudoBuild60HapsLifted.gp
# Didn't load. There are 12 invalid genePreds (10 in Ensembl 55, 12 for 59):
Error: invalid genePred: PGOHUM00000244617 exon 2 overlaps previous exon
Error: invalid genePred: PGOHUM00000244796 exon 1 overlaps previous exon
Error: invalid genePred: PGOHUM00000248470 exon 1 overlaps previous exon
Error: invalid genePred: PGOHUM00000251325 exon 1 overlaps previous exon
Error: invalid genePred: PGOHUM00000250199 exon 1 overlaps previous exon
Error: invalid genePred: PGOHUM00000243651 exon 1 overlaps previous exon
Error: invalid genePred: PGOHUM00000232858 exon 1 overlaps previous exon
Error: invalid genePred: PGOHUM00000232933 exon 1 overlaps previous exon
Error: invalid genePred: PGOHUM00000233065 exon 1 overlaps previous exon
Error: invalid genePred: PGOHUM00000236237 exon 1 overlaps previous exon
Error: invalid genePred: PGOHUM00000241760 exon 1 overlaps previous exon
Error: invalid genePred: PGOHUM00000233784 exon 8 overlaps previous exon
Error: 12 invalid genePreds, database unchanged

# These are on chroms 1, 6, 7, 9, X, Y.
# File didn't load into database.
# Make a file of these ids - invalidIds
grep -f invalidIds -vw gencodeYalePseudoBuild60HapsLifted.gp \
    > gencodeYalePseudoBuild60HapsLiftedNoInvalidGps.gp
wc -l gencode*gp
# 17888 gencodeYalePseudoBuild60.gp
# 17888 gencodeYalePseudoBuild60HapsLifted.gp
# 17876 gencodeYalePseudoBuild60HapsLiftedNoInvalidGps.gp

# Then re-load database
hgLoadGenePred hg19 gencodeYalePseudoBuild60 \
      gencodeYalePseudoBuild60HapsLiftedNoInvalidGps.gp

# Add trackDb.ra entry for track, add a search and make sure
# there is a description page, copy over from the gencodeYalePseudoBuild59
# html. Commit these to SVN. 
# Add to the html description, the list of 12 IDs of genes that were removed 
# due to overlapping exon coordinates. This was also a problem for the Yale
# pseudogenes based on Ensembl Build 53 and 55 but there were 10 problem IDs
# for those builds. 

# Build class table for colouring pseudogenes by type. 
# copy over class table definition from a previous set of Yale pseudogenes.
cp -p ../gencodeYalePseudoBuild55/gencodeYalePseudoBuild55Class.sql \
 gencodeYalePseudoBuild60Class.sql

# Make the class table file:
tail -n +2 Human60YalePseudo.txt \
  | tawk '{print $1, $21, "Yale"}' | sort > yalePseudoBuild60Class.txt

# load table
hgLoadSqlTab hg19 gencodeYalePseudoBuild60Class \
    gencodeYalePseudoBuild60Class.sql yalePseudoBuild60Class.txt

hgsql -e 'select distinct(class) from gencodeYalePseudoBuild60Class;' hg19
+------------+
| class      |
+------------+
| Ambiguous  | 
| Processed  | 
| Duplicated | 
+------------+

# Add these classes to the trackDb.ra entry for the geneClasses field and 
# to the list of classes with colours.

# Next, how Fan adopted it:

ssh hgwdev
mkdir -p /hive/data/genomes/hg19/bed/pseudoYale60
cd /hive/data/genomes/hg19/bed/pseudoYale60

hgsql hg19 < ~/src/hg/lib/pseudoYale60.sql
hgsql hg19 < ~/src/hg/lib/pseudoYale60Class.sql

hgsql hg19 -e "insert into pseudoYale60 select * from gencodeYalePseudoBuild60"
hgsql hg19 -e "insert into pseudoYale60Class select * from gencodeYalePseudoBuild60Class"

##############################################################################
# LASTZ Lizard AnoCar2 (DONE - 2011-04-19 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzAnoCar2.2011-04-19
    cd /hive/data/genomes/hg19/bed/lastzAnoCar2.2011-04-19

    cat << '_EOF_' > DEF
# human vs lizard
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Lizard anoCar2
SEQ2_DIR=/scratch/data/anoCar2/anoCar2.2bit
SEQ2_LEN=/scratch/data/anoCar2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=40

BASE=/hive/data/genomes/hg19/bed/lastzAnoCar2.2011-04-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \
	-bigClusterHub=swarm -qRepeats=windowmaskerSdust > do.log 2>&1 &
    #	real    195m52.809s
    cat fb.hg19.chainAnoCar2Link.txt 
    #	102917023 bases of 2897316137 (3.552%) in intersection

    #	running the swap - DONE - 2011-04-19
    mkdir /hive/data/genomes/anoCar2/bed/blastz.hg19.swap
    cd /hive/data/genomes/anoCar2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzAnoCar2.2011-04-19/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-syntenicNet -swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
    #	real    20m45.683s
    cat fb.anoCar2.chainHg19Link.txt 
    #	88296392 bases of 1701353770 (5.190%) in intersection

##############################################################################
# NCBI patch 3 (NOT COMPLETE - 2011-04-21 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch3
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch3
    # these scripts were altered slightly for improvements and corrections
    # to this patch3 business
    cp -p ../patches/gatherNames.pl .
    # business added to gatherNames.pl to construct patches.chrom.sizes file
    ./gatherNames.pl . > ucscNames.patch3.txt
    cp -p ../patch2/mkTables.pl .
    ./mkTables.pl  patches.chrom.sizes ucscNames.patch3.txt PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz
    # output to stdout is the contents of alt.scaf.agp.gz
    # constructs files: ctgPos.txt chromInfo.txt gold.txt and gap.txt
    cp -p ../patch2/mkCtgPos2.pl .
    ./mkCtgPos2.pl ucscNames.patch3.txt patches.chrom.sizes > ctgPos2.txt
    cp -p ../patch2/mkHapLocate.pl .
    ./mkHapLocate.pl ctgPos.txt \
	PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
	> haplotypeLocations.bed
# not found: GL339449.1 HSCHR5_1_CTG1
# not found: GL339450.1 HG79_PATCH

##############################################################################
# NCBI patch 5 (DONE - 2011-07-01,13 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch5
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch5
    wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
        -nH --ftp-user=anonymous --ftp-password=yourName@domain.edu \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p5/
    # the scripts from patch4 were modified slightly to update and fix some
    #	of the new names in this patch5
    cp ../patch4/gatherNames.pl .
    ./gatherNames.pl . > ucscNames.patch5.txt
    cp -p ../patch4/mkTables.pl .
    ./mkTables.pl  patches.chrom.sizes ucscNames.patch5.txt PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz
    # output to stdout is the contents of alt.scaf.agp.gz
    # constructs files: ctgPos.txt chromInfo.txt gold.txt and gap.txt
    cp -p ../patch4/mkCtgPos2.pl .
    ./mkCtgPos2.pl ucscNames.patch5.txt patches.chrom.sizes > ctgPos2.txt
    cp -p ../patch4/mkHapLocate.pl .
    ./mkHapLocate.pl ctgPos.txt \
	PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
	> haplotypeLocations.bed
    cp haplotypeLocations.bed altSequence.bed
    ln -s ../patch2/before.patch2.hapLoc.bed hg19.hapLoc.bed
    awk '{printf "%s\t%d\t%d\t%s\t500\t+\t%d\t%d\t32,32,190\n", $2,$3,$4,$5,$3,$4}' \
hg19.hapLoc.bed >> altSequence.bed

    # a new script for patch5
    ./mkFasta.pl ucscNames.patch5.txt > hg19.patch5.fa
    # the build of hg19Patch5 can be seen in hg19Patch5.txt

    egrep -v "32,32,190" altSequence.bed  \
	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
	    > altSeqPatchesP5.tab
    egrep "32,32,190" altSequence.bed  \
	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
	    > altSeqHaplotypesP5.tab
    # verify none lost
    wc -l altSeqPatchesP5.tab altSeqHaplotypesP5.tab
    #	41 altSeqPatchesP5.tab
    #	75 altSeqHaplotypesP5.tab
    #	116 total
    wc -l altSequence.bed
    #	116 altSequence.bed
    hgLoadBed hg19 altSeqHaplotypesP5 altSeqHaplotypesP5.tab
    #	Loaded 75 elements of size 6
    hgLoadBed hg19 altSeqPatchesP5 altSeqPatchesP5.tab
    #	Loaded 41 elements of size 6

    # to replace the existing track:
    hgLoadBed hg19 altSeqHaplotypes altSeqHaplotypesP5.tab
    #	Loaded 75 elements of size 6
    hgLoadBed hg19 altSeqPatches altSeqPatchesP5.tab
    #	Loaded 41 elements of size 6

##############################################################################
# NCBI patch 9 (DONE - 2012-07-16 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch9
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch9
    wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
        -nH --ftp-user=anonymous --ftp-password=yourName@domain.edu \
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p9/
    # the scripts from patch5 were modified slightly to update and fix some
    #	of the new names in this patch9
    cp ../patch5/gatherNames.pl .
    ./gatherNames.pl . > ucscNames.patch9.txt
    # examine the names for sanity:
    awk '{print $NF}' ucscNames.patch9.txt | sort
    # and they should not be longer than 31 characters:
    awk '{print $NF}' ucscNames.patch9.txt | sort | awk '{print length($0)}' \
        | sort -n | tail
    cp -p ../patch5/mkTables.pl .
    ./mkTables.pl  patches.chrom.sizes ucscNames.patch9.txt PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz
    # output to stdout is the contents of alt.scaf.agp.gz
    # constructs files: ctgPos.txt chromInfo.txt gold.txt and gap.txt
    cp -p ../patch5/mkCtgPos2.pl .
    ./mkCtgPos2.pl ucscNames.patch9.txt patches.chrom.sizes > ctgPos2.txt
    cp -p ../patch5/mkHapLocate.pl .
    ./mkHapLocate.pl ctgPos.txt \
	PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
	> haplotypeLocations.bed
    cp -p haplotypeLocations.bed altSequence.bed
    ln -s ../patch2/before.patch2.hapLoc.bed hg19.hapLoc.bed
    awk '{printf "%s\t%d\t%d\t%s\t500\t+\t%d\t%d\t32,32,190\n", $2,$3,$4,$5,$3,$4}' \
hg19.hapLoc.bed >> altSequence.bed

    # a new script for patch9
    cp -p ../patch5/mkFasta.pl .
    ./mkFasta.pl ucscNames.patch9.txt > hg19.patch9.fa
    # the build of hg19Patch9 can be seen in hg19Patch9.txt

    egrep -v "32,32,190" altSequence.bed  \
	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
	    > altSeqPatchesP9.tab
    egrep "32,32,190" altSequence.bed  \
	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
	    > altSeqHaplotypesP9.tab
    # verify none lost
    wc -l altSeqPatchesP9.tab altSeqHaplotypesP9.tab
    #	82 altSeqPatchesP9.tab
    #	81 altSeqHaplotypesP9.tab
    #	163 total
    wc -l altSequence.bed
    #	163 altSequence.bed
    hgLoadBed hg19 altSeqHaplotypesP9 altSeqHaplotypesP9.tab
    #	Loaded 75 elements of size 6
    # do not need the chrM_rCRS item:
    hgsql -e 'delete from altSeqHaplotypesP9 where chrom="chrM_rCRS";' hg19
    hgLoadBed hg19 altSeqPatchesP9 altSeqPatchesP9.tab
    #	Loaded 41 elements of size 6

    #    these tables are part of human/hg19/altSeqComposite9.ra
    # to replace the existing track:
    grep -v "^chrM_rCRS" altSeqHaplotypesP9.tab \
        | hgLoadBed hg19 altSeqHaplotypes stdin
    #   Read 80 elements of size 6 from stdin
    hgLoadBed hg19 altSeqPatches altSeqPatchesP9.tab
    #   Read 82 elements of size 6 from altSeqPatchesP9.tab
 
##############################################################################
#  hg19 - Human - Ensembl Genes version 62  (DONE - 2011-04-22 - hiram)
    # This human gene set need a lot of work to get the name translation
    #	to work again.  The contig names have changed in Ensembl for this
    #	version and they defined genes on patch sequence that UCSC does not
    #	include
    ssh hgwdev
    cd /hive/data/genomes/hg19
    cat << '_EOF_' > hg19.ensGene.ra
# required db variable
db hg19
# optional nameTranslation, the sed command that will transform
#       Ensemble names to UCSC names.  With quotes just to make sure.
# delete commands take out genes that are only in patch sequence
nameTranslation 's/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; /^GL3.*/d; /^HSCHR[1-5]/d; /^HSCHR[7-9]/d; /^HG/d'
# optionally update the knownToEnsembl table after ensGene updated
knownToEnsembl yes
# optional haplotype lift-down from Ensembl full chrom coordinates
#       to UCSC simple haplotype coordinates
haplotypeLift /hive/data/genomes/hg19/jkStuff/ensGene.haplotype.lift
# Ensembl 62 has new sequence names for some of the random bits
liftUp /hive/data/genomes/hg19/jkStuff/ens.62.lft
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl  -ensVersion=62 hg19.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/hg19/bed/ensGene.62
    featureBits hg19 ensGene
    # 109947258 bases of 2897316137 (3.795%) in intersection

    hgsql -e \
'update trackVersion set dateReference="current" where db="hg19" AND version=62;' hgFixed

############################################################################
# BUILD hg19 GERP TRACK (DONE 4/25/11, Fan)

ssh hgwdev
mkdir /hive/data/genomes/hg19/bed/gerp
cd /hive/data/genomes/hg19/bed/gerp

# place the wig data file, All_hg19_RS.wig, here.

ulimit -d 180000000
ulimit -v 180000000

wigToBigWig All_hg19_RS.wig /hive/data/genomes/hg19/chrom.sizes All_hg19_RS.bw

ln -s `pwd`/All_hg19_RS.bw /gbdb/hg19/bbi/All_hg19_RS.bw

hgsql hg19 -e 'drop table if exists allHg19RS_BW; \
               create table allHg19RS_BW (fileName varchar(255) not null); \
	       insert into allHg19RS_BW values ("/gbdb/hg19/bbi/All_hg19_RS.bw");'

# create corresponding trackDb.ra section and html description page.


#########################################################################
# LASTZ Cow BosTau6 (DONE - 2011-05-16 - Chin)
    mkdir /hive/data/genomes/hg19/bed/lastzBosTau6.2011-05-16
    cd /hive/data/genomes/hg19/bed/lastzBosTau6.2011-05-16

    cat << '_EOF_' > DEF
# human vs cow
# maximum M allowed with lastz is only 254
BLASTZ_M=254

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau6 
SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit
SEQ2_LEN=/scratch/data/bosTau6/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzBosTau6.2011-05-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    # real    481m23.263s 
    cat fb.hg19.chainBosTau6Link.txt
    # 1370696434 bases of 2897316137 (47.309%) in intersection
    # Create link
    cd /hive/data/genomes/hg19/bed
    ln -s  lastzBosTau6.2011-05-16 lastz.bosTau6


    #   running the swap 
    mkdir /hive/data/genomes/bosTau6/bed/blastz.hg19.swap
    cd /hive/data/genomes/bosTau6/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzBosTau6.2011-05-16/DEF \
        -swap  -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    #   real     98m22.477s
    cat fb.bosTau6.chainHg19Link.txt
    #   1336966333 bases of 2649682029 (50.458%) in intersection
    cd /hive/data/genomes/bosTau6/bed
    ln -s blastz.hg19.swap lastz.hg19


###################################################################
# BUILD OMIM RELATED TRACKS (REBUILT 5/17/11, Fan)

ssh hgwdev
cd /hive/data/genomes/hg19/bed
mkdir -p omim/05172011
cd omim/05172011

# obtain the following files from OMIM and place them at this subdirectory

       genemap.txt  
       mim2gene.txt  
       mimAV.txt
       script1.pl  
       script2.pl

cat genemap.txt|sed -e 's/|/\t/g' > genemap.tab

hgLoadSqlTab -warn hg19 omimGeneMap ~/kent/src/hg/lib/omimGeneMap.sql genemap.tab

# Load mim2gene table

hgsql hg19 -e 'drop table mim2gene'
hgsql hg19 < ~/kent/src/hg/lib/mim2gene.sql
hgsql hg19 -e 'load data local infile "mim2gene.txt" into table mim2gene ignore 1 lines'

doOmimDisorders hg19 omimDisorderMap.tab
hgsql hg19 -e "drop table omimDisorderMap"
hgsql hg19 < ~/kent/src/hg/lib/omimDisorderMap.sql
hgLoadSqlTab -warn hg19 omimDisorderMap ~/kent/src/hg/lib/omimDisorderMap.sql omimDisorderMap.tab 

# build omimGeneSymbol table

doOmimGeneSymbols hg19 j.out
cat j.out |sort -u >omimGeneSymbol.tab
hgLoadSqlTab -warn hg19 omimGeneSymbol ~/kent/src/hg/lib/omimGeneSymbol.sql omimGeneSymbol.tab 

perl ./script1.pl --gene-map-file=genemap.txt >omimPhenotype.tab
hgLoadSqlTab -warn hg19 omimPhenotype ~/kent/src/hg/lib/omimPhenotype.sql omimPhenotype.tab 

hgsql hg19 -e 'update omimPhenotype set phenotypeClass = -1 where
phenotypeClass=0'
hgsql hg19 -e 'update omimPhenotype set phenotypeId = -1 where phenotypeId=0'

doOmimGene2 hg19 j.tmp
cat j.tmp |sort -u > omimGene2.tab

hgLoadBed hg19 omimGene2 omimGene2.tab

rm j.tmp
##############################################################
# build the omimAvSnp track

cd /hive/data/genomes/hg19/bed/omim/05172011
mkdir av
cd av

# get the mimAV.txt data file from OMIM

cut -f 1 mimAV.txt >j1
cut -f 2 mimAV.txt >j2
cut -f 3  mimAV.txt >j3
cut -f 4  mimAV.txt >j4
cut -f 5  mimAV.txt >j5

cat j1 |sed -e 's/\./\t/' >j1.2

cat j4 |sed -e 's/,/\t/' >j4-2
cut -f 1 j4-2 >j4.1
cut -f 2 j4-2 >j4.2

paste j1 j1.2 j3 j4 j4.1 j4.2 j5 j2 >omimAv.tab

hgsql hg19 -e 'drop table omimAv'
hgsql hg19 < ~/src/hg/lib/omimAv.sql
hgsql hg19 -e \
'load data local infile "omimAv.tab" into table omimAv ignore 1 lines'
hgsql hg19 -e 'update omimAv set repl2 = rtrim(ltrim(repl2))'

doOmimAv hg19 omimAvRepl.tab  2>j.err

hgsql hg19 -e "drop table omimAvRepl"
hgsql hg19 < ~/kent/src/hg/lib/omimAvRepl.sql
hgsql hg19 -e 'load data local infile "omimAvRepl.tab" into table omimAvRepl'

rm j1.2  j1 j2 j3  j4  j4-2  j4.1  j4.2  j5

hgsql hg19 -N -e 'select chrom, chromStart, chromEnd, avId from omimAvRepl r,
snp132 s where s.name = dbSnpId order by avId' >omimAvSnp.tab

hgLoadBed -allowStartEqualEnd  hg19 omimAvSnp omimAvSnp.tab
##############################################################
# build the omimLocation track

cd /hive/data/genomes/hg19/bed/omim/05172011
mkdir location
cd location

doOmimLocation hg19 omimLocation.bed 2>j.err
hgLoadBed hg19 omimLocation omimLocation.bed

# Remove all gene entries in omimGene2 from omimLocation table

hgsql hg19 -N -e \
'delete from omimLocation where name  in (select name from omimGene2) '

# Per OMIM request, delete all the gray entries in omimLocation table.

mkdir cleanUpOmimLocation
cd cleanUpOmimLocation

hgsql hg19 -N -e \
'select distinct name from omimLocation' |sort -u >j.all

hgsql hg19 -N -e \
'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=1' >j.1
hgsql hg19 -N -e \
'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=2' >j.2
hgsql hg19 -N -e \
'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=3' >j.3
hgsql hg19 -N -e \
'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=4' >j.4

cat j.1 j.2 j.3 j.4 |sort -u >j.1234

diff j.all j.1234 |grep "<" |sed -e 's/</do1/' >doall

cat << '_EOF_' > do1
hgsql hg19 -e "delete from omimLocation where name='${1}'"
'_EOF_'
# << emacs

./doall

############################################################################
# NUMTS TRACK (DONE 2011-06-03 - Chin)

    mkdir -p /hive/data/outside/Numts/hg19
    cd /hive/data/outside/Numts/hg19
    wget http://193.204.182.50/files/hg19/all_hg19_NumtS_tracks.txt
    wget http://193.204.182.50/files/hg19/HSA_NumtS_hg19_details.html
    wget http://193.204.182.50/files/bam/allNumtS.sorted.bam
    wget http://193.204.182.50/files/bam/allNumtS.sorted.bam.bai

    mkdir /cluster/data/hg19/bed/NumtS
    cd  /cluster/data/hg19/bed/NumtS
    cp /hive/data/outside/Numts/hg19/*.* .


    # split the all_hg19_NumtS_tracks.txt into 3 bed files 
    # numtSAssembled.bed, numtS.bed, an numtSMitochondrion.bed

    cat all_hg19_NumtS_tracks.txt | awk ' /^track name/ {print $_}'  > tracks.list
    cat all_hg19_NumtS_tracks.txt | awk ' /^track type/ {print $_}'  >> tracks.list

    # load the 3 bed files to hg19
    hgLoadBed hg19  numtSAssembled  numtSAssembled.bed
    hgLoadBed hg19 numtS numtS.bed
    hgLoadBed hg19 numtSMitochondrion numtSMitochondrion.bed
    # Make /gbdb/ links and load bam
    mkdir /gbdb/hg19/NumtS
    ln -s `pwd`/allNumtS.sorted.bam{,.bai} /gbdb/hg19/NumtS/
    hgBbiDbLink hg19 bamAllNumtSSorted /gbdb/hg19/NumtS/allNumtS.sorted.bam 

    # setup trackDb for hg19

############################################################################
# Add Gene name search to Ensembl gene track (DONE - 2011-07-22 - Hiram)
    cd /hive/data/genomes/hg19/bed/ensGene.62/process
    cut -f1,9 infoOut.txt | grep -v "^#" | sort > ensemblToGeneName.tab
    NL=`awk '{print length($1)}' ensemblToGeneName.tab | sort -rn | head -1`
    VL=`awk '{print length($2)}' ensemblToGeneName.tab | sort -rn | head -1`
    sed -e "s/ knownTo / ensemblToGeneName /; s/known gene/ensGen/; s/INDEX(name(12)/PRIMARY KEY(name($NL)/; s/value(12)/value($VL)/" \
	$HOME/kent/src/hg/lib/knownTo.sql > ensemblToGeneName.sql

    hgLoadSqlTab hg19 ensemblToGeneName ensemblToGeneName.sql ensemblToGeneName.tab

    # add this search specification to trackDb.ra
searchName ensGeneName
searchTable ensGene
searchType genePred
searchMethod prefix
xrefTable ensemblToGeneName
xrefQuery select name,value from %s where value like '%%%s%%'
searchPriority 50

############################################################################
# COSMIC TRACK (DONE 2011-07-15 Fan)

mkdir /hive/data/outside/cosmic/20110711
# put raw data file, EnsMutExp_v54_080711.csv, received by email to there.

mkdir /hive/data/genomes/hg19/bed/cosmic/20110711
cd /hive/data/genomes/hg19/bed/cosmic/20110711

cp -p /hive/data/outside/cosmic/20110711/EnsMutExp_v54_080711.csv . 

cat EnsMutExp_v54_080711.csv|sed -e 's/\t//g' |sed -e 's/,/\t/g' |\
grep -v COSMIC_MUTATION_ID |grep -v 'selected'|grep COSM
>EnsMutExp_v54_080711.tab

hgsql hg19 -e 'drop table cosmicRaw'
hgsql hg19 < ~/kent/src/hg/lib/cosmicRaw.sql

hgLoadSqlTab hg19 cosmicRaw ~/kent/src/hg/lib/cosmicRaw.sql
EnsMutExp_v54_080711.tab

# use  grch37_start-1 for our zero based chromStart and 
# conver their chr23 and chr24 to chrX and chrY.

hgsql hg19 -N -e 'select "chr", chromosome, grch37_start-1, grch37_stop,
cosmic_mutation_id from cosmicRaw' \
|grep -v NULL |sed -e 's/chr\t/chr/'|sort -u|sed -e 's/chr23/chrX/' |sed -e
's/chr24/chrY/' >cosmic.bed

hgLoadBed -allowStartEqualEnd  hg19 cosmic cosmic.bed

#############################################################################
# HI SEQ DEPTH (DONE 7/15/11 angie)
    mkdir /hive/data/genomes/hg19/bed/hiSeqDepth
    cd /hive/data/genomes/hg19/bed/hiSeqDepth
    foreach cov (001 005 01 05 1)
      wget --timestamp http://eqtl.uchicago.edu/Masking/seq.cov$cov.ONHG19.bed.gz
      gunzip -N seq.cov$cov.ONHG19.bed.gz
    end
    wc -l seq.cov*
#    522 seq.cov001.ONHG19.bed
#   1224 seq.cov005.ONHG19.bed
#   2060 seq.cov01.ONHG19.bed
#  16119 seq.cov05.ONHG19.bed
#  30671 seq.cov1.ONHG19.bed
    foreach cov (001 005 01 05 1)
      echo seq.cov$cov.ONHG19.bed
      featureBits -countGaps hg19 seq.cov$cov.ONHG19.bed
    end
#seq.cov001.ONHG19.bed
#55092 bases of 3137161264 (0.002%) in intersection
#seq.cov005.ONHG19.bed
#175379 bases of 3137161264 (0.006%) in intersection
#seq.cov01.ONHG19.bed
#344425 bases of 3137161264 (0.011%) in intersection
#seq.cov05.ONHG19.bed
#3073270 bases of 3137161264 (0.098%) in intersection
#seq.cov1.ONHG19.bed
#5736695 bases of 3137161264 (0.183%) in intersection
    # Compare hg19 coverage to hg18:
    calc 55092 / 57409
#55092 / 57409 = 0.959640
    calc 175379 / 183848
#175379 / 183848 = 0.953935
    calc 344425 / 362423
#344425 / 362423 = 0.950340
    calc 3073270 / 3462959
#3073270 / 3462959 = 0.887469
    calc 5736695 / 6466376
#5736695 / 6466376 = 0.887158

    # Not all small ones are strict subsets of larger ones.
    featureBits hg19 -countGaps seq.cov001.ONHG19.bed \!seq.cov005.ONHG19.bed
#128 bases of 3137161264 (0.000%) in intersection
    featureBits hg19 -countGaps seq.cov005.ONHG19.bed \!seq.cov01.ONHG19.bed
#222 bases of 3137161264 (0.000%) in intersection
    featureBits hg19 -countGaps seq.cov01.ONHG19.bed \!seq.cov05.ONHG19.bed
#4185 bases of 3137161264 (0.000%) in intersection
    featureBits hg19 -countGaps seq.cov05.ONHG19.bed \!seq.cov1.ONHG19.bed
#41831 bases of 3137161264 (0.001%) in intersection
    # No overlap w/gap track:
    featureBits hg19 -countGaps seq.cov1.ONHG19.bed gap -bed=gapOverlaps.ONHG19.bed
#0 bases of 3137161264 (0.000%) in intersection

    # Load tables:
    hgLoadBed hg19 hiSeqDepthTopPt1Pct seq.cov001.ONHG19.bed
#Loaded 522 elements of size 3
    hgLoadBed hg19 hiSeqDepthTopPt5Pct seq.cov005.ONHG19.bed
#Loaded 1224 elements of size 3
    hgLoadBed hg19 hiSeqDepthTop1Pct seq.cov01.ONHG19.bed
#Loaded 2060 elements of size 3
    hgLoadBed hg19 hiSeqDepthTop5Pct seq.cov05.ONHG19.bed
#Loaded 16119 elements of size 3
    hgLoadBed hg19 hiSeqDepthTop10Pct seq.cov1.ONHG19.bed
#Loaded 30671 elements of size 3

############################################################################
# adding new decode data (DONE - 2011-08-18 - Hiram)
# liftOver from hg18 tracks:
    mkdir /hive/data/outside/decode/hg19
    cd /hive/data/outside/decode/hg19
# some of the items end up overlapping as result of liftOver,
# this filters them out:
export OVERLAPS="9202536|9235404|9225403|9215395|9192536|9182536|9172536|110561813|110582154|110572154|110552191|110542194|110532192|110522233|110512223|110502226|110492220|110482221|110472216|36283195|36273203|36252392|81325902|36262322"

for T in female female_carrier female_noncarrier male male_carrier \
        male_noncarrier sex-averaged sex-averaged_carrier \
        sex-averaged_noncarrier maleFemale
do
liftOver ../hg18/${T}.bedGraph \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
        ${T}.hg19.txt ${T}.unMapped.bedGraph
wc -l ${T}.hg19.txt ${T}.unMapped.bedGraph
awk '$3-$2 > 8000 && $3-$2 < 12000' ${T}.hg19.txt | sort -k1,1 -k2,2n \
	| egrep -v "${OVERLAPS}" > ${T}.hg19.bedGraph
awk '$3-$2 < 8001 || $3-$2 > 11999' ${T}.hg19.txt >> ${T}.unMapped.bedGraph
awk '$3-$2 > 8000 && $3-$2 < 12000' ${T}.hg19.txt | sort -k1,1 -k2,2n \
    | egrep "${OVERLAPS}" >> ${T}.unMapped.bedGraph
bedGraphToBigWig ${T}.hg19.bedGraph /hive/data/genomes/hg19/chrom.sizes ${T}.bw
wc -l ${T}.hg19.txt ${T}.hg19.bedGraph ${T}.unMapped.bedGraph
done

    # load the bigWig files into SQL table name friendly tables:
    mkdir /gbdb/hg19/decode
for C in female female_carrier female_noncarrier \
	male male_carrier male_noncarrier \
	sex-averaged sex-averaged_carrier sex-averaged_noncarrier
do
    N=${C}
    case ${C} in
        female) N="Female" ;;
        female_carrier) N="FemaleCarrier" ;;
        female_noncarrier) N="FemaleNonCarrier" ;;
        male) N="Male" ;;
        male_carrier) N="MaleCarrier" ;;
        male_noncarrier) N="MaleNonCarrier" ;;
        sex-averaged) N="SexAveraged" ;;
        sex-averaged_carrier) N="SexAveragedCarrier" ;;
        sex-averaged_noncarrier) N="SexAveragedNonCarrier" ;;
    esac
    echo $C $N
    rm -f /gbdb/hg19/decode/${C}.bw /gbdb/hg19/decode/${N}.bw
    ln -s `pwd`/${C}.bw /gbdb/hg19/decode/${N}.bw
    hgsql -e "drop table decode${N};" hg19
    hgBbiDbLink hg19 decode${N} /gbdb/hg19/decode/${N}.bw
done

    # compute male - female difference
    awk '{printf "%s_%d_%d\t%s\n", $1, $2, $3, $4}' male.hg19.bedGraph \
	| sort > ordered.male.txt
    awk '{printf "%s_%d_%d\t%s\n", $1, $2, $3, $4}' female.hg19.bedGraph \
	| sort > ordered.female.txt
    join ordered.male.txt ordered.female.txt > maleFemale.txt

    awk '{printf "%s\t%.6f\n", $1, $2-$3}' maleFemale.txt \
	| sed -e "s/_/\t/g" | sort -k1,1 -k2,2n > maleFemale.bedGraph
    # same result as what was lifted:
    sum maleFemale.hg19.bedGraph maleFemale.bedGraph
    #	14015  7950 maleFemale.hg19.bedGraph
    #	14015  7950 maleFemale.bedGraph


    # and hot spots
    awk '$4 > 9.99' female.hg19.bedGraph > hotSpotFemale.bed
    awk '$4 > 9.99' male.hg19.bedGraph > hotSpotMale.bed

    hgLoadBed hg19 decodeHotSpotFemale hotSpotFemale.bed
    #	Loaded 4135 elements of size 4
    hgLoadBed hg19 decodeHotSpotMale hotSpotMale.bed
    #	Loaded 4771 elements of size 4

    bedGraphToBigWig maleFemale.bedGraph /hive/data/genomes/hg19/chrom.sizes \
        MaleFemaleDifference.bw
    ln -s `pwd`/MaleFemaleDifference.bw /gbdb/hg19/decode/
    hgsql -e "drop table decodeMaleFemaleDifference;" hg19
    hgBbiDbLink hg19 decodeMaleFemaleDifference /gbdb/hg19/decode/MaleFemaleDifference.bw


#############################################################################
# DBSNP B134 / SNP134 (DONE 9/1/11)
# Redmine #5133
# Originally run 8/30/11; re-run 9/1/11 to incorporate 1000 Genomes frequency data
# that dbSNP had moved out to a new database table, SNPAlleleFreq_TGP.
    mkdir -p /hive/data/outside/dbSNP/134/human
    cd /hive/data/outside/dbSNP/134/human
    # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
    # to find the subdir name to use as orgDir below (human_9606 in this case).
    # Then click into that directory and look for file names like
    #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
    # -- use the first num for build and the second num_num for buildAssembly.
    # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
    #
    # Some trial and error was required to get the config.ra just right -- assembly
    # label now has ".p2" at end, and GRCh37 patch contigs needed to be filtered out:
    cat > config.ra <<EOF
db hg19
orgDir human_9606
build 134
buildAssembly 37_2
liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
refAssemblyLabel GRCh37.p2
ignoreDbSnpContigs NW_0033159[0-9][0-9]
EOF
    # << emacs
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log


#############################################################################
# FILTER SNP134 (DONE 9/2/11 angie)
    # Redmine #5133
    # Make several tracks that are filtered subsets of snp134:
    # First, filter out the multiply-aligned and/or weight >1 SNPs -> snp134Mult
    # Second, siphon off the common variants -> snp134Common
    # Third, take the (uniquely mapped, not known to be common) variants
    # w/dbSNP's "clinically-assoc" flag -> snp134Flagged
    cd /hive/data/outside/dbSNP/134/human
    zcat snp134.bed.gz \
    | perl -we \
      '$minTotal2N = 10; \
       ($multCount, $comCount, $flagCount, $miscCount) = (0,0,0,0); \
       open($mult, "| gzip -c > snp134Mult.bed.gz") || die; \
       open($common,    "| gzip -c > snp134Common.bed.gz") || die; \
       open($flagged,   "| gzip -c > snp134Flagged.bed.gz") || die; \
       open($misc,      "| gzip -c > snp134Misc.bed.gz") || die; \
       while (<>) { \
         @w = split("\t"); \
         if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \
           print $mult $_; \
           $multCount++; \
         } else { \
           my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \
           my @alNs = split(",", $nStr);      die unless scalar(@alNs) == $alleleFreqCount; \
           my @freqs = split(",", $freqStr);  die unless scalar(@freqs) == $alleleFreqCount; \
           my ($total2N, $maxAlleleFreq) = (0, 0); \
           for (my $i = 0;  $i < $alleleFreqCount;  $i++) { \
             $total2N += $alNs[$i]; \
             $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \
           } \
           if ($alleleFreqCount >= 2 && $total2N >= $minTotal2N && $maxAlleleFreq <= 0.99) { \
             print $common $_; \
             $comCount++; \
           } elsif($w[24] =~ /clinically-assoc/)  { \
             print $flagged $_; \
             $flagCount++; \
           } else { \
             print $misc $_; \
             $miscCount++; \
           } \
         } \
       } \
       close($mult);  close($common); close($flagged);  close($misc); \
       print "snp134Mult:    $multCount\nsnp134Common:  $comCount\nsnp134Flagged: $flagCount\n" . \
             "leftover:      $miscCount\n";'
#snp134Mult:    3603177
#snp134Common:  13413905
#snp134Flagged: 26496
#leftover:      26910747

    # Load tables
    foreach subset (Mult Common Flagged)
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
        hg19 snp134$subset -sqlTable=snp134.sql snp134$subset.bed.gz
    end


#############################################################################
# SNP134 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 9/2/11 angie)
    mkdir /hive/data/genomes/hg19/bed/snp134Ortho
    cd /hive/data/genomes/hg19/bed/snp134Ortho
    # Filter snp134 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
    zcat /hive/data/outside/dbSNP/134/human/snp134.bed.gz \
    | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
    | sort -u \
      > snp134ExcludeIds.txt
    wc -l snp134ExcludeIds.txt
#1178007 snp134ExcludeIds.txt
    zcat /hive/data/outside/dbSNP/134/human/snp134.bed.gz \
    | awk '$3-$2 == 1 && $11 == "single" {print;}' \
    | grep -vFwf snp134ExcludeIds.txt \
      > snp134Simple.bed
    wc -l snp134Simple.bed
#32818637 snp134Simple.bed

    # Glom all human info that we need for the final table onto the
    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
    awk 'BEGIN{OFS="\t";} \
        {print $1, $2, $3, \
               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
               0, $6;}' \
      snp134Simple.bed > snp134ForLiftOver.bed

    # Map coords to chimp using liftOver.
    mkdir run.liftOChimp
    cd run.liftOChimp
    mkdir split out
    splitFile ../snp134ForLiftOver.bed 10000 split/chunk
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro3.over.chain.gz \
        \{check out exists out/panTro3.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    ssh swarm
    cd /hive/data/genomes/hg19/bed/snp134Ortho/run.liftOChimp
    para make jobList
#Completed: 3282 of 3282 jobs
#CPU time in finished jobs:     314951s    5249.18m    87.49h    3.65d  0.010 y
#IO & Wait Time:                 32669s     544.49m     9.07h    0.38d  0.001 y
#Average job time:                 106s       1.77m     0.03h    0.00d
#Longest finished job:             268s       4.47m     0.07h    0.00d
#Submission to last job:           444s       7.40m     0.12h    0.01d

    # Map coords to orangutan using liftOver.
    mkdir ../run.liftOPon
    cd ../run.liftOPon
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
        \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 3282 of 3282 jobs
#CPU time in finished jobs:     681601s   11360.02m   189.33h    7.89d  0.022 y
#IO & Wait Time:                 57733s     962.21m    16.04h    0.67d  0.002 y
#Average job time:                 225s       3.75m     0.06h    0.00d
#Longest finished job:             586s       9.77m     0.16h    0.01d
#Submission to last job:          1598s      26.63m     0.44h    0.02d

    # Map coords to macaque using liftOver.
    mkdir ../run.liftOMac
    cd ../run.liftOMac
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
        \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 3282 of 3282 jobs
#CPU time in finished jobs:     826108s   13768.47m   229.47h    9.56d  0.026 y
#IO & Wait Time:                 68165s    1136.08m    18.93h    0.79d  0.002 y
#Average job time:                 272s       4.54m     0.08h    0.00d
#Longest finished job:             679s      11.32m     0.19h    0.01d
#Submission to last job:          1775s      29.58m     0.49h    0.02d

    cd /hive/data/genomes/hg19/bed/snp134Ortho
    # Concatenate the chimp results, sorting by chimp pos in order to
    # efficiently access 2bit sequence in getOrthoSeq.  The output of
    # that is then sorted by the glommed human info field, so that we
    # can use join to combine chimp and macaque results in the next step.
    # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro3.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro3/panTro3.2bit \
    | sort > panTro3.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
    | sort > ponAbe2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
    | sort > rheMac2.orthoGlom.txt
    wc -l panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
#30880910 panTro3.orthoGlom.txt
#29376791 ponAbe2.orthoGlom.txt
#26505681 rheMac2.orthoGlom.txt

    # Use the glommed name field as a key to join up chimp and macaque
    # allele data.  Include glommed name from both files because if only
    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
    # in the orthoGlom fields from each file, which are in the same order
    # as the chimp and macaque columns of snp134OrthoPanTro2RheMac2.
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt \
    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
      > tmp.txt
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      tmp.txt rheMac2.orthoGlom.txt \
    | perl -wpe 'chomp; \
        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
          split(/\|/, $glomKey); \
        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
        s/^.*$//;' \
    | sort -k1,1 -k2n,2n > snp134OrthoPt3Pa2Rm2.bed
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
      hg19 snp134OrthoPt3Pa2Rm2 snp134OrthoPt3Pa2Rm2.bed
#Loaded 31924973 elements of size 22
    # Cleanup:
    rm -r run*/split tmp.txt *.orthoGlom.txt bed.tab
    gzip snp134Simple.bed snp134ExcludeIds.txt snp134ForLiftOver.bed &


############################################################################
# DBSNP CODING ANNOTATIONS (134) (DONE 8/30/11 angie)
# It wasn't necessary to redo this following the 9/1 re-run of doDbSnp.pl because
# that simply picked up new allele frequency info, no change to exceptions etc.
    cd /hive/data/outside/dbSNP/134/human
    # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
    # For anything except an insertion (0 bases between flanks),
    # we need to add 1 to the end coord.  For an insertion, we need
    # to add 1 to the start coord.  Make a hash of the insertion IDs,
    # then look up each ID in ncbiFuncAnnotations.txt to tell which
    # transform to apply.
    # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
    zcat ncbiFuncAnnotations.txt.gz \
    | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \
              while (<$IDS>) { chomp; $ids{$_} = 1; } \
              close($IDS); \
              %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 44=>1, 45=>1); \
              while (<>) { \
                chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                next unless $coding{$w[5]}; \
                $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                  $w[2]++; # 2-base insertions: increment start coord \
                } else { \
                  $w[3]++; # increment end coord to get half-open \
                } \
                print join("\t", @w) . "\n"; \
              }' \
    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
    | uniq \
      > ncbiCodingAnnotations.txt
    wc -l ncbiCodingAnnotations.txt
#2510704 ncbiCodingAnnotations.txt
    # How many & what kinds of function types?
    cut -f 6 ncbiCodingAnnotations.txt \
    | sort -n | uniq -c
# 461567 3   (coding-synon)
#1244159 8   (cds-reference -- ignored)
#  21296 41  (nonsense)
# 729942 42  (missense)
#  52778 44  (frameshift)
#    962 45  (cds-indel)
    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
    perl -e  'while (<>) { chomp; \
                my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                if (defined $lastRs && \
                    ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                     $lastTx ne $txId || $lastFrm ne $frm)) { \
                  if (defined $refRow) { \
                    $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                    $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                  } \
                  print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                        "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                  $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                } \
                ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                    ($rsId, $ctg, $s, $e, $txId, $frm); \
                $count++; \
                if ($fxn == 8) { \
                  $refRow = [$fxn, $nt, $aa, $codon]; \
                } else { \
                 $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                } \
              } \
              if (defined $refRow) { \
                $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
              } \
              print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                    "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
      ncbiCodingAnnotations.txt \
    | liftUp snp134CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
    hgLoadBed hg19 snp134CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      snp134CodingDbSnp.bed
#Loaded 1244179 elements of size 11


############################################################################
# SNPMASKED SEQUENCE FOR SNP134 (DONE 8/30/11 angie)
# It wasn't necessary to redo this following the 9/1 re-run of doDbSnp.pl because
# that simply picked up new allele frequency info, no change to exceptions etc.
    mkdir /hive/data/genomes/hg19/snp134Mask
    cd /hive/data/genomes/hg19/snp134Mask
    # Identify rsIds with various problems -- we will exclude those.
    zcat /hive/data/outside/dbSNP/134/human/snp134.bed.gz \
    | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
      | sort -u \
      > snp134ExcludeRsIds.txt
    zcat /hive/data/outside/dbSNP/134/human/snp134.bed.gz \
    | grep -vFwf snp134ExcludeRsIds.txt \
      > snp134Cleaned.bed
    wc -l snp134Cleaned.bed
#37853186 snp134Cleaned.bed

    # Substitutions:
    mkdir substitutions
    snpMaskSingle snp134Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout diffObserved.txt \
    | faSplit byname stdin substitutions/
#Masked 32668329 snps in 32666100 out of 3131050506 genomic bases
#/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3131050506 (difference is 6110758)
    # warnings about differing observed strings at same base position:
    wc -l diffObserved.txt
#2545 diffObserved.txt
    # Check that 6110758 is the total #bases in sequences with nothing in snp134Cleaned:
    grep -Fw single snp134Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
    grep -vwf /data/tmp/1 ../chrom.sizes \
    | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
#6110758
    # Make sure that sizes are identical, first diffs are normal -> IUPAC,
    # and first diffs' case is preserved:
    foreach f (substitutions/chr*.fa)
      faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
    end
#chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10233 (y != c)
#chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 60522 (K != T)
#...
#(output OK -- ambiguous bases replacing [agct] at SNP positions)
    foreach f (substitutions/chr*.fa)
      echo $f:t:r
      mv $f $f:r.subst.fa
      gzip $f:r.subst.fa &
    end

    # Insertions & deletions not done.  To date we have only offered substs for download.
    # If there is user demand, use template from snp131 above.

    # Clean up and prepare for download:
    gzip snp134Cleaned.bed &
    foreach d (substitutions)
      pushd $d
        md5sum *.gz > md5sum.txt
        cp /hive/data/genomes/hg19/snp132Mask/$d/README.txt .
      popd
    end
    # Edit the README.txt.

    # Create download links on hgwdev.
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp134Mask
    ln -s /hive/data/genomes/hg19/snp134Mask/substitutions/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp134Mask/


#############################################################################
# LASTZ X. tropicalis XenTro3 (DONE - 2011-09-20 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzXenTro3.2011-09-20
    cd /hive/data/genomes/hg19/bed/lastzXenTro3.2011-09-20

    cat << '_EOF_' > DEF
# human vs X. tropicalis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Frog xenTro3
SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit
SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/hg19/bed/lastzXenTro3.2011-09-20
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    395m51.626s
    cat fb.hg19.chainXenTro3Link.txt  
    #	87928753 bases of 2897316137 (3.035%) in intersection
    cd /hive/data/genomes/hg19/bed
    ln -s lastzXenTro3.2011-09-20 lastz.xenTro3

    #	running the swap - DONE - 2011-09-21
    mkdir /hive/data/genomes/xenTro3/bed/blastz.hg19.swap
    cd /hive/data/genomes/xenTro3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzXenTro3.2011-09-20/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    37m7.001s
    cat fb.xenTro3.chainHg19Link.txt 
    #	90929066 bases of 1358334882 (6.694%) in intersection

############################################################################
# GENEREVIEWS TRACK (DONE 2011-09-22 - Chin)
    mkdir /hive/data/genomes/hg19/bed/geneReviews
    mkdir -p /hive/data/outside/ncbi/geneReviews/current
    cd /hive/data/outside/ncbi/geneReviews/current
    wget --timestamping ftp://ftp.ncbi.nih.gov/pub/GeneTests/README.html
    wget --timestamping ftp://ftp.ncbi.nih.gov/pub/GeneTests/*.txt
    # Note: the report *.txt are updated daily
    wc -l *.txt
    219 README.html
    #   3043 data_to_build_custom_reports.txt
    #   7245 disease_OMIM.txt
    #   3729 disease_gene_GR.txt
    #   1415 disease_hierarchy.txt

    cp disease_gene_GR.txt /hive/data/genomes/hg19/bed/geneReviews/.
    cd /hive/data/genomes/hg19/bed/geneReviews
    cat disease_gene_GR.txt | grep -v "^#" \
     | awk  -F'|' '{if ($3!="-" && $3!="Not applicable" && $4!="-") \
      printf  "%s\t%s\t%s\t%s\n", $3, $4, $1, $2}' | sort -k1 > grRefGeneData.tab
    # Create geneReviewsRefGene table
    cat << '_EOF_' > $HOME/kent/src/hg/lib/geneReviewsRefGene.sql
CREATE TABLE geneReviewsRefGene (
    geneSymbol  varchar(255) not null,   # refSeq gene symbol         
    grShort     varchar(255) not null,   # short name for GeneReviews article
    diseaseID   int unsigned not null,   # Disease ID of the review article
    diseaseName varchar(255) not null,   # Disease name of the review article
    index (geneSymbol)
);
'_EOF_'
    # << happy emacs

    # load RefSeg Gene to geneReview mapping list to hg19
    hgLoadSqlTab -warn hg19 geneReviewsRefGene \
      $HOME/kent/src/hg/lib/geneReviewsRefGene.sql grRefGeneData.tab
    # Scanning through 1 files

    # Generate a list of refSeq genes thah have geneReview assoicate with it.
    cat  grRefGeneData.tab | awk -F'\t' '{printf  "%s\n", $1}' \
       | sort  | uniq  > grRefGene.lst
    wc -l *.*
    #   3729 disease_gene_GR.txt
    #   1111 grData.tab
    #    946 grRefGene.lst
  
    # for each refGen in grRefGene.lst, create a non-overlapping bed row.
    cat grRefGene.lst | while read G
      do
        echo ${G}
        hgsql hg19 -N -e \
          "SELECT e.chrom,e.txStart,e.txEnd,j.geneSymbol \
          FROM knownGene e, kgXref j WHERE e.alignID = j.kgID AND \
          j.geneSymbol ='${G}' ORDER BY e.chrom,e.txStart;" > temp.in
        bedRemoveOverlap temp.in temp.out
        cat temp.out >> geneReviews.tab
      done
    rm temp.*

    # load the collapsed bed4 file to hg19,
    hgLoadBed hg19 geneReviews geneReviews.tab

    # addGeneReviewToBed.pl will add the geneReviews detail in html format to 
    # the bed 4 file
    cat << '_EOF_' > $HOME/kent/src/utils/geneReviews/addGeneReviewToBed.pl
#!/usr/bin/perl
use warnings;
use strict;
sub usage() {
    print "usage: ./addGRtoBed.pl dbName > outputFile\n";
}
my $argc=scalar(@ARGV);
if ($argc != 1) {
    usage; die "ERROR: Please supply a database name for results.\n";
}
#get the list of (unique) of gene symbols form geneReviews table
my @geneReviews = split('\n',
 `hgsql -N -e "select chrom, chromStart, chromEnd, name from geneReviews;" $ARGV[0]`);
my $grBed;
my $clickMsg = " (Click links below to search GeneReviews or GeneTests)";
my $firstTime = 1;
my $field;
my $details;
foreach $grBed(@geneReviews) {
   $details = "";
   my @col = split(/\t/, $grBed);
   #print "Processing name: ", $col[3], " <BR>";
   my @grShort = split('\n',
    `hgsql -N -e 'select  grShort, diseaseID, diseaseName from geneReviewsRefGene where geneSymbol="$col[3]"' $ARGV[0]`);
   $firstTime = 1;
   my $count = scalar(@grShort);
   my $i;
   my $j;
   for ($i=0; $i < $count; $i++) {
       my @f5 = split(/\t/, $grShort[$i]);
   if ($firstTime == 1) {
      $firstTime=0;
      $details = "";
      $details = "<BR><B>GeneReview available for " . $col[3] .  ": </B> " . $clickMsg . "<BR>";
      $details .= "<PRE><TT>";
      $details .= "Short name    Disease ID     GeneTests disease name<BR>";
      $details .= "-----------------------------------------------------------";
      $details .= "-----------------------------------------------------------";
      $details .= "----------------------------------<BR>";
      } 
      $details .= "<A HREF=\"http://www.ncbi.nlm.nih.gov/books/n/gene/" . $f5[0] . "\" TARGET=_blank><B>" . $f5[0] . "</B></A>";
      if (length($f5[0]) <= 15) {
        for ($j = 0; $j <  15-length($f5[0]); $j ++ )
           {
              $details .= " ";
           }
         }
       $details .=  $f5[1] . "       ";
       $details .= "<A HREF=\"http://www.ncbi.nlm.nih.gov/sites/GeneTests/review/disease/" . $f5[2] . "?db=genetests&search_param==begins_with\" TARGET=_blank>" . $f5[2]  ."<BR>";
     }
       $details .= "</TT></PRE><BR>";
       print $col[0], "\t", $col[1], "\t", $col[2], "\t", $col[3], "\t", $details, "\n";
  }
'_EOF_'
    # << happy emacs
    chmod +x $HOME/kent/src/utils/geneReviews/addGeneReviewToBed.pl
    # Add geneReview item in html format format as field 5
    $HOME/kent/src/utils/geneReviews//addGeneReviewToBed.pl hg19 > hg19.geneReviews.bed5

    # Convert to bigBed format 
    cat << '_EOF_' > $HOME/kent/src/hg/lib/geneReviewsBed5.as
table geneReviewsBed5
"GeneReviews  bigBed 4 + with extra field for detail page"
    (
    string chrom;      "Reference sequence chromosome or scaffold"
    uint   chromStart; "Start position in chromosome"
    uint   chromEnd;   "End position in chromosome"
    string name;       "Short Name of item"
    lstring description; "geneReviews item details in html"
    )
'_EOF_'
    # << happy emacs

    /cluster/bin/x86_64/bedToBigBed -bedFields=4 -tabs \
     -as=$HOME/kent/src/hg/lib/geneReviewsBed5.as  hg19.geneReviews.bed5 \
     /hive/data/genomes/hg19/chrom.sizes hg19.geneReviews.bb

    # upload the bigBed file to genomewiki
    /cluster/bin/scripts/gwUploadFile hg19.geneReviews.bb hg19.geneReviews.bb
    #  UploadFile hg19.geneReviews.bb hg19.geneReviews.bb
    #  # loading file: hg19.geneReviews.bb
    #  # into Image name: Hg19.geneReviews.bb
    #  # login name: chinhli
    #  # siteUrl: genomewiki.ucsc.edu
    #  # traceBackLimit: 0
    #  # traceBackLimit: 0 past site.Images
    #  Image info: {u'comment': u'gwUploadFile upload', u'sha1': u'643966466503bd6770f67d49397f4b94174beed2', u'url': u'http://genomewiki.ucsc.edu/images/b/b9/Hg19.geneReviews.bb', u'timestamp': u'2011-09-22T17:40:56Z', u'metadata': None, u'height': 0, u'width': 0, u'user': u'Chinhli', u'descriptionurl': u'http://genomewiki.ucsc.edu/index.php/File:Hg19.geneReviews.bb', u'size': 174667}
    #  Image File:Hg19.geneReviews.bb usage:

    cat << '_EOF_' > $HOME/kent/src/hg/lib/geneReviewsBB.sql
# sql to create geneReviewsBB table 
DROP TABLE IF EXISTS geneReviewsBB;
CREATE TABLE geneReviewsBB (
    fileName varchar(255) not null      # geneReviews.bb location (url)
);
'_EOF_' 
    # << happy emacs
    hgsql hg19 -e "source $HOME/kent/src/hg/lib/geneReviewsBB.sql;"
    hgsql hg19 -e 'insert into geneReviewsBB values ("http://genomewiki.ucsc.edu/images/b/b9/Hg19.geneReviews.bb");'


##############################################################################
# hgPal downloads redone for new knownGene (re-DONE 2012-01-06 braney)
#   FASTA from 46way for  knownGene, knownCanonical 

    ssh hgwdev
    screen
    bash
#    rm -rf /cluster/data/hg19/bed/multiz46way/pal
#    mkdir /cluster/data/hg19/bed/multiz46way/pal
    cd /cluster/data/hg19/bed/multiz46way/pal
    for i in `cat ../species.list`; do echo $i; done > order.lst

    mz=multiz46way
    gp=knownGene
    db=hg19
    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
    sleep 1
    tail -f $gp.$mz.job.log

# real    205m16.105s
# user    36m32.438s
# sys     6m12.046s

    mz=multiz46way
    gp=knownGene
    db=hg19

    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    mz=multiz46way
    gp=knownGene
    db=hg19
    pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

    # now do the canonical set
    cd /cluster/data/hg19/bed/multiz46way/pal
    mz=multiz46way
    gp=knownCanonical
    db=hg19
    for j in `awk '{print $1}' /cluster/data/hg19/chrom.sizes`
    do
	echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
    done

    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -geneBeds=$j.known.bed  $db $mz knownGene order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
    sleep 1
    tail -f $gp.$mz.job.log

# real    166m1.220s
# user    13m35.246s
# sys     2m50.683s

    rm *.known.bed
    mz=multiz46way
    gp=knownCanonical
    db=hg19
    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    mz=multiz46way
    gp=knownCanonical
    db=hg19
    pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
############################################################################
# UPDATE COSMIC TRACK (DONE 2011-10-11 Fan)

mkdir /hive/data/outside/cosmic/20111011
# put raw data file, UCSCMutExp_v55_090911.csv, received by email to there.

mkdir /hive/data/genomes/hg19/bed/cosmic/20111011
cd /hive/data/genomes/hg19/bed/cosmic/20111011

cp -p /hive/data/outside/cosmic/20111011/UCSCMutExp_v55_090911.csv .

cat UCSCMutExp_v55_090911.csv|sed -e 's/\t//g' |sed -e 's/,/\t/g' |\
grep -v COSMIC_MUTATION_ID |grep -v 'selected'|grep COSM >UCSCMutExp_v55_090911.tab

hgsql hg19 -e 'drop table cosmicRaw'
hgsql hg19 < ~/kent/src/hg/lib/cosmicRaw.sql

hgLoadSqlTab hg19 cosmicRaw ~/kent/src/hg/lib/cosmicRaw.sql UCSCMutExp_v55_090911.tab

# use  grch37_start-1 for our zero based chromStart and 
# conver their chr23 and chr24 to chrX and chrY.

hgsql hg19 -N -e 'select "chr", chromosome, grch37_start-1, grch37_stop, cosmic_mutation_id from cosmicRaw' \
|grep -v NULL |sed -e 's/chr\t/chr/'|sort -u|sed -e 's/chr23/chrX/' |sed -e 's/chr24/chrY/' >cosmic.bed

hgLoadBed -allowStartEqualEnd  hg19 cosmic cosmic.bed

#############################################################################
# LASTZ Gorilla GorGor3 (DONE - 2011-10-17 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17
    cd /hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17

    cat << '_EOF_' > DEF
# human vs gorilla
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Frog gorGor3
SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit
SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    159m46.598s
    cat fb.hg19.chainGorGor3Link.txt  
    #	2603997992 bases of 2897316137 (89.876%) in intersection

    cd /hive/data/genomes/hg19/bed
    ln -s lastzGorGor3.2011-10-17 lastz.gorGor3

    #	running the swap - DONE - 2011-09-21
    mkdir /hive/data/genomes/gorGor3/bed/blastz.hg19.swap
    cd /hive/data/genomes/gorGor3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzGorGor3.2011-10-17/DEF \
	-swap -syntenicNet \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    69m39.685s
    cat fb.gorGor3.chainHg19Link.txt 
    #	2571797450 bases of 2822760080 (91.109%) in intersection

############################################################################
# ISCA FROM DBVAR (DONE 5/21/12 angie)
# Updated 3/02/12 angie
    # Redmine: Track #34 (dbVar for human)
    set today = `date +%Y_%m_%d`
    mkdir /hive/data/genomes/hg19/bed/isca/$today
    cd /hive/data/genomes/hg19/bed/isca/$today
    # Get variants submitted on this assembly, and variants remapped from other assemblies.
    wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd37_ISCA/gvf/nstd37_ISCA.GRCh37.submitted.all.germline.ucsc.gvf.gz
    wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd37_ISCA/gvf/nstd37_ISCA.GRCh37.remap.all.germline.ucsc.gvf.gz
    # New 5/21/12: ISCA Curated
    wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd45_ISCA_curated_dataset/gvf/nstd45_ISCA_curated_dataset.GRCh37.remap.all.germline.ucsc.gvf.gz
    zcat nstd37_ISCA*.gvf.gz \
    | ~/kent/src/hg/utils/automation/gvfToBed8Attrs.pl \
      > isca.bed
    zcat nstd45_ISCA*.gvf.gz \
    | ~/kent/src/hg/utils/automation/gvfToBed8Attrs.pl \
      > iscaCurated.bed
    wc -l isca*.bed
#   12943 isca.bed
#      84 iscaCurated.bed
    # Split into subtracks by clinical_int value.
    zcat nstd37_ISCA*.gvf.gz \
    | grep ssv | sed -e 's/.*clinical_int=//; s/;.*//;' | sort | uniq -c
#   4307 Benign
#   4600 Pathogenic
#   3406 Uncertain significance
#    466 Uncertain significance: likely benign
#    164 Uncertain significance: likely pathogenic
    zcat nstd45_ISCA*.gvf.gz \
    | grep ssv | sed -e 's/.*clinical_int=//; s/;.*//;' | sort | uniq -c
#     29 Benign
#     55 Pathogenic
    foreach subtrack (Benign Pathogenic)
      grep -w  $subtrack isca.bed > isca$subtrack.bed
      hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
        -allowStartEqualEnd hg19 isca$subtrack isca$subtrack.bed
      grep -w  $subtrack iscaCurated.bed > iscaCurated$subtrack.bed
      hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
        -allowStartEqualEnd hg19 iscaCurated$subtrack iscaCurated$subtrack.bed
    end
#Read 4307 elements of size 11 from iscaBenign.bed
#Read 29 elements of size 11 from iscaCuratedBenign.bed
#Read 4600 elements of size 11 from iscaPathogenic.bed
#Read 55 elements of size 11 from iscaCuratedPathogenic.bed

    # The subcategories of Uncertain need a bit more sophisticated treatment:
    set subtrack = Uncertain
    grep -w $subtrack isca.bed \
    | grep -vi 'Uncertain Significance: likely' \
      > isca$subtrack.bed
    hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
      -allowStartEqualEnd hg19 isca$subtrack isca$subtrack.bed
#Read 3406 elements of size 11 from iscaUncertain.bed

    foreach unc (benign pathogenic)
      set subtrack = Likely`perl -we 'print ucfirst("'$unc'");'`
      grep -wi "Uncertain Significance: likely $unc" isca.bed \
        > isca$subtrack.bed
      hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
        -allowStartEqualEnd hg19 isca$subtrack isca$subtrack.bed
    end
#Read 466 elements of size 11 from iscaLikelyBenign.bed
#Read 164 elements of size 11 from iscaLikelyPathogenic.bed

## more for this track below v ##

############################################################################
# ISCA AGGREGATE PATHOGENIC TRACKS (DONE 2012-05-21 angie)
# First done 2012-02-08 by b0b; updated 2012-03-03 by b0b.

    # files of ISCA Pathogenic Gain and Loss were fetched in the previous section --
    # use same dir.
    set today = `date +%Y_%m_%d`
    cd /hive/data/genomes/hg19/bed/isca/$today

    # make bedGraphs
    hgsql -N -e "SELECT chrom, chromStart, chromEnd FROM iscaPathogenic \
        WHERE attrVals LIKE '%number_gain%'" hg19 | sort \
    | bedItemOverlapCount hg19 stdin > iscaPathGain.bedGraph

    hgsql -N -e "SELECT chrom, chromStart, chromEnd FROM iscaPathogenic \
        WHERE attrVals LIKE '%number_loss%'" hg19 | sort \
    | bedItemOverlapCount hg19 stdin > iscaPathLoss.bedGraph

    # load tables
    hgLoadBed -bedGraph=4 hg19 iscaPathGainCum iscaPathGain.bedGraph
#Read 2001 elements of size 4 from iscaPathGain.bedGraph

    hgLoadBed -bedGraph=4 hg19 iscaPathLossCum iscaPathLoss.bedGraph
#Read 3567 elements of size 4 from iscaPathLoss.bedGraph
 
    # End of track build instructions; historical notes follow.

  # trackDb (these values from original load, not update)
  # get 2 stdDev value to set default maxHeightPixels 
    # use average of: median + 2 SD - would mean be better?
    # use same viewLimit for both:  average the two
    # note that chr21 (Down's) is overrepresented in the dataset
    ave  hg19.iscaPathGain.bedGraph -col=4
#    median 9.000000
#    average 20.970921
#    max 105.000000
#    standard deviation 25.652712
    median + 2SD = 60 

    ave  hg19.iscaPathLoss.bedGraph -col=4
#    median 6.000000
#    average 15.998146
#    max 171.000000
#    standard deviation 23.526095
    median + 2SD = 53 
 
  # move some settings down to existing subtracks:  
       type gvf 
       noScoreFilter . 
  # add settings to parent track:
      type bed
      noInherit on
  # set these two new tracks:
      release alpha
      type bedGraph 4
      maxHeightPixels 100:57:16
      viewLimits 0:60 (halfway betw 2 SD and max)
      alwaysZero on
      color 0,0,200 (Gain)
      color 200,0,0 (Loss)
  #  new html page using override in trackDb/human/hg19/trackDb.ra


############################################################################
# LINCRNAS FROM BROAD (DONE 2011-10-10 Chin)
# Human lincRNA Catalog

    # unzip data from Board to /hive/data/outside/lincRnaFromCabili
    mkdir /hive/data/genomes/hg19/bed/lincRnaFromCabili
    cd /hive/data/genomes/hg19/bed/lincRnaFromCabili
    cp /hive/data/outside/lincRnaFromCabili/Cabili_etal_BodyMaplincRNAs.key.txt .
    cp /hive/data/outside/lincRnaFromCabili/Cabili_etal_description.doc .
    cp /hive/data/outside/lincRnaFromCabili/Cabili_etal_BodyMapLincRNAs.bed .
    cp /hive/data/outside/lincRnaFromCabili/lincRNAs_transcripts.gtf .

    # Load data for lincRNAsTranscripts track	
    cd /hive/data/genomes/hg19/bed/lincRnaFromCabili
    ldHgGene -gtf hg19 lincRNAsTranscripts lincRNAs_transcripts.gtf
    # Read 21630 transcripts in 67096 lines in 1 files
    #   21630 groups 43 seqs 7 sources 1 feature types
    # 21630 gene predictions

    cat << '_EOF_' > createExpDataByCellType.pl
#!/usr/bin/perl
# Create expData table form a microarray bed15 file
# for further analysis
# usage: ./createExpDataByCellType.pl <bed15File> 
use strict;
use warnings;
my $line;
my @bF;
my $i;
my $outFname;
my $name;
my $expCount;
my $expIds;
my $expScores;
my @expId;
my @expScore;
my $score;
my $tScore;
my $fScore;
my $log2 = log(2);
my $sLog2;
my $tLog2;
# Define the 22 cell type array
my @cellType = ("Adipose","Adrenal","Brain","Breast","Colon","Heart","Kidney",
              "Liver","Lung","LymphNode","Ovary","Prostate","SkeletalMuscle",
              "WhiteBloodCell","Testes","Thyroid","Testes_R","Brain_R",
              "Placenta_R","Foreskin_R","hLF_r2","hLF_r1");
#Read in the microarray (bed15) files
# Assume number of exp and score agreed
my $argc=scalar(@ARGV);
if ($argc < 1)
   {
      print "usage: ./createExpDataByCellType.pl <bed15File>";
    }
my $fName = $ARGV[0];
my $outFName;
# Loop thru each cell type
for($i = 0; $i < scalar(@cellType); $i++) {
  open(FHIN, $fName) or die "Can not open $fName";
  $outFName= "lincRNAsCT" . $cellType[$i] . "\.tab";
  open(FHOUT, ">$outFName") or die "Can not open $outFName";
  while ($line = <FHIN>) {
     chomp($line);
     @bF = split('\t', $line);
     printf(FHOUT "%s\t%s\t%s\t%s\t",
                  $bF[0],$bF[1],$bF[2],$bF[3]); 
     $bF[14] =~ s/,$//;
     $expScores = $bF[14];
     @expScore = split(",",$expScores);
     # Process the expRatio
     $tLog2 = log($expScore[$i] + 0.5)/$log2;
     $sLog2 = sprintf("%.3f",$tLog2);
     # scale sLog2 using 0 (-1) .. 1000 (4)
     if ($sLog2 <= 4.0) {
         $tScore = ($sLog2 + 1) * (1000/5);
         $fScore = sprintf("%3d",$tScore);
       } else {
         $tScore = 1000;
       }
     if ($tScore >= 1000) {
         $fScore = 1000;
       } else {
         $fScore = sprintf("%3d",$tScore);
       }
     printf(FHOUT "%s\t%s\t%s\n",$fScore, $expScore[$i],$sLog2);     
  } # end while
  close FHIN;
  close FHOUT;
} # for loop
'_EOF_'
    # << happy emacs
    chmod +x createExpDataByCellType.pl
    ./createExpDataByCellType.pl Cabili_etal_BodyMapLincRNAs.bed 

    cat << '_EOF_' > deleteCTTables.sh
#!/bin/sh
#
cellType=(Adipose Adrenal Brain Breast Colon Heart Kidney
              Liver Lung LymphNode Ovary Prostate SkeletalMuscle
              WhiteBloodCell Testes Thyroid Testes_R Brain_R
              Placenta_R Foreskin_R hLF_r2 hLF_r1)
for c in "${cellType[@]}"
do
    echo Processing lincRNAs$c
    hgsql  hg19 -e "DROP TABLE IF EXISTS  lincRNAsCT$c;"
done
'_EOF_'
    # << happy emacs
    chmod +x deleteCTTables.sh
    ./deleteCTTables.sh
   
    cat << '_EOF_' > lincRNAsCTTemp.sql
CREATE TABLE lincRNAsCTTemp (
    chrom varchar(255) not null,        # Human chromosome or FPC contig
    chromStart int unsigned not null,   # Start position in chromosome
    chromEnd int unsigned not null,     # End position in chromosome
    name varchar(255) not null,         # Name of item
    score int unsigned not null,        # Score from 0-1000
    rawScore float not null,            # Raw Signal Score
    log2RawScore float not null         # log2 of raw score
);
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > loadLincRNAsAllCellType.sh
#!/bin/sh
#
cellType=(Adipose Adrenal Brain Breast Colon Heart Kidney
              Liver Lung LymphNode Ovary Prostate SkeletalMuscle
              WhiteBloodCell Testes Thyroid Testes_R Brain_R
              Placenta_R Foreskin_R hLF_r2 hLF_r1)
for c in "${cellType[@]}"
do
    echo Processing lincRNAsCT$c.tab
    hgLoadBed -tab -sqlTable=lincRNAsCTTemp.sql hg19 lincRNAsCTTemp lincRNAsCT$c.tab
    hgsql hg19 -e "RENAME TABLE lincRNAsCTTemp to lincRNAsCT$c"
done
'_EOF_'
    # << happy emacs
    chmod +x loadLincRNAsAllCellType.sh
    ./loadLincRNAsAllCellType.sh

#############################################################################
# LASTZ Gibbon NomLeu1 (DONE - 2011-11-04 - Chin)

    mkdir /hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04
    cd /hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04

    cat << '_EOF_' > DEF
# human vs gibbon
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Gibbon nomLeu1
SEQ2_DIR=/scratch/data/nomLeu1/nomLeu1.2bit
SEQ2_LEN=/scratch/data/nomLeu1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #   establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        > do.log 2>&1 &
    #   real    724m15s
    cat fb.hg19.chainNomLeu1Link.txt
    #   2543943556 bases of 2897316137 (87.803%) in intersection

    cd /hive/data/genomes/hg19/bed
    ln -s lastzNomLeu1.2011-11-04 lastz.nomLeu1

    #   running the swap - DONE - 2011-11-08
    mkdir /hive/data/genomes/nomLeu1/bed/blastz.hg19.swap
    cd /hive/data/genomes/nomLeu1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzNomLeu1.2011-11-04/DEF \
        -swap -syntenicNet \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        > swap.log 2>&1 &
    #   real     69m27s
    cat fb.nomLeu1.chainHg19Link.txt
    #   2480558770 bases of 2756591777 (89.986%) in intersection


#############################################################################
# DBSNP B135 / SNP135 (DONE 11/9/11)
# Redmine #5170
    mkdir -p /hive/data/outside/dbSNP/135/human
    cd /hive/data/outside/dbSNP/135/human
    # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
    # to find the subdir name to use as orgDir below (human_9606 in this case).
    # Then click into that directory and look for file names like
    #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
    # -- use the first num for build and the second num_num for buildAssembly.
    # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
    #
    # Some trial and error was required to get the config.ra just right -- assembly
    # label now has ".p5" at end despite buildAssembly being 37_3, and more GRCh37
    # patch contigs needed to be filtered out:
    cat > config.ra <<EOF
db hg19
orgDir human_9606
build 135
buildAssembly 37_3
liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
refAssemblyLabel GRCh37.p5
ignoreDbSnpContigs NW_003(3159[0-9][0-9]|5710[3-6][0-9])
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
    # Sent dbSNP some emails about conditions causing lots of SNPs in snp135Errors.bed.gz
    # due to inconsistent locType vs. coords.  2.3M SNPs have new exception SingleAlleleFreq,
    # and 474k SNPs have new exception InconsistentAlleles (allele freqs vs observed).


#############################################################################
# FILTER SNP135 (DONE 11/14/11 angie)
    # Redmine #5170
    # Make several tracks that are filtered subsets of snp135:
    # First, filter out the multiply-aligned and/or weight >1 SNPs -> snp135Mult
    # Second, siphon off the common variants -> snp135Common
    # Third, take the (uniquely mapped, not known to be common) variants
    # w/dbSNP's "clinically-assoc" flag -> snp135Flagged
    cd /hive/data/outside/dbSNP/135/human
    zcat snp135.bed.gz \
    | perl -we \
      '$minTotal2N = 10; \
       ($multCount, $comCount, $flagCount, $miscCount) = (0,0,0,0); \
       open($mult, "| gzip -c > snp135Mult.bed.gz") || die; \
       open($common,    "| gzip -c > snp135Common.bed.gz") || die; \
       open($flagged,   "| gzip -c > snp135Flagged.bed.gz") || die; \
       open($misc,      "| gzip -c > snp135Misc.bed.gz") || die; \
       while (<>) { \
         @w = split("\t"); \
         if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \
           print $mult $_; \
           $multCount++; \
         } else { \
           my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \
           my @alNs = split(",", $nStr);      die unless scalar(@alNs) == $alleleFreqCount; \
           my @freqs = split(",", $freqStr);  die unless scalar(@freqs) == $alleleFreqCount; \
           my ($total2N, $maxAlleleFreq) = (0, 0); \
           for (my $i = 0;  $i < $alleleFreqCount;  $i++) { \
             $total2N += $alNs[$i]; \
             $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \
           } \
           if ($alleleFreqCount >= 2 && $total2N >= $minTotal2N && $maxAlleleFreq <= 0.99) { \
             print $common $_; \
             $comCount++; \
           } elsif($w[24] =~ /clinically-assoc/)  { \
             print $flagged $_; \
             $flagCount++; \
           } else { \
             print $misc $_; \
             $miscCount++; \
           } \
         } \
       } \
       close($mult);  close($common); close($flagged);  close($misc); \
       print "snp135Mult:    $multCount\nsnp135Common:  $comCount\nsnp135Flagged: $flagCount\n" . \
             "leftover:      $miscCount\n";'
#snp135Mult:    3538479
#snp135Common:  11525489
#snp135Flagged: 32077
#leftover:      39116035

    # Load tables
    foreach subset (Mult Common Flagged)
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
        hg19 snp135$subset -sqlTable=snp135.sql snp135$subset.bed.gz
    end


#############################################################################
# SNP135 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 11/14/11 angie)
    mkdir /hive/data/genomes/hg19/bed/snp135Ortho
    cd /hive/data/genomes/hg19/bed/snp135Ortho
    # Filter snp135 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
    zcat /hive/data/outside/dbSNP/135/human/snp135.bed.gz \
    | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
    | sort -u \
      > snp135ExcludeIds.txt
    wc -l snp135ExcludeIds.txt
#1297409 snp135ExcludeIds.txt
    zcat /hive/data/outside/dbSNP/135/human/snp135.bed.gz \
    | awk '$3-$2 == 1 && $11 == "single" {print;}' \
    | grep -vFwf snp135ExcludeIds.txt \
#NOTE FOR NEXT TIME: pipe output straight to awk command below... don't need this 7G intermediate:
      > snp135Simple.bed
    wc -l snp135Simple.bed
#44228667 snp135Simple.bed

    # Glom all human info that we need for the final table onto the
    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
    awk 'BEGIN{OFS="\t";} \
        {print $1, $2, $3, \
               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
               0, $6;}' \
      snp135Simple.bed > snp135ForLiftOver.bed

    # Map coords to chimp using liftOver.
    mkdir run.liftOChimp
    cd run.liftOChimp
    mkdir split out
    splitFile ../snp135ForLiftOver.bed 10000 split/chunk
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro3.over.chain.gz \
        \{check out exists out/panTro3.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    ssh swarm
    cd /hive/data/genomes/hg19/bed/snp135Ortho/run.liftOChimp
    para make jobList
#Completed: 4423 of 4423 jobs
#CPU time in finished jobs:     430555s    7175.92m   119.60h    4.98d  0.014 y
#IO & Wait Time:                 45877s     764.61m    12.74h    0.53d  0.001 y
#Average job time:                 108s       1.80m     0.03h    0.00d
#Longest finished job:             262s       4.37m     0.07h    0.00d
#Submission to last job:           542s       9.03m     0.15h    0.01d

    # Map coords to orangutan using liftOver.
    mkdir ../run.liftOPon
    cd ../run.liftOPon
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
        \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 4423 of 4423 jobs
#CPU time in finished jobs:     591884s    9864.74m   164.41h    6.85d  0.019 y
#IO & Wait Time:                 55485s     924.74m    15.41h    0.64d  0.002 y
#Average job time:                 146s       2.44m     0.04h    0.00d
#Longest finished job:             380s       6.33m     0.11h    0.00d
#Submission to last job:          1403s      23.38m     0.39h    0.02d

    # Map coords to macaque using liftOver.
    mkdir ../run.liftOMac
    cd ../run.liftOMac
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
        \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 4423 of 4423 jobs
#CPU time in finished jobs:    1097552s   18292.53m   304.88h   12.70d  0.035 y
#IO & Wait Time:                 91301s    1521.69m    25.36h    1.06d  0.003 y
#Average job time:                 269s       4.48m     0.07h    0.00d
#Longest finished job:             697s      11.62m     0.19h    0.01d
#Submission to last job:          1555s      25.92m     0.43h    0.02d

    cd /hive/data/genomes/hg19/bed/snp135Ortho
    # Concatenate the chimp results, sorting by chimp pos in order to
    # efficiently access 2bit sequence in getOrthoSeq.  The output of
    # that is then sorted by the glommed human info field, so that we
    # can use join to combine chimp and macaque results in the next step.
    # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro3.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro3/panTro3.2bit \
    | sort > panTro3.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
    | sort > ponAbe2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
    | sort > rheMac2.orthoGlom.txt
    wc -l panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
#41804363 panTro3.orthoGlom.txt
#39856046 ponAbe2.orthoGlom.txt
#35918623 rheMac2.orthoGlom.txt

    # Use the glommed name field as a key to join up chimp and macaque
    # allele data.  Include glommed name from both files because if only
    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
    # in the orthoGlom fields from each file, which are in the same order
    # as the chimp and macaque columns of snp135OrthoPanTro2RheMac2.
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt \
    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
      > tmp.txt
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      tmp.txt rheMac2.orthoGlom.txt \
    | perl -wpe 'chomp; \
        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
          split(/\|/, $glomKey); \
        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
        s/^.*$//;' \
    | sort -k1,1 -k2n,2n > snp135OrthoPt3Pa2Rm2.bed
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
      hg19 snp135OrthoPt3Pa2Rm2 snp135OrthoPt3Pa2Rm2.bed
#Loaded 43184090 elements of size 22
    # Cleanup:
    rm -r run*/split tmp.txt *.orthoGlom.txt snp135Simple.bed 
    gzip snp135ExcludeIds.txt snp135ForLiftOver.bed &


############################################################################
# DBSNP CODING ANNOTATIONS (135) (DONE 11/14/11 angie)
# It wasn't necessary to redo this following the 9/1 re-run of doDbSnp.pl because
# that simply picked up new allele frequency info, no change to exceptions etc.
    cd /hive/data/outside/dbSNP/135/human
    # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
    # For anything except an insertion (0 bases between flanks),
    # we need to add 1 to the end coord.  For an insertion, we need
    # to add 1 to the start coord.  Make a hash of the insertion IDs,
    # then look up each ID in ncbiFuncAnnotations.txt to tell which
    # transform to apply.
    # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
    zcat ncbiFuncAnnotations.txt.gz \
    | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \
              while (<$IDS>) { chomp; $ids{$_} = 1; } \
              close($IDS); \
              %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 44=>1, 45=>1); \
              while (<>) { \
                chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                next unless $coding{$w[5]}; \
                $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                  $w[2]++; # 2-base insertions: increment start coord \
                } else { \
                  $w[3]++; # increment end coord to get half-open \
                } \
                print join("\t", @w) . "\n"; \
              }' \
    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
    | uniq \
      > ncbiCodingAnnotations.txt
    wc -l ncbiCodingAnnotations.txt
#2803490 ncbiCodingAnnotations.txt
    # How many & what kinds of function types?
    cut -f 6 ncbiCodingAnnotations.txt \
    | sort -n | uniq -c
# 512390 3   (coding-synon)
#1385793 8   (cds-reference -- ignored)
#  23909 41  (nonsense)
# 827675 42  (missense)
#  53703 44  (frameshift)
#     20 45  (cds-indel)
    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
    perl -e  'while (<>) { chomp; \
                my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                if (defined $lastRs && \
                    ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                     $lastTx ne $txId || $lastFrm ne $frm)) { \
                  if (defined $refRow) { \
                    $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                    $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                  } \
                  print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                        "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                  $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                } \
                ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                    ($rsId, $ctg, $s, $e, $txId, $frm); \
                $count++; \
                if ($fxn == 8) { \
                  $refRow = [$fxn, $nt, $aa, $codon]; \
                } else { \
                 $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                } \
              } \
              if (defined $refRow) { \
                $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
              } \
              print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                    "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
      ncbiCodingAnnotations.txt \
    | liftUp snp135CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
    hgLoadBed hg19 snp135CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      snp135CodingDbSnp.bed
#Loaded 1385812 elements of size 11


############################################################################
# SNPMASKED SEQUENCE FOR SNP135 (DONE 11/14/11 angie)
    mkdir /hive/data/genomes/hg19/snp135Mask
    cd /hive/data/genomes/hg19/snp135Mask
    # Identify rsIds with various problems -- we will exclude those.
    zcat /hive/data/outside/dbSNP/135/human/snp135.bed.gz \
    | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
      | sort -u \
      > snp135ExcludeRsIds.txt
    zcat /hive/data/outside/dbSNP/135/human/snp135.bed.gz \
    | grep -vFwf snp135ExcludeRsIds.txt \
      > snp135Cleaned.bed
    wc -l snp135Cleaned.bed
#49922101 snp135Cleaned.bed

    # Substitutions:
    mkdir substitutions
    snpMaskSingle snp135Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout diffObserved.txt \
    | faSplit byname stdin substitutions/
#Masked 44283699 snps in 44281659 out of 3131050506 genomic bases
#/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3131050506 (difference is 6110758)
    # warnings about differing observed strings at same base position:
    wc -l diffObserved.txt
#3661 diffObserved.txt
#TODO: send list to dbSNP.
    # Check that 6110758 is the total #bases in sequences with nothing in snp135Cleaned:
    grep -Fw single snp135Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
    grep -vwf /data/tmp/1 ../chrom.sizes \
    | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
#6110758
    # Make sure that sizes are identical, first diffs are normal -> IUPAC,
    # and first diffs' case is preserved:
    foreach f (substitutions/chr*.fa)
      faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
    end
#chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10233 (y != c)
#chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 60522 (K != T)
#...
#(output OK -- ambiguous bases replacing [agct] at SNP positions)
    foreach f (substitutions/chr*.fa)
      echo $f:t:r
      mv $f $f:r.subst.fa
      gzip $f:r.subst.fa &
    end

    # Insertions & deletions not done.  To date we have only offered substs for download.
    # If there is user demand, use template from snp131 above.

    # Clean up and prepare for download:
    gzip snp135Cleaned.bed &
    foreach d (substitutions)
      pushd $d
        md5sum *.gz > md5sum.txt
        cp /hive/data/genomes/hg19/snp132Mask/$d/README.txt .
      popd
    end
    # Edit the README.txt.

    # Create download links on hgwdev.
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp135Mask
    ln -s /hive/data/genomes/hg19/snp135Mask/substitutions/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp135Mask/


#############################################################################
# SIB Transcriptome (DONE 2011-12-02 Chin)

    # Create working directory and download data from where Christian
    # Iseli (Christian.Iseli at licr.org) put it, and unpack. 
    mkdir -p /hive/data/outside/lirc
    cd /hive/data/outside/lirc
    wget --timestamping ftp://ftp.licr.org/pub/hg19/HTr.gtf.gz 
    wget --timestamping ftp://ftp.licr.org/pub/hg19/txg.tar.gz 
 
    cd /hive/data/genomes/hg19/bed/
    mkdir sibTranscriptome
    cd sibTranscriptome
    tar -zxvf /hive/data/outside/lirc/txg.tar.gz
    cp /hive/data/outside/lirc/HTr.gtf.gz .

    zcat HTr.gtf.gz | ldHgGene hg19 sibGene stdin
    # Reading stdin
    # Read 195300 transcripts in 2564421 lines in 1 files
    # 195300 groups 25 seqs 1 sources 2 feature types
    # 195300 gene predictions

    # Do a little data cleanup and transformation and load splice graphs
    # into database.
    sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql > sibTxGraph.sql
    cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb \
      -sqlTable=sibTxGraph.sql hg19 sibTxGraph stdin
    # Reading stdin
    # Loaded 46973 elements of size 18
    # Sorted
    # Creating table definition for sibTxGraph
    # Saving bed.tab
    # Loading hg19

    # Create sibAltEvents track for analysed alt-splices.
    # Not on RR for hg18 and hg19, so do not push it out
   cat txg/*.txg | txgAnalyze stdin /cluster/data/hg19/hg19.2bit sibAltEvents.bed
   awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed
   hgLoadBed hg19 sibAltEvents foo.bed
    # Reading foo.bed
    # Loaded 431590 elements of size 6
    # Sorted
    # Creating table definition for sibAltEvents
    # Saving bed.tab
    # Loading hg19


    # push sibGene and sibTxGraph for hg19


############################################################################
# HGNC: Hugo Gene Nomenclature Committee (DONE 2012-05-20 cline)

mkdir /hive/data/outside/hgnc
cd /hive/data/outside/hgnc
mkdir 052012
cd 052012
wget -O hgnc.txt "http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&preset=all&status=Approved&status=Entry+Withdrawn&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag"

tail -n +2 hgnc.txt |grep -v withdrawn \
 | hgLoadSqlTab hg19 hgnc ~/kent/src/hg/lib/hgnc.sql stdin       
#########################################################################
# LASTZ Cow BosTau7 (DONE - 2012-01-23 - Chin)
    mkdir /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23
    cd /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23

    cat << '_EOF_' > DEF
# human vs cow
# maximum M allowed with lastz is only 254
BLASTZ_M=254

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau7 
SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit
SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0    

    
BASE=/hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    # real    433m45.814s
    cat fb.hg19.chainBosTau7Link.txt
    # 1360887008 bases of 2897316137 (46.971%) in intersection
    # Create link
    cd /hive/data/genomes/hg19/bed
    ln -s  lastzBosTau7.2012-01-23 lastz.bosTau7

    #   running the swap
    mkdir /hive/data/genomes/bosTau7/bed/blastz.hg19.swap
    cd /hive/data/genomes/bosTau7/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23/DEF \
        -swap  -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real     95m9.611s
    cat fb.bosTau7.chainHg19Link.txt
    #   1388551419 bases of 2804673174 (49.508%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastz.hg19.swap lastz.hg19
############################################################################
# UPDATE COSMIC TRACK - v57 (DONE 2012-01-26 larrym)

# Table stats before

hgsql hg19 -s -e 'select count(*) from cosmicRaw'
55579
hgsql hg19 -s -e 'select count(*) from cosmic'
49087

~/kent/src/hg/utils/automation/loadCosmic.pl hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v57_180112.csv

# Table stats after

hgsql hg19 -s -e 'select count(*) from cosmicRaw'
78152
hgsql hg19 -s -e 'select count(*) from cosmic'
71437

Here's some ID's that were added in this release (and not present on RR):

COSM143850
COSM143824
COSM143823
COSM143802
COSM143801

############################################################################
# POLYA-SEQ TRACK (from Adnan Derti, Merck) (LOADED, Andy 2012-01-30)

# Fan unpacked the .zip a while back
cd /hive/data/genomes/hg19/bed/polyA
# make links with more UCSCish naming
mkdir /hive/data/genomes/{hg18,hg19,canFam2,mm9,rn4,rheMac2}/bed/polyASeq
mkdir -p /gbdb/{hg18,hg19,canFam2,mm9,rn4,rheMac2}/bbi
for set in `ls -1 | grep -v orig | grep -v table`; do
  suff=`echo $set | sed 's/^[a-z]\+_//'`; 
  db=`echo $suff | sed 's/\_.*//'`;
  tiss=`echo $suff | sed 's/.\+\_//; s/^[a-z]/\U&/; s/-[a-z]/\U&/; s/-//'`;
  Db=`echo $db | sed 's/^[a-z]/\U&/'`; 
  printf "%s\t%s\t%s\t%s\t%s\n" $set $db $Db $tiss $suff
done > table.info
for set in `ls -1 | grep -v orig | grep -v table`; do
  suff=`echo $set | sed 's/^[a-z]\+_//'`; 
  db=`echo $suff | sed 's/\_.*//'`;
  tiss=`echo $suff | sed 's/.\+\_//; s/^[a-z]/\U&/; s/-[a-z]/\U&/; s/-//'`;
  Db=`echo $db | sed 's/^[a-z]/\U&/'`; 
  fwdTable=polyASeqSites${tiss}Fwd
  revTable=polyASeqSites${tiss}Rev
  fwdBg=/hive/data/genomes/${db}/bed/polyASeq/${fwdTable}.bedGraph
  revBg=/hive/data/genomes/${db}/bed/polyASeq/${revTable}.bedGraph
  fwdBw=${fwdBg%.bedGraph}.bw
  revBw=${revBg%.bedGraph}.bw
  tail -n +2 ${set}/polyaseq_sites_fwd_strand.bedgraph | sort -k1,1 -k2,2n > $fwdBg;
  tail -n +2 ${set}/polyaseq_sites_rev_strand.bedgraph | sort -k1,1 -k2,2n > $revBg;
  bedGraphToBigWig $fwdBg /hive/data/genomes/${db}/chrom.sizes $fwdBw
  bedGraphToBigWig $revBg /hive/data/genomes/${db}/chrom.sizes $revBw
  ln -s $fwdBw /gbdb/${db}/bbi/
  ln -s $revBw /gbdb/${db}/bbi/
  hgBbiDbLink $db $fwdTable /gbdb/${db}/bbi/${fwdTable}.bw
  hgBbiDbLink $db $revTable /gbdb/${db}/bbi/${revTable}.bw
done
# silly loop to take care of the majority of the trackDb.
# the rest copy/paste
cat table.info | while read -a line; do 
  db=${line[1]}
  for Strand in Fwd Rev; do 
     strand=`echo $Strand | tr [:upper:] [:lower:]`
     bg=${line[0]}/polyaseq_sites_${strand}_strand.bedgraph
     tiss=${line[3]}
     table=polyASeqSites${tiss}${Strand}
     bw=/gbdb/${db}/bbi/${table}.bw
     min=`bigWigInfo $bw | grep "^min" | sed 's/min: //'`
     max=`bigWigInfo $bw | grep "^max" | sed 's/max: //'`
     echo "        track "$table
     echo "        parent polyASeqSitesSignalView"
     echo "        subGroups view=Signal tissType="$tiss" strand="$strand
     echo "        shortLabel PolyA-Seq "$tiss
     echo "        longLabel Poly(A)-tail sequencing of "$tiss" from Merck ("$Strand" strand)"
     if [ $strand = "fwd" ]; then
     echo "        color 153,51,51"
     else
     echo "        color 0,0,0"
     fi 
     echo "        type bigWig "$min" "$max
     echo
  done >> ${db}.ra 
done

##############################################################################
# LASTZ MOUSE Mm10 (DONE - 2012-03-08 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07
    cd /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07

    cat << '_EOF_' > DEF
# human vs mouse
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_SMSK=/scratch/data/hg19/lineageSpecificRepeats
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Mouse Mm9
SEQ2_DIR=/scratch/data/mm10/nib
SEQ2_SMSK=/scratch/data/mm10/notInOthers
SEQ2_LEN=/scratch/data/mm10/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzMm10.2012-03-07
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    197m23.436s
    cat fb.hg19.chainMm10Link.txt
    #	1021265143 bases of 2897316137 (35.249%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/hg19/bed
    ln -s lastzMm10.2012-03-07 lastz.mm10

    #	and the swap
    mkdir /hive/data/genomes/mm10/bed/blastz.hg19.swap
    cd /hive/data/genomes/mm10/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzMm10.2012-03-07/DEF \
	-swap -noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    72m32.794s
    cat fb.mm10.chainHg19Link.txt
    #	1014045890 bases of 2652783500 (38.226%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/mm10/bed
    ln -s blastz.hg19.swap lastz.hg19

#########################################################################
# LIFT ENCODE REGIONS FROM HG19 (DONE, Andy)

echo "select * from encodeRegions" | hgsql hg18 | tail -n +2 \
  | liftOver /dev/stdin /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz /dev/stdout encodeRegions.unmapped \
  | hgLoadBed -noBin hg19 encodeRegions /dev/stdin
# (all mapped cleanly)

#########################################################################
## WINDOWMASKER (DONE - 2012-04-19 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/windowMasker
    cd /hive/data/genomes/hg19/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev hg19 > do.log 2>&1 &
    #   real    225m45.489s

    # Masking statistics
    twoBitToFa hg19.wmsk.2bit stdout | faSize stdin
    #   3137161264 bases (239850802 N's 2897310462 real 1828487268 upper
    #   1068823194 lower) in 93 sequences in 1 files
    #   Total size: mean 33732916.8 sd 63483709.4
    #   min 4262 (chr18_gl000207_random) max 249250621 (chr1) median 172294
    #   %34.07 masked total, %36.89 masked real

    twoBitToFa hg19.wmsk.sdust.2bit stdout | faSize stdin
    #   3137161264 bases (239850802 N's 2897310462 real 1811306328 upper
    #   1086004134 lower) in 93 sequences in 1 files
    #   Total size: mean 33732916.8 sd 63483709.4
    #   min 4262 (chr18_gl000207_random) max 249250621 (chr1) median 172294
    #   %34.62 masked total, %37.48 masked real

    hgLoadBed hg19 windowmaskerSdust windowmasker.sdust.bed.gz
    #   Read 16318719 elements of size 3 from windowmasker.sdust.bed.gz

    featureBits -countGaps hg19 windowmaskerSdust
    #	1325854876 bases of 3137161264 (42.263%) in intersection

    #	eliminate the gaps from the masking
    featureBits hg19 -not gap -bed=notGap.bed
    #	2897316137 bases of 2897316137 (100.000%) in intersection
    time nice -n +19 featureBits hg19 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #   1086009749 bases of 2897316137 (37.483%) in intersection
    #   real    2m11.261s

    #	reload track to get it clean
    hgLoadBed hg19 windowmaskerSdust cleanWMask.bed.gz
    #	Read 16318560 elements of size 4 from cleanWMask.bed.gz
    time featureBits -countGaps hg19 windowmaskerSdust
    #   1086009749 bases of 3137161264 (34.618%) in intersection
    #   real    1m34.044s

    #	do *not* need to mask with this clean result since RepeatMasker
    #	does a very good job here.  Using RM masking instead.
#    zcat cleanWMask.bed.gz \
#	| twoBitMask ../../hg19.unmasked.2bit stdin \
#	    -type=.bed hg19.cleanWMSdust.2bit
#    twoBitToFa hg19.cleanWMSdust.2bit stdout | faSize stdin \
#        > hg19.cleanWMSdust.faSize.txt
#    cat hg19.cleanWMSdust.faSize.txt

    # how much does this window masker and repeat masker overlap:
    time featureBits -countGaps hg19 rmsk windowmaskerSdust
    #   849334688 bases of 3137161264 (27.073%) in intersection
    #   real    2m4.634s

    # RM by itself:
    time featureBits -countGaps hg19 rmsk
    #   1465724774 bases of 3137161264 (46.721%) in intersection
    #   real    0m33.408s

##########################################################################pubStart
# Publications track (DONE - 04-27-12 - Max)

# article download and conversion is run every night on hgwdev:
# 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh
# the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/
# then converts them to text into /hive/data/outside/literature/{pmc,elsevier}

# all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py

# data processing was run manually like this
export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/
# pmc
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc
ssh swarm 
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat steps:annot-tables
exit
pubBlat load

# elsevier
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier
ssh swarm
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat steps:annot-tables
exit
pubBlat load
#--pubEnd

#############################################################################
# lifting HapMap recombination maps from hg18 (DONE - 2012-05-09 - Hiram)
    mkdir -p /hive/data/genomes/hg19/bed/hapmap/release24FromHg18
    cd /hive/data/genomes/hg19/bed/hapmap/release24FromHg18
    ln -s /hive/data/genomes/hg18/bed/hapmap/release24/hapMapRelease24CEURecombMap.bedGraph hg18.hapMapRelease24CEURecombMap.bedGraph
    ln -s /hive/data/genomes/hg18/bed/hapmap/release24/hapMapRelease24YRIRecombMap.bedGraph hg18.hapMapRelease24YRIRecombMap.bedGraph
    ln -s /hive/data/genomes/hg18/bed/hapmap/release24/hapMapRelease24CombinedRecombMap.bedGraph hg18.hapMapRelease24CombinedRecombMap.bedGraph

    liftOver hg18.hapMapRelease24CEURecombMap.bedGraph \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
        stdout hapMapRelease24CEURecombMap.unmapped | sort -k1,1 -k2,2n \
        > hapMapRelease24CEURecombMap.bedGraph

    liftOver hg18.hapMapRelease24YRIRecombMap.bedGraph \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
        stdout hapMapRelease24YRIRecombMap.unmapped | sort -k1,1 -k2,2n \
        > hapMapRelease24YRIRecombMap.bedGraph

    liftOver hg18.hapMapRelease24CombinedRecombMap.bedGraph \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToHg19.over.chain.gz \
        stdout hapMapRelease24CombinedRecombMap.unmapped | sort -k1,1 -k2,2n \
        > hapMapRelease24CombinedRecombMap.bedGraph


for F in hapMapRelease24CEURecombMap hapMapRelease24CombinedRecombMap \
        hapMapRelease24YRIRecombMap
do
bedGraphToBigWig -verbose=2 ${F}.bedGraph \
        /hive/data/genomes/hg19/chrom.sizes ${F}.bw > ${F}.log 2>&1
done

for T in hapMapRelease24CEURecombMap hapMapRelease24CombinedRecombMap \
        hapMapRelease24YRIRecombMap
do
    rm -f /gbdb/hg19/decode/${T}.bw
    ln -s `pwd`/${T}.bw /gbdb/hg19/decode/${T}.bw
    hgsql -e "drop table ${T};" hg19
    hgBbiDbLink hg19 ${T} /gbdb/hg19/decode/${T}.bw
done

#############################################################################
# 1000 GENOMES PHASE 1 VARIANT CALLS (UPDATE DONE 10/9/12 angie)
# Autosomes and chrX loaded 5/21/12; chrY (and chrM but it's rCRS of course)
# became available in July '12.  Existing released files were quietly updated
# 10/1/12 with some new variant IDs.
    # This is a lot of data.  Use aspera (ascp) instead of ftp, run in a screen.
    screen -S phase1
    mkdir -p /hive/data/genomes/hg19/bed/1000Genomes/phase1
    cd /hive/data/genomes/hg19/bed/1000Genomes/phase1
    set ascpCmd = /opt/aspera/connect/bin/ascp
    set ascpArgs = '-i /opt/aspera/connect/etc/asperaweb_id_dsa.putty -QTr -l150M'
    set phase1Path = anonftp@ftp-private.ncbi.nlm.nih.gov:/1000genomes/ftp/phase1/analysis_results/integrated_call_sets
    $ascpCmd $ascpArgs \
      $phase1Path/README.ALL.BI_genome_strip_hq_chrY.20101123 \
      $phase1Path/README_phase1_integrated_call_set_20120621 \
      $phase1Path/integrated_call_samples.20101123.ALL.panel \
      $phase1Path/integrated_call_samples.20101123.ped \
      $phase1Path/uniq.chrY.human.ncbi37.txt \
      .
    # BTW if you see "Error 51 [Destination: Permission denied]" when reloading,
    # it's because the files are read-only -- move aside or rm, then try again.
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
      set file = $phase1Path/ALL.chr$c.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz
      set cmd = "$ascpCmd $ascpArgs $file $file.tbi ."
      echo $cmd
      $cmd
      if ($status != 0) then
        echo ================ ERROR chrom $c ======================
      endif
    end
    du -sh --apparent .
#142G    .
    $ascpCmd $ascpArgs $phase1Path/ALL.chrY.genome_strip_hq.20101123.svs.low_coverage.genotypes.vcf.gz{,.tbi} .
    $ascpCmd $ascpArgs $phase1Path/ALL.chrY.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz{,.tbi} .
    # I wondered why they didn't merge the two chrY call sets (SNPs and SVs) --
    # the SVS file has 456 individs and the SNPs file has 526.  At least the
    # 456 are a subset of the 526... but for now I will just use the SNPs file.

    # Grab chrMT even though we don't have liftOver for VCF at this point:
    $ascpCmd $ascpArgs $phase1Path/ALL.chrMT.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz{,.tbi} .

    # Hmmmm, how much space do we have on /gbdb?  Well, link it on hgwdev anyway:
    mkdir /gbdb/hg19/1000Genomes
    ln -s `pwd`/*.vcf.gz* /gbdb/hg19/1000Genomes/
    cp /dev/null tgpPhase1.txt
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
      set file = ALL.chr$c.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz
      echo "/gbdb/hg19/1000Genomes/$file\tchr$c" >> tgpPhase1.txt
    end
    echo "/gbdb/hg19/1000Genomes/ALL.chrY.phase1_samtools_si.20101123.snps.low_coverage.genotypes.vcf.gz\tchrY" \
      >> tgpPhase1.txt
    hgLoadSqlTab hg19 tgpPhase1 ~/kent/src/hg/lib/bbiChroms.sql tgpPhase1.txt
    # Make a chromosomes line for trackDb:
    use hg19 -NBe 'select seqName from tgpPhase1' | xargs echo | sed -e 's/ /,/g'
#chr1,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr2,chr20,chr21,chr22,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chrX,chrY


############################################################################
# 1000 GENOMES PAIRED-END ACCESSIBLE REGIONS (#1079) (DONE 8/22/12 angie)
    # Data provided by Tom Blackwell at University of Michigan
    mkdir /hive/data/genomes/hg19/bed/1000Genomes/phase1Mapability
    cd /hive/data/genomes/hg19/bed/1000Genomes/phase1Mapability
    wget -r ftp://share.sph.umich.edu/public
    ln -s `pwd`/share.sph.umich.edu/public/paired.end.mapping.1000G..pilot.bb \
      /gbdb/hg19/1000Genomes/
    ln -s `pwd`/share.sph.umich.edu/public/paired.end.mapping.1000G.strict.bb \
      /gbdb/hg19/1000Genomes/
    hgBbiLink hg19 tgpPhase1AccessibilityPilotCriteria \
      /gbdb/hg19/1000Genomes/paired.end.mapping.1000G..pilot.bb
    hgBbiLink hg19 tgpPhase1AccessibilityStrictCriteria \
      /gbdb/hg19/1000Genomes/paired.end.mapping.1000G.strict.bb


############################################################################
# UPDATE COSMIC TRACK - v59 (DONE 2012-05-23 larrym)

~/kent/src/hg/utils/automation/loadCosmic.pl -oldVer=55 hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v59_230512.csv
Loading COSMIC v59
New length: 136638
Old length: 49087
Percent bed overlap with previous version: 99.95%
Number of deleted IDs: 20
Number of added IDs: 87571

#########################################################################
# LASTZ Rat Rn5 (DONE - 2009-05-13 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/lastzRn5.2012-06-27
    cd /hive/data/genomes/hg19/bed/lastzRn5.2012-06-27

    cat << '_EOF_' > DEF
# human vs rat

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn5
SEQ2_DIR=/hive/data/genomes/rn5/rn5.2bit
SEQ2_LEN=/hive/data/genomes/rn5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg19/bed/lastzRn5.2012-06-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet -noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    658m53.984s
    cat fb.hg19.chainRn5Link.txt
    #   917356917 bases of 2897316137 (31.662%) in intersection

    #	running the swap - DONE - 2012-06-27
    mkdir /hive/data/genomes/rn5/bed/blastz.hg19.swap
    cd /hive/data/genomes/rn5/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzRn5.2012-06-27/DEF \
	-swap -noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    66m53.095s
    cat fb.rn5.chainHg19Link.txt
    #	933922552 bases of 2572853723 (36.299%) in intersection

##############################################################################
# LASTZ tenrec echTel1 (DONE - 2012-06-29 - Hiram)
    # establish a screen to control this job with a name to indicate what it is
    screen -S hg19EchTel1
    mkdir /hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29
    cd /hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #	number of jobs, 50,000 to something under 100,000
    cat << '_EOF_' > DEF
# tenrec vs human
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: tenrec EchTel1
SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/data/echTel1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=700

BASE=/hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    411m54.452s
    cat fb.hg19.chainEchTel1Link.txt
    #   670299345 bases of 2897316137 (23.135%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/hg19/bed
    ln -s lastzEchTel1.2012-06-29 lastz.echTel1

    # better to have reciprocal best for this one since it is low coverage:
    cd /hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29
    time doRecipBest.pl hg19 echTel1 -buildDir=`pwd` -workhorse=hgwdev \
	> best.log 2>&1 &
    #   real    48m11.157s

    mkdir /hive/data/genomes/echTel1/bed/blastz.hg19.swap
    cd /hive/data/genomes/echTel1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzEchTel1.2012-06-29/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    405m49.935s
    cat fb.echTel1.chainMm10Link.txt
    #   659524096 bases of 2111581369 (31.234%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/echTel1/bed
    ln -s blastz.hg19.swap lastz.hg19

##############################################################################
# LASTZ dog canFam3 (DONE - 2012-07-03 - Hiram)
    # establish a screen to control this job with a name to indicate what it is
    screen -S hg19CanFam3
    mkdir /hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03
    cd /hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #	number of jobs, 50,000 to something under 100,000
    cat << '_EOF_' > DEF
# human vs dog
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: dog CanFam3
SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit
SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=20

BASE=/hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    # forgot to copy to the log
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    1019m39.790s

    cat fb.hg19.chainCanFam3Link.txt
    #   1502192631 bases of 2897316137 (51.848%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/hg19/bed
    ln -s lastzCanFam3.2012-07-03 lastz.canFam3

    mkdir /hive/data/genomes/canFam3/bed/blastz.hg19.swap
    cd /hive/data/genomes/canFam3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    103m14.464s
    cat fb.canFam3.chainHg19Link.txt
    #   1455183825 bases of 2392715236 (60.817%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/canFam3/bed
    ln -s blastz.hg19.swap lastz.hg19


##############################################################################
# DBSNP B137 / SNP137 (DONE 11/9/12)
# Originally done 7/11/12; updated w/corrections from dbSNP 9/10/12, 10/10, 11/9/12
# -- see comments below and #8360 note 36, 42, 45
# Redmine #8360
    mkdir -p /hive/data/outside/dbSNP/137/human
    cd /hive/data/outside/dbSNP/137/human
    # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
    # to find the subdir name to use as orgDir below (human_9606 in this case).
    # Then click into that directory and look for file names like
    #    b(1[0-9][0-9])_*_([0-9]+_[0-9])
    # -- use the first num for build and the second num_num for buildAssembly.
    # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp.
    #
    # Some trial and error was required to get the config.ra just right --
    # the b* filenames don't include buildAssembly!
    # patch contigs needed to be filtered out:
    cat > config.ra <<EOF
db hg19
orgDir human_9606
build 137
buildAssembly
liftUp /hive/data/genomes/hg19/jkStuff/liftContigs.lft
refAssemblyLabel GRCh37.p5
ignoreDbSnpContigs NW_003(3159[0-9][0-9]|5710[3-6][0-9])
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
    # Script failed with mysql warnings because b137_ContigInfo.bcp.gz had an extra
    # column in the middle, relative to the CREATE TABLE def in human_9606_table.sql.gz.
    # I emailed dbsnp-collab-all@ncbi and manually spliced out the unexplained column:
    cd data
    mv b137_ContigInfo.bcp.gz b137_ContigInfo_extraColumn.bcp.gz
    zcat b137_ContigInfo_extraColumn.bcp.gz | cut -f 1-13,15-27 | gzip -c > b137_ContigInfo.bcp.gz
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue loadDbSnp \
      >>& do.log & tail -f do.log
    # mysql server crashed when loading the last table. Added "if (0)" around the
    # parts of loadDbSnp.csh that succeeded and ran it again to catch the last table:
    ./loadDbSnp.csh >>& do.log & tail -f do.log
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue addToDbSnp \
      >>& do.log & tail -f do.log
    # Next error:
#/cluster/home/angie/kent/src/hg/utils/automation/snpAddTGPAlleleFreq.pl hg19snp137
#SNPAlleleFreq_TGP data are not sorted on snp_id (183304030 follows 191299099) at /cluster/home/angie/kent/src/hg/utils/automation/snpAddTGPAlleleFreq.pl line 75, <$tgpAlF> line 4.
    # I modified the mysql queries in snpAddTGPAlleleFreq.pl to order by snp_id,
    # added "if (0)" around the successful portion addToDbSnp.csh, ran again:
    ./addToDbSnp.csh >>& do.log & tail -f do.log
    # Had to do the above a few more times to deal with other unexpected conditions
    # e.g. an allele called "+".
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue bigJoin \
      >>& do.log & tail -f do.log
    # Some tweaks to snpNcbiToUcsc.c required (larger MAX_SNPID, new locType 7, new func 30=ncRNA):
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue translate \
      >>& do.log & tail -f do.log
    # After final snpNcbiToUcsc tweaking:
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue load \
      >>& do.log & tail -f do.log

    # 9/10/12, 10/10/12, 11/9/12: updates w/corrections from dbSNP.
    # The 9/10 update included SNPContigLoc with some corrected mappings -- and some
    # dropped mappings!
    # All 3 updates included new SNPContigLocusId (func predictions).
    # So, last time I'm doing this for snp137 -- new SNPContigLocusId, and
    # new SNPContigLoc with dropped lines added back in from original SNPContigLoc.
    cd /hive/data/outside/dbSNP/137/human/data
    # 9/10:
    mv b137_SNPContigLoc.bcp.gz b137_SNPContigLoc.orig.bcp.gz
    mv b137_SNPContigLocusId.bcp.gz b137_SNPContigLocusId.orig.bcp.gz
    wget ftp://ftp.ncbi.nih.gov/snp/temp/post_b137_fix_preview/b137_SNPContigLoc.bcp.gz
    wget ftp://ftp.ncbi.nih.gov/snp/temp/post_b137_fix_preview/b137_SNPContigLocusId.bcp.gz
    # 10/10:
    wget ftp://ftp.ncbi.nih.gov/snp/temp/post_b137_fix_preview/b137_SNPContigLocusId_before_QA.bcp
    mv b137_SNPContigLocusId.bcp.gz b137_SNPContigLocusId.120910.bcp.gz
    gzip b137_SNPContigLocusId_before_QA.bcp
    ln -s b137_SNPContigLocusId.bcp.gz b137_SNPContigLocusId_before_QA.bcp.gz
    # 11/9:
    wget ftp://ftp.ncbi.nih.gov/snp/temp/post_b137_fix_preview/2012_nov_08_preview/b137_SNPContigLocusId.bcp.gz
    mv b137_SNPContigLocusId.bcp.gz b137_SNPContigLocusId.121108.bcp.gz
    ln -s b137_SNPContigLocusId.121108.bcp.gz b137_SNPContigLocusId.bcp.gz
    # No update to SNPContigLoc, which in the Sep. update dropped some mappings
    # from the original.  Add the dropped mappings back:
    cd /hive/data/outside/dbSNP/137/human/data
    cat > combineContigLoc.pl <<EOF
#!/usr/bin/env perl
# The latest b137_SNPContigLoc.bcp.gz contains some corrected mappings -
# but it is also missing some mappings that it should have!
# So step through the numerically sorted (by {snp_id, ctg_id, asn_from})
# new and old files; if the old file has something that the new file does not,
# add it back in.

use warnings;
use strict;

open(my $fNew, "zcat b137_SNPContigLoc.bcp.gz |") || die "$!";
open(my $fOld, "zcat b137_SNPContigLoc.orig.bcp.gz |") || die "$!";
open(my $fOut, "| gzip -c > b137_SNPContigLoc.merged.bcp.gz") || die "$!";

sub cmpLocs {
  my ($newRef, $oldRef) = @_;
  if (!defined $newRef->[1] || !defined $oldRef->[1]) {
    die;
  }
  my $diff = $newRef->[1] <=> $oldRef->[1];
  return $diff unless ($diff == 0);
  $diff = $newRef->[2] <=> $oldRef->[2];
  return $diff unless ($diff == 0);
  $diff = $newRef->[3] <=> $oldRef->[3];
  return $diff;
}

my @old = split("\t", <$fOld>);

while (<$fNew>) {
  my @new = split("\t");
  if (defined $old[1]) {
    my $diff = &cmpLocs(\@new, \@old);
    while ($diff > 0) {
      # old file's line is missing from the new file -- print old line & get next old line
      print $fOut join("\t", @old);
      my $nextOld = <$fOld>;
      @old = split("\t", $nextOld);
      last if (! defined $old[1]);
      $diff = &cmpLocs(\@new, \@old);
    }
    if ($diff == 0) {
      # same line in new and old -- advance to next line from old file
      @old = split("\t", <$fOld>);
    }
  }
  # always print line from new file.
  print $fOut join("\t", @new);
}
if (defined $old[1]) {
  print $fOut join("\t", @old);
  while (<$fOld>) {
    print $fOut $_;
  }
}
EOF
    chmod a+x combineContigLoc.pl
    ./combineContigLoc.pl

    # Redo the parts of the doDbSnp.pl process that depend on SNPContigLoc and SNPContigLocusId:
    hgsql hg19snp137 -e 'drop table b137_SNPContigLoc; drop table b137_SNPContigLocusId;'
    hgsql hg19snp137 < schema/SNPContigLocs.sql
    # Relevant subset of loadDbSnp.csh:
    cd /hive/data/outside/dbSNP/137/human
    setenv TMPDIR /data/tmp
    set tmpDir = `mktemp -d $TMPDIR/doDbSnp.pl.translate.XXXXXX`
    chmod 775 $tmpDir
    pushd $tmpDir
    echo $tmpDir > /hive/data/outside/dbSNP/137/human/workingDir
    set t = b137_SNPContigLocusId
    zcat /hive/data/outside/dbSNP/137/human/data/$t.bcp.gz | egrep -vw '(HuRef|CRA_TCAGchr7v2)'  | egrep -vw 'NW_003(3159[0-9][0-9]|5710[3-6][0-9])'\
    | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
    | hgLoadSqlTab -oldTable hg19snp137 $t placeholder stdin
    hgsql hg19snp137 -e 'alter table b137_SNPContigLocusId add index (ctg_id);'
    zcat /hive/data/outside/dbSNP/137/human/data/b137_ContigInfo.bcp.gz | egrep -vw '(HuRef|CRA_TCAGchr7v2)' \
    | cut -f 1 | sort -n > b137_ContigInfo.ctg_id.txt
    zcat /hive/data/outside/dbSNP/137/human/data/b137_SNPContigLoc.merged.bcp.gz \
    | grep -Fwf b137_ContigInfo.ctg_id.txt \
    | perl -wpe 's/(\d\d:\d\d:\d\d)\.\d+/$1/g; s/\t(\t|\n)/\t\\N$1/g; s/\t(\t|\n)/\t\\N$1/g;' \
      > tmp.tab
    hgLoadSqlTab -oldTable hg19snp137 b137_SNPContigLoc placeholder tmp.tab
    hgsql hg19snp137 -e 'alter table b137_SNPContigLoc add index (ctg_id);'
    hgsql hg19snp137 -e 'create table ContigLocFix select cl.* from b137_SNPContigLoc as cl, b137_ContigInfo as ci where cl.ctg_id = ci.ctg_id;'
    hgsql hg19snp137 -e 'alter table ContigLocFix add index (ctg_id);'
    hgsql hg19snp137 -e 'drop table b137_SNPContigLoc; \
                         rename table ContigLocFix to b137_SNPContigLoc;'
    hgsql hg19snp137 -e 'alter table b137_SNPContigLoc add index (snp_id);'
    popd
    # Run the first parts of addToDbSnp (if(0)'d out the rest):
    ./addToDbSnp.csh >>& do.log & tail -f do.log
    # Redo from bigJoin onward...
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue bigJoin \
      >>& do.log & tail -f do.log


#############################################################################
# FILTER SNP137 (DONE 11/10/12 angie)
# Originally done 7/11/12; updated 9/11/12, 10/15/12, 11/9/12 -- see SNP137 above
    # Redmine #8360
    # Make several tracks that are filtered subsets of snp137:
    # First, filter out the multiply-aligned and/or weight >1 SNPs -> snp137Mult
    # Second, siphon off the common variants -> snp137Common
    # Third, take the (uniquely mapped, not known to be common) variants
    # w/dbSNP's "clinically-assoc" flag -> snp137Flagged
    cd /hive/data/outside/dbSNP/137/human
    zcat snp137.bed.gz \
    | perl -we \
      '$minTotal2N = 10; \
       ($multCount, $comCount, $flagCount, $miscCount) = (0,0,0,0); \
       open($mult, "| gzip -c > snp137Mult.bed.gz") || die; \
       open($common,    "| gzip -c > snp137Common.bed.gz") || die; \
       open($flagged,   "| gzip -c > snp137Flagged.bed.gz") || die; \
       open($misc,      "| gzip -c > snp137Misc.bed.gz") || die; \
       while (<>) { \
         @w = split("\t"); \
         if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \
           print $mult $_; \
           $multCount++; \
         } else { \
           my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \
           my @alNs = split(",", $nStr);      die unless scalar(@alNs) == $alleleFreqCount; \
           my @freqs = split(",", $freqStr);  die unless scalar(@freqs) == $alleleFreqCount; \
           my ($total2N, $maxAlleleFreq) = (0, 0); \
           for (my $i = 0;  $i < $alleleFreqCount;  $i++) { \
             $total2N += $alNs[$i]; \
             $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \
           } \
           if ($alleleFreqCount >= 2 && $total2N >= $minTotal2N && $maxAlleleFreq <= 0.99) { \
             print $common $_; \
             $comCount++; \
           } elsif($w[24] =~ /clinically-assoc/)  { \
             print $flagged $_; \
             $flagCount++; \
           } else { \
             print $misc $_; \
             $miscCount++; \
           } \
         } \
       } \
       close($mult);  close($common); close($flagged);  close($misc); \
       print "snp137Mult:    $multCount\nsnp137Common:  $comCount\nsnp137Flagged: $flagCount\n" . \
             "leftover:      $miscCount\n";'
#snp137Mult:    3633662
#snp137Common:  13894623
#snp137Flagged: 42733
#leftover:      38677681

    # Load tables
    foreach subset (Mult Common Flagged)
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
        hg19 snp137$subset -sqlTable=snp137.sql snp137$subset.bed.gz
    end


#############################################################################
# SNP137 ORTHOLOGOUS ALLELES IN CHIMP, ORANG, MACAQUE (DONE 7/12/12 angie)
# Chose not to redo this 9/11/12 (see SNP137 above) because this is only for SNVs
# and only indel locations were changed.
    mkdir /hive/data/genomes/hg19/bed/snp137Ortho
    cd /hive/data/genomes/hg19/bed/snp137Ortho
    # Filter snp137 to to keep only uniquely mapped biallelic SNVs (class=single, length=1);
    zcat /hive/data/outside/dbSNP/137/human/snp137.bed.gz \
    | awk '$18 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
    | sort -u \
      > snp137ExcludeIds.txt
    wc -l snp137ExcludeIds.txt
#1267059 snp137ExcludeIds.txt
    # Glom all human info that we need for the final table onto the
    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
    zcat /hive/data/outside/dbSNP/137/human/snp137.bed.gz \
    | awk '$3-$2 == 1 && $11 == "single" {print;}' \
    | grep -vFwf snp137ExcludeIds.txt \
    | awk 'BEGIN{OFS="\t";} \
        {print $1, $2, $3, \
               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
               0, $6;}' \
      > snp137ForLiftOver.bed

    # Map coords to chimp using liftOver.
    mkdir run.liftOChimp
    cd run.liftOChimp
    mkdir split out
    splitFile ../snp137ForLiftOver.bed 10000 split/chunk
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPanTro3.over.chain.gz \
        \{check out exists out/panTro3.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    ssh swarm
    cd /hive/data/genomes/hg19/bed/snp137Ortho/run.liftOChimp
    para make jobList
#Completed: 4597 of 4597 jobs
#CPU time in finished jobs:     443120s    7385.34m   123.09h    5.13d  0.014 y
#IO & Wait Time:                 46429s     773.81m    12.90h    0.54d  0.001 y
#Average job time:                 106s       1.77m     0.03h    0.00d
#Longest finished job:             261s       4.35m     0.07h    0.00d
#Submission to last job:           558s       9.30m     0.15h    0.01d

    # Map coords to orangutan using liftOver.
    mkdir ../run.liftOPon
    cd ../run.liftOPon
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToPonAbe2.over.chain.gz \
        \{check out exists out/ponAbe2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 4597 of 4597 jobs
#CPU time in finished jobs:     924695s   15411.59m   256.86h   10.70d  0.029 y
#IO & Wait Time:                 90764s    1512.73m    25.21h    1.05d  0.003 y
#Average job time:                 221s       3.68m     0.06h    0.00d
#Longest finished job:             580s       9.67m     0.16h    0.01d
#Submission to last job:          1201s      20.02m     0.33h    0.01d

    # Map coords to macaque using liftOver.
    mkdir ../run.liftOMac
    cd ../run.liftOMac
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg19/bed/liftOver/hg19ToRheMac2.over.chain.gz \
        \{check out exists out/rheMac2.$f:t.bed\} out/hg19.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 4597 of 4597 jobs
#CPU time in finished jobs:    1189764s   19829.40m   330.49h   13.77d  0.038 y
#IO & Wait Time:                107365s    1789.42m    29.82h    1.24d  0.003 y
#Average job time:                 282s       4.70m     0.08h    0.00d
#Longest finished job:             694s      11.57m     0.19h    0.01d
#Submission to last job:          1565s      26.08m     0.43h    0.02d

    cd /hive/data/genomes/hg19/bed/snp137Ortho
    # Concatenate the chimp results, sorting by chimp pos in order to
    # efficiently access 2bit sequence in getOrthoSeq.  The output of
    # that is then sorted by the glommed human info field, so that we
    # can use join to combine chimp and macaque results in the next step.
    # Ditto for macaque and orangutan.  Each command pipe takes ~6 minutes:
    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro3.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro3/panTro3.2bit \
    | sort > panTro3.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
    | sort > ponAbe2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
    | sort > rheMac2.orthoGlom.txt
    wc -l panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
#  43311070 panTro3.orthoGlom.txt
#  41254270 ponAbe2.orthoGlom.txt
#  37148915 rheMac2.orthoGlom.txt

    # Use the glommed name field as a key to join up chimp and macaque
    # allele data.  Include glommed name from both files because if only
    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
    # in the orthoGlom fields from each file, which are in the same order
    # as the chimp and macaque columns of snp137OrthoPanTro2RheMac2.
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      panTro3.orthoGlom.txt ponAbe2.orthoGlom.txt \
    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
      > tmp.txt
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      tmp.txt rheMac2.orthoGlom.txt \
    | perl -wpe 'chomp; \
        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
          split(/\|/, $glomKey); \
        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
        s/^.*$//;' \
    | sort -k1,1 -k2n,2n > snp137OrthoPt3Pa2Rm2.bed
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
      hg19 snp137OrthoPt3Pa2Rm2 snp137OrthoPt3Pa2Rm2.bed
#Read 44774198 elements of size 22 from snp137OrthoPt3Pa2Rm2.bed
    # Cleanup:
    rm -r run*/split tmp.txt *.orthoGlom.txt snp137Simple.bed
    gzip snp137ExcludeIds.txt snp137ForLiftOver.bed &


############################################################################
# DBSNP CODING ANNOTATIONS (137) (DONE 11/10/12 angie)
# Originally done 7/11/12 but code 43 (stop-loss) was omitted, and filtering out
# NULL frame caused us to lose 45 (cds-indel) too.
# Updated 7/30/12 with corrections for that issue.
# Updated 9/11/12 with extensive corrections from dbSNP (see SNP137 above, #8360 note 36)
# Updated 10/15/12 w/more corrections (see SNP137 above, #8360 note 42)
# Updated 11/9/12 w/more corrections (see SNP137 above, #8360 note 45)
    cd /hive/data/outside/dbSNP/137/human
    # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
    # For anything except an insertion (0 bases between flanks),
    # we need to add 1 to the end coord.  For an insertion, we need
    # to add 1 to the start coord.  Make a hash of the insertion IDs,
    # then look up each ID in ncbiFuncAnnotations.txt to tell which
    # transform to apply.
    # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
    zcat ncbiFuncAnnotations.txt.gz \
    | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \
              while (<$IDS>) { chomp; $ids{$_} = 1; } \
              close($IDS); \
              %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 43=>1, 44=>1, 45=>1); \
              while (<>) { \
                chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                next unless $coding{$w[5]}; \
                $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                  $w[2]++; # 2-base insertions: increment start coord \
                } else { \
                  $w[3]++; # increment end coord to get half-open \
                } \
                print join("\t", @w) . "\n"; \
              }' \
    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
    | uniq \
      > ncbiCodingAnnotations.txt
    wc -l ncbiCodingAnnotations.txt
#3873975 ncbiCodingAnnotations.txt
    # How many & what kinds of function types?
    cut -f 6 ncbiCodingAnnotations.txt \
    | sort -n | uniq -c
# 681327 3   (coding-synon)
#1917237 8   (cds-reference -- ignored)
#  35591 41  (nonsense)
#1190533 42  (missense)
#   1153 43  (stop-loss)
#  41695 44  (frameshift)
#   6439 45  (cds-indel)
    # In b137, the functional annotations include non-coding (frame = NULL),
    # which we'll exclude here because this is supposed to be just coding stuff...
    # probably need to update how we show dbSNP's func annos anyway, e.g.
    # it is a shame that we toss out codon number and transcript offset.
    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
    perl -e  'while (<>) { chomp; \
                my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                next if ($fxn == 8 && ($frm eq "NULL" && $aa eq "NULL" && $codon eq "NULL")); \
                if (defined $lastRs && \
                    ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                     $lastTx ne $txId || $lastFrm ne $frm)) { \
                  if (defined $refRow) { \
                    $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                    $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                  } \
                  $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                        "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                  $lineOut =~ s@NULL@n/a@g; \
                  print $lineOut; \
                  $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                } \
                ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                    ($rsId, $ctg, $s, $e, $txId, $frm); \
                $count++; \
                if ($fxn == 8) { \
                  $refRow = [$fxn, $nt, $aa, $codon]; \
                } else { \
                 $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                } \
              } \
              if (defined $refRow) { \
                $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
              } \
              $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                    "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
              $lineOut =~ s@NULL@n/a@g; \
              print $lineOut;' \
      ncbiCodingAnnotations.txt \
    | liftUp snp137CodingDbSnp.bed /hive/data/genomes/hg19/jkStuff/liftContigs.lft warn stdin
    hgLoadBed hg19 snp137CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      snp137CodingDbSnp.bed
#Read 1922594 elements of size 11 from snp137CodingDbSnp.bed


############################################################################
# SNPMASKED SEQUENCE FOR SNP137 (DONE 7/12/12 angie)
# Chose not to redo this 9/11/12 (see SNP137 above) because this is only for SNVs
# and only indel locations were changed.
    mkdir /hive/data/genomes/hg19/snp137Mask
    cd /hive/data/genomes/hg19/snp137Mask
    # Identify rsIds with various problems -- we will exclude those.
    zcat /hive/data/outside/dbSNP/137/human/snp137.bed.gz \
    | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \
      | sort -u \
      > snp137ExcludeRsIds.txt
    zcat /hive/data/outside/dbSNP/137/human/snp137.bed.gz \
    | grep -vFwf snp137ExcludeRsIds.txt \
      > snp137Cleaned.bed
    wc -l snp137Cleaned.bed
#52047160 snp137Cleaned.bed

    # Substitutions:
    mkdir substitutions
    snpMaskSingle snp137Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout diffObserved.txt \
    | faSplit byname stdin substitutions/
#Masked 46091199 snps in 46090845 out of 3131225094 genomic bases
#/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3131225094 (difference is 5936170)
    # Check that 5936170 is the total #bases in sequences with nothing in snp137Cleaned:
    grep -Fw single snp137Cleaned.bed | cut -f 1 | uniq > /data/tmp/1
    grep -vwf /data/tmp/1 ../chrom.sizes \
    | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
#5936170
    # warnings about differing observed strings at same base position:
    wc -l diffObserved.txt
#448 diffObserved.txt
#TODO: send list to dbSNP.
    # Make sure that sizes are identical, first diffs are normal -> IUPAC,
    # and first diffs' case is preserved:
    foreach f (substitutions/chr*.fa)
      faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
    end
#chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10176 (m != a)
#chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 60522 (K != T)
#...
#(output OK -- ambiguous bases replacing [agct] at SNP positions)
    foreach f (substitutions/chr*.fa)
      echo $f:t:r
      mv $f $f:r.subst.fa
      gzip $f:r.subst.fa &
    end

    # Insertions & deletions not done.  To date we have only offered substs for download.
    # If there is user demand, use template from snp131 above.

    # Clean up and prepare for download:
    gzip snp137Cleaned.bed &
    foreach d (substitutions)
      pushd $d
        md5sum *.gz > md5sum.txt
        cp /hive/data/genomes/hg19/snp135Mask/$d/README.txt .
      popd
    end
    # Edit the README.txt.

    # Create download links on hgwdev.
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp137Mask
    ln -s /hive/data/genomes/hg19/snp137Mask/substitutions/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp137Mask/

#########################################################################
# LASTZ Macaca Mulatta RheMac3 (DONE - 2012-03-15 - Chin)
    mkdir /hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15
    cd /hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15

    cat << '_EOF_' > DEF
# human vs macaca mulatta
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Macaca Mulatta RheMac3
SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit
SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        > do.log 2>&1 &
    #   real     322m50.822s
    cat fb.hg19.chainRheMac3Link.txt
    #   2400694407 bases of 2897316137 (82.859%) in intersection
    cd /hive/data/genomes/hg19/bed
    ln -s lastzRheMac3.2012-03-15 lastz.rheMac3


    #   running the swap - DONE - 20i12-03-16
    mkdir /hive/data/genomes/rheMac3/bed/blastz.hg19.swap
    cd /hive/data/genomes/rheMac3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15/DEF \
        -swap \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        > swap.log 2>&1 &
    #    58m38.594s
    cat fb.rheMac3.chainHg19Link.txt
    #   2313806886 bases of 2646704109 (87.422%) in intersection
    cd /hive/data/genomes/rheMac3/bed
    ln -s blastz.hg19.swap lastz.hg19


############################################################################
# UPDATE COSMIC TRACK - v60 (DONE 2012-07-23 larrym)

~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v60_190712.csv.gz
New length: 166164
Old length: 136638
Percent bed overlap with previous version: 100.00%
Number of deleted IDs: 4
Number of added IDs: 29530

############################################################################
# UPDATE COSMIC TRACK - v61 (DONE - 2012-11-09 - Hiram)

time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v61_260912.csv.gz
#       real    1m10.070s
New length: 220318
Old length: 166164
Percent bed overlap with previous version: 100.00%
Number of deleted IDs: 28
Number of added IDs: 54182
time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v61_260912.csv.gz
#       real    0m8.251so
# Loading COSMIC v61
# New length: 220318
# Old length: 166164
# Percent bed overlap with previous version: 100.00%
# Number of deleted IDs: 28
# Number of added IDs: 54182
# Scanning through 1 files
# Reading cosmic.bed
# Read 220318 elements of size 4 from cosmic.bed
# Sorted
# Creating table definition for cosmic
# Saving bed.tab
# Loading hg19

############################################################################
# UPDATE COSMIC TRACK - v62 (DONE - 2012-12-18 - Hiram)
    # take a look at:
    # ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/
    # to see what the new version file name is, then:


time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v62_291112.csv.gz
#       New length: 536287
#       Old length: 220318
#       Percent bed overlap with previous version: 100.00%
#       Number of deleted IDs: 77
#       Number of added IDs: 316046

#       real    0m23.191s

    # that created files in: /hive/data/genomes/hg19/bed/cosmic/v62/

    # then:
    cd /hive/data/genomes/hg19/bed/cosmic/v62/
    time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 \
ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v62_291112.csv.gz \
        > do.log 2>&1
    #   real    0m19.404s
    #   Read 536287 elements of size 4 from cosmic.bed

############################################################################
# UPDATE COSMIC TRACK - v63 (DONE - 2013-02-19 - Hiram)
    # take a look at:
    # ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/
    # to see what the new version file name is, then:

    cd /hive/data/genomes/hg19/bed/cosmic

time ~/kent/src/hg/utils/automation/loadCosmic.pl -dryRun hg19 ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v63_300113.csv.gz
#       New length: 616299
#       Old length: 536287
#       Percent bed overlap with previous version: 100.00%
#       Number of deleted IDs: 643
#       Number of added IDs: 80655

#       real    0m32.084s

    # that created files in: /hive/data/genomes/hg19/bed/cosmic/v63/

    # then:
    cd /hive/data/genomes/hg19/bed/cosmic/v63/
    time ~/kent/src/hg/utils/automation/loadCosmic.pl hg19 \
ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/Request_based_exports/UCSCMutExp_v63_300113.csv.gz \
        > do.log 2>&1
    #   real    0m24.619s

    #   Read 616299 elements of size 4 from cosmic.bed

############################################################################
# lastz Medium Ground Finch geoFor1 (DONE - 2012-07-29 - Hiram)
    # establish a screen to control this job with a name to indicate what it is
    screen -S hg19
    mkdir /hive/data/genomes/hg19/bed/lastzGeoFor1.2012-07-29
    cd /hive/data/genomes/hg19/bed/lastzGeoFor1.2012-07-29

    cat << '_EOF_' > DEF
# Human vs. medium ground finch
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Medium Ground Finch GeoFor1
SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit
SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=60

BASE=/hive/data/genomes/hg19/bed/lastzGeoFor1.2012-07-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #	number of jobs, 50,000 to something under 100,000
    # when not present, SEQ2_LIMIT is a default 100
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
    #   real    238m6.827s
    cat fb.hg19.chainGeoFor1Link.txt
    #   101503916 bases of 2897316137 (3.503%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/hg19/bed
    ln -s lastzGeoFor1.2012-07-29 lastz.geoFor1

    #	and for the swap
    mkdir /hive/data/genomes/geoFor1/bed/blastz.hg19.swap
    cd /hive/data/genomes/geoFor1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzGeoFor1.2012-07-29/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
    #   real    9m10.240s
    cat  fb.geoFor1.chainHg19Link.txt
    #	88547518 bases of 1041286029 (8.504%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/geoFor1/bed
    ln -s blastz.hg19.swap lastz.hg19


#######################################################################
# DENISOVA HIGH-COVERAGE VARIANTS #8886 (DONE 9/6/12 angie)
    mkdir /hive/data/genomes/hg19/bed/denisova
    cd /hive/data/genomes/hg19/bed/denisova
    # Get tabix-compressed+indexed VCF files for Denisova and 11 modern humans:
    wget ftp://ucsc_paper_data:PHZuezz7@cdna.eva.mpg.de/hg19_1000g/\*
    # Make /gbdb links and bbi tables, prefix dhcVcf for Denisova High-Coverage VCF:
    foreach f (`pwd`/*.vcf.gz{,.tbi})
      ln -s $f /gbdb/hg19/bbi/
    end
    foreach f (*.vcf.gz)
      set track = dhcVcf$f:r:r
      echo $track
      hgBbiDbLink hg19 $track /gbdb/hg19/bbi/$f
    end
#dhcVcfDNK02
#dhcVcfDenisovaPinky
#dhcVcfHGDP00456
#dhcVcfHGDP00521
#dhcVcfHGDP00542
#dhcVcfHGDP00665
#dhcVcfHGDP00778
#dhcVcfHGDP00927
#dhcVcfHGDP00998
#dhcVcfHGDP01029
#dhcVcfHGDP01284
#dhcVcfHGDP01307
    # Add Denisova track group section:
    hgsql hg19 -e "insert into grp values('denisova', 'Denisova Assembly and Analysis', 6.6, 1)"


#########################################################################
# DENISOVA HIGH-COVERAGE SEQUENCE READS #8886 (DONE 9/10/12)
    cd /hive/data/genomes/hg19/bed/denisova
    wget http://cdna.eva.mpg.de/denisova/alignments/T_hg19_1000g.bam
    wget http://cdna.eva.mpg.de/denisova/alignments/T_hg19_1000g.bam.bai
    # Tweak sequence names?  e.g. SN:GL000193.1 -> chr4_gl000193_random ?
    ln -s `pwd`/T_hg19_1000g.bam{,.bai} /gbdb/hg19/bbi/
    hgBbiDbLink hg19 dhcBamDenisova /gbdb/hg19/bbi/T_hg19_1000g.bam


#########################################################################
# DENISOVA HIGH-COVERAGE ANALYSIS #8886 (DONE 10/2/12 angie)
    mkdir /hive/data/genomes/hg19/bed/denisova
    cd /hive/data/genomes/hg19/bed/denisova
    # Fetched original .zip file on 9/6/12:
    wget --no-check-certificate https://bioinf.eva.mpg.de/download/HighCoverageDenisovaGenome/Denisova_catalog.zip
    unzip Denisova_catalog.zip
#    cd /hive/data/genomes/hg19/bed/denisova/Denisova_zip/Denisova-derived_Human-ancestral/
#    cd /hive/data/genomes/hg19/bed/denisova/Denisova_zip/GAD_Denisova-state/
#    cd /hive/data/genomes/hg19/bed/denisova/Denisova_zip/GWAS_Denisova-state/
#    cd /hive/data/genomes/hg19/bed/denisova/Denisova_zip/Human-derived_Denisova-ancestral/
#    cd /hive/data/genomes/hg19/bed/denisova/Denisova_zip/Human-derived_Denisova-state/

    # 9/17/12 update: no zip, dir name change, and wget -r apparently doesn't work with
    # https --no-check-cert, so just fetch updates alongside original files:
    mv Denisova_zip Denisova_zip_orig
    cp -p Denisiva_zip_orig DenHC_catalog
    cd DenHC_catalog/Human-derived_Denisova-state
    wget --no-check-certificate https://bioinf.eva.mpg.de/download/HighCoverageDenisovaGenome/DenHC_catalog/Human-derived_Denisova-state/Genome_CAT.tsv.gz
    # Good, no incomplete lines now.  But what we really want to make a track for is
    # Human-derived_Denisova-ancestral...
    cd /hive/data/genomes/hg19/bed/denisova/DenHC_catalog/Human-derived_Denisova-ancestral
    # There are two subdirs, InDels and SNCs, and each subdir has a whole bunch of files
    # which we can make into BED4+ subtracks of a composite with views.
    cat > trackify.pl <<'_EOF_'
#!/usr/bin/env perl
use warnings;
use strict;
my $isSNC = ($ARGV[0] =~ /SNC/);
while (<>) {
  next if (/^#/);
  chomp;
  my @w = split("\t");
  my $chromEnd = $w[2];
  $w[2]--; # chromStart -> 0-based
  # Delete extra column (Grantham score) that appears only in nonsynon files:
  splice(@w, 31, 1) if (scalar(@w) > 32);
  # Measure allele lengths to determine whether this is an indel, and whether it's
  # necessary to trim an identical first base from each allele:
  my $humanFirstBase = substr($w[15], 0, 1);
  my $humanLen = length($w[15]);
  my $diffLengths = 0;
  my $sameFirstBase = 1;
  foreach my $i (16..19) {
    next if ($w[$i] eq "N/A");
    my $firstBase = substr($w[$i], 0, 1);
    my $len = length($w[$i]);
    $diffLengths = 1 if ($len != $humanLen);
    $sameFirstBase = 0 if ($firstBase ne $humanFirstBase);
  }
  if ($sameFirstBase) {
    foreach my $i (15..19) {
      next if ($w[$i] eq "N/A");
      $w[$i] =~ s/^$humanFirstBase// || die;
      $w[$i] =~ s/^$/-/; # Some alleles have trailing "-", some don't; use "-" for deletion
    }
    $w[2]++; # adjust chromStart
    $humanLen--;
  }
  if ($diffLengths) {
    $chromEnd = $w[2] + $humanLen;
  }
  if ($isSNC) {
    # "-" is used as N/A in SNCs/* -- tweak to N/A for consistency w/InDels files
    foreach my $i (16..19) {
      $w[$i] = "N/A" if ($w[$i] eq "-");
    }
  }
  my $name = "$w[15]/$w[16]";
  $name .= ":$w[21]" if (length($w[21]) > 1);
  # Add spaces to Extra for readability:
  $w[14] =~ s/;/; /g;
  print join("\t", "chr$w[1]", $w[2], $chromEnd, $name,      # BED 4
             $w[5], $w[4], $w[14],                           # Feature, Gene, Extra
             $w[7], $w[8], $w[9], $w[10], $w[11], $w[12],    # Consequence, coding effect
             $w[15], $w[16], $w[17], $w[18], $w[19],         # Human, Den, etc. alleles
             $w[20], $w[21], $w[22], $w[23], $w[31]) . "\n"; # D zyg, dbSNP, 1000g freq, flag,strnd
}
'_EOF_'
    # << emacs
    chmod a+x trackify.pl

    cat > tsvToBedAndTrackDb.pl << '_EOF_'
#!/usr/bin/env perl
# Parse a .tsv file path into descriptive components to use for track name and trackDb subGroups;
# Call trackify.pl and if the resulting bed file is non-empty, print out subtrack .ra entry.
use warnings;
use strict;
my $tsvPath = $ARGV[0];
chomp $tsvPath;
$tsvPath =~ m/^(InDels|SNCs)\/Genome_VEP(|(_genic(_ccds)?_(3utr|5utr|nonsyn_grantham|frameshift_coding|inframe_nonsyn|splice|syn))|(_regul(_motif)?(_highinfo)?))_formatted_(fixed|highfreq)/ || die;
my ($vType, $ccds, $gType, $reg, $regMo, $regHi, $fType) = ($1, $4 || "", $5 || "", $6 || "",
                                                            $7 || "", $8 || "", $9 || ""); 
# Reformat file name components: _ to inital uppercase, etc.
$vType =~ s/s$//;
$ccds =~ s/_(\w)/\u$1/g;
$gType =~ s/_(\w)/\u$1/g;  $gType =~ s/^(\w)/\u$1/;
$gType =~ s/Grantham//;  $gType =~ s/^(\d)utr/Utr$1/; $gType =~ s/nframe/nFrame/;
$fType =~ s/^(\w)/\u$1/;
$fType =~ s/ighfreq/ighFreq/;
my $fShort = ($fType eq "Fixed") ? "Fxd" : "HiF";
my $shortLabel = "";
my $longLabel = "Modern Human Derived ($fType), Denisova Ancestral: ";
my $color = "0,0,0";
my ($subset, $view);
if ($gType) {
  $subset = "$ccds$gType";
  if ($gType eq "Nonsyn" || $gType eq "Splice" || $gType eq "FrameshiftCoding" ||
      $gType eq "InFrameNonsyn") {
    $color = "200,0,0";
  } elsif ($gType =~ /^Utr/) {
    $color = "0,0,200";
  } elsif ($gType eq "Syn") {
    $color = "0,200,0";
  }
  my ($gShort, $gLong) = ($gType, $gType);
  $gShort =~ s/FrameshiftCoding/FrShft/; $gShort =~ s/InFrameNonsyn/InFrNS/;
  $gLong =~ s/([a-z])([A-Z])/$1 $2/g;  $gLong =~ s/Utr(\d)/$1\' UTR/;
  $gLong =~ s/In Frame/In-frame/;  $gLong =~ s/Nonsyn/Non-synonymous/;
  if ($ccds) {
    $gShort = "CC $gShort";
    $gLong = "CCDS $gLong";
  }
  $shortLabel .= "$gShort $fShort";
  $longLabel .= "$gLong";
  $view = $ccds ? $ccds : "Ens";
} elsif ($reg) {
  $color = "230,130,0";
  my ($rShort, $rLong);
  if ($regHi) {
    $subset = "RegMotifHighInfo";
    ($rShort, $rLong) = ("RgMoHiInf", "Reg. Motif at High Inf Pos in TFBP");
  } elsif ($regMo) {
    $subset = "RegMotif";
    ($rShort, $rLong) = ("RegMotif", "Regulatory Motif");
  } else {
    $subset = "Reg";
    ($rShort, $rLong) = ("RegRegion", "Regulatory Region");
  }
  $shortLabel .= "$rShort $fShort";
  $longLabel .= "$rLong";
  $view = "Reg";
} else {
  $subset = "All";
  $shortLabel .= "$fShort";
  $longLabel .= "All ";
  $view = "All";
}
my $track = "dhcHumDerDenAnc$vType$subset$fType";
my $cmd = "./trackify.pl $tsvPath > $track.bed";
system($cmd) == 0 || die "ERROR from \"$cmd\"\n\n";
if (-s "$track.bed") {
  my $subsetTweaked = $subset;  $subsetTweaked =~ s/Syn/ZLast_Syn/;
  $subsetTweaked =~ s/^RegMotifH/DA_RegMotifH/;  $subsetTweaked =~ s/^RegM/DB_RegM/;
  $subsetTweaked =~ s/^Reg/DC_Reg/;
  my $isOff = "";
  $isOff = " off" if ($subset =~ /Syn/ || $subset eq "Reg");
  # We will combine SNCs and InDels later; print only one set of trackDb descriptions:
  if ($vType eq "InDel" || $subset =~ /Nonsyn/ || $subset =~ /Syn/) {
    print "        track dhcHumDerDenAnc$subset$fType\n";
    print "        parent dhcHumDerDenAnc$view$isOff\n";
    print "        subGroups view=$view subset=$subsetTweaked freq=$fType\n";
    print "        shortLabel $shortLabel\n";
    print "        longLabel $longLabel\n";
    print "        color $color\n\n";
  }
  if ($fType eq "Fixed") {
    # Fernando Racimo's request: separate out Fixed (in 1000Genomes) locations that are in dbSNP.
    $cmd = "egrep -vw 'rs[0-9]+' $track.bed > tmp$track.bed";
    system($cmd) == 0 || die "ERROR from \"$cmd\"\n\n";
    $cmd = "egrep -w 'rs[0-9]+' $track.bed > ${track}DbSnp.bed";
    system($cmd); # grep returns nonzero if it can't find anything, but that's OK here.
    $cmd = "mv tmp$track.bed $track.bed";
    system($cmd) == 0 || die "ERROR from \"$cmd\"\n\n";
    if (-s "${track}DbSnp.bed" &&
        ($vType eq "InDel" || $subset =~ /Nonsyn/ || $subset =~ /Syn/ ||
         $subset eq "RegMotifHighInfo")) {
      $shortLabel =~ s/Fxd/FxS/;
      $longLabel =~ s/Fixed/Fixed+dbSNP/;
      print "        track dhcHumDerDenAnc$subset${fType}DbSnp\n";
      print "        parent dhcHumDerDenAnc$view$isOff\n";
      print "        subGroups view=$view subset=$subsetTweaked freq=${fType}DbSnp\n";
      print "        shortLabel $shortLabel\n";
      print "        longLabel $longLabel\n";
      print "        color $color\n\n";
    }
  }
}
'_EOF_'
     # << emacs
    chmod a+x tsvToBedAndTrackDb.pl
    foreach f (*/Genome_VEP_*.tsv)
        ./tsvToBedAndTrackDb.pl $f
        if ($status != 0) break
    end
    # Check input and output file counts:
    ls -1 */G*_highfreq.tsv | wc -l
#27
    ls -1 *HighFreq.bed | wc -l
#27
    ls -1 */G*_fixed.tsv | wc -l
#27
    ls -1 *Fixed.bed | wc -l
#27
    ls -1 *FixedDbSnp.bed | wc -l
#27
    # 54 inputs, 81 outputs because Fixed was split into Fixed and FixedDbSnp

    # Combine SNCs and InDels:
    foreach indel (dhcHumDerDenAncInDel*.bed)
      set snc = `echo $indel | sed -e 's/InDel/SNC/;'`
      set both = `echo $indel | sed -e 's/InDel//;'`
      if (! -e $snc) then
        mv $indel $both
      endif
    end
    foreach snc (dhcHumDerDenAncSNC*.bed)
      set indel = `echo $snc | sed -e 's/SNC/InDel/;'`
      set both = `echo $snc | sed -e 's/SNC//;'`
      if (-e $indel) then
        sort -k1,1 -k2n,2n $snc $indel > $both
        rm $snc $indel
      else
        mv $snc $both
      endif
    end

    # bedToBigBed:
    foreach f (dhcHumDerDenAnc*.bed)
      if (-s $f) then
        echo $f
        bedToBigBed -verbose=0 -tab -type=bed4+19 -as=$HOME/kent/src/hg/lib/dhcHumDerDenAnc.as \
          $f /hive/data/genomes/hg19/chrom.sizes $f:r.bb
        if ($status != 0) break
      else
        echo "Skipping $f (zero size)"
      endif
    end
#Skipping dhcHumDerDenAncCcdsInFrameNonsynFixedDbSnp.bed (zero size)
#Skipping dhcHumDerDenAncCcdsInFrameNonsynHighFreq.bed (zero size)
#Skipping dhcHumDerDenAncInFrameNonsynFixedDbSnp.bed (zero size)
#Skipping dhcHumDerDenAncInFrameNonsynHighFreq.bed (zero size)
    # Check bigBed file count:
    ls -1 *.bed | wc -l
#54
    ls -1 *.bb | wc -l
#50
    # We skipped 4 empty bed files, so that's OK.

    # Install in /gbdb and load up.
    mkdir /gbdb/hg19/dhcHumDerDenAnc
    ln -s `pwd`/dhcHumDerDenAnc*.bb /gbdb/hg19/dhcHumDerDenAnc/
    foreach f (dhcHumDerDenAnc*.bb)
      hgBbiDbLink hg19 $f:r /gbdb/hg19/dhcHumDerDenAnc/$f
    end


#########################################################################
# recip best for mm10 (DONE - 2012-09-14 - Hiram)
    # see also: redmine issue 9089
    cd /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07
    time doRecipBest.pl -buildDir=`pwd` hg19 mm10 > rbest.log 2>&1
    #   real    157m16.369s

#########################################################################
# NCBI patch 10 (DONE - 2012-09-26 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/additionalSequence/patch10
    cd /hive/data/genomes/hg19/bed/additionalSequence/patch10
    rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p10/ ./
    # the scripts from patch9 were modified slightly to update and fix some
    #	of the new names in this patch10
    cp ../patch9/gatherNames.pl .
    ./gatherNames.pl . > ucscNames.patch10.txt
    # examine the names for sanity:
    awk '{print $NF}' ucscNames.patch10.txt | sort
    # and they should not be longer than 31 characters:
    awk '{print $NF}' ucscNames.patch10.txt | sort | awk '{print length($0)}' \
        | sort -n | tail
    cp -p ../patch9/mkTables.pl .
    ./mkTables.pl  patches.chrom.sizes ucscNames.patch10.txt PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz
    # output to stdout is the contents of alt.scaf.agp.gz
    # constructs files: ctgPos.txt chromInfo.txt gold.txt and gap.txt
    cp -p ../patch9/mkCtgPos2.pl .
    ./mkCtgPos2.pl ucscNames.patch10.txt patches.chrom.sizes > ctgPos2.txt
    cp -p ../patch9/mkHapLocate.pl .
    ./mkHapLocate.pl ctgPos.txt \
	PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
	> haplotypeLocations.bed
    cp -p haplotypeLocations.bed altSequence.bed
    ln -s ../patch2/before.patch2.hapLoc.bed hg19.hapLoc.bed
    awk '{printf "%s\t%d\t%d\t%s\t500\t+\t%d\t%d\t32,32,190\n", $2,$3,$4,$5,$3,$4}' \
hg19.hapLoc.bed >> altSequence.bed

    # a new script for patch10
    cp -p ../patch9/mkFasta.pl .
    ./mkFasta.pl ucscNames.patch10.txt > hg19.patch10.fa
    # the build of hg19Patch10 can be seen in hg19Patch10.txt

    egrep -v "32,32,190" altSequence.bed  \
	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
	    > altSeqPatchesP10.tab
    egrep "32,32,190" altSequence.bed  \
	| awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \
	    | grep -v "^chrM_rCRS" > altSeqHaplotypesP10.tab
    # verify only one lost
    wc -l altSeqPatchesP10.tab altSeqHaplotypesP10.tab
    #   112 altSeqPatchesP10.tab
    #   80 altSeqHaplotypesP10.tab
    #   192 total

    wc -l altSequence.bed
    #	193 altSequence.bed
    hgLoadBed hg19 altSeqHaplotypesP10 altSeqHaplotypesP10.tab
    #	Read 80 elements of size 6 from altSeqHaplotypesP10.tab
    hgLoadBed hg19 altSeqPatchesP10 altSeqPatchesP10.tab
    #	Read 112 elements of size 6 from altSeqPatchesP10.tab

    #    these tables are part of human/hg19/altSeqComposite10.ra
    #  Check the chrom coverage for the altSeqComposite10.ra listing:
    cut -f1 altSequence.bed | sort -u | xargs echo
# chr1 chr10 chr11 chr12 chr13 chr15 chr16 chr17 chr18 chr19 chr2 chr20 chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrM chrM_rCRS chrX

##############################################################################
# lastz Lamprey petMar2 (DONE - 2012-10-17 - Hiram)
    # establish a screen to control this job with a name to indicate what it is
    screen -S petMar2
    mkdir /hive/data/genomes/hg19/bed/lastzPetMar2.2012-10-17
    cd /hive/data/genomes/hg19/bed/lastzPetMar2.2012-10-17

    cat << '_EOF_' > DEF
# Human vs. Lamprey
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Lamprey PetMar2
SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit
SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=60

BASE=/hive/data/genomes/hg19/bed/lastzPetMar2.2012-10-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #	number of jobs, 50,000 to something under 100,000
    # when not present, SEQ2_LIMIT is a default 100
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
    #   real    76m34.446s

    cat fb.hg19.chainPetMar2Link.txt
    #   30305028 bases of 2897316137 (1.046%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/hg19/bed
    ln -s lastzPetMar2.2012-10-17 lastz.petMar2

    #	and for the swap
    mkdir /hive/data/genomes/petMar2/bed/blastz.hg19.swap
    cd /hive/data/genomes/petMar2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzPetMar2.2012-10-17/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
    #   real    15m22.099s
    cat  fb.petMar2.chainHg19Link.txt
    #	21515660 bases of 647368134 (3.324%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/petMar2/bed
    ln -s blastz.hg19.swap lastz.hg19

#########################################################################
# lastz White Rhino cerSim1 (DONE - 2012-10-17 - Hiram)
    # establish a screen to control this job with a name to indicate what it is
    screen -S cerSim1
    mkdir /hive/data/genomes/hg19/bed/lastzCerSim1.2012-10-17
    cd /hive/data/genomes/hg19/bed/lastzCerSim1.2012-10-17

    cat << '_EOF_' > DEF
# Human vs. White Rhino
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: White Rhino CerSim1
SEQ2_DIR=/hive/data/genomes/cerSim1/cerSim1.2bit
SEQ2_LEN=/hive/data/genomes/cerSim1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=60

BASE=/hive/data/genomes/hg19/bed/lastzCerSim1.2012-10-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #	number of jobs, 50,000 to something under 100,000
    # when not present, SEQ2_LIMIT is a default 100
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    1272m58.952s
    #   problem in chaining chr19, running it manually on hgwdev:
    cd /hive/data/genomes/hg19/bed/lastzCerSim1.2012-10-17/axtChain/run
export maxMem=83886080
ulimit -S -m $maxMem -v $maxMem
ulimit -a
time ./chain.csh hg19.2bit:chr19: chain/hg19.2bit:chr19:.chain
    #   real    147m46.959s
    # very impressive:
    #   -rw-rw-r-- 1 707886253 Oct 18 13:03 hg19.2bit:chr19:.chain
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        -continue=chainMerge `pwd`/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 &
    #   real    99m4.624s

    cat fb.hg19.chainCerSim1Link.txt
    #   1683424317 bases of 2897316137 (58.103%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/hg19/bed
    ln -s lastzCerSim1.2012-10-17 lastz.cerSim1

    #	and for the swap
    mkdir /hive/data/genomes/cerSim1/bed/blastz.hg19.swap
    cd /hive/data/genomes/cerSim1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzCerSim1.2012-10-17/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -swap -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real     100m36s
    cat  fb.cerSim1.chainHg19Link.txt
    #	1637961407 bases of 2366858012 (69.204%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/cerSim1/bed
    ln -s blastz.hg19.swap lastz.hg19

#########################################################################
# construct liftOver to hg17 (DONE - 2012-11-08 - Hiram)
    screen -S hg17	# manage this longish running job in a screen
    mkdir /hive/data/genomes/hg19/bed/blat.hg17.2012-11-08
    cd /hive/data/genomes/hg19/bed/blat.hg17.2012-11-08
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/hg19/11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev hg19 hg17 > do.log 2>&1
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/hg19/11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev hg19 hg17 > do.log 2>&1
    #	real    333m16.756s

    # verify this file exists:
    #	/gbdb/hg19/liftOver/hg19ToHg17.over.chain.gz
    # and try out the conversion on genome-test from hg19 to hg17

############################################################################
2012-11-11: import and UCSC GENCODE group process of GENCODE V14 (markd)
    # Due to UCSC Genome Browser using the NC_001807 mitochondrial genome sequence
    # (chrM) and GENCODE annotating the NC_012920 mitochondrial sequence, the
    # GENCODE mitochondrial sequences are lifted to UCSC chrM.

    # download files
    mkdir -p /hive/data/genomes/hg19/bed/gencodeV14/release
    cd /hive/data/genomes/hg19/bed/gencodeV14/

    # download gencode release
    wget -nv -r -np ftp://ftp.sanger.ac.uk/pub/gencode/release_14
    mv ftp.sanger.ac.uk/pub/gencode/release_14 .
    rm -rf ftp.sanger.ac.uk/

    # silly sanity check:
    cd release_14
    for f in *.gz *.tgz ; do zcat $f >/dev/null ; done

    # untar main distribution
    tar -zxf gencode14_GRCh37.tgz

    cd /hive/data/genomes/hg19/bed/gencodeV14

    # obtain transcription support level analysis from UCSC GENCODE group (markd/rachel)
    mkdir -p data
    cp /cluster/home/markd/compbio/ccds/branches/transSupV14.1/modules/gencodeTransSupport/exprs/classDev/runs/2012-11-11/results/gencode.v14.transcriptionSupportLevel.{tab,tsv} data/

    # create Makefile from previous one.  This time, we need to get
    # if from the ENCODE DCC area.
    cp /hive/groups/encode/dcc/data/gencodeV13/Makefile .
    # edit to set version:
    ver = 14
   
    # on code in the CCDS subversion tree:
    #   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/svnroot/hausslerlab/ccds/trunk
    # and markd's python library (it will be moved to the hausslerlab
    # repository soon)
    # may need to update
        ccds2/modules/gencode/src/lib/gencode/data/gencodeGenes.py
    # to add new biotypes, use this command to verify and update as needed
    # be sure to do a make in ccds2/modules/gencode
    make checkAttrs

    # build and load tables
    (time nice make -j 10) >&build.out&

    # compare tables from previous release to see if number chnaged made
    # sense.
        make cmpRelease

    ## Copy and update trackDb files from previous release.
    ## Change version and use lower priority so it sorts to top of
    ## super track page.
    ## Important to make sure filter attrs.transcriptType matches current set
    ## figured out with
    select distinct transcriptType from wgEncodeGencodeAttrsV14 order by transcriptType;
    cd kent/src/hg/makeDb/trackDb
    cp human/hg19/wgEncodeGencodeV13.ra human/hg19/wgEncodeGencodeV14.ra
    cp human/hg19/wgEncodeGencodeV13.html human/hg19/wgEncodeGencodeV14.html
    # edit these plus human/hg19/trackDb.wgEncode.ra

    ### IMPORTANT: make sure that hgTracks/gencodeTracks.c registers
    ### track handler for this version of gencode:
    registerTrackHandlerOnFamily("wgEncodeGencodeV14", gencodeGeneMethods);

#########################################################################
# QPCR PRIMERS (DONE - 2012-12-10 - Chin)
# The track name is changed to "qPCR Primers"
# Reload table with new track_mouse.BED (2013-01-28)
    # Download
    mkdir /hive/data/outside/Weizmann/qPcrPrimers
    cd /hive/data/outside/Weizmann/qPcrPrimers
    wget http://www.weizmann.ac.il/complex/compphys/software/Amit/primers/human/track_human.BED
    mkdir -p /hive/data/genomes/hg19/bed/qPcrPrimers
    cat track_human.BED | grep -v track \
       > /hive/data/genomes/hg19/bed/qPcrPrimers/qPcrPrimers_hg19.bed
    cd /hive/data/genomes/hg19/bed/qPcrPrimers
    hgLoadBed -bedDetail -tab -renameSqlTable \
      -sqlTable=$HOME/kent/src/hg/lib/bedDetail.sql \
      hg19 qPcrPrimers qPcrPrimers_hg19.bed
    # Read 534301 elements of size 14 from qPcrPrimers_hg19.bed
    # Sorted
    # Creating table definition for qPcrPrimers
    # Saving bed.tab
    # Loading hg19

    # NULL descrition column
    hgsql hg19 -ne "UPDATE qPcrPrimers SET description = NULL;"

############################################################################
# coriellDelDup track (DONE - 2013-01-03 - Hiram)
    # data came in via email, files deposited to RedMine issue 6530
    mkdir /hive/data/genomes/hg19/bed/coriell
    cd /hive/data/genomes/hg19/bed/coriell
    # output the XLS files as tab delimited files:
# -rw-rw-r-- 1   4544 Dec  4 10:04 coriellReanalyzed.tab
# -rw-rw-r-- 1 119331 Dec  4 10:05 coriellDetailsHg19.tab
    # convert that .tab file to a bed 9 + file:
grep -v "^name" coriellDetailsHg19.tab \
| sed -e 's/B-Lymphocyte/B_Lymphocyte/; s/-derived//;' \
    | awk -F'\t' '{
rgb="200,0,0";
if ($5 == 0) { rgb="255,0,0"; }
if ($5 == 1) { rgb="170,68,0"; }
if ($5 == 2) { rgb="0,0,0"; }
if ($5 == 3) { rgb="0,68,170"; }
if ($5 == 4) { rgb="0,0,255"; }
gsub(" ","_",$6);
gsub("\"","",$7);
gsub("\"","",$8);
printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n", $2,$3,$4,$1,$5*100,$3,$4,rgb,$5,$6,$7,$8}' \
        | sort -k1,1 -k2,2n > coriellDetailsHg19.bed9
    # added the coriellDelDup.as  coriellDelDup.sql files to the source tree
    # in src/hg/lib/
    # loading the table:
    hgLoadBed -tab -type=bed9+ \
        -sqlTable=$HOME/kent/src/hg/lib/coriellDelDup.sql -bedDetail hg19 \
        coriellDelDup coriellDetailsHg19.bed9
    # add the description of the table to tableDescriptions in order to
    # see the functioning of the hgc clicks, the nightly tableDescriptions
    # build will pick this up from the source tree:
/bin/echo -n -e 'DELETE FROM tableDescriptions where tableName="coriellDelDup"; ' > tableDescriptions.entry.sql
/bin/echo -n -e "INSERT INTO tableDescriptions (tableName, autoSqlDef, gbdAnchor) values ('coriellDelDup', '" >> tableDescriptions.entry.sql
cat $HOME/kent/src/hg/lib/coriellDelDup.as >> tableDescriptions.entry.sql
/bin/echo -e "', '');" >> tableDescriptions.entry.sql
    hgsql hg19 < tableDescriptions.entry.sql

############################################################################
# ENCODE Regulation track -- make doc has been moved to encodeRegHg19.txt

############################################################################
# affyCytoScan track (DONE - 2013-01-15 - kuhn )

# for affyCytoScanHD chipset
# aamp left no record of what he did.
# This reconstructs it, as best I can figure
# followed by what I did to finish it up.

cd /hive/data/genomes/hg19/bed
mkdir affyCytoScan
cd affyCytoScan

# files from Carl Dowds at affy:
# CytoScanHD_ProbeList_CN.zip
# CytoScanHD_ProbeList_SNP.zip

# It looks like the files were catenated into a file called both.bed
# Columns were added to get to coloring field
#   score=1000, strand=+ thickStart=chromStart thickEnd=chromEnd
#   reserved=<colors>
# Colors are 204 for blue (C- probes) and 3368499 for green (S- probes)

# Then he loaded into db, picking up a bin column.
# File bed.tab has one color for each source file, but some
#   probes are in both sets

commTrio.csh CytoScanHD_ProbeList_CN.bed CytoScanHD_ProbeList_SNP.bed rm
# 1953247 CytoScanHD_ProbeList_CN.bed.Only
#   53144 CytoScanHD_ProbeList_SNP.bed.Only
#  743304 CytoScanHD_ProbeList_CN.bed.CytoScanHD_ProbeList_SNP.bed.Only  #
#  both sets

#   (SNP-file probes are all named "S-")
#   (CN -file probes are named "C-" and "S-", but the S- are all in the SNP
#   file, too)

###### kuhn hereafter
# The SNP probes in the CN file are redundant, so I dropped them
#   by making everything one color, then uniq the whole file.

hgsql -N -e "SELECT * FROM affyCytoScan" hg19 > hg19.affyCytoScan
cat hg19.affyCytoScan | sed 's/204$/3368499/' > hg19.cytoScan.oneColor
cat hg19.cytoScan.oneColor | sort -u > hg19.cytoScan.oneColor.uniq

wc -l *oneColor*
# 3492997 hg19.cytoScan.oneColor
# 2749693 hg19.cytoScan.oneColor.uniq

# Lost exactly the number of probes that were "S-" type,
#   but were colored the same as the others in the "C-" file.
#       743304  < # of dupes

# remove bin for loading
# (don't know if this is necessary, maybe it'd load with bin in place)
cat hg19.cytoScan.oneColor.uniq | awk '{print $2, $3, $4, $5, $6, $7, $8, $9,
$10}' \
   > hg19.cytoScan.oneColor.noBin
# load
hgLoadBed -type=bed9 hg19 affyCytoScanNew hg19.cytoScan.oneColor.noBin

# Set colors per Carl Dowds:
mysql> UPDATE affyCytoScanNew SET reserved = 3308830 WHERE name LIKE "C-%";
mysql> UPDATE affyCytoScanNew SET reserved = 8913032 WHERE name LIKE "S-%";

# Checked new track in Browser by making temporary block for it in trackDb.ra
# Moved into place:

mysql> RENAME TABLE affyCytoScan    TO affyCytoScanAndy;
mysql> RENAME TABLE affyCytoScanNew TO affyCytoScan;

############################################################################
# Chimp Lastz run (DONE 1/29/13 angie)
    mkdir /hive/data/genomes/hg19/bed/lastzPanTro4.2013-01-25
    cd /hive/data/genomes/hg19/bed/lastzPanTro4.2013-01-25
    cat << '_EOF_' > DEF
# human vs chimp
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg19/hg19.2bit
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Chimp PanTro4
SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit
SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=200
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg19/bed/lastzPanTro4.2013-01-25
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    screen # use screen to manage this long-running job
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -syntenicNet >& do.log & tail -f do.log

    cat fb.hg19.chainPanTro4Link.txt
#2760526412 bases of 2897316137 (95.279%) in intersection

    # filter with doRecipBest.pl
    doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        hg19 panTro4 >& rbest.log & tail -f rbest.log

    # running the swap
    mkdir /hive/data/genomes/panTro4/bed/blastz.hg19.swap
    cd /hive/data/genomes/panTro4/bed/blastz.hg19.swap
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
        -swap /hive/data/genomes/hg19/bed/lastzPanTro4.2013-01-25/DEF \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -syntenicNet >& swap.log & tail -f swap.log
    cat fb.panTro4.chainHg19Link.txt
#2773561724 bases of 2902338967 (95.563%) in intersection


#############################################################################
# LASTZ Gibbon NomLeu3 (DONE - Wed Mar  6 12:20:39 PST 2013 - Pauline)

    mkdir /hive/data/genomes/hg19/bed/lastzNomLeu3.2013-03-06
    cd /hive/data/genomes/hg19/bed/lastzNomLeu3.2013-03-06

    cat << '_EOF_' > DEF
# human vs gibbon
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human hg19
SEQ1_DIR=/scratch/data/hg19/nib
SEQ1_LEN=/scratch/data/hg19/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Gibbon nomLeu3
SEQ2_DIR=/hive/data/genomes/nomLeu3/nomLeu3.2bit
SEQ2_LEN=/hive/data/genomes/nomLeu3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/hg19/bed/lastzNomLeu3.2013-03-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #   establish a screen to control this job
    screen -S lastz

XXX Wed Mar  6 12:28:53 PST 2013
    /usr/bin/time -p nice doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        >& do.log &
    #   real    724m15s
    cat fb.hg19.chainNomLeu3Link.txt
    #   2543943556 bases of 2897316137 (87.803%) in intersection

    cd /hive/data/genomes/hg19/bed
    ln -s lastzNomLeu3.2013-03-06 lastz.nomLeu3

    #   running the swap - DONE - 2013-03-06
    mkdir /hive/data/genomes/nomLeu3/bed/blastz.hg19.swap
    cd /hive/data/genomes/nomLeu3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzNomLeu3.2013-03-06/DEF \
        -swap -syntenicNet \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        > swap.log 2>&1 &
    #   real     69m27s
    cat fb.nomLeu3.chainHg19Link.txt
    #   2480558770 bases of 2756591777 (89.986%) in intersection


##############################################################################
# DBNSFP (DONE 3/22/13 angie)
    # The database of non-synonymous functional predictions (dbNSFP) contains
    # precomputed scores from a wide variety of tools on all non-synon variants
    # of all genomic positions in the CDS of Gencode transcripts.  Pick out
    # some interesting subsets of its 52 columns and translate into bigBed and
    # bigWig files that can be joined with users' variants by the Variant
    # Annotation Integrator (#6152).
    screen -S dbNSFP -t dbNSFP
    mkdir /hive/data/genomes/hg19/bed/dbNSFP2.0
    cd /hive/data/genomes/hg19/bed/dbNSFP2.0
    wget http://dbnsfp.houstonbioinformatics.org/dbNSFPzip/dbNSFP2.0.zip
    unzip dbNSFP2.0.zip

    # Run a perl script that digests the 52-column input files into several independent
    # bed3+ files:
    ~/kent/src/hg/utils/dbNsfpToBed.pl dbNSFP*_variant.chr*
    # There are a bunch of mild warnings like this:
#FYI: >3 rows (5) for {chr1, 1221327, ENST00000379110}; removing less-informative duplicates.
    # The script has a workaround and follow-up error check, but it would be
    # good to report the cases to dbNSFP (see script for details).

    wc -l *.bed
#    2466469 dbNsfpGerpNr.bed
#   22275488 dbNsfpGerpRs.bed
#     231348 dbNsfpInterPro.bed
#   24810552 dbNsfpLrt.bed
#   28654727 dbNsfpMutationAssessor.bed
#   26188935 dbNsfpMutationTaster.bed
#   27629124 dbNsfpPolyPhen2.bed
#   31826285 dbNsfpSeqChange.bed
#   28302771 dbNsfpSift.bed
#     474262 dbNsfpUniProt.bed

    # Are all subsets present on all chroms?
    foreach f (*.bed)
      echo $f
      cut -f 1 $f | uniq -c > $f:r.chromHist
    end
    wc -l *.chromHist
#  24 dbNsfpGerpNr.chromHist
#  24 dbNsfpGerpRs.chromHist
#  24 dbNsfpInterPro.chromHist
#  24 dbNsfpLrt.chromHist
#  24 dbNsfpMutationAssessor.chromHist
#  24 dbNsfpMutationTaster.chromHist
#  24 dbNsfpPolyPhen2.chromHist
#  24 dbNsfpSeqChange.chromHist
#  23 dbNsfpSift.chromHist
#  24 dbNsfpUniProt.chromHist
    # -- nope, dbNsfpSift has no chrY.  SIFT limitation??

    # Convert Gerp scores to bigWig
    bedGraphToBigWig dbNsfpGerpNr.bed /hive/data/genomes/hg19/chrom.sizes dbNsfpGerpNr.bw
    bedGraphToBigWig dbNsfpGerpRs.bed /hive/data/genomes/hg19/chrom.sizes dbNsfpGerpRs.bw

    # Convert remaining files to bigBed
    foreach f (dbNsfp[^G]*.bed)
      set track = $f:r
      echo $track
      set autoSql = ~/kent/src/hg/lib/$track.as
      bedToBigBed -type=bed3+ -as=$autoSql -tab \
        $f /hive/data/genomes/hg19/chrom.sizes $track.bb
    end

    # Clean up: remove large files that can be re-unzipped or regenerated:
    rm -f search_db* dbNS*_variant.chr* dbNs*.bed

##############################################################################