# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on
# NCBI build 36 (October 2005 freeze)

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +---------------+-------------------------------------+
#  | tableName     | type                                |
#  +---------------+-------------------------------------+
#  | knownGene     | genePred knownGenePep knownGeneMrna |
#  | refGene       | genePred refPep refMrna             |
#  | xenoRefGene   | genePred xenoRefPep xenoRefMrna     |
#  | mgcGenes      | genePred                            |
#  | ensGene       | genePred ensPep                     |
#  | nscanGene     | genePred nscanPep                   |
#  | sgpGene       | genePred sgpPep                     |
#  | geneid        | genePred geneidPep                  |
#  | genscan       | genePred genscanPep                 |
#  | exonWalk      | genePred                            |
#  | ecoresTetNig1 | genePred                            |
#  +---------------+-------------------------------------+


# HOW TO BUILD AN ASSEMBLY FROM NCBI FILES
# ---------------------------------------
# 10/06/2005
# Make gs.19 directory, gs.19/build36 directory, and gs.19/ffa directory.
    ssh kkstore02
    mkdir /cluster/store11/gs.19
    mkdir /cluster/store11/gs.19/build36
    mkdir /cluster/store11/gs.19/agp
    mkdir /cluster/store11/gs.19/ffa

#    Make a symbolic link from /cluster/store1 to this location
#	(I assume there is some use for this later ?)

    cd /cluster/store1
    ln -s /cluster/store11/gs.19 ./gs.19
    ln -s /cluster/store11/gs.19/build36 /cluster/data/hg18

#    Make a symbolic link from your home directory to the build dir:
#	(Investigate what this is used for, may no longer be necessary)

    cd
    ln -s /cluster/store11/gs.19/build36 ~/oo

# NCBI download site, fetch everything into this one directory:

#	with the machine and password in your $HOME/.netrc file, this
#	wget command will require no login.  Your $HOME/.netrc file
#	is set to 'chmod 600 .netrc' to prevent anyone from finding
#	the data.  (There were some early files that later moved
#		into an OLD subdirectory.  They were broken.)

# 11/16/2005
# Received answer from Greg to go ahead with the new build.

    ssh kkstore02
    mkdir /cluster/store11/gs.19/ncbi
    cd /cluster/store11/gs.19/ncbi
    bash
    wget --timestamp ftp://ftp-private.ncbi.nih.gov/build_36/*

#	New to this build is the sequence: NC_001807 which is the
#	mitochondria sequence.  This prefix NC_ is new to the process
#	and will have to be accounted for below.  The other two special
#	prefixes are similar to what was seen before:
#	from DR52.agp NG_002392
#	Homo sapiens major histocompatibility complex, class II,
#		DR52 haplotype (DR52) on chromosome 6
#	and from DR53.agp NG_002433
#	Homo sapiens major histocompatibility complex, class II,
#		DR53 haplotype (DR53) on chromosome 6

#	Fixup seq_contig.md
#
#	It has a bunch of stuff belonging to the Celera
#	genome assembly.  Filter those out.  I don't know what the
#	NT_07959[0-7] items are, but there are no definitions for them
#	in the agp files and no sequence in any fa.gz file.
#	Fixup the names for the NG_ items, and change chrom MT to be M

# get the seq_contig.md file Craig just made for us on 11/28/05.
cd /cluster/store11/gs.19/ncbi
wget --timestamp ftp://ftp-private.ncbi.nih.gov/build_36/seq_contig.md

# remove Celera and Toronto entries
# and replace chrom number for those haplotypes

ssh hgwdev
cd /cluster/store11/gs.19/build36
egrep -v "Celera|NT_07959[0-7]" ../ncbi/seq_contig.md |grep -v CRA_TCA >seq_contig0.tab

hgsql hg18 -e 'drop table seq_contig0'
hgsql hg18 <~/src/hg/lib/seq_contig0.sql
hgsql hg18 -e 'load data local infile "seq_contig0.tab" into table seq_contig0'

#     fix seq_contig and
#	get the randoms sorted in proper order.  The createNcbiLifts
#	does not work correctly if the randoms are not grouped together
#	by chromosome
fixMd0 hg18 |sed -e "s/6_qbl_hap1/6_qbl_hap2/"| sed -e "s/MT/M/" | grep -v "|" >seq_contig1.tab

hgsql hg18 -e 'drop table seq_contig1'
hgsql hg18 <~/src/hg/lib/seq_contig1.sql
hgsql hg18 -e 'load data local infile "seq_contig1.tab" into table seq_contig1'
fixMd hg18 seq_contig1 >seq_contig.md

#	This pulls out all the randoms and groups them within the
#	same chrom but leaving them in the same order as they orginally
#	were  (warning this is BASH code ...)
bash
    grep "|" seq_contig0.tab | awk -F"|" '{print $1}' | \
        awk '{print $2}' | sort -n -u | while read CHR
do
        grep "[^0-9]${CHR}|" seq_contig0.tab
done >> seq_contig.md
exit

hgsql hg18 -e 'drop table seq_contig'
hgsql hg18 <~/src/hg/lib/seq_contig.sql
hgsql hg18 -e 'load data local infile "seq_contig.md" into table seq_contig'

# FYI: agp file format documented at:
#	http://www.ncbi.nlm.nih.gov/Genbank/WGS.agpformat.html# fixup a couple of names for our own purposes here
cd /cluster/store11/gs.19/agp
ln -s ../ncbi/chr*.agp ../ncbi/chr*.fa.gz .

sed -e "s#MT/NC_001807#NC_001807#" ../ncbi/chrMT.agp > chrM.agp

cat ../ncbi/c22_H2.agp > chr22_h2_hap1.agp
cat ../ncbi/c5_H2.agp  > chr5_h2_hap1.agp
cat ../ncbi/c6_COX.agp > chr6_cox_hap1.agp
cat ../ncbi/c6_QBL.agp > chr6_qbl_hap2.agp

cp -p ../ncbi/c22_H2.fa.gz chr22_h2_hap1.fa.gz
cp -p ../ncbi/c5_H2.fa.gz  chr5_h2_hap1.fa.gz
cp -p ../ncbi/c6_COX.fa.gz chr6_cox_hap1.fa.gz
cp -p ../ncbi/c6_QBL.fa.gz chr6_qbl_hap2.fa.gz

mkdir sav
cp -p *hap*.agp sav

# fix hap type agp files that have multiple contigs.

fixAgp hg18 sav/chr5_h2_hap1.agp chr5_h2_hap1.agp
fixAgp hg18 sav/chr6_qbl_hap2.agp chr6_qbl_hap2.agp

# PLEASE NOTE THAT THESE TWO CORRECTED .agp FILES ABOVE ARE USED LATER,
# NOT BY THE NEXT STEP IMMEDIATELY.

#  Put all the agp files together into one.

#	The chrM sequence now has its own agp, remove it from
#	ref_placed.agp
# sed -e "/^NC_001807/d" ../ncbi/ref_placed.agp > ref_placed.agp

# PLEASE NOTE THAT THE ORIGINAL NCBI .agp FILES FOR THOSE
# SPECIAL HAP TYPE SEQUENCES ARE USED, NOT THE CORRECTED ONES.

cd /cluster/store11/gs.19/build36
cat ../ncbi/ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \
../ncbi/c22_H2.agp \
../ncbi/c5_H2.agp \
../ncbi/c6_COX.agp \
../ncbi/c6_QBL.agp \
../ncbi/PAR.agp > ncbi_build36.agp

# cat ../ncbi/ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \
# ../agp/chr22_h2_hap1.agp ../agp/chr5_h2_hap1.agp \
# ../agp/chr6_cox_hap1.agp ../agp/chr6_qbl_hap2.agp \
# ../ncbi/PAR.agp > ncbi_build36.agp

    zcat ../ncbi/chrMT.fa.gz | \
	sed -e "s/gi|17981852|ref|NC_001807.4/ref|NC_001807/" | \
	gzip > chrM.fa.gz

#	and into ffa
    cd /cluster/store11/gs.19/ffa
# NO LONGER TRUE FOR GS19!
# There is a single bogus line at the end of ref_placed.fa.gz
#	declaring the NC_001807 MT sequence, this was later replaced by
#	chrMT.fa.gz, so remove that one line:
    zcat ../ncbi/ref_placed.fa.gz | sed -e "/^>ref|NC_001807/d" | \
    gzip > ref_placed.fa.gz
#	(That's a 40 minute job)

#	sequence.inf is usually here, symlink it
#ln -s ../ncbi/sequence.inf
    ln -s ../ncbi/chromosome_extents.inf
#	put all the fa.gz files together in one big fa.gz
#   time zcat ref_placed.fa.gz ../agp/chrM.fa.gz ../ncbi/ref_unplaced.fa.gz \
time zcat ../ncbi/ref_placed.fa.gz ../ncbi/ref_unplaced.fa.gz \
../agp/*hap?.fa.gz ../ncbi/PAR.fa.gz | gzip \
> ncbi_build36.fa.gz

#	Make a listing of all the fasta record headers, just FYI:
    cd /cluster/store11/gs.19
    zcat ffa/ncbi_build36.fa.gz | grep "^>" > ncbi.fa.headers

# Sanity check, checkYbr was updated to handle the NC_ identifier
cd /cluster/store11/gs.19/build36
zcat ../ffa/ncbi_build36.fa.gz | $HOME/bin/i386/checkYbr ncbi_build36.agp stdin seq_contig.md >check.seq_contig
#	result should be clean:
cat check.seq_contig
# Read 378 contigs from ncbi_build36.agp
# Verifying sequence sizes in stdin
# 0 problems detected

# Convert fa files into UCSC style fa files and place in "contigs"
# directory inside the gs.19/build36 directory
#	(a check that can be done here is make a list of the contigs
#	in this ./contigs directory before and compare it with the
#	list of distributed contigs created after they have been
#	disbursed.)
#	faNcbiToUcsc was fixed to handle the NC_ identifier

cd /cluster/store11/gs.19/build36

# We've been through this often

# mv contigs contigs.0
zcat ../ffa/ncbi_build36.fa.gz | $HOME/bin/i386/faNcbiToUcsc \
-split -ntLast stdin contigs

#	If you want to compare anything to previous work, check now, then:
#     rm -fr contigs.0

# Determine the chromosome sizes from agps
#	Watch carefully how chrY gets constructed.  I'm not sure
#	this chrom_sizes represents the whole length of chrY with
#	the PAR added.  We will see about that.
#	Script updated to handle new chrom names:
#	my @chroms = (1 .. 22, 'X', 'Y', 'M', '6_hla_hap1', '6_hla_hap2');

cd /cluster/store11/gs.19/build36
/cluster/bin/scripts/getChromSizes ../agp

#	Create chrom.lst list for use in foreach() loops
awk '{print $1}' chrom_sizes | sed -e "s/chr//" > chrom.lst

# Create lift files (this will create chromosome directory structure) and
#	inserts file

/cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md .

# Create contig agp files (will create contig directory structure)

/cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build36.agp .

# Create chromsome random agp files.

/cluster/bin/scripts/createNcbiChrAgp -randomonly .

# Copy the original chrN.agp files from the gs.19/agp directory
#    into each of the chromosome directories since they contain better
#    gap information. Delete the comments at top from these.
cd /cluster/store11/gs.19/build36
foreach c ( `cat chrom.lst` )
	sed -e "/^#.*/d" ../agp/chr${c}.agp > ./${c}/chr${c}.agp
end
#	chrM needs a name fixup
sed -e "s#NC_001807#chrM#" ../agp/chrM.agp > M/chrM.agp

# Distribute contig .fa to appropriate directory (assumes all files
# are in "contigs" directory).

# Create inserts file from agp and lift files (new - added by Terry, 2004-07-12)
/cluster/bin/scripts/createInserts /cluster/data/hg18 > /cluster/data/hg18/inserts

# create global data link for everyone.  No more home directory
# links required.
ln -s /cluster/store11/gs.19/build36 /cluster/data/hg18
cd /cluster/data/hg18
/cluster/bin/scripts/distNcbiCtgFa contigs .
#	Verify that everything was moved properly, the contigs directory
#	should be empty:
ls contigs
#	Nothing there, then remove it
rmdir  contigs

#	Make a list of the contigs for use later
    rm contig.lst
    touch contig.lst
    foreach chrom ( `cat chrom.lst` )
	foreach c ( $chrom/N{C,G,T}_?????? )
	    set contig = $c:t
	    echo "${chrom}/${contig}/${contig}.fa" >> contig.lst
	end
    end
#   For later comparisons, this is how many contigs we have:
    wc -l contig.lst
# 378 contig.lst

#	Note 2004-06-30 - there are some clone numbers left in some of
#	the NCBI files that are incorrect.  Due to version number
#	changes, more than one version is listed.  Namely for accession
#	numbers: AC004491 AC004921 AC004983 AC005088 AC006014 AC099654
#	The AGP files are correct, the sequence.inf file lists these
#	twice: AC004491.1 AC004491.2
#	AC004921.1 AC004921.2 AC004983.2 AC004983.3
#	AC005088.2 AC005088.3 AC006014.2 AC006014.3
#	AC099654.4 AC099654.5

# for hg18, NCBI did not provide the seq.inf file.

# FILES ARE NOW READY FOR REPEAT MASKING - start that process as
#	other steps here can proceed in parallel.

#	Previous practice used to copy everything over for jkStuff from a
#	previous build.  Rather than do that, pick up whatever is needed
#	at the time it is needed and verify that it is going to do what
#	you expect.

    cd /cluster/data/hg18
    mkdir jkStuff

# Create the contig.gl files - XXX - NCBI doesn't deliver
# contig_overlaps.agp - 2004-06-18 - this is beginning to come
# together and there is now a contig_overlaps.agp file

#	This is properly done below with a combination of psLayout
#	alignments to create the contig_overlaps.agp file
# /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
# Create chromosome gl files
# jkStuff/liftGl.csh contig.gl

# CREATING DATABASE  (DONE - 2005-11-30 - Fan)

    ssh hgwdev

# Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
# Filesystem            Size  Used Avail Use% Mounted on
# /dev/sdc1             1.8T  1.3T  356G  79% /var/lib/mysql

# Create the database.
    hgsql -e 'create database hg18' mysql
# Copy over grp table (for track grouping) from another database:
    hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" hg18

# The DB updates to grp below are not needed since we copied from hg17.
# ENCODE groups
# Added 2005-08016 kate
    echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg18
    echo 'UPDATE grp SET priority=8 WHERE name="encode"'| hgsql hg18
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg18
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg18
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg18
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg18
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg18
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg18
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg18

# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS
#	(DONE - 2005-12-02 - Fan)

# Make nib/, unmasked until RepeatMasker and TRF steps are done.
# Do this now so that the chromInfo table will exist and thus the
#	trackDb tables can be built in the next step.
#	These unmasked nibs will be replaced by the masked nibs after
#	repeat mask and trf are done.
    ssh kkstore02
    cd /cluster/data/hg18
    cp /cluster/data/hg17/jkStuff/chrFa.csh jkStuff -p

# Make chr*.fa from contig .fa
#  Copied chrFa.sh from hg17/jkStuff, renamed it to chrFa.csh
bash
time ./jkStuff/chrFa.csh
# real    2m34.406s
# user    1m17.405s
# sys     0m16.730s
exit

    mkdir nib
    foreach c (`cat chrom.lst`)
      foreach f ($c/chr${c}{,_random}.fa)
        if (-e $f) then
          echo "nibbing $f"
          /cluster/bin/i386/faToNib $f nib/$f:t:r.nib
        endif
      end
    end

# Make symbolic links from /gbdb/hg18/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/hg18/nib
    ln -s /cluster/data/hg18/nib/chr*.nib /gbdb/hg18/nib
# Load /gbdb/hg18/nib paths into database and save size info.
    cd /cluster/data/hg18
    hgsql hg18  < $HOME/kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib hg18 /gbdb/hg18/nib */chr*.fa
    hgsql -N -e "select chrom,size from chromInfo order by chrom" hg18 \
	> chrom.sizes
# You can compare this chrom.sizes with the previously created
# chrom_sizes.  Should be no difference
    sort chrom_sizes > s0
    sort chrom.sizes | grep -v random > s1
    diff s0 s1
    rm s0 s1

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2005-12-06 - Fan)
#	dbDb orderKey updated 2005-12-06 - Fan
    ssh hgwdev
#	reset dbDb orderKey - these have never been ordered properly
#	before, this will get them on the program.
    hgsql -e 'update dbDb set orderKey=11 where name = "hg17";' \
	-h genome-testdb hgcentraltest
    hgsql -e 'update dbDb set orderKey=12 where name = "hg16";' \
	-h genome-testdb hgcentraltest
    hgsql -e 'update dbDb set orderKey=13 where name = "hg15";' \
	-h genome-testdb hgcentraltest
    hgsql -e 'update dbDb set orderKey=14 where name = "hg13";' \
	-h genome-testdb hgcentraltest

# Enter hg18 into hgcentraltest.dbDb so test browser knows about it:
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
	defaultPos, active, orderKey, genome, scientificName, \
	htmlPath, hgNearOk, hgPbOk, sourceName) \
	VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \
	"chr7:127,664,479-127,689,005", 1, 10, "Human", "Homo sapiens", \
	"/gbdb/hg18/html/description.html", 0, 0, "NCBI Build 36.1");' \
	-h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P .
# Edit the makefile to add hg18 in all the right places and do
    make update
    make alpha
    cvs commit makefile

# MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2005-12-07 Fan)
    cd /cluster/data/hg18
    mkdir -p jkStuff
    cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
# Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
# Note: this ncbi.lift will not lift floating contigs to chr_random coords,
# but it will show the strand orientation of the floating contigs
# (grep for '|').
#   mdToNcbiLift seq_contig.md jkStuff/ncbi.lft
#	XXXX - appears to be unused, not done - Hiram

# REPEAT MASKING (DONE - 2005-12-09 - Fan)

#	Record the RM version here:
#	as this changes over time and there is no record in the results
ls -l /cluster/bluearc/RepeatMasker
# lrwxrwxrwx    1 angie    protein        18 Nov  3 10:40
#	/cluster/bluearc/RepeatMasker -> RepeatMasker051101

#	beware that you can not actually include the precise single line output
#	by this command since it is a CVS ident line and it will get
#	changed as this file is checked into CVS.  Remove the Id and
#	dollar sign business to allow it to stay as it is here.
/cluster/bluearc/RepeatMasker/RepeatMasker | head -1
# RepeatMasker version development-:
#	RepeatMasker,v 1.10 2005/11/03 18:39:27 angie Exp

    cat /cluster/bluearc/RepeatMasker051101/Libraries/version
    #	RepBase Update 9.11, RM database version 20050112

# Split contigs, run RepeatMasker, lift results
#	This split takes a few minutes
    ssh kkstore02
    cd /cluster/data/hg18
    foreach chrom ( `cat chrom.lst` )
	foreach c ( $chrom/N{C,G,T}_?????? )
	    set contig = $c:t
	    echo "splitting ${chrom}/${contig}/${contig}.fa"
	    faSplit size ${chrom}/${contig}/$contig.fa 500000 \
		${chrom}/${contig}/${contig}_ \
		-lift=${chrom}/${contig}/$contig.lft -maxN=500000
	end
    end

#- Make the run directory and job list:
    cd /cluster/data/hg18
    mkdir -p jkStuff
#  According to RepeatMasker help file, no arguments are required to
#	specify species because its default is set for primate (human)
#  This run script saves the .tbl file to be sent to Arian.  He uses
# those for his analysis.  Sometimes he needs the .cat and .align files for
# checking problems.  Krish needs the .align files, they are large.

    cat << '_EOF_' > jkStuff/RMHuman
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/hg18/$2
/bin/cp $2 /tmp/hg18/$2/
cd /tmp/hg18/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2
popd
/bin/cp /tmp/hg18/$2/$2.out ./
if (-e /tmp/hg18/$2/$2.align) /bin/cp /tmp/hg18/$2/$2.align ./
if (-e /tmp/hg18/$2/$2.tbl) /bin/cp /tmp/hg18/$2/$2.tbl ./
# if (-e /tmp/hg18/$2/$2.cat) /bin/cp /tmp/hg18/$2/$2.cat ./
/bin/rm -fr /tmp/hg18/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg18/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg18
'_EOF_'
# << this line makes emacs coloring happy
    chmod +x jkStuff/RMHuman

    ssh kkstore02
    cd /cluster/data/hg18
    mkdir RMRun
    rm -f RMRun/RMJobs
    touch RMRun/RMJobs
    foreach d ( `cat chrom.lst` )
     foreach c ( ${d}/N{C,G,T}_*/N{C,G,T}_*_*.fa )
        set f = $c:t
        set cc = $c:h
        set contig = $cc:t
        echo /cluster/store11/gs.19/build36/jkStuff/RMHuman \
   		/cluster/store11/gs.19/build36/${d}/${contig} $f \
   '{'check out line+ /cluster/store11/gs.19/build36/${d}/${contig}/$f.out'}' \
          >> RMRun/RMJobs
      end
    end

# We have 5990 jobs in RMJobs:
    wc RMRun/RMJobs
#	5990   41930 1127992 RMRun/RMJobs

#- Do the run
    ssh pk
    cd /cluster/data/hg18/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...

#- While that is running, you can run TRF (simpleRepeat) on the small
# cluster.  See SIMPLE REPEAT section below
# Completed: 5990 of 5990 jobs
# CPU time in finished jobs:   30661460s  511024.34m  8517.07h  354.88d  0.972 y
# IO & Wait Time:                 38038s     633.96m    10.57h    0.44d  0.001 y
# Average job time:                5125s      85.42m     1.42h    0.06d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6693s     111.55m     1.86h    0.08d
# Submission to last job:         86532s    1442.20m    24.04h    1.00d

#	Lift up the split-contig .out's to contig-level .out's
#
#	If a mistake is made in the following it would be possible to
#	destroy all the RM output.  So, just to be paranoid, save all
#	the RM output in bluearc for the time being:
    ssh kkstore02

    cd /cluster/data/hg18
    mkdir /cluster/bluearc/hg18/RMOutput
    foreach c ( `cat chrom.lst` )
     foreach d ( ${c}/N{C,G,T}_* )
	set T = /cluster/bluearc/hg18/RMOutput/${d}
	mkdir -p ${T}
        cd ${d}
        set contig = $d:t
        cp -p ${contig}_?{,?,??}.fa.out ${T}
        cd ../..
	echo "${d} done"
     end
    end
#	Make sure we got them all:
#	(this doesn't work later since there are more *.fa.out files
#	after the lifting.  More explicitly to find just these:
#		find . -name "N?_*_*.fa.out" -print | wc -l
    find . -name "*.fa.out" -print | wc -l
#	5990
    find /cluster/bluearc/hg18/RMOutput -type f | wc -l
#	5990
#	same count

#	OK, now you can try this operation, do it in a script like this
#	and save the output of the script for a record of what happened.

    cat << '_EOF_' > jkStuff/liftRM.csh
#!/bin/csh -fe
foreach c ( `cat chrom.lst` )
 foreach d ( ${c}/N{C,G,T}_* )
    cd $d
    set contig = $d:t
    liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out
    cd ../..
 end
end
'_EOF_'
    chmod +x jkStuff/liftRM.csh
    mkdir scriptsOutput

    script lift.log
    bash
    time jkStuff/liftRM.csh > scriptsOutput/liftRM.1 2>&1
    exit
    exit

#	Check that they all were done:
    grep "fa.out" scriptsOutput/liftRM.1 | wc -l
#	5990
#	same count as above

#- Lift up RepeatMask .out files to chromosome coordinates via
# picked up jkStuff/liftOut2.sh from the hg17 build.  Renamed to
# liftOut2.csh, changed the line that does the chrom listing
    bash
    time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2 2>&1
# real    0m30.488s
# user    0m24.670s
# sys     0m2.797s
# seems much faster than hg17 ???

# hg17 numbers:
#	real    9m46.780s
#	user    1m18.900s
#	sys     7m33.990s

#- By this point, the database should have been created (above):
    ssh hgwdev
    cd /cluster/data/hg18
    bash
    time hgLoadOut hg18 ?/*.fa.out ??/*.fa.out *hap*/*.fa.out > \
	scriptsOutput/hgLoadOut 2>&1
# real    9m9.045s
# user    2m19.500s
# sys     0m24.440s

# errors during this load:  (there are always a couple of these)
# Strange perc. field -1.2 line 153851 of 2/chr2.fa.out
# Strange perc. field -10423.3 line 174747 of 3/chr3.fa.out
# Strange perc. field -5635.9 line 174747 of 3/chr3.fa.out
# Strange perc. field -259.3 line 174747 of 3/chr3.fa.out
# Strange perc. field -1.4 line 205545 of 4/chr4.fa.out
# Strange perc. field -0.1 line 167690 of 7/chr7.fa.out
# Strange perc. field -1331.2 line 198656 of 7/chr7.fa.out
# Strange perc. field -1460.4 line 198656 of 7/chr7.fa.out
# Strange perc. field -4.2 line 223183 of 7/chr7.fa.out
# Strange perc. field -3192.0 line 60424 of 8/chr8.fa.out
# Strange perc. field -423.4 line 60424 of 8/chr8.fa.out
# Strange perc. field -784.0 line 60424 of 8/chr8.fa.out
# Strange perc. field -0.1 line 52020 of X/chrX.fa.out
# Strange perc. field -4526.7 line 190254 of X/chrX.fa.out
# Strange perc. field -3757.2 line 190254 of X/chrX.fa.out
# Strange perc. field -597.2 line 190254 of X/chrX.fa.out
# Strange perc. field -13030.4 line 137624 of 16/chr16.fa.out
# Strange perc. field -1359.8 line 137624 of 16/chr16.fa.out
# Strange perc. field -2223.5 line 137624 of 16/chr16.fa.out
# Strange perc. field -1.3 line 11573 of 22/chr22.fa.out
# Strange perc. field -12.7 line 69873 of 22/chr22.fa.out


#	Verify we have similar results to previous assembly:
#	featureBits hg18 rmsk
# 	1406290513 bases of 3107677273 (45.252%) in intersection
#	featureBits -countGaps hg17 rmsk
#	1390952984 bases of 3095016460 (44.942%) in intersection
#	featureBits hg17 rmsk
#	1391378842 bases of 2867328468 (48.525%) in intersection
#	featureBits hg16 rmsk
#	1388770568 bases of 2865248791 (48.469%) in intersection
#	Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
#	following the SIMPLE REPEAT sections below

# let Rachel know that RepeatMask is done.

# SIMPLE REPEAT [TRF] TRACK (DONE - 2005-12-07 - Fan)
#	Copy the contigs, first to the bluearc, then to /iscratch/i
    ssh kkstore02
    mkdir /cluster/bluearc/hg18
    mkdir /cluster/bluearc/hg18/contigs

    cd /cluster/data/hg18
    foreach ctg ( `cat contig.lst` )
	set c = $ctg:t
 	echo "$ctg > /cluster/bluearc/hg18/contigs/$c"
	cp -p $ctg /cluster/bluearc/hg18/contigs/$c
    end
#	Check how much is there:
#	du -hsc /cluster/bluearc/hg18/contigs
#	2.8G    /cluster/bluearc/hg18/contigs
    exit

# Distribute contigs to /iscratch/i
    ssh pk
    mkdir -p /san/sanvol1/scratch/hg18/unmaskedContigs
    cd /san/sanvol1/scratch/hg18/unmaskedContigs
    cp -p /cluster/bluearc/hg18/contigs/* .
    ls .

# Verify same amount made it there:
#	du -hsc /san/sanvol1/scratch/hg18/unmaskedContigs
#	2.9G    /san/sanvol1/scratch/hg18/unmaskedContigs
#	Then send them to the other 7 Iservers
#    /cluster/bin/iSync

#	Go to the small cluster for this business:
    ssh pk

    mkdir -p /cluster/data/hg18/bed/simpleRepeat
    cd /cluster/data/hg18/bed/simpleRepeat
    mkdir trf
    cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
# << this line makes emacs coloring happy
    chmod +x runTrf

    cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy

    ls -1S /san/sanvol1/scratch/hg18/unmaskedContigs/*.fa > genome.lst
    gensub2 genome.lst single gsub jobList
    para create jobList
    para try
    para check
    para push
    para check
# Completed: 378 of 378 jobs
# CPU time in finished jobs:      18956s     315.93m     5.27h    0.22d  0.001 y
# IO & Wait Time:                  2519s      41.98m     0.70h    0.03d  0.000 y
# Average job time:                  57s       0.95m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2345s      39.08m     0.65h    0.03d
# Submission to last job:          2427s      40.45m     0.67h    0.03d

bash
liftUp simpleRepeat.bed /cluster/data/hg18/jkStuff/liftAll.lft \
warn trf/*.bed  > lu.out 2>&1

# Load into the database:
    ssh hgwdev
    cd /cluster/data/hg18/bed/simpleRepeat
    /cluster/bin/i386/hgLoadBed hg18 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
#	Loaded 629076 elements of size 16
#	Compare with previous assembly
    featureBits hg18 simpleRepeat
# 56164158 bases of 3107677273 (1.807%) in intersection
# 	featureBits hg17 simpleRepeat
#	54952425 bases of 2866216770 (1.917%) in intersection
#     featureBits hg16 simpleRepeat
#	54320136 bases of 2865248791 (1.896%) in intersection
#	GAPS weren't in hg18 yet at this point, after gaps added:
#	featureBits hg18 simpleRepeat
#	54964044 bases of 2867328468 (1.917%) in intersection
#	featureBits -countGaps hg18 simpleRepeat
#	54964044 bases of 3096628158 (1.775%) in intersection

# CREATE MICROSAT TRACK (done 2006-7-5 JK)
     ssh hgwdev
     cd /cluster/data/hg18/bed
     mkdir microsat
     cd microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
    /cluster/bin/i386/hgLoadBed hg18 microsat microsat.bed

# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2005-12-09 - Fan)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
    ssh kkstore02
    mkdir -p cd /cluster/data/hg18/bed/simpleRepeat
    cd /cluster/data/hg18/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end

#	The 4 lines below were left over from makeHg17.doc.
#	EXPERIMENT, at a filter of <= 12, we have coverage:
#	20904399 bases of 2867328468 (0.729%) in intersection
#	at a filter of <= 9, we have coverage:
#	19271270 bases of 2867328468 (0.672%) in intersection

# Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/hg18
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c ( `cat chrom.lst` )
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end

# MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2005-12-09, Fan)
# This used to be done right after RepeatMasking.  Now, we mask with
# TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above,
#	and after Repeat Masker is complete.
    ssh kkstore02
    cd /cluster/data/hg18

# Make chr*.fa from contig .fa
#  chrFa.csh was already copied from hg17/jkStuff
    bash
    time ./jkStuff/chrFa.csh > scriptsOutput/chrFa.out 2>&1 &
# real    2m35.734s
# user    1m18.351s
# sys     0m16.596s
# much faster than hg17 numbers as shown below.  ???

# old hg17 numbers:
#	real    13m18.512s
#	user    9m1.670s
#	sys     1m7.290s

#- Soft-mask (lower-case) the contig and chr .fa's
    time ./jkStuff/makeFaMasked.csh > scriptsOutput/maFaMasked.out 2>&1
# real    8m47.289s
# user    3m45.698s
# sys     1m44.416s

#	old hg17 numbers:
#	real    29m31.623s
#	user    13m49.700s
#	sys     5m58.750s

#- Make hard-masked .fa.masked files as well:
    time ./jkStuff/makeHardMasked.csh > scriptsOutput/maHardMasked.out 2>&1
# real    5m48.833s
# user    1m41.926s
# sys     0m52.084s

#- Create the bothMasksNib/ directory
    time ./jkStuff/makeNib.csh > scriptsOutput/maNib.out 2>&1
# real    2m23.280s
# user    1m6.462s
# sys     0m19.795s

# old hg17 numbers:
#	real    14m41.694s
#	user    6m28.000s
#	sys     1m42.500s

# Make symbolic links from /gbdb/hg18/nib to the real nibs.
    ssh hgwdev
    cd /cluster/store11/gs.19/build36
    mv nib nib.raw
    mv bothMasksNib nib
    rm /gbdb/hg18/nib/*.nib
    ln -s `pwd`/nib/* /gbdb/hg18/nib

# Load /gbdb/hg18/nib paths into database and save size info.

    cd /cluster/data/hg18
    hgNibSeq -preMadeNib hg18 /gbdb/hg18/nib */chr*.fa
# 3107677273 total bases

#	Should be the same size as before
    hgsql -N -e "select chrom,size from chromInfo order by chrom" hg18 \
	> chrom.sizes.masked
    diff chrom.sizes chrom.sizes.masked
#	should be no output at all, thus:
    rm chrom.sizes.masked

# Copy the masked contig fa to /scratch and /iscratch
#	And everything else we will need for blastz runs, etc ...
#	Best to do this sequence first to /cluster/bluearc/scratch,
#	which is going to be the source for the /scratch copy.
#	And then from there to the /iscratch
#	Make sure you are on the fileserver for the original source:
    ssh kkstore02
    mkdir -p /cluster/bluearc/scratch/hg/gs.19/build36
    cd /cluster/bluearc/scratch/hg/gs.19/build36

#	these copies take less than 2 minutes each
    mkdir bothMaskedNibs
    cp -p /cluster/data/hg18/nib/*.nib ./bothMaskedNibs
    mkdir maskedContigs
    foreach chrom ( `cat /cluster/data/hg18/chrom.lst` )
	cp -p /cluster/data/hg18/${chrom}/N{C,G,T}_*/N{C,G,T}_??????.fa \
		./maskedContigs
	echo "done ${chrom}"
    end
#	make sure you have them all:
    ls maskedContigs | wc -l
#	378
    wc -l /cluster/data/hg18/contig.lst
#	378
    mkdir rmsk
    foreach chrom ( `cat /cluster/data/hg18/chrom.lst` )
	cp -p /cluster/data/hg18/${chrom}/*.out ./rmsk
	echo "done ${chrom}"
    end

#	Now, go to the destination for /iscratch and copy from the
#	bluearc
    ssh kkr1u00
    mkdir -p /iscratch/i/gs.19/build36
    cd /iscratch/i/gs.19/build36
#	This takes about 5 minutes
    rsync -arlv /cluster/bluearc/scratch/hg/gs.19/build36/ .

    bash
    time /cluster/bin/iSync
#	real    7m27.649s

# request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch

# Ask sysadmin to bring up BLAT server.

# update central dbDb table to add the new blat server entry

    echo 'INSERT INTO blatServers (db, host, port, isTrans) \
                VALUES ("hg18", "blat19", "17778", "1"); \
          INSERT INTO blatServers (db, host, port, isTrans) \
                VALUES ("hg18", "blat19", "17779", "0");' \
    | hgsql -h genome-testdb hgcentraltest

# LOAD ctgPos table - Contig position track
#	After fixing up hgCtgPos to accept the -chromLst argument, simply:
    cd /cluster/data/hg18
    hgCtgPos -chromLst=chrom.lst hg18 .

# GOLD AND GAP TRACKS (DONE - 2005-12-10 - Fan)
		      (RE-DONE - 2006-04-06 - Fan)
    ssh hgwdev
    cd /cluster/data/hg18

# manually edit the 4 haplotype .agp files to change the first col from
# contig IDs into chrom name.

    hgGoldGapGl -noGl -chromLst=chrom.lst hg18 /cluster/data/hg18 .
    #	Disappointing to see this create so many tables ...
    #	_gap and _gold for each chrom

# contig.gl ... section skipped for the time being.  (Fan 2005-12-13).

#############################################################################
# GC5BASE (DONE - 2005-12-13 - Fan)
    ssh kkstore02
    mkdir -p /cluster/data/hg18/bed/gc5Base
    cd /cluster/data/hg18/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 hg18 \
        /cluster/data/hg18/nib | wigEncode stdin gc5Base.wig gc5Base.wib

    #   runs for about 17 minutes

    #   load database
    ssh hgwdev
    cd /cluster/data/hg18/bed/gc5Base
    mkdir /gbdb/hg18/wib
    ln -s `pwd`/gc5Base.wib /gbdb/hg18/wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 gc5Base gc5Base.wig

    #   verify index is correct:
    hgsql hg18 -e "show index from gc5Base;"
    #   should see good numbers in Cardinality column

#########################################################################
# GENBANK auto update (DONE 2005-12-13 Fan)
    # align with revised genbank process. drop xeno ESTs.
    cd ~/kent/src/hg/makeDb/genbank
    cvs update -d etc
    # edit etc/genbank.conf to add hg18

# hg18
hg18.serverGenome = /cluster/data/hg18/nib/chr*.nib
hg18.clusterGenome = /scratch/hg/gs.18/build36/bothMaskedNibs/chr*.nib
hg18.ooc = /scratch/hg/h/11.ooc
hg18.lift = /cluster/store11/gs.19/build36/jkStuff/liftAll.lft
hg18.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
hg18.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
hg18.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
hg18.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
#hg18.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
#hg18.genbank.est.xeno.pslCDnaFilter    = ${finished.genbank.est.xeno.pslCDnaFilter}
#hg18.genbank.est.xeno.load = yes
hg18.refseq.mrna.xeno.load  = yes
hg18.refseq.mrna.xeno.loadDesc = yes
hg18.mgcTables.default = full
hg18.mgcTables.mgc = all
hg18.downloadDir = hg18

    ### NOTE: in the future, enable orfeome tracks as part of this (markd)

    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial hg18 &

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad  hg18&

# CPGISLANDS (DONE - 2005-12-14 - Fan)
    ssh hgwdev
    mkdir -p /cluster/data/hg18/bed/cpgIsland
    cd /cluster/data/hg18/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal at watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    mv cpglh.exe /cluster/data/hg18/bed/cpgIsland/

    # cpglh.exe requires hard-masked (N) .fa's.
    # There may be warnings about "bad character" for IUPAC ambiguous
    # characters like R, S, etc.  Ignore the warnings.
    ssh kkstore02
    cd /cluster/data/hg18/bed/cpgIsland
    foreach f (../../*/chr*.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpglh.exe $f > $fout
    end
    #	the warnings:
# Bad char 0x52 = 'R' at line 2046, base 102229, sequence chr16_random
# Bad char 0x4d = 'M' at line 1216113, base 60805573, sequence chr3
# Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
# Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/hg18/bed/cpgIsland
    hgLoadBed hg18 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 28226 elements of size 10
# Sorted
# Saving bed.tab
# Loading hg18

########################################################################
# PRODUCING GENSCAN PREDICTIONS (DONE - 2005-12-16 - Fan)
# RELOADED PEPTIDE TABLE, GENSCANPEP (DONE, 2006-07-11, hartera)

    ssh hgwdev
    mkdir /cluster/data/hg18/bed/genscan
    cd /cluster/data/hg18/bed/genscan
    cvs co hg3rdParty/genscanlinux

    ssh kkstore02
    cd /cluster/data/hg18/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the contigs
    # *that do not have pure Ns* (due to heterochromatin, unsequencable
    # stuff) which would cause genscan to run forever.
    rm -f genome.list
    bash
    for f in `cat /cluster/data/hg18/contig.lst`
    do
      egrep '[ACGT]' /cluster/data/hg18/$f.masked > /dev/null
	if [ $? = 0 ]; then
	    echo /cluster/data/hg18/$f.masked >> genome.list
	fi
    done
    # exit your bash shell if you are [t]csh ...
    #	This egrep matched all the contigs in hg18.  I guess none of
    #	them are complete Ns* at this point.

    # Log into kki (not kk !).  kki is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kki
    cd /cluster/data/hg18/bed/genscan
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try
    para check
    para push ... etc ...
# Completed: 377 of 378 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      78976s    1316.27m    21.94h    0.91d  0.003 y
# IO & Wait Time:                  4961s      82.68m     1.38h    0.06d  0.000 y
# Average job time:                 223s       3.71m     0.06h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3491s      58.18m     0.97h    0.04d
# Submission to last job:          7541s     125.68m     2.09h    0.09d

    #	Running the single failed job on kolossus with a smaller window:

ssh kkr7u00.kilokluster.ucsc.edu
/cluster/bin/x86_64/gsBig /cluster/data/hg18/5/NT_006576/NT_006576.fa.masked \
        gtf/NT_006576.fa.gtf -trans=pep/NT_006576.fa.pep \
        -subopt=subopt/NT_006576.fa.bed -exe=hg3rdParty/genscanlinux/genscan \
        -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000

    # If there were out-of-memory problems (run "para problems"), then
    # re-run those jobs by hand but change the -window arg from 2400000
    # something lower.  In build33, this was 22/NT_011519
    #  In build34 there were NO failures !

    # Convert these to chromosome level files as so:
    ssh kkstore02
    cd /cluster/data/hg18/bed/genscan
    $HOME/bin/i386/liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N*.gtf
    $HOME/bin/i386/liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft \
	warn subopt/N*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/hg18/bed/genscan
    ldHgGene hg18 genscan genscan.gtf
# Reading genscan.gtf
# Read 43122 transcripts in 329799 lines in 1 files
# 43122 groups 49 seqs 1 sources 1 feature types
# 43122 gene predictions

    hgPepPred hg18 generic genscanPep genscan.pep
    #	Processing genscan.pep
    hgLoadBed hg18 genscanSubopt genscanSubopt.bed
    # Reading genscanSubopt.bed
    # Loaded 514065 elements of size 6
    #	Sorted
    #	Creating table definition for
    #	Saving bed.tab
    #	Loading hg18

    # featureBits hg18 genscan
    # 56039161 bases of 2881515245 (1.945%) in intersection
    #	featureBits hg17 genscan
    #	55323340 bases of 2866216770 (1.930%) in intersection
    #	featureBits hg16 genscan
    #	55333689 bases of 2865248791 (1.931%) in intersection
    #	featureBits hg18 genscanSubopt
    # 55685959 bases of 2881515245 (1.933%) in intersection
    # featureBits hg17 genscanSubopt
    #	55986178 bases of 2866216770 (1.953%) in intersection
    #	featureBits hg16 genscanSubopt
    #	56082952 bases of 2865248791 (1.957%) in intersection

    #	Should be zero intersection with rmsk
    #	featureBits -chrom=chr1 hg18 genscan rmsk

    # Reload genscanPep table - requested by a user. It has been dropped
    # from hgwdev.
    # (hartera, 2006-07-11)
    ssh hgwdev
    cd /cluster/data/hg18/bed/genscan
    hgPepPred hg18 generic genscanPep genscan.pep

############################################################################
# CREATE 2 BIT FILE (DONE 12/20/05, Fan)

   ssh kkstore02
   cd /cluster/data/hg18
   faToTwoBit */chr*.fa hg18.2bit

# BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR
# ZEBRAFISH (danRer3) (DONE, 2005-12-23, hartera)
    ssh pk
    # Blastz uses lineage-specific repeats. There are none for mouse
    # and fish so use all repeats for each species as lineage-specific.
    mkdir -p /san/sanvol1/scratch/hg18/linSpecRep.notInOthers
   foreach f (/cluster/bluearc/hg18/linSpecRep/notInOthers/chr*.out.spec)
     cp -p $f /san/sanvol1/scratch/hg18/linSpecRep.notInOthers/
   end

    # get only lineage specific repeats for chr1-25 and chrM
    mkdir -p /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
    foreach f (/cluster/data/danRer3/*/chr[0-9M]*.fa.out)
      cp -p $f \
          /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/$f:t:r:r.out.spec
    end
    # make a nib dir without random chroms
    mkdir -p /san/sanvol1/scratch/hg18/chromNib
    cp -p /cluster/data/hg18/nib/chr*.nib \
       /san/sanvol1/scratch/hg18/chromNib
    rm -r chr*_random.nib
    # make a nib dir that is also just chr1-25 and chrM
    mkdir -p /san/sanvol1/scratch/danRer3/chromNib
    cp /cluster/data/danRer3/nib/chr[0-9M]*.nib \
       /san/sanvol1/scratch/danRer3/chromNib

    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.danRer3.2005-12-17
    cd /cluster/data/hg18/bed
    ln -s blastz.danRer3.2005-12-17 blastz.danRer3
    # Three separate runs done to create chains. Runs 1 and 3 could be
    # combined into one.
    # RUN 1: hg18 chroms (no randoms) vs danRer3 chr1-25 and chrM using
    # lineage-specific repeats.
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.danRer3
    # make run dir
    mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun
    ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun
    # make out dir
    mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut
    ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut
    cd chromsRun
    # use parameters as for hg17 vs danRer2 - see makeHg17.doc
    cat << '_EOF_' > DEF
# human (hg18) vs zebrafish (danRer3)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64

# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human (hg18)
SEQ1_DIR=/san/sanvol1/scratch/hg18/chromNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRep.notInOthers
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer3)
# just chroms 1-25 and chrM
SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
SEQ2_LIMIT=30
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1chroms.len
SEQ2_LEN=$BASE/S2chroms.len
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF

    grep -v random /cluster/data/hg18/chrom.sizes > S1chroms.len
    grep -v chrUn /cluster/data/danRer3/chrom.sizes \
            | grep -v chrNA > S2chroms.len
    # do blastz and create chains for danRer3 chr1-25 and chrM using
    # all repeats as lineage-specific repeats.
    # chickenHumanTuned.gap scoring matrix is now used by axtChain if the
    # linearGap parameter is set to "loose".
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
       -bigClusterHub=pk \
       -smallClusterHub=pk \
       -workhorse=pk \
       -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut \
       -chainMinScore=5000 \
       -chainLinearGap loose \
       -stop chainRun `pwd`/DEF >& doChains.log &
    # Took 2 hours 45 minutes to run.
    # Then run the human hg18 chroms and randoms vs danRer3 chrUn and chrNA
    ssh hgwdev
    # get file of scaffolds for hg18 randoms. Use the Table Browser to
    # select sequence from the whole genome for the ctgPos table of contigs
    # restricting to chrom like "%_random" in the Free-form query box of
    # the filter. hg18RandomContigs.fa
    cd /cluster/data/hg18/bed/blastz.danRer3
    # get the position and contig name from the ctgPos table
    hgsql -N -e 'select chrom, chromStart, chromEnd, contig from ctgPos \
          where chromlike "%_random";' hg18 > contigPosAndNames.txt
    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.danRer3
    # change header to just the position
    perl -pi.bak -e 's/>.+range=(chr[0-9XY]+_random:[0-9]+\-[0-9]+).+/>$1/' \
         hg18RandomContigs.fa
awk '{print "perl -pi.bak -e s/"$1":"$2+1"-"$3"/"$4"/ hg18RandomContigs.fa"}' \
       contigPosAndNames.txt > addContigNames.csh
    chmod +x addContigNames.csh
    # run script
    addContigNames.csh
    ssh hgwdev
    # make a 2 bit file of the chroms and random scaffolds
    cd /cluster/data/hg18
    set dir=/san/sanvol1/scratch/hg18
    faToTwoBit [1-9]/chr[1-9].fa [12][0-9]/chr[12][0-9].fa M/chrM.fa \
               X/chrX.fa Y/chrY.fa *hap[12]/chr*.fa \
               /cluster/data/hg18/bed/blastz.danRer3/hg18RandomContigs.fa \
               $dir/chromsAndRandoms.2bit
    twoBitInfo $dir/chromsAndRandoms.2bit $dir/chromsAndRandoms.len
    # make a 2 bit file for just the random scaffolds
    faToTwoBit /cluster/data/hg18/bed/blastz.danRer3/hg18RandomContigs.fa \
               $dir/randoms.2bit
    twoBitInfo $dir/randoms.2bit $dir/randoms.len
    # make sure all the random chroms contigs are included - should be 88.
    # make a 2 bit file for all the chroms and random chroms, make sure to
    # get the haplotype chrom sequences.
    faToTwoBit [1-9MXY]/chr*.fa [12][0-9]/chr*.fa *hap[12]/chr*.fa \
               $dir/hg18.2bit
    twoBitInfo $dir/hg18.2bit $dir/hg18Chroms.len
    twoBitInfo /san/sanvol1/scratch/danRer3/danRer3.2bit \
               /san/sanvol1/danRer3/danRer3Chroms.len
    # make file of scaffolds lengths for NA and Un scaffolds
    twoBitInfo \
       /san/sanvol1/scratch/danRer3/scaffoldsNAandUn/danRer3NAandUnScaf.2bit \
       /san/sanvol1/scratch/danRer3/scaffoldsNAandUn/NAandUnScafs.len
    cd /cluster/data/hg18/bed/blastz.danRer3
    # make a lift file for the hg18 randoms contigs
    cat /cluster/data/hg18/*/lift/random.lft >> $dir/randomContigs.lft
    # RUN 2: hg18 chroms and random chroms contigs vs danRer3 chrNA and
    # chrUn scaffolds with no lineage-specific repeats as there are too
    # many scaffolds in chrNA and chrUn. Use the dynamic masking function
    # of Blastz instead.
    # make run dir
    mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun
    ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun
    # make out dir
    mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut
    ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut
    cd chromsAndRandomsRun
    # use parameters similar to hg17 vs danRer2 - see makeHg17.doc
    # As lineage-specific repeats can not be used with chrUn and chrNA
    # scaffolds, then use dynamic masking, M=50.
    cat << '_EOF_' > DEF
# human (hg18) vs zebrafish (danRer3)
# human chroms and random chrom contigs vs zebrafish chrNA and chrUn scaffolds
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64

# Reuse some parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human (hg18)
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/chromsAndRandoms.2bit
SEQ1_LIFT=/san/sanvol1/scratch/hg18/randomContigs.lft
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
# 500 kb target with 5 kb overlap
SEQ1_CHUNK=500000
SEQ1_LAP=5000

# QUERY: Zebrafish (danRer3)
# just scaffolds for chrUn and chrNA
SEQ2_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
SEQ2_CTGDIR=/san/sanvol1/scratch/danRer3/scaffoldsNAandUn/danRer3NAandUnScaf.2bit
SEQ2_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1000000000
SEQ2_LAP=0

BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len
SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/chromsAndRandoms.len
SEQ2_LEN=/san/sanvol1/scratch/danRer3/danRer3Chroms.len
SEQ2_CTGLEN=/san/sanvol1/scratch/danRer3/scaffoldsNAandUn/NAandUnScafs.len
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF

    # do blastz and create chains for human chroms and random chroms in contigs
    # vs zebrafish danRer3 chrNA and chrUn in scaffolds without
    # lineage-specific repeats but using blastz's dynamic masking.
    # chickenHumanTuned.gap scoring matrix is now used by axtChain if the
    # linearGap parameter is set to "loose".
  nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
   -bigClusterHub=pk \
   -smallClusterHub=pk \
   -workhorse=pk \
   -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut \
   -chainMinScore=5000 \
   -chainLinearGap loose \
   -stop chainRun `pwd`/DEF >& doChains.log &
    # Took about 15 hours to finish.
    ssh hgwdev
    # Try running hg18 random chroms in contigs vs danRer3 chroms 1-25 and chrM
    # with lineage-specific repeats.
    # make directory of human contigs repeats to serve as lineage-specific
    # repeats for the random chroms contigs.
    mkdir -p /san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers
    cd /cluster/data/hg18/bed/blastz.danRer3
    awk '{print $4}' contigPosAndNames.txt > contigNames.txt
    foreach c (`cat contigNames.txt`)
      foreach f (/cluster/data/hg18/*/${c}/${c}.fa.out)
      cp -p $f \
      /san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers/$f:t:r:r.out.spec
      end
    end
    # RUN 3: hg18 random chroms contigs vs danRer3 chr1-25 and chrM using
    # lineage-specific repeats.
    # make run dir
    mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun
    ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun
    # make out dir
    mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut
    ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut
    set dir=/san/sanvol1/scratch
    cp $dir/hg18/blastzDanRer3/chromsRun/S2chroms.len \
       $dir/danRer3/chr1to25andM.len
    # make nib dir for random contigs for hg18
    mkdir -p $dir/hg18/randomContigsNib
    foreach c (`cat contigNames.txt`)
      foreach f (/cluster/data/hg18/*/${c}/${c}.fa)
      faToNib -softMask $f $dir/hg18/randomContigsNib/$f:t:r.nib
      end
    end
    cd randomsRun
    cat << '_EOF_' > DEF
# human (hg18) vs zebrafish (danRer3)
# human random chrom contigs vs zebrafish chr1-15 and chrM
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64

# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human (hg18)
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/randomContigsNib
SEQ1_LIFT=/san/sanvol1/scratch/hg18/randomContigs.lft
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer3)
# just chr1-25 and chrM
SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_LIMIT=30
SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len
SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/randoms.len
SEQ2_LEN=/san/sanvol1/scratch/danRer3/chr1to25andM.len
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF

    # do blastz and create chains for human random chroms in contigs
    # vs zebrafish danRer3 chroms 1 to 25 and chrM using all repeats
    # as lineage-specific repeats.
    # chickenHumanTuned.gap scoring matrix is now used by axtChain if the
    # linearGap parameter is set to "loose".
  nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
   -bigClusterHub=pk \
   -smallClusterHub=pk \
   -workhorse=pk \
   -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut \
   -chainMinScore=5000 \
   -chainLinearGap loose \
   -stop chainRun `pwd`/DEF >& doChains.log &
   # Took 15 minutes.
   # chains are sorted by score so move into one directory and use
   # chainMergeSort
   ssh kolossus
   set blastzDir=/cluster/data/hg18/bed/blastz.danRer3
   cd $blastzDir/chromsRun/axtChain
   mkdir -p chainsNotMerged
   foreach r (chromsRun chromsAndRandomsRun randomsRun)
     nice cp -p ${blastzDir}/${r}/axtChain/run/chain/*.chain \
          ${blastzDir}/chromsRun/axtChain/chainsNotMerged/
   end
   nice chainMergeSort ./chainsNotMerged/*.chain | nice gzip -c \
        > hg18.danRer3.all.chain.gz
   # split into chains by chrom
   nice zcat hg18.danRer3.all.chain.gz | chainSplit chain stdin
   # check chains, there are 48 should be 49. Chains for chr11_random
   # are missing. These sequences have a lot of repeats in the regions that
   # hits danRer3 with BLAT.
   # carry on with doBlastzChainNet.pl starting from net step
   ssh hgwdev
   cd /cluster/data/hg18/bed/blastz.danRer3/chromsRun
   mv DEF DEF.chroms
   # edit DEF to give hg18.2bit as the SEQ1_DIR and danRer3.2bit as SEQ2_DIR
   # and remove lineage-specfic repeats.
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
   -bigClusterHub=pk \
   -smallClusterHub=pk \
   -workhorse=pk \
   -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut \
   -chainMinScore=5000 \
   -chainLinearGap loose \
   -continue net `pwd`/DEF >& doNetAndDownloads.log &
   # Took about 25 minutes.
   # crashed on ssh -X sanhead1 for cleanup so re-run script
   cleanUp.csh
   # copy chainDanRer3.html and netDanRer3.html to
   # kent/src/hg/makeDb/trackDb/human/hg18/ and edit to describe method used.
   # Add tracks to trackDb.ra there. Edit README.txt in the downloads
   # directory to describe method used for alignments.
# featureBits -chrom=chr1 hg18 refGene:cds chainDanRer3Link -enrichment
# refGene:cds 1.378%, chainDanRer3Link 2.601%, both 0.927%, cover 67.26%,
# enrich 25.86x
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 1.386%, chainDanRer3Link 2.742%, both 0.909%, cover 65.58%,
# enrich 23.91x
# So similar coverage and enrichment to hg17 vs danRer2 chains.

#########################################################################
# BLASTZ MOUSE Mm7 second time (DONE - 2005-12-24 - 2005-12-25 Fan)
    #	After fixing a bug in the lineage specific repeat snip business
    #	in blastz-run-ucsc script
    ssh pk
    mkdir /cluster/data/hg18/bed/blastzMm7.2005-12-24
    cd /cluster/data/hg18/bed
    rm blastz.mm7
    ln -s blastzMm7.2005-12-24 blastz.mm7
    cd blastzMm7.2005-12-24

    cat << '_EOF_' > DEF
# human vs mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInMouse
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes

# QUERY: Mouse Mm7 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/mm7/nib
SEQ2_SMSK=/scratch/hg/mm7/linSpecRep/notInHumanDogCow
SEQ2_LEN=/cluster/bluearc/mm7/chrom.sizes
SEQ2_CHUNK=3000000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzMm7.2005-12-24
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-stop=load \
	`pwd`/DEF > to-load.out 2>&1 &
    #	Started 2005-12-24 06:15

    mv to-load.out to-load.out.1

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
    -continue=chainMerge -stop=load \
    `pwd`/DEF > to-load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
    -continue=download \
    `pwd`/DEF > download.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
    -swap -continue=download \
    `pwd`/DEF > swap-download.out 2>&1 &

# PLEASE NOTE THAT SOME .OUT FILES MIGHT HAVE BEEN OVERWRITTEN
# DUE TO RETRIES AND/OR NEXT STEP COMMAND NOT FULLY EDITED CORRECTLY.

    #	Measurements:

    ssh hgwdev

    featureBits mm7 chainHg18Link
    # 990285408 bases of 2583394090 (38.333%) in intersection

    featureBits hg18 chainMm7Link
    # 991769039 bases of 2881515245 (34.418%) in intersection

    # each of above took about half hour.

#########################################################################
# BLASTZ CHICKEN GalGal2 second time (DONE - 2005-12-28 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzGalGal2.2005-12-28
    cd /cluster/data/hg18/bed
    rm blastz.galGal2
    ln -s blastzGalGal2.2005-12-28 blastz.galGal2
    cd blastzGalGal2.2005-12-28

    cat << '_EOF_' > DEF
# human vs chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken GalGal2 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/cluster/bluearc/galGal2/chrom.sizes
SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzGalGal2.2005-12-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-stop=load \
	`pwd`/DEF > load.out 2>&1 &
    #	Started 2005-12-28 10:35

    # Two jobs stuck in the same node.  Did manual para stop and para push.
    # Both finished within a few minutes.

    # Done! On Wed Dec 28 15:32:45 PST 2005.

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

    # Had an error at the net step

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
      -swap -continue=net -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
    -continue=download \
    `pwd`/DEF > download.out 2>&1 &

    # the gzip job on kolossus seems not moving at all.
    # killed it manually.  Try again.

    # Seemed not moving, kill it again.  Now use pk instead of kolossus.

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -workhorse=pk \
    -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
    -continue=download \
    `pwd`/DEF > download.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -workhorse=pk \
    -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
    -swap -continue=download \
    `pwd`/DEF > swap-download.out 2>&1 &

    # Done! Wed Dec 28 20:39:44 PST 2005

    #	Measurements:

    ssh hgwdev

    nice featureBits galGal2 chainHg18Link
    # 91564024 bases of 1054197620 (8.686%) in intersection
    nice featureBits hg18 chainGalGal2Link
    # 102417858 bases of 2881515245 (3.554%) in intersection

    nice featureBits galGal2 chainHg17Link
    # 93277286 bases of 1054197620 (8.848%) in intersection
    nice featureBits hg17 chainGalGal2Link
    # 103882699 bases of 2866216770 (3.624%) in intersection

#########################################################################
# BLASTZ DOG CanFam2 time (DONE - 2005-12-28 - 2005-12-29 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzCanFam2.2005-12-28
    cd /cluster/data/hg18/bed
    rm blastz.canFam2
    ln -s blastzCanFam2.2005-12-28 blastz.canFam2
    cd blastzCanFam2.2005-12-28

    cat << '_EOF_' > DEF
# human vs dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for dog (per Webb email to Brian Raney)
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog CanFam2 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_LEN=/cluster/bluearc/canFam2/chrom.sizes
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzCanFam2.2005-12-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-stop=load \
	`pwd`/DEF > load.out 2>&1 &
    #	Started 2005-12-28 21:33

    # Two jobs stuck in the same node.  Did manual para stop and para push.
    # Both finished within a few minutes.

    # Done! On Thu Dec 29 05:27:31 PST 2005.

    # system seems hang on kolossus (3 processes of [tcsh -c nice chainMergeSort], not moving)
    # manually killed the jobs.
    # now use pk as the workhorse.

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
      -workhorse=pk \
      -continue=chainMerge \
	-stop=load \
	`pwd`/DEF > load2.out 2>&1 &

    # Done! Thu Dec 29 09:10:02 PST 2005.

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=pk \
      -swap -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

    # Had an error at the load step,
    # mySQL error 2013: Lost connection to MySQL server during query,
    # probably due to sys admin working on network connections,
    # continue at the load step

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
      -workhorse=pk \
      -swap -continue=load -stop=load \
	`pwd`/DEF > swap-load2.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
    -workhorse=pk \
    -continue=download \
    `pwd`/DEF > download.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -workhorse=pk \
    -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
    -swap -continue=download \
    `pwd`/DEF > swap-download.out 2>&1 &

    # Done! Dec 29 13:21

    #	Measurements:

    ssh hgwdev
nice featureBits canFam2 chainHg18Link
# 1477551526 bases of 2384996543 (61.952%) in intersection
nice featureBits hg18 chainCanFam2Link
# 1524764349 bases of 2881515245 (52.915%) in intersection
nice featureBits canFam2 chainHg17Link
# 1487483112 bases of 2384996543 (62.368%) in intersection
nice featureBits hg17 chainCanFam2Link
# 1530197469 bases of 2866216770 (53.387%) in intersection

# ENABLE GENBANK UPDATE (1/3/06 Fan)

# add hg18 to the following two files and check them in.

     src/hg/makeDb/genbank/etc/align.dbs
     src/hg/makeDb/genbank/etc/hgwdev.dbs

# then go to /cluster/data/genbank/etc and do cvs update on these two files.

#########################################################################
# BLASTZ RAT Rn3 (STARTED - 2005-12-22, DONE 2006-01-05 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzRn3.2005-12-22
    cd /cluster/data/hg18/bed
    rm blastz.rn3
    ln -s blastzRn3.2005-12-22 blastz.rn3
    cd blastzRn3.2005-12-22

    cat << '_EOF_' > DEF
# human vs rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Muman Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInRat
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole
pieces
SEQ2_DIR=/scratch/rat/rn3/softNib
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzRn3.2005-12-22
TMPDIR=/scratch/tmp
'_EOF_'
    # happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-stop=load \
	`pwd`/DEF > to-load.out 2>&1 &

# start processing again on 12/31/05.

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=pk \
      -swap \
      -stop=load \
	`pwd`/DEF > swap.out 2>&1 &

# Either UCSC RR and hgwdev systems or network went down around 11 AM 12/31/05.

# After holidays, start again on 1/3/06 and again on 1/5/06.

    ssh pk
    cd /cluster/data/hg18/bed
    cd blastzRn3.2005-12-22
    screen
    bash

      time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=pk \
      -swap \
      -continue=net \
      -stop=load \
	`pwd`/DEF > swap6.out 2>&1 &

# DONE! Jan  5 13:39

# Measurements:
nice featureBits rn3 chainHg18Link
# 962630574 bases of 2571104688 (37.440%) in intersection
nice featureBits hg18 chainRn3Link
# 964251210 bases of 2881515245 (33.463%) in intersection

#########################################################################
# BLASTZ FUGU fr1 (STARTED - 2005-12-20, DONE 2006-01-05 Fan)
    ssh pk
    mkdir /cluster/data/hg18/bed/blastzFr1.2005-12-20
    cd /cluster/data/hg18/bed
    ln -s blastzFr1.2005-12-20 blastz.fr1
    cd blastzFr1.2005-12-20

    cat << '_EOF_' > DEF
# human vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Reuse parameters from human-chicken, except L=6000 (more relaxed)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human Hg18 - testing 100,000,000 sized chunk on pk kluster
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000

# QUERY: Fugu Fr1 - chunk big enough to run the whole chrom at once
SEQ2_DIR=/san/sanvol1/scratch/fr1/nib
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CHUNK=400000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzFr1.2005-12-20
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    ssh pk
    cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -stop=load \
	`pwd`/DEF > thruLoad.out 2>&1 &

    ssh pk
    cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -continue=chainMerge -stop=load \
	`pwd`/DEF > thruLoad.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -continue=download \
	`pwd`/DEF > download.clean.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -swap \
	`pwd`/DEF > swap.out 2>&1 &

# Finish the remaining step, 1/4/05.

    ssh pk
    cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
    screen
    bash

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 \
	-swap -continue=download \
	`pwd`/DEF > DownloadSwap.out 2>&1 &

# First try found the DEF was some how altered for rn3.
# Re-generated DEF and try again.

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 \
	-swap -continue=download \
	`pwd`/DEF > DownloadSwap2.out 2>&1 &

# Done.  Jan  4 09:48.

# measurements

nice featureBits hg18 chainFr1Link
# 51795958 bases of 2881515245 (1.798%) in intersection
nice featureBits hg17 chainFr1Link
#50831650 bases of 2866216770 (1.773%) in intersection

nice featureBits hg18 netFr1
# 691148929 bases of 2881515245 (23.986%) in intersection
nice featureBits hg17 netFr1
# 714234935 bases of 2866216770 (24.919%) in intersection

nice featureBits fr1 chainHg18Link
# 43267869 bases of 315518167 (13.713%) in intersection
# nice featureBits fr1 chainHg17Link
0 bases of 315518167 (0.000%) in intersection
nice featureBits fr1 netHg18
# 140843080 bases of 315518167 (44.639%) in intersection
nice featureBits fr1 netHg17
# 0 bases of 315518167 (0.000%) in intersection

# BLASTZ TETRAODON TetNig1 second time (DONE - 2006-01-07 Fan)

ssh pk
mkdir /cluster/data/hg18/bed/blastzTetNig1.2006-01-07
cd /cluster/data/hg18/bed
rm blastz.tetNig1
ln -s blastzTetNig1.2006-01-07 blastz.tetNig1
cd blastzTetNig1.2006-01-07

    cat << '_EOF_' > DEF
# human vs tetraodon
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CHUNK=410000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzTetNig1.2006-01-07
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Sat Jan  7 05:40:51 PST 2006

# Encountered an error:
startStep: 0, at step 5 net to stopStep 6
netChains: looks like previous stage was not successful (can't find [hg18.tetNig1.]all.chain[.gz]).

# Try it with pk as the workhorse.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=net \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &

# Load done.  Sat Jan  7 07:34:56 PST 2006

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=download \
 `pwd`/DEF > download.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &

# Done! Sat Jan  7 08:02:14 PST 2006
# The download and swap-download took less than 10 seconds each.  ???

# Measurements:

ssh hgwdev
nice featureBits tetNig1 chainHg18Link
# 50026847 bases of 342403326 (14.611%) in intersection
nice featureBits hg18 chainTetNig1Link
# 57654754 bases of 2881515245 (2.001%) in intersection

nice featureBits tetNig1 chainHg17Link
# 34379509 bases of 342403326 (10.041%) in intersection
nice featureBits hg17 chainTetNig1Link
# 35910128 bases of 2866216770 (1.253%) in intersection

# BLASTZ FROG XenTro1 second time (STARTED - 2006-01-06 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06
    cd /cluster/data/hg18/bed
    rm blastz.xenTro1
    ln -s blastzXenTro1.2006-01-06 blastz.xenTro1
    cd blastzXenTro1.2006-01-06

    cat << '_EOF_' > DEF
# human vs frog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Frog XenTro1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
      -stop=load \
	`pwd`/DEF > load.out 2>&1 &
# Started Fri Jan  6 20:19:30 PST 2006
# Blastz run done.  Jan  7 02:07 load.out

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

# got the following error:

startStep: 4, at step 5 net to stopStep 6
netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]).

# Try it with pk instead of kolossus:

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load2.out 2>&1 &

# It worked, swap-load done. Jan  7 06:05

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=download \
`pwd`/DEF > download.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &

# Done! Jan  7 06:18

# Measurements:

ssh hgwdev
nice featureBits xenTro1 chainHg18Link
# 61197900 bases of 1381238994 (4.431%) in intersection
nice featureBits hg18 chainXenTro1Link
# 67810866 bases of 2881515245 (2.353%) in intersection

nice featureBits xenTro1 chainHg17Link
# 81777842 bases of 1381238994 (5.921%) in intersection
nice featureBits hg17 chainXenTro1Link
# 85701475 bases of 2866216770 (2.990%) in intersection

############################################################################
# BLASTZ COW BosTau2 second time (STARTED - 2006-01-07, DONE 2006-01-08 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzBosTau2.2006-01-07
    cd /cluster/data/hg18/bed
    rm blastz.bosTau2
    ln -s blastzBosTau2.2006-01-07 blastz.bosTau2
    cd blastzBosTau2.2006-01-07

    cat << '_EOF_' > DEF
# human vs cow
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow BosTau2 - single chunk big enough to run entire genome
SEQ2_DIR=/san/sanvol1/scratch/bosTau2/bosTau2.2bit
SEQ2_LEN=/san/sanvol1/scratch/bosTau2/chrom.sizes
SEQ2_CHUNK=3200000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzBosTau2.2006-01-07
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-stop=load \
-workhorse=pk \
`pwd`/DEF > load.out 2>&1 &

# Started Sat Jan  7 07:57:22 PST 2006
# blastz run (and load) done Jan  8 00:13

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &

# took a long time to finish.

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=download \
`pwd`/DEF > download.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &

# Done! Jan  8 21:10

# Measurements:

ssh hgwdev
nice featureBits bosTau2 chainHg18Link
# 1357027317 bases of 2812203870 (48.255%) in intersection
nice featureBits hg18 chainBosTau2Link
# 1357291762 bases of 2881515245 (47.103%) in intersection
nice featureBits bosTau2 chainHg17Link
# 0 bases of 2812203870 (0.000%) in intersection
nice featureBits hg17 chainBosTau2Link
# 1350076765 bases of 2866216770 (47.103%) in intersection

#######################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2006-01-11 - Fan)
    ssh kkstore02
    cd /cluster/data/hg18

    blat hg18.2bit \
	 /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
# Wrote 30378 overused 11-mers to 11.ooc

# Copy over to the bluearc
   cp -p 11.ooc /cluster/bluearc/hg18

#######################################################################
# PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE
#	(DONE - 2006-01-12 - 2006-04-04 - Hiram)
#  (RE-DONE 2006-10-31 - Hiram - see section:)
# REWORK PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/coverage
    cd /cluster/data/hg18/bed/coverage
    #	find all the clones that were used in the assembly
    sed -e "/^#.*/d" ../../ncbi_build36.agp | \
        awk '{if (!match($5,"N")) {print $6}}' | \
        sort -u > placed_in_assembly.list
    wc -l placed_in_assembly.list
    #	27093 placed_in_assembly.list

    #	And all possible clones considered for assembly.
    #	The AADB clones are the Celera assembly, don't want them.
    sed -e "/^#.*/d" /cluster/store11/gs.19/ncbi/sequence.inf | \
	grep for_assembly | grep -v AADB | awk '{print $1}' | sort -u \
	    > allButOneClonesConsidered.list
    (grep AADB01066164.1 \
	/cluster/store11/gs.19/ncbi/sequence.inf | awk '{print $1}'; \
	cat allButOneClonesConsidered.list) | sort -u \
	    > allClonesConsidered.list
    #	The grep for AADB eliminates a single clone: AADB01066164.1
    #	Which actually should be in the list since it is in the
    #	ncbi_build36.agp file.  Back in Hg17, this was the only AADB
    #	clone in the sequence.inf file, now there are 400,673 of them in
    #	this Hg18 sequence.inf file marked "for_assembly"

    #	Later after a lot of this was done, it was discovered that some
    #	of the clones on this allConsidered list are actually obsolete
    #	and have newer versions in use.  They were identified by the
    #	following perl script:

    cat << '_EOF_' > ckMultipleVersions.pl
#!/usr/bin/env perl
use warnings;
use strict;
sub usage() {
    print "usage: ./ckMultipleVersions.pl allClonesConsidered.list\n";
    exit 255;
}
my $argc = scalar(@ARGV);
if ($argc != 1) { usage; }
my $fileName = shift;
open (FH,"<$fileName") or die "Can not open $fileName";
my %cloneAcc;   #       key is clone accession major number, value is version
while (my $clone = <FH>) {
    chomp $clone;
    my ($major, $version) = split('\.', $clone);
    if (exists($cloneAcc{$major})) {
        my $previousVersion = $cloneAcc{$major};
        if ($previousVersion >= $version) {
            printf STDERR "$major.$version - obsolete\n";
        } else {
            printf STDERR "$major.$previousVersion - obsolete\n";
            $cloneAcc{$major} = $version;
        }
    } else {
        $cloneAcc{$major} = $version;
    }
}
close (FH);
foreach my $major (sort keys %cloneAcc) {
    printf "$major.$cloneAcc{$major}\n";
}
'_EOF_'
    #	happy emacs
    chmod +x ckMultipleVersions.pl

    ./ckMultipleVersions.pl allClonesConsidered.list \
	2> obsoleteClone.list > allClones.notObsolete.list
    #	After this obsolete list was made, those clone results were
    #	removed from the kluster run hierarchies of results.
    #	And when we finally got to loading up the coverage track
    #	2006-04-04, a few additional ones had crept into the mix.
    #	These were added to this list at that loading time.

    comm -12 allClonesConsidered.list \
	/cluster/data/hg17/bed/contig_overlaps/sequence.list \
	    > allClones.InHg17AndHg18.list
    comm -23 allClonesConsidered.list \
	/cluster/data/hg17/bed/contig_overlaps/sequence.list \
	    > allClones.InHg18NotHg17.list
    comm -13 allClonesConsidered.list \
	/cluster/data/hg17/bed/contig_overlaps/sequence.list \
	    > allClones.InHg17NotHg18.list

    #	how many are the same as previous build:
    comm -12 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \
	placed_in_assembly.list > sameAsHg17.list
    wc sameAsHg17.list
    #	26775  26775 300641 sameAsHg17.list
    #	There is one clone: AADB01066164.1
    #	Which is listed in allClones.InHg17NotHg18.list
    #	But it is on the Hg18 placed_in_assembly.list
    #	And it is on the Hg17 placed_in_assembly.list but it isn't
    #	actually found in Hg17 ?  Perhaps it didn't align good enough.
    comm -23 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \
	placed_in_assembly.list > uniqueToHg17.list
    wc uniqueToHg17.list
    #	97   97 1080 uniqueToHg17.list
    #	and unique to hg18, not in hg17:
    comm -13 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \
	placed_in_assembly.list > newToHg18.list
    wc newToHg18.list
    #	318  318 3547 newToHg18.list
    #	make a list of these new contigs:
    #	using the previous perl scripts:
    cp -p /cluster/data/hg17/bed/contig_overlaps/*.pl .

    #	Now, we need to distribute the clone sequence files in a
    #	directory hierarchy by chrom name.  Using the contigAcc.pl file
    #	from the previous release:
    cp /cluster/data/hg17/bed/contig_overlaps/contigAcc.pl .
    #	This newer version is generalized a bit better to take command
    #	line arguments for the two files it is to read instead of having
    #	them explicitly in the code, then:
    ./contigAcc.pl /cluster/data/hg18/ncbi_build36.agp \
	/cluster/data/hg18/seq_contig.md > cloneToChrom.list 2>&1
    #	And now, since most of the clone sequence already exists in the
    #	Hg17 work directory, we only need to make symlinks to the
    #	existing ones, and move only the new ones.  The following script
    #	will find an existing copy and symlink it correctly.

    cat << '_EOF_' > createPlacedHierarchy.sh
#!/bin/sh

mkdir -p placedClones

sed -e "/^#.*/d" cloneToChrom.list | while read L
do
    CHROM=`echo "${L}" | awk '{print $1}'`
    CLONE=`echo "${L}" | awk '{print $2}'`
    if [ ! -d "placedClones/${CHROM}" ]; then
	mkdir placedClones/${CHROM}
    fi
    HG17_version="/cluster/data/hg17/bed/contig_overlaps/${CHROM}/${CLONE}"
    HG18_version_0="/cluster/data/hg18/bed/coverage/newToHg18/${CLONE}"
  HG18_version_1="/cluster/data/hg18/bed/coverage/allClones.newToHg18/${CLONE}"
    if [ -f "${HG17_version}" ]; then
	if [ -f "${HG18_version_0}" -o -f "${HG18_version_1}" ]; then
	  echo "ERROR: Why is there both an Hg17 and Hg18 version for ${CLONE}"
	  exit 255
	fi
	ln -s "/cluster/data/hg17/bed/contig_overlaps/${CHROM}/${CLONE}" \
		"./placedClones/${CHROM}/${CLONE}"
    else
	if [ -f "${HG18_version_0}" -a -f "${HG18_version_1}" ]; then
	    echo "ERROR: Why are there two Hg18 copies for ${CLONE}"
	    exit 255
	fi
	if [ -f "${HG18_version_0}" ]; then
	    ln -s "${HG18_version_0}" "./placedClones/${CHROM}/${CLONE}"
	else
	    if [ -f "${HG18_version_1}" ]; then
		ln -s "${HG18_version_1}" "./placedClones/${CHROM}/${CLONE}"
	    else
		# must be on a different chrom in hg17
		HG17_chrom=`grep -v "^#" \
	/cluster/data/hg17/bed/contig_overlaps/disburseEm.list \
	| grep "^${L}$" | awk '{print $1}'`
    HG17_version="/cluster/data/hg17/bed/contig_overlaps/${HG17_chrom}/${CLONE}"
		if [ -f "${HG17_version}" ]; then
		    echo "ERROR: Why is there no version for ${CLONE}"
		    exit 255
		fi
		ln -s "${HG17_version}" "./placedClones/${CHROM}/${CLONE}"
	    fi
	fi
    fi
done
'_EOF_'
    #	happy emacs
    chmod +x createPlacedHierarchy.sh
    ./createPlacedHierarchy.sh
    #	There should be no errors

    #	We need masked contigs for the psLayout alignments
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/coverage/maskedContigs
    cd /cluster/data/hg18/bed/coverage/maskedContigs
    hgsql -N \
	-e "select chrom,chromStart,chromEnd,contig,size from ctgPos;" hg18 \
	> ctgPos.txt

    ssh kkstore02
    cd /cluster/data/hg18/bed/coverage/maskedContigs
    #	verify each contig only listed once:
    awk '{print $4}' ctgPos.txt | sort | uniq -c | sort -n | less
    #	should all have a count of one
    #	verify all chrom sizes match the contig sizes:
    awk '{print $3-$2}' ctgPos.txt > chrSize.list
    awk '{print $5}' ctgPos.txt > ctgSize.list
    diff ctgSize.list chrSize.list
    #	should be no difference
    #	OK, now fetch the contigs from the twoBit file:

    cat << '_EOF_' > 2bitToFa.pl
#!/usr/bin/env perl
use warnings;
use strict;
while (my $line=<>) {
chomp $line;
my ($chrom, $start, $end, $contig, $size) = split('\s',$line);
$chrom =~ s/chr//;
printf "echo -n 'working $contig ...'; mkdir -p $chrom; twoBitToFa /cluster/data/hg18/hg18.2bit:chr$chrom:$start-$end stdout | sed -e 's/^>.*/>$contig/' > $chrom/$contig.fa; gzip $chrom/$contig.fa; echo 'done'\n";
}
'_EOF_'
    # happy emacs
    chmod +x 2bitToFa.pl
    cat ctgPos.txt | ./2bitToFa.pl > 2bitToFa.sh
    chmod +x 2bitToFa.sh
    time ./2bitToFa.sh

    #	and create a lift file for these contigs
    cat << '_EOF_' > mkCtgLift.pl
#!/usr/bin/env perl
use warnings;
use strict;
while (my $line=<>)
{
chomp $line;
my ($start, $chrCtg, $size, $chrom, $chrLen) = split('\s',$line);
$chrCtg =~ s#.*/##;
printf "%s\t%s\t%s\t%s\t%s\n", $start, $chrCtg, $size, $chrom, $chrLen;
}
'_EOF_'
    #	happy emacs
    chmod +x mkCtgLift.pl
    cat /cluster/data/hg18/jkStuff/liftAll.lft \
	| ./mkCtgLift.pl > liftContigs.lft

    #	Create individual ooc files for each contig
    mkdir ooc
    for C in `ls */*.fa.gz | sed -e "s/.fa.gz//"`
    do
	CONTIG=`basename ${C}`
	CHR=`dirname ${C}`
	mkdir -p ooc/${CHR}
	zcat ${C}.fa.gz | blat -repMatch=256 \
	    -makeOoc=ooc/${CHR}/${CONTIG}.10.ooc -tileSize=10 \
	    stdin /dev/null /dev/null
	echo "done: ${CONTIG}"
    done

    #	Copy everything to san filesystem for kluster run:
    ssh pk
    mkdir /san/sanvol1/scratch/hg18/coverage
    cd /san/sanvol1/scratch/hg18/coverage
    rsync -a --progress --copy-links \
	/cluster/data/hg18/bed/coverage/placedClones/ ./placedClones/
    rsync -a --progress --copy-links \
	/cluster/data/hg18/bed/coverage/maskedContigs/ ./maskedContigs/

    mkdir /san/sanvol1/scratch/hg18/coverage/runPlaced
    cd /san/sanvol1/scratch/hg18/coverage/runPlaced

    cat << '_EOF_' > runPsLayout.sh
#!/bin/sh
#   runPsLayout.sh <chrom> <clone> <contig>
#     where <chrom> is the chrom this contig is on
#      <clone> is one of the .fa.gz files in
#	  /san/sanvol1/scratch/hg18/coverage/placedClones/<chrom>/<clone>.fa.gz
#      <contig> is one of the contigs found in:
#	/san/sanvol1/scratch/hg18/coverage/maskedContigs/<chrom>/<contig>.fa.gz
#
HERE=`pwd`
CHROM=$1
CLONE=$2
CONTIG=$3
TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz
CLONESRC=/san/sanvol1/scratch/hg18/coverage/placedClones/$CHROM/$CLONE.fa.gz
OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc
RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl"
mkdir -p psl/${CHROM}/${CONTIG}
if [ ! -s ${CLONESRC} ]; then
        echo "Can not find: ${CLONESRC}" 1>/dev/stderr
        exit 255
fi
if [ ! -s ${TARGET} ]; then
        echo "Can not find: ${TARGET}" 1>/dev/stderr
        exit 255
fi
if [ ! -s ${OOC} ]; then
        echo "Can not find: ${OOC}" 1>/dev/stderr
        exit 255
fi
WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}"
mkdir -p "${WRKDIR}"
cd ${WRKDIR}
zcat ${CLONESRC} > ${CLONE}.fa
zcat ${TARGET} > ${CONTIG}.fa
cp -p ${OOC} ./10.ooc
/cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT}
RET=$?
cd ${HERE}
rm -fr ${WRKDIR}
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}"
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}"
exit ${RET}
'_EOF_'
    #	happy emacs
    chmod +x runPsLayout.sh

    #	create jobList from cloneToChrom.list:
    grep -v "^#" /cluster/data/hg18/bed/coverage/cloneToChrom.list \
	| sed -e "s/.fa.gz//" \
	| awk '{
printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s/%s.psl}\n",
        $1, $2, $3, $1, $3, $2
}' > masterJobList

    #	To do a quick test, run just chrM:
    grep " M " masterJobList > jobList
s
    para create jobList
    para try ... check ... etc ...


    #	Then, the whole run:
    rm -fr psl err
    para create masterJobList
    para try ... check ... push ... etc ...
    #	running 2006-01-17  16:41

    #	We need the phase information from the sequence.inf file:
    ssh hgwdev
    cd /cluster/data/hg18/bed/coverage
    cp /cluster/data/hg17/phase.pl .
    #	this script was fixed up for hg18 to take an argument to the
    #	sequence.inf file:
    ./phase.pl /cluster/data/hg18/ncbi/sequence.inf > phase.txt
    #	what kind of phases do we have:
    awk '{print $2}' phase.txt | sort | uniq -c
    #	  1134 D
    #	562513 F
    #	 17270 P
    #	Compared to hg17 we had:
    awk '{print $2}' /cluster/data/hg17/phase.txt | sort | uniq -c
    #	  1088 D
    #	146900 F
    #	 17300 P

    #	Back in the kluster runPlaced directory, we put together the
    #	kluster run results with:
    ssh pk
    mkdir /san/sanvol1/scratch/hg18/coverage/runPlaced/filteredLifted
    cd /san/sanvol1/scratch/hg18/coverage/runPlaced/filteredLifted

    cat << '_EOF_' > filterLift.sh
#!/bin/sh

for C in 22
do
    echo -n "chr${C} working ... "
    mkdir -p ${C}
    OUT="${C}/filterLift.out"
    pslSort dirs ${C}/raw.psl tmp ../psl/${C}/N* > ${OUT} 2>&1
    pslReps -singleHit -nearTop=0.001 ${C}/raw.psl ${C}/repsSingle.psl \
        /dev/null >> ${OUT} 2>&1
    liftUp ${C}/chr${C}.psl ../../maskedContigs/liftContigs.lft warn \
        ${C}/repsSingle.psl >> ${OUT} 2>&1
    clusterClone -agp -minCover=80 -maxGap=60000 ${C}/repsSingle.psl \
        > ${C}/single.agp 2>> ${OUT} 2>&1
    liftUp ${C}/rawLifted.psl ../../maskedContigs/liftContigs.lft warn \
        ${C}/raw.psl >> ${OUT} 2>&1
    clusterClone -agp -minCover=80 -maxGap=60000 ${C}/chr${C}.psl \
        > ${C}/chr${C}.bed 2>> ${OUT}
    echo "done"
done
'_EOF_'
    #	happy emacs
    chmod +x filterLift.sh
    time ./filterLift.sh

    cp /cluster/data/hg17/fixPhase.pl .
    #	fixed up the script to take an argument pointing to the phase.txt file

    ssh kkstore02
    cd /cluster/data/hg18
    grep "for_assembly" ncbi/sequence.inf \
	| sed -e "s/\tW\t/\t3\t/;" > sequence.inf
    cd /cluster/store11/gs.19/ffa
    ln -s ../build36/sequence.inf .

    ssh hgwdev
    cd /cluster/data/hg18
    #	currently working only on chr22
    echo "22" > clonePos.list
    #	need to reload gold gap *and* gl at this time.  gl wasn't loaded
    #	before this.  It is required for the clonePos track.
    hgGoldGapGl -chrom=chr22 hg18 /cluster/store11/gs.19 build36
    hgClonePos  -maxErr=3 -maxWarn=2000 -chromLst=clonePos.list \
        hg18 /cluster/data/hg18 ./sequence.inf /cluster/store11/gs.19 \
        2> clone.pos.errors


    #	OK, now for the hard part.  The unplaced clones.
    #	First we will make an attempt to determine which clones they
    #	belong to by using information from the previous build, the
    #	sequence.inf file, the seq_contig.md file, and the
    #	ncbi_build36.agp file.

    ssh kkstore02
    cd /cluster/data/hg18/bed/coverage
    comm -13 placed_in_assembly.list allClonesConsidered.list \
	> unplaced.clone.list
    comm -12 unplaced.clone.list allClones.InHg17AndHg18.list \
	> common.to.hg17.unplaced.list
    comm -23 unplaced.clone.list allClones.InHg17AndHg18.list \
	> unique.to.hg18.unplaced.list

    awk '{print $1,$6}' /cluster/data/hg17/contig_overlaps.agp \
	| sed -e "s/_[0-9]*$//" | sort -u > hg17.contig.clone.list

    awk '{print $1,$6}' ../../sequence.inf | sed -e "s/(//; s/)//" \
	> cloneToChrom.from.seq.inf.txt

    #	using the contig to clone information from Hg17, attempt to
    #	locate the common.to.hg17.unplaced.list in terms of chrom and
    #	contig.  Along with the ncbi_build36.agp, seq_contig.md and
    #	cloneToChrom.from.seq.inf.txt infomation, we can attempt to
    #	place clones that have perhaps moved, or don't have entries in
    #	one file or another.  The relationships obtained from the
    #	various files:
    #   ncbi_build36.agp - gives clone to contig name and clone to chr name
    #			but for placed clones only, not useful here
    #			unless they moved from hg17 (try this with the
    #			placed list)
    #   seq_contig.md - gives contig to chrom relationship

    ./chrCloneContig.pl /cluster/data/hg18/ncbi_build36.agp \
	hg17.contig.clone.list /cluster/data/hg18/seq_contig.md \
	    common.to.hg17.unplaced.list cloneToChrom.from.seq.inf.txt \
		> chrCloneContigCommonToHg17.list \
		    2> common.to.hg17.unplaced.stderr

    #	With this chrCloneContigCommonToHg17.list list in hand, can now
    #	create a hierarchy of ./unPlacedClones/
    ./createUnplacedHierarchy.sh

    #	Then, copy them to the san for kluster run
    ssh pk
    cd /san/sanvol1/scratch/hg18/coverage
    rsync -a --progress --copy-links \
	/cluster/data/hg18/bed/coverage/unPlacedClones/ ./unPlacedClones/


    mkdir runUnPlaced
    cd runUnPlaced
    #	create jobList from the chrCloneContigCommonToHg17.list
    egrep -v "^#|XX_000" \
	/cluster/data/hg18/bed/coverage/chrCloneContigCommonToHg17.list \
	| sed -e "s/.fa.gz//" \
	| awk '{
printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s/%s.psl}\n",
        $1, $2, $3, $1, $3, $2
}' > masterJobList

    #	Test a subset:
    grep " Y " masterJobList > jobListY

    para create jobListY
    para try ... check ... etc ...

    #	... some time later ... 2006-04-04
    #	All the clones were eventually run through the placement kluster
    #	runs.  Ending up with five different directory results:
    [hiram@hgwdev64 /san/sanvol1/scratch/hg18/coverage]
    #	-rw-rw-r--  1  3144245541 Mar 15 09:24 runFishClones/raw.psl
    #	-rw-rw-r--  1    91182723 Mar 15 10:44 runUnPlaced/raw.psl
    #	-rw-rw-r--  1   102642706 Mar 15 10:49 runPlaced/raw.psl
    #	-rw-rw-r--  1 15839733941 Mar 15 14:56 runLastRecover/raw.psl
    #	-rw-rw-r--  1 14338192704 Mar 15 18:25 runLastOnes/raw.psl

    #	Combining those results together required a large memory
    #	machine and a couple of days processing time:
    ssh hgwdev64
    cd /san/sanvol1/scratch/hg18/coverage
    pslSort dirs raw.psl tmp runPlaced runUnPlaced runFishClones \
	runLastRecover runLastOnes > raw.psl.out 2>&1
    #	resulting in a 33 Gb result file:
    -rw-rw-r--    1 33515995907 Apr  2 10:54 raw.psl
    #	trimming that down with pslReps:
    time pslReps -nohead -nearTop=0.001 -singleHit \
	raw.psl repsSingle.psl /dev/null
    #	real    14m58.371s
    #	-rw-rw-r--    1    42333543 Apr  4 10:22 repsSingle.psl
    #	wc -l repsSingle.psl
    #	48005 repsSingle.psl
    #	Now, clustering those alignments together:
    clusterClone -allowDuplicates -agp -minCover=80 -maxGap=60000 \
        repsSingle.psl > single.agp 2> single.out
    wc -l single.agp
    #	45714 single.agp
    #	Sort them, and set their phase correctly:
    sort -k1,1 -k2,2n single.agp \
    | ./fixPhase.pl /cluster/data/hg18/bed/coverage/phase.txt \
        > contig_overlaps.agp
    #	some of them are not in the phase.txt file, these are
    #	set to draft status:
    #	WARN: can not find contig AC024654.2 in phase.txt
    #	WARN: can not find contig AL133291.12 in phase.txt
    #	WARN: can not find contig AC055712.12 in phase.txt
    #	WARN: can not find contig AC024480.2 in phase.txt
    #	WARN: can not find contig AC068738.2 in phase.txt
    #	WARN: can not find contig AL354703.14 in phase.txt
    #	WARN: can not find contig AL354756.17 in phase.txt
    #	WARN: can not find contig AL157825.11 in phase.txt
    #	WARN: can not find contig AC073306.1 in phase.txt
    #	WARN: can not find contig AL138892.13 in phase.txt
    #	WARN: can not find contig AL590104.7 in phase.txt
    #	WARN: can not find contig AC079146.4 in phase.txt
    #	WARN: can not find contig AC024497.3 in phase.txt
    #	WARN: can not find contig AC021295.3 in phase.txt
    #	WARN: can not find contig AC040906.3 in phase.txt
    #	WARN: can not find contig AC008372.5 in phase.txt
    #	WARN: can not find contig AC026054.3 in phase.txt
    #	WARN: can not find contig AC053504.4 in phase.txt

    #	create the gl files from that overlaps.agp file:
    ssh hgwdev
    cd /cluster/data/hg18
    cp -p /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp .
    #	after going through this sequence and loading everything,
    #	a few clones were discovered to have crept into the list that
    #	were obsolete.  So, add them to the list used by the
    #	removeObsoleteClones.sh script:
    awk '{print $6}' contig_overlaps.agp > clone.coverage.list
    bed/coverage/ckMultipleVersions.pl clone.coverage.list \
	> /dev/null 2> /tmp/clone.transitions
    awk '{if (! match($1,$3)){ print }}' /tmp/clone.transitions \
	>> bed/coverage/obsoleteClone.list

    time ./removeObsoleteClones.sh
    wc -l /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp \
	./contig_overlaps.agp
    #	45714 /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp
    #	45597 ./contig_overlaps.agp
    #	after adding ten new ones the second time around:
    #	45587 ./contig_overlaps.agp
    time agpToGl contig_overlaps.agp . -md=seq_contig.md
    #       this liftGl.csh finds all the contig.gl files under each
    #       contig directory and creates chromsome coordinate chr*.gl
    #       files in each chrom directory
    jkStuff/liftGl.csh contig.gl
    #       Then hgGoldGapGl uses those chrom level chr*.gl files to add
    #       the gl tables (as well as gold and gap
    hgGoldGapGl -chromLst=chrom.lst hg18 /cluster/store11/gs.19 build36

    #	strip some business from the sequence.inf file that is not needed
    #	The sed here has to be done in a shell script, those tabs are
    #	actual tabs and not the explicit ^I
    mkdir -p /scratch/tmp
    grep -v AADB /cluster/store11/gs.19/ncbi/sequence.inf \
	> /scratch/tmp/seq0.inf
    (cat /scratch/tmp/seq0.inf; \
    grep AADB01066164.1 /cluster/store11/gs.19/ncbi/sequence.inf) \
	| grep "for_assembly" \
	| sed -e "s/^IW^I/^I3^I/" > cleanedSequence.inf
    #       Then hgClonePos uses those tables to create the Coverage track
    hgClonePos  -maxErr=600 -maxWarn=50000 -chromLst=clonePos.list \
        hg18 /cluster/data/hg18 ./cleanedSequence.inf /cluster/store11/gs.19 \
        > clone.pos.errors 2>&1

###########################################################################
# RECOMBINATION RATES (DONE 2006-02-15 Fan)

# The STS MArkers track must be completed prior to creating this track

    ssh kkstore02
    cd /cluster/data/hg18/bed
    mkdir -p recombRate
    cd recombRate

# Copy other necessary files here (in future, can take from previous version)
# NOTE: these are stable, and could be saved in a permanent spot

    cp -p /projects/hg2/booch/psl/info/decode_all .
    cp -p /projects/hg2/booch/psl/info/marshfield_all .
    cp -p /projects/hg2/booch/psl/info/genethon_all .

# Compared these 3 files with the 3 files of hg17, they are identical.

# Determine maximum concordant set of markers for each of the maps
    /cluster/bin/scripts/assignGPsts -full -maxcon \
        /cluster/data/ncbi/sts.10/stsAlias.bed \
        /cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \
        decode_all > decode.marker.rdb
    /cluster/bin/scripts/assignGPsts -full -maxcon \
        /cluster/data/ncbi/sts.10/stsAlias.bed \
        /cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \
        marshfield_all > marshfield.marker.rdb
    /cluster/bin/scripts/assignGPsts -full -maxcon \
        /cluster/data/ncbi/sts.10/stsAlias.bed \
        /cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \
        genethon_all > genethon.marker.rdb

# Determine the rates for each of the maps
    /cluster/bin/scripts/markers_to_recomb_rate.terry.pl decode.marker.rdb \
            /cluster/data/hg18/chrom.sizes 1000000 1000000 \
                > decode_1mb_slide_1mb
    /cluster/bin/scripts/markers_to_recomb_rate.terry.pl genethon.marker.rdb \
            /cluster/data/hg18/chrom.sizes 1000000 1000000 \
* genethon_1mb_slide_1mb
# got 338 "... DISCARDING" messages.

    /cluster/bin/scripts/markers_to_recomb_rate.terry.pl marshfield.marker.rdb \
            /cluster/data/hg18/chrom.sizes 1000000 1000000 \
* marshfield_1mb_slide_1mb
# Got 424 "... DISCARDING" messages.

# Convert files to proper format
    /cluster/bin/scripts/convertRecombRate decode_1mb_slide_1mb \
        /cluster/data/hg18/inserts \
        /cluster/data/hg18 1000 > decode_1mb_slide_1mb_conv
    /cluster/bin/scripts/convertRecombRate marshfield_1mb_slide_1mb \
        /cluster/data/hg18/inserts \
         /cluster/data/hg18 1000 > marshfield_1mb_slide_1mb_conv
    /cluster/bin/scripts/convertRecombRate genethon_1mb_slide_1mb \
        /cluster/data/hg18/inserts \
	    /cluster/data/hg18 1000 > genethon_1mb_slide_1mb_conv

# Create bed file and load
    /cluster/bin/scripts/createRRbed decode_1mb_slide_1mb_conv \
        marshfield_1mb_slide_1mb_conv genethon_1mb_slide_1mb_conv \
                > recombRate.bed

    ssh hgwdev
    cd /cluster/store11/gs.19/build36/bed/recombRate
    hgLoadBed -noBin -tab \
        -sqlTable=/cluster/home/kent/src/hg/lib/recombRate.sql \
	    hg18 recombRate recombRate.bed

###########################################################################
# FISH CLONES (DONE - 2006-01-13 - 2006-02-07 - Hiram)
#  **** RE-LOAD fishClones after bacEnds update - see below 2007-09-04 ****
# The STS Marker, Coverage, and BAC End Pairs tracks must be completed prior to
# creating this track  (and why is this ?)

    ssh kkstore01
    mkdir /cluster/data/ncbi/fishClones/fishClones.2006-01/
    cd /cluster/data/ncbi/fishClones/fishClones.2006-01/

# Download information from NCBI
        # point browser at:
#   http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
# change "Sequence tag:" to "placed on contig"
        # change "Show details on sequence-tag" to "yes"
        # change "Download or Display" to "Download table for UNIX"
        # press Submit - save as
# /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.txt
    chmod 664 /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.txt
#	Unfortunately the format of this hbrc file has changed since
#	last time.  The columns have been rearranged, and one important
#	column is missing, the contig information.  So, let's see if we
#	can recover the original format by putting this together with
#	some other things we have here.
    $HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \
	/cluster/data/hg18/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \
	    2> dbg
    #	the seq_clone.pmd file was obtained via email from Wonhee Jang
    #	jang at ncbi.nlm.nih.gov - I have asked for clarification where
    #	such a file can be fetched without resorting to email.

# Get current clone/accession information
    wget --timestamping http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out

# Create initial Fish Clones bed file
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/fishClones
    cd /cluster/data/hg18/bed/fishClones

# Copy previous sts info from fhcrc (take from previous build in future)
    cp -p /cluster/data/ncbi/fishClones/fishClones.2004-07/fhcrc.sts .
#	This fhcrc.sts listing doesn't change.  It is merely a listing
#	of aliases that remain in effect.

    #	Create cl_acc_gi_len file form cloneend information:
    grep -v "^#" /cluster/data/hg18/bed/cloneend/all.txt \
    | awk '{gsub("\.[0-9]*$", "", $2);
	printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len


    ssh hgwdev
    #	have to be on hgwdev for this since it is going to read from the
    #	database.  Had to work on this program to get it past what is
    #	evidently a bad entry in hbrc.fixed where columns of information
    #	are missing for one clone in particular
    time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg18 \
	/cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt \
	/cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out \
         ./cl_acc_gi_len \
         /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl \
            fishClones
    #	real    2m4.708s
# Reading Fish Clones file /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
# reading fishInfo file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
# Reading Clone/Acc (clac.out) file /cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out
# Reading BAC Ends file ./cl_acc_gi_len
# Reading BAC Ends psl file /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl
# Reading additional STS Marker links fhcrc.sts
# Determining good positions
#	findClonePos: determining positions of fish clones
# Writing output file
# ERROR: at line # 170, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# ERROR: at line # 171, no cytoband info for chrX:104048913-104206974
# RP11-79L11

    # Load the track
    ssh hgwdev
    cd /cluster/data/hg18/bed/fishClones
    hgLoadBed -notItemRgb -noBin -tab \
        -sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
	hg18 fishClones fishClones.bed
    #	Loaded 9461 elements of size 16

###########################################################################
# CHROMOSOME BANDS TRACK (DONE - 2006-01-20 - 2006-02-07 - Hiram)
# This must wait until the Fish Clones tracks is done
#	This was loaded in place of the previously loaded ideoband data
#	created from NCBI information, see below for "ideogram"
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/cytoband
    cd /cluster/data/hg18/bed/cytoband

    # Copy in some necessary files (usually from previous version)
    cp -p /cluster/data/hg17/bed/cytoband/pctSetBands.txt .
    cp -p /cluster/data/hg17/bed/cytoband/ISCN800.txt .

    # Create some preliminary information files
    /cluster/bin/scripts/createSetBands pctSetBands.txt \
	/cluster/data/hg18/inserts /cluster/data/hg18  100 > setBands.txt
    /cluster/bin/scripts/makeBands ISCN800.txt \
        /cluster/data/hg18 > cytobands.pct.bed
    /cluster/bin/scripts/makeBandRanges cytobands.pct.bed \
        > cytobands.pct.ranges

    # Reformat fishClones file
    /cluster/bin/scripts/createBanderMarkers \
	/cluster/data/hg18/bed/fishClones/fishClones.bed > fishClones.txt

    /cluster/bin/scripts/runBander fishClones.txt \
	ISCN800.txt setBands.txt /cluster/data/hg18
    # Should be 862 bands
    wc  -l cytobands.bed
    # 862    cytobands.bed

    hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
        hg18 cytoBand cytobands.bed

    hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
        hg18 cytoBandIdeo cytobands.bed

###########################################################################
#  BLASTZ SELF (DONE - 2006-01-17 - 2006-01-20 - Hiram)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzSelf.2006-01-17
    cd /cluster/data/hg18/bed/blastzSelf.2006-01-17

    cat << '_EOF_' > DEF
# human vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_M=400

# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib
SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Human Hg18
SEQ2_DIR=/san/sanvol1/scratch/hg18/selfNib
SEQ2_LEN=/san/sanvol1/scratch/hg18/self.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/cluster/data/hg18/bed/blastzSelf.2006-01-17
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    cd /cluster/data/hg18/bed/blastzSelf.2006-01-17
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
    #	real    640m37.637s

    ssh kolossus
    cd /cluster/data/hg18/bed/blastzSelf.2006-01-17
    time HGDB_CONF=~/.hg.conf.read-only featureBits \
	-noRandom -noHap hg18 chainSelfLink > fb.chainSelfLink 2>&1 &
    #	real    21m52.697s
    #	324067552 bases of 2858034764 (11.339%) in intersection

    #	compared to Hg17:
    cd /cluster/data/hg17/bed/blastzSelf.2004-07-01
    time HGDB_CONF=~/.hg.conf.read-only featureBits \
	-noRandom -noHap hg17 chainSelfLink > fb.chainSelfLink 2>&1 &
    #	real    56m34.802s
    #	240976607 bases of 2851352871 (8.451%) in intersection

    #	reloaded these chains to add normalized score column
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastzSelf.2006-01-17/axtChain
    chainSplit chain hg18.hg18.all.chain.gz
    cd /cluster/data/hg18/bed/blastzSelf.2006-01-17/axtChain/chain
    foreach f (*.chain)
	set c = $f:r
	hgLoadChain -normScore hg18 ${c}_chainSelf $f
    end
    cd ..
    rm -fr chain

##############################################################################
# CLONE ENDS - BACEND TRACK (DONE - 2006-01-11 - Fan)

    ssh kkstore02
    cd /cluster/data/hg18
    # check disk space: 73Gb free
    df -h .
# Filesystem            Size  Used Avail Use% Mounted on
# /export/cluster/store11
                      1.8T  1.4T  323G  82% /cluster/store11

    mkdir -p bed/cloneend/ncbi
    cd bed/cloneend/ncbi

    wget --timestamping ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/*

# Somehow the wget did not work.  Did it by hand.

    cd /cluster/data/hg18/bed/cloneend
    # seems like the *.mfa files were split just for convenience
    # concatenate

    bash
    for F in ncbi/*.mfa.gz
    do
	zcat ${F}
    done | gzip > all.mfa.gz

    exit

    # Convert the title line of the all.mfa file
    cat << '_EOF_' > convert.pl
#!/usr/bin/env perl

use strict;
use warnings;

while (my $line = <>) {
    if ($line !~ m/^>/) {
	print $line
    } else {
        my @fields = split('\|', $line);
	my $fieldCount = scalar(@fields);
        my $printed = 0;
        for (my $i = 0; $i < $fieldCount; $i++) {
                if ($fields[$i] eq "gb" || $fields[$i] eq "dbj" || $fields[$i] eq "emb") {
                        (my $name, my $vers) = split(/\./,$fields[$i+1]);
                        print ">$name\n";
                        $i= $fieldCount;
                        $printed = 1;
                }
        }
        if (!$printed) {
                die("Failed for $line\n");
        }
    }
}
'_EOF_'
    # < happy emacs
    chmod +x convert.pl
    zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz

    #	make sure nothing got broken:
    faSize all.mfa.gz
# 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files
    faSize cloneEnds.fa.gz
# 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files
    #	identical numbers

    # concatenate the text files, too
    bash
    for F in ncbi/*.txt.gz
    do
	zcat ${F}
    done | gzip > all.txt.gz

    # generate cloneEndPairs.txt and cloneEndSingles.txt
    cp -p /cluster/data/mm6/bed/cloneend/ncbi/convertTxt.pl .
    zcat all.txt.gz >all.txt
    ./convertTxt.pl all.txt


    # Reading in end info
    # Writing out pair info
    # Writing out singleton info
    # 249619 pairs and 318500 singles

    #	faSplit does not function correctly if given a .gz source file
    #	AND, we need the unzipped file for sequence loading below
    gunzip cloneEnds.fa.gz
    # split
    mkdir splitdir
    cd splitdir
    faSplit sequence ../cloneEnds.fa 100 cloneEnds
    #	Check to ensure no breakage:
    cat *.fa | faSize stdin
# 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files
    #	same numbers as before

    #	Copy to san for cluster runs
    ssh pk
    cd /cluster/data/hg18/bed/cloneend/splitdir
    mkdir /san/sanvol1/scratch/hg18/cloneEnds
    cp -p *.fa /san/sanvol1/scratch/hg18/cloneEnds
    rm *
    cd ..
    rmdir splitdir

    # load sequences
    ssh hgwdev
    mkdir /gbdb/hg18/cloneend
    cd /gbdb/hg18/cloneend
      ln -s /cluster/data/hg18/bed/cloneend/cloneEnds.fa .
    cd /tmp
    hgLoadSeq hg18 /gbdb/hg18/cloneend/cloneEnds.fa
    #  Advisory lock created
    # Creating .tab file
    # Adding /gbdb/hg18/cloneend/cloneEnds.fa
    # 832860 sequences
    # Updating seq table
    # Advisory lock has been released
    # All done

############################################################################
# BACEND SEQUENCE ALIGNMENTS (STARTED - 2006-01-11, DONE 2006-01-18 - Fan)
#	REDONE 2006-02-02 - Hiram
    ssh pk
    #	The ooc file was created earlier into /cluster/bluearc/hg18/11.ooc
    cp -p /cluster/bluearc/hg18/11.ooc  /san/sanvol1/scratch/hg18/11.ooc

    mkdir /san/sanvol1/scratch/hg18/bacends
    cd /san/sanvol1/scratch/hg18/bacends
    ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
    ls -1S /san/sanvol1/scratch/hg18/cloneEnds/cloneEnds???.fa > bacends.lst
        # 378 contigs vs 98 bacends files -> 37,044 jobs

    mkdir out
cat > template << '_EOF_'
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/hg18/11.ooc {check out line+ out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 contigs.lst bacends.lst template jobList
    foreach f (`cat bacends.lst`)
        set d = $f:r:t
        echo $d
        mkdir out/$d
    end

    para create jobList
# 37044 jobs in batch
    para try, check, push, etc ...

    # lift alignments
    ssh pk
    cd /san/sanvol1/scratch/hg18/bacends
    pslSort dirs raw.psl temp out/cloneEnds*
    #	37044 files in 98 dirs
    #	Got 37044 files 192 files per mid file
    #	real    32m24.804s
    #	-rw-rw-r--    1 6487445210 Feb  2 21:08 raw.psl
    time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                raw.psl  bacEnds.psl /dev/null > pslReps.out 2>&1 &
    #	real    6m33.218s
    #	Processed 51898639 alignments

    mkdir lifted
    time liftUp lifted/bacEnds.lifted.psl ./liftContigs.lft warn bacEnds.psl
    #	real    0m30.067s
    pslSort dirs bacEnds.sorted.psl temp lifted

    # cleanup
    rmdir temp
    rm -fr out /cluster/store7/kate/hg17/bacends

    wc -l *.sorted.psl
    #	2490892 bacEnds.sorted.psl

    time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
	-mismatch -verbose bacEnds.sorted.psl \
	/cluster/data/hg18/bed/cloneend/cloneEndPairs.txt \
	all_bacends bacEnds
    #	Reading pair file
    #	Reading psl file
    #	Creating Pairs
    #	Writing to files
    #	real    0m11.221s
    #	this creates the files:
    #	-rw-rw-r--    1   16224182 Feb  2 21:36 bacEnds.pairs
    #	-rw-rw-r--    1    4655633 Feb  2 21:36 bacEnds.orphan
    #	-rw-rw-r--    1     399525 Feb  2 21:36 bacEnds.slop
    #	-rw-rw-r--    1     106252 Feb  2 21:36 bacEnds.mismatch
    #	-rw-rw-r--    1     634909 Feb  2 21:36 bacEnds.short
    #	-rw-rw-r--    1       4023 Feb  2 21:36 bacEnds.long

    # create header required by "rdb" tools
    # TODO: replace w/ awk & sort
    echo -e \
'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
    echo -e '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
    cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \
	| headchg -del > bacEndPairs.bed
    cat header bacEnds.slop bacEnds.short bacEnds.long \
	bacEnds.mismatch bacEnds.orphan \
        | row score ge 300 | sorttbl chr start | headchg -del \
	> bacEndPairsBad.bed

    extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
                bacEndPairsBad.bed | \
                        sorttbl tname tstart | headchg -del > bacEnds.load.psl

    #	Move the previous build out of the way and copy these
    #	results over to the primary hg18 bed location:
    mv /cluster/data/hg18/bed/bacends /cluster/data/hg18/bed/bacends.2006-01-18
    mkdir /cluster/data/hg18/bed/bacends
    cp -p bacEnd* /cluster/data/hg18/bed/bacends
    cp -p lifted/bacEnds.lifted.psl /cluster/data/hg18/bed/bacends

    #	load them into the database
    ssh hgwdev
    cd /cluster/data/hg18/bed/bacends
    #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
    awk '{print $5}' bacEndPairs.bed | sort | uniq -c
    #	result should be the scores, no extraneous strings:
    #	156984 1000
    #	   195 300
    #	   316 375
    #	   297 500
    #	  1476 750
    #	edit the file and fix it if it has a bad name.
    hgLoadBed -notItemRgb hg18 bacEndPairs bacEndPairs.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    # Loaded 159268
    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed -notItemRgb hg18 bacEndPairsBad bacEndPairsBad.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    # Loaded 69788
    #hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
    # NOTE: truncates file to 0 if -nobin is used
    # NOTE: truncates file to 0 if -nobin is used
    hgLoadPsl hg18 -table=all_bacends bacEnds.load.psl
    #	no complaints !  Usually there are, this loaded:
    hgsql -N -e "select count(*) from all_bacends;" hg18
    #	1249956

    nice featureBits hg18 all_bacends
# 191078854 bases of 2881515245 (6.631%) in intersection
    nice featureBits hg17 all_bacends
# 225763317 bases of 2866216770 (7.877%) in intersection

    nice featureBits hg18 bacEndPairs
# 2842800422 bases of 2881515245 (98.656%) in intersection
    nice featureBits hg17 bacEndPairs
# 2846568377 bases of 2866216770 (99.314%) in intersection

    nice featureBits hg18 bacEndPairsBad
# 729313572 bases of 2881515245 (25.310%) in intersection
    nice featureBits hg17 bacEndPairsBad
# 797412909 bases of 2866216770 (27.821%) in intersection

############################################################################
# BACEND PAIRS TRACK (OBSOLETE - DONE ABOVE) (DONE - 2006-01-18 - Fan)
    ssh kolossus
    cd /cluster/data/hg18/bacends
    bash

time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds

    # create header required by "rdb" tools
echo -e \
"chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header

cat header bacEnds.pairs | \
/cluster/bin/scripts/row score ge 300 | \
/cluster/bin/scripts/sorttbl chr start | \
/cluster/bin/scripts/headchg -del > bacEndPairs.bed

cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
/cluster/bin/scripts/sorttbl chr start | \
/cluster/bin/scripts/headchg -del > bacEndPairsBad.bed

/cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed >j1.out
cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out
cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl

rm j1.out j2.out

    #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
    awk '{print $5}' bacEndPairs.bed | sort | uniq -c
    #	result should be the scores, no extraneous strings:
    #	156984 1000
    #	   195 300
    #	   316 375
    #	   297 500
    #	  1476 750
    #	edit the file and fix it if it has a bad name.

    # load into database
    ssh hgwdev
    cd /cluster/data/hg18/bacends
    hgLoadBed -strict -notItemRgb hg18 bacEndPairs bacEndPairs.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    # Loaded 146284 elements of size 11

    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed -strict -notItemRgb hg18 bacEndPairsBad bacEndPairsBad.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    # Loaded 75995 elements of size 11

    # NOTE: truncates file to 0 if -nobin is used
    hgLoadPsl hg18 -table=all_bacends bacEnds.load.psl

    nice featureBits hg18 all_bacends
# 162081172 bases of 2881515245 (5.625%) in intersection
    nice featureBits hg17 all_bacends
# 225763317 bases of 2866216770 (7.877%) in intersection

    nice featureBits hg18 bacEndPairs
# 2835522069 bases of 2881515245 (98.404%) in intersection
    nice featureBits hg17 bacEndPairs
# 2846568377 bases of 2866216770 (99.314%) in intersection

    nice featureBits hg18 bacEndPairsBad
# 781697678 bases of 2881515245 (27.128%) in intersection
    nice featureBits hg17 bacEndPairsBad
# 797412909 bases of 2866216770 (27.821%) in intersection

##########################################################################
# BLASTZ OPOSSUM monDom2 second time (DONE - 2006-02-13 - Hiram)

    ssh kk
    mkdir /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
    cd /cluster/data/hg18/bed
    ln -s blastzMonDom2.2006-02-13 blastz.monDom4
    cd blastzMonDom2.2006-02-13

    cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin

BLASTZ=blastz.v7

# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom4
SEQ2_DIR=/iscratch/i/monDom4/monDom4RMExtra.2bit
SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-13
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &

    ssh kolossus
    cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
    time nice -n +19 featureBits hg18 chainMonDom4Link \
	> fb.hg18.chainMonDom4Link 2>&1 &
    cat fb.hg18.chainMonDom4Link
    #	356865888 bases of 2881515245 (12.385%) in intersection

    #	for the swap, see makeMonDom4.doc 2006-04-28

    #	Creating download directory (DONE - 2006-07-18 - Hiram)
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
        -continue=download -stop=download `pwd`/DEF > download.out 2>&1

##########################################################################
# BLASTZ OPOSSUM monDom2 first time (EXPERIMENT - 2006-01-23 - Hiram)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
    cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23

    cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom2
SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit
SEQ2_LEN=/san/sanvol1/scratch/monDom2/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzMonDom2.2006-01-23
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    #	real    912m22.818s

    #	This failed during the load of the chains due to the size of
    #	chr19.chain.  So, go to kolossus:
    ssh kolossus
    #	There isn't any hg18 db here yet, get it established with a
    #	chromInfo and a 2bit sequence:
    hgsql -e "create database hg18;" mysql
    cd /cluster/data/hg18
    twoBitInfo hg18.2bit stdout |
        awk '{printf "%s\t%s\t/gbdb/hg18/hg18.2bit\n", $1,$2}' \
		> chromInfo.kolossus.tab
    hgsql hg18 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql hg18 \
-e 'load data local infile "chromInfo.kolossus.tab" into table chromInfo;'
    mkdir /gbdb/hg18
    ln -s /cluster/data/hg18/hg18.2bit /gbdb/hg18/hg18.2bit
    #	now, loading only chr19:
    cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain
    hgLoadChain hg18 chr19_chainMonDom2 chain/chr19.chain
    #	while that is running, back on hgwdev, get the other chains loaded
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain
    cp loadUp.csh loadUp.noChr19.csh
    #	change the foreach line to eliminate the chr19.chain:
    diff loadUp.csh loadUp.noChr19.csh
    < foreach f (*.chain)
    ---
    > foreach f (`ls *.chain | grep -v chr19.chain`)
    #	And then run that script
    time ./loadUp.noChr19.csh > load.noChr19.out 2>&1

    #	When the kolossus load finishes, email to push-request and ask
    #	for the two tables to be pushed from kolossus to hgwdev:
    #	chr19_chainMonDom2
    #	chr19_chainMonDom2Link

    #	then, continuing:
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=download -bigClusterHub=pk -chainMinScore=5000 \
	-chainLinearGap=loose `pwd`/DEF > download.out 2>&1 &
    #	real    2m42.505s

    ssh kolossus
    cd /cluster/data/hg18/bed/blastz.monDom2
    time HGDB_CONF=~/.hg.conf.read-only featureBits \
	hg18 chainMonDom2Link > fb.hg18.chainMonDom2Link 2>&1
    #	real    124m34.435s
    cat fb.hg18.chainMonDom2Link
    #	357258631 bases of 2881515245 (12.398%) in intersection

    #	then, to swap
    ssh pk
    cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > swap.out 2>&1 &
    #	running 2006-01-25 17:28
    #	real    51m27.447s
    #	this swap failed at:
    #	startStep: 4, at step 5 net to stopStep 9
    #	netChains: looks like previous stage was not successful
    #	(can't find [monDom2.hg18.]all.chain[.gz]).
    #	This failure does not make any sense.  The end of swapChains
    #	does an nfsNoodge on this file to verify it exists.
    #	I don't understand why it wouldn't be in existence
    #	as netChains starts up.
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=net `pwd`/DEF > net-swap.out 2>&1 &
    #	running 2006-01-26 09:28
    #	real    27m57.077s
    #	This swap failed at the load chain:
    #	startStep: 5, at step 6 load to stopStep 9
    #	# chmod a+x
    #	# /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain/loadUp.csh
    #	# ssh -x hgwdev nice
    #	# /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain/loadUp.csh
    #	cd /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain
    #	hgLoadChain -tIndex monDom2 chainHg18 monDom2.hg18.all.chain.gz
    #	Out of memory needMem - request size 56 bytes

    #	So, over to kolossus to give it a try:

    #	There isn't any monDom2 db here yet, get it established with a
    #	chromInfo and a 2bit sequence:
    hgsql -e "create database monDom2;" mysql
    cd /cluster/data/monDom2
    hgsql monDom2 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql monDom2 \
-e 'load data local infile "chromInfo.tab" into table chromInfo;'
    mkdir /gbdb/monDom2
    ln -s /cluster/data/monDom2/monDom2.2bit /gbdb/monDom2/monDom2.2bit
    #	now, loading into monDom2
    cd /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain
    time hgLoadChain -tIndex monDom2 chainHg18 monDom2.hg18.all.chain.gz \
	> kolossus.load
    #	running - 2006-01-26

##########################################################################
#  test BLASTZ Opossum MonDom1  (DONE - 2006-01-30 - Hiram)
#	to see what happened with the blow up of data in monDom2
#

    ssh kk
    mkdir /cluster/data/hg18/bed/blastzMonDom1.2006-01-30
    cd /cluster/data/hg18/bed/blastzMonDom1.2006-01-30

    cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin

BLASTZ=blastz.v7

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom1
SEQ2_DIR=/iscratch/i/monDom1/chunks
SEQ2_LEN=/iscratch/i/monDom1/chrom.sizes
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzMonDom1.2006-01-30
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    #	started 2006-01-30 - 15:40
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=cat -stop=load `pwd`/DEF > cat_load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-stop=net `pwd`/DEF > blastz.out 2>&1 &

############################################################################
############################################################################
# STS MARKERS (STARTED 2006-01-27 Fan - DONE 2006-02-06 - Hiram)
#	FOR NEXT TIME - a lot of the perl scripts used in this process
#	need to be placed into the source tree and cleaned up to modern
#	perl warnings and strict standards.  In particular, one script
#	was placed into the source tree this time: src/utils/findAccession.pl

   # update from NCBI
    ssh kkstore02
    # use store11 for space
    mkdir -p /cluster/store11/sts.2006-01
    ln -s /cluster/store11/sts.2006-01 /cluster/data/ncbi
    ln -s /cluster/data/ncbi/sts.2006-01 sts.10
    cd /cluster/data/ncbi/sts.2006-01
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
# old
#    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
#    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
    wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
    gunzip sts.gz
    mv sts dbSTS.fa

    #	these items are copied in from the previous builds
    cp -p /cluster/data/ncbi/sts.9/all.STS.fa ./all.STS.fa.prev
    cp -p /cluster/data/ncbi/sts.9/stsInfo2.bed ./stsInfo2.bed.prev

    # Convert dbSTS.fa file to easier reading format, and get accessions
    /cluster/bin/scripts/convertGbFaFile dbSTS.fa > UniSTS.convert.fa
    grep ">" UniSTS.convert.fa | cut -f 2 -d ">" > UniSTS.acc

    # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
    #   all.STS.fa, stsAlias.bed files
#### XXX - FOR NEXT TIME: need to fix something here for the
#### XXX - broken symbol AFM067XA9 which has over 6,000 aliases.
#### XXX - This isn't right
#### hand-editted the record for AFM067XA9.  KUHN/ARCHANA 10-08-2007
#### preserving the list of otherNames that showed up stsInfo2.otherNames for
####    trueName=AFM067XA9
#### cp hg18.AFM067XA9.otherNames /cluster/data/hg18/bed/sts
#### preserving the list of stsMarkers that showed up in stsAlias.alias
####    in excess of those in the above file (10 k total)
#### cp hg18.AFM067XA9.dropped.aliases /cluster/data/hg18/bed/sts

    updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
	UniSTS.sts UniSTS.aliases UniSTS.convert.fa new
# 5610    MFD330  1000006 (0) not in dbSTS anymore
# 5667    D3S4560 1000008 (0) not in dbSTS anymore
# 5686    ATA92F01        1000007 (0) not in dbSTS anymore
# 5945    MFD206  1000009 (0) not in dbSTS anymore
# 6591    MFD311  1000011 (0) not in dbSTS anymore
# 6841    MFD306  1000013 (0) not in dbSTS anymore
# 6842    MFD310  1000012 (0) not in dbSTS anymore
# 6844    MFD349  1000026 (0) not in dbSTS anymore
# 7024    D12S2343        1000015 (0) not in dbSTS anymore
# 7042    ATA73C05        1000014 (0) not in dbSTS anymore
# 7226    MFD341  1000016 (0) not in dbSTS anymore
# 7500    D17S2200        1000018 (0) not in dbSTS anymore
# 7628    ATA92E03        1000020 (0) not in dbSTS anymore
# 7642    GATA178F11      1000019 (0) not in dbSTS anymore
# 7910    MFD338  1000022 (0) not in dbSTS anymore
# 97723   GATA172D05      1000023 (0) not in dbSTS anymore
# 205088  CPLA3610        1000000 (0) not in dbSTS anymore
# 205089  COX_1935        1000001 (0) not in dbSTS anymore
# 205090  24534CA2        1000002 (0) not in dbSTS anymore
# 205091  D5S811  1000003 (0) not in dbSTS anymore
# 205092  AC016604-5      1000004 (0) not in dbSTS anymore
# 205093  CA-JAP-180      1000005 (0) not in dbSTS anymore
# 205094  D10S1120        1000025 (0) not in dbSTS anymore
# 205095  D21S2039        1000024 (0) not in dbSTS anymore
# 205102  D12S1013        1000028 (0) not in dbSTS anymore

    mv new.info stsInfo2.bed
    mv new.primers all.primers
    mv new.alias stsAlias.bed
    mv new.fa all.STS.fa

    # get list of all STS id's in the fasta file
    sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n >  all.STS.id
    wc -l all.STS.id
    # 93698 total sequences
    /cluster/bin/scripts/convertPrimerToFA all.primers > all.primers.fa

    # Copy stsInfo2.bed and stsAlias.bed to data directory becuase
    # these will be loaded into the database later
    mkdir -p /cluster/data/hg18/bed/sts
    cp -p stsInfo2.bed /cluster/data/hg18/bed/sts/
    cp -p stsAlias.bed /cluster/data/hg18/bed/sts/

    # Create sts sequence alignments
    mkdir /san/sanvol1/scratch/hg18/sts
    mkdir /san/sanvol1/scratch/hg18/sts/split

    faSplit sequence all.STS.fa 200 /san/sanvol1/scratch/hg18/sts/split/sts
    cp -p all.STS.fa /san/sanvol1/scratch/hg18/sts

    ssh pk
    cd /cluster/data/hg18/bed/sts
    mkdir run
    cd run
    ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
    ls -1S /san/sanvol1/scratch/hg18/sts/split/sts*.fa > sts.lst
    mkdir /san/sanvol1/scratch/hg18/sts/out

    foreach f (`cat sts.lst`)
        set d = $f:t:r
        mkdir /san/sanvol1/scratch/hg18/sts/out/$d
    end

    # create alignments
cat > template << '_EOF_'
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/hg18/11.ooc -stepSize=5 {check out line+ /san/sanvol1/scratch/hg18/sts/out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # happy emacs

    gensub2 contigs.lst sts.lst template jobList
    para create jobList
        # 70686 jobs
    para try ... check ... push ... etc
# Completed: 70686 of 70686 jobs
# CPU time in finished jobs:     117490s    1958.16m    32.64h    1.36d  0.004 y
# IO & Wait Time:                195274s    3254.57m    54.24h    2.26d  0.006 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:              97s       1.62m     0.03h    0.00d
# Submission to last job:          8085s     134.75m     2.25h    0.09d

    # Compile sts sequence results
    ssh pk
    cd /san/sanvol1/scratch/hg18/sts
    time pslSort dirs raw.psl temp out/sts*
    #	real    8m50.714s
    #	-rw-rw-r--    1 810548945 Feb  3 14:19 raw.psl
    #	70686 files in 187 dirs
    #	Got 70686 files 266 files per mid file
    rm -rf temp
    time pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons raw.psl \
	stsMarkers.psl /dev/null
    #	Processed 7252745 alignments
    #	real    0m28.102s
    #	-rw-rw-r--    1  10981952 Feb  3 14:26 stsMarkers.psl

    cp -p stsMarkers.psl /cluster/data/hg18/bed/sts/run

    # Lift them and get them ready to combine with primer alignments
    liftUp -nohead stsMarkers.lifted.psl \
        /cluster/data/hg18/jkStuff/liftContigs.lft \
	     warn stsMarkers.psl

    /cluster/bin/scripts/extractPslInfo stsMarkers.lifted.psl
        # creates stsMarkers.lifted.psl.initial
    wc stsMarkers.lifted.psl.initial
    #	93236  559416 4111801 stsMarkers.lifted.psl.initial
    $HOME/kent/src/utils/findAccession.pl -agp stsMarkers.lifted.psl.initial \
	/cluster/data/hg18
    wc stsMarkers.lifted.psl.initial.acc
    #	93236  652652 4947261 stsMarkers.lifted.psl.initial.acc

    sort -k4,4n stsMarkers.lifted.psl.initial.acc > stsMarkers.final

    # determine found markers (4th field in file)
    cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
    wc -l stsMarkers.found
    #	90676 stsMarkers.found
    #	out of 93698 total sequences
    #		from wc /cluster/data/ncbi/sts.2006-01/all.STS.id)

    # extract sequences for markers not yet found, and
    # blat w/o ooc to try to place more
    comm -1 -3  stsMarkers.found /cluster/data/ncbi/sts.2006-01/all.STS.id \
                > stsMarkers.notFound
    wc -l stsMarkers.notFound
    # 3022 stsMarkers.notFound

    faSomeRecords /san/sanvol1/scratch/hg18/sts/all.STS.fa stsMarkers.notFound \
                notFound.STS.fa

    mkdir /san/sanvol1/scratch/hg18/sts/splitNotFound
    faSplit sequence notFound.STS.fa 20 \
                /san/sanvol1/scratch/hg18/sts/splitNotFound/sts

    # blat with 11.ooc misses alignments, so reblat w/o the
    # sequences that aren't found
    # NOTE: filtering produces yield of only 101 markers placed (out of 3022).
    # not enough to justify this step next time
    ssh pk
    mkdir /cluster/data/hg18/bed/sts/run.noOoc
    cd /cluster/data/hg18/bed/sts/run.noOoc
    ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
    ls -1S /san/sanvol1/scratch/hg18/sts/splitNotFound/sts*.fa > sts.lst

    mkdir /san/sanvol1/scratch/hg18/sts/out.noOoc

    foreach f (`cat sts.lst`)
        set d = $f:t:r
        mkdir /san/sanvol1/scratch/hg18/sts/out.noOoc/$d
    end

cat > template << '_EOF_'
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -stepSize=5 {check out line+ /san/sanvol1/scratch/hg18/sts/out.noOoc/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # happy emacs

    gensub2 contigs.lst sts.lst template jobList
    para create jobList
    # 7182 jobs written to batch
    para try
    para check

    # process this set of alignments
    cd /san/sanvol1/scratch/hg18/sts
    pslSort dirs raw.noOoc.psl temp out.noOoc/*
    #	-rw-rw-r--    1 459858612 Feb  3 15:56 raw.noOoc.psl
    #	Wow, that is almost half the size of the original raw.psl with
    #	everything in it.

    rm -rf temp
    pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons \
        raw.noOoc.psl stsMarkers.noOoc.psl /dev/null
    # Processed 4027664 alignments

    # Lift them and get them ready to combine with primer alignments
    liftUp -nohead stsMarkers.noOoc.lifted.psl \
        /cluster/data/hg18/jkStuff/liftContigs.lft \
        warn stsMarkers.noOoc.psl

    /cluster/bin/scripts/extractPslInfo stsMarkers.noOoc.lifted.psl
        # creates <file>.initial
    $HOME/kent/src/utils/findAccession.pl -agp \
        stsMarkers.noOoc.lifted.psl.initial /cluster/data/hg18

    #rm stsMarkers.lifted.psl.initial
    mv stsMarkers.final stsMarkers.ooc.final
    sort -k4,4n stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra
    sort -k4,4n stsMarkers.lifted.psl.initial.acc \
                stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.final

    # determine found markers (4th field in file)
    cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.more.found
    wc -l stsMarkers.more.found
        #  90777 stsMarkers.found
    cut -f 4 stsMarkers.extra | sort -n -u > stsMarkers.extra.found
    wc -l stsMarkers.extra.found
        #   101 out of 3022 attempted
	#  out of 93698 total sequences

    cp -p stsMarkers.final stsMarkers.lifted.psl \
	stsMarkers.*lifted.psl.initial* stsMarkers.found \
                /cluster/data/hg18/bed/sts

    # Alignments from noOoc set were not added to all_sts_seq but info for the
    # markers is in stsMap and stsInfo2. Some of the alignments are bad so
    # filter by removing all alignments from noOoc psl file where
    # tBaseInsert >=1000. Add the remaining alignments to the set of final
    # alignments for stsMarkers. The information for the removed markers
    # from the filtered set was also removed from stsMap and stsInfo2.
    ssh pk
    mkdir /cluster/data/hg18/bed/sts/fix
    cd /cluster/data/hg18/bed/sts/fix
    cp /san/sanvol1/scratch/hg18/sts/stsMarkers.noOoc.lifted.psl .
    awk '{if ($8 < 1000) print}' stsMarkers.noOoc.lifted.psl \
	> stsMarkers.noOoc.lifted.filt1000.psl
    wc -l *.filt*.psl
    # 23   483  4206 stsMarkers.noOoc.lifted.filt1000.psl
    sort -k4,4n \
	/san/sanvol1/scratch/hg18/sts/stsMarkers.noOoc.lifted.psl.initial.acc \
	> stsMarkers.extra
    awk '{print $4}' stsMarkers.extra | sort -n | uniq >  extra.ids
    # in psl file, the ids are the 10th field
    awk '{print $10}' stsMarkers.noOoc.lifted.psl | sort -n | uniq \
        > noOoc.ids
    diff extra.ids noOoc.ids
    # there is no difference as expected
    # get list of IDs from filtered file, filter < 1000
    awk '{print $10}' stsMarkers.noOoc.lifted.filt1000.psl \
        | sort -n | uniq > filt1000.ids
    for i in `cat filt1000.ids`
    do
     awk 'BEGIN {OFS="\t"} \
         {if ($4 == "'$i'") print $1, $2, $3, $4, $5, $6, $7}' \
         stsMarkers.extra >> stsMarkers.extra.filt1000
    done
    cp -p ../stsMarkers.final stsMarkers.final
     # need to filter stsMarkers.final not just cat this on the end
    # get list of alignments with tBaseInsert >= 1000 and remove these
    cd /cluster/data/hg18/bed/sts/fix
    awk '{if ($8 >= 1000) print;}' stsMarkers.noOoc.lifted.psl \
	> stsMarkers.noOoc.lifted.filtToRemove.psl
    wc -l *.filt*.psl
    #	 23 stsMarkers.noOoc.lifted.filt1000.psl
    #	175 stsMarkers.noOoc.lifted.filtToRemove.psl
    # get list of IDs that need to be removed
    awk '{print $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl | sort -n \
        | uniq  > noOoc.IdsToRemove.txt
    # get chrom and co-ordinates for IDs to be removed
    awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
        stsMarkers.noOoc.lifted.filtToRemove.psl | sort | uniq \
        > sts.noOoc.filtToRemove.coords
    # checked that the stsMarkers.final contain the noOoc alignments
    # use this perl script to remove lines with these IDs from stsMarkers.final
cat << '_EOF_' > removeIds.pl
#!/usr/bin/env perl
use warnings;
use strict;

my $ids = $ARGV[0];
my $file = $ARGV[1];
# list of IDs with chrom and coords to remove
open(IDS, $ids) || die "Can not open $ids: $!\n";
# file for removal of IDs
open(FILE, $file) || die "Can not open $file: $!\n";
open(OUT, ">removed.txt") || die "Can not create removed.txt: $!\n";

my %idsHash;

while (<IDS>) {
   chomp;
   my @a = split(/\t/);

   my $chr = $a[0];
   my $st = $a[1];
   my $end = $a[2];
   my $id = $a[3];
   my $key = $id."_".$chr . "_" . $st . "_" . $end;
   $idsHash{$key}->{chrom} = $chr;
   $idsHash{$key}->{start} = $st;
   $idsHash{$key}->{end} = $end;
}
close IDS;

while (<FILE>) {
   chomp;
   my $l = $_;
   my $found = "FALSE";
   my @f = split(/\t/, $l);
   foreach my $k (keys(%idsHash)) {
      # if the id is contained in the key
      if ($k =~ /^$f[3]/) {
         my $c = $idsHash{$k}->{chrom};
         my $s = $idsHash{$k}->{start};
         my $e = $idsHash{$k}->{end};
         if ($f[0] eq $c && $f[1] == $s && $f[2] == $e) {
             print OUT "$c\t$s\t$e\t$f[3]\n";
             $found = "TRUE";
         }
      }
   }
   if ($found eq "FALSE") {
      print "$l\n";
   }
}
'_EOF_'
    chmod +x removeIds.pl
    ./removeIds.pl sts.noOoc.filtToRemove.coords stsMarkers.final \
         > stsMarkers.final.new
    wc -l stsMarkers.final*
    wc stsMarkers.final*
    #	93434  654038 4957784 stsMarkers.final
    #	93259  652813 4948484 stsMarkers.final.new

    # There are 175 ids and sets of co-ordinates in list of Ids to remove
    #	175 stsMarkers.noOoc.lifted.filtToRemove.psl
    # check that stsMarkers.final.new contains all the alignments that
    # are in filtered set: stsMarkers.noOoc.lifted.filt1000.psl
    awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
        stsMarkers.noOoc.lifted.filt1000.psl | sort | uniq \
        > sts.noOoc.filt1000.coords
    awk 'BEGIN {OFS = "\t"} {print $1,$2,$3,$4}' \
        stsMarkers.final.new | sort | uniq \
        > sts.finalnew.coords
    diff sts.finalnew.coords sts.noOoc.filt1000.coords > finalnewvsfilt1000
    grep '>' finalnewvsfilt1000
    # there is nothing in sts.noOoc.filt1000.coords not found in the
    # sts.finalnew.coords file therefore this contains all the alignments
    # from the filtered noOoc file.
    cp ../primers/primers.final .
    awk '{print $4}' stsMarkers.final.new | sort | uniq > stsfinal.new.ids

    # primers
    ssh eieio
    cd /cluster/data/ncbi/sts.10
    # strip out N's and wobbles (KS) from primers, as isPcr
    # can't currently handle them
    # strip out primers < 10 as isPcr can't handle them
    awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
                all.primers > all.primers.ispcr
    mkdir -p /san/sanvol1/scratch/hg18/sts.10/primers
    cd /san/sanvol1/scratch/hg18/sts.10/primers
    split -l 4000 /cluster/data/ncbi/sts.10/all.primers.ispcr primers_

    ssh pk
    mkdir /cluster/data/hg18/bed/sts/primers
    cd /cluster/data/hg18/bed/sts/primers
    mkdir run
    cd run
    ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
    ls -1S /san/sanvol1/scratch/hg18/sts.10/primers/primers_* > primers.lst
    mkdir /san/sanvol1/scratch/hg18/sts.10/primers/out

cat > template << '_EOF_'
#LOOP
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/san/sanvol1/scratch/hg18/10ooc/$(root1).10.ooc  -stepSize=5 $(path1) $(path2) {check out line /san/sanvol1/scratch/hg18/sts.10/primers/out/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # happy emacs

    gensub2 contigs.lst primers.lst template jobList
    para create jobList
	# 29106 jobs
    para try ... check ... push ... etc ...
# Completed: 29106 of 29106 jobs
# CPU time in finished jobs:     658245s   10970.76m   182.85h    7.62d  0.021 y
# IO & Wait Time:                 82764s    1379.39m    22.99h    0.96d  0.003 y
# Average job time:                  25s       0.42m     0.01h    0.00d
# Longest finished job:             534s       8.90m     0.15h    0.01d
# Submission to last job:          2282s      38.03m     0.63h    0.03d


    # Filter output file quickly based on simple parameters
    ssh pk
    cd /san/sanvol1/scratch/hg18/sts.10/primers
    mkdir filter
    pslQuickFilter -minMatch=26 -maxMismatch=5 \
	-maxTinsert=5000 -verbose out/ filter/
    # Note: there will be many messages saying files are empty - this is OK
    time pslSort dirs ../primers.psl.unlifted temp filter
    #	Got 29106 files 171 files per mid file
    #	real    3m31.401s
    # filter primer alignments and create not found primer file for ePCR run
    cd /san/sanvol1/scratch/hg18/sts.10
    pslFilterPrimers primers.psl.unlifted  \
	/cluster/data/ncbi/sts.10/all.primers primers.filter.unlifted.psl
    # creates primers.filter.unlifted.psl.notfound.primers
    wc -l primers.filter.unlifted.psl.notfound.primers
    # 22943  primers.filter.unlifted.psl.notfound.primers

    # use Greg Schuler's ePCR to attempt alignment of primers missed
    # by isPcr
    ssh pk
    mkdir /san/sanvol1/scratch/hg18/sts.10/epcr
    mkdir /san/sanvol1/scratch/hg18/sts.10/epcr/out
    cd /san/sanvol1/scratch/hg18/sts.10/epcr
    split -l 3000 ../primers.filter.unlifted.psl.notfound.primers  primers_
    mkdir /cluster/data/hg18/bed/sts/primers/run.epcr
    cd /cluster/data/hg18/bed/sts/primers/run.epcr
    ls -1S /san/sanvol1/scratch/hg18/sts.10/epcr/primers_* > primers.lst
    #  These jobs are going to go quickly, make sure all I/O comes and
    #  goes from something that can handle it.
    ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contig.lst

    #	This runEpcr64 script was made from the existing runEpcr script
    #	and from the looks of it, I doubt the original script works in
    #	the way this was set up here.  It appears to be reading the
    #	second argument $(path2) line by line and sending that as
    #	arguments to e-PCR.  That wouldn't be right here.

cat > template << '_EOF_'
#LOOP
/cluster/bin/scripts/runEpcr64 $(path1) $(path2) {check out line /san/sanvol1/scratch/hg18/sts.10/epcr/out/$(root1).$(root2).epcr}
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 primers.lst contig.lst template jobList
    para create jobList
	# 3420 jobs
    para try ... check ... push ... etc ...

# Completed: 3024 of 3024 jobs
# CPU time in finished jobs:      31802s     530.04m     8.83h    0.37d  0.001 y
# IO & Wait Time:                 12804s     213.40m     3.56h    0.15d  0.000 y
# Average job time:                  15s       0.25m     0.00h    0.00d
# Longest finished job:             193s       3.22m     0.05h    0.00d
# Submission to last job:           372s       6.20m     0.10h    0.00d

    # merge output
    ssh pk
    cd /cluster/bluearc/hg17/sts/primers/epcr
    cd /san/sanvol1/scratch/hg18/sts.10/epcr
    cat out/*.epcr > all.epcr
    wc -l all.epcr
    # 3792

    #	should be on the fileserver (kkstore02) for the following heavy
    #	I/O operations.  Didn't do that here, was on pk instead.
    # use all.epcr file to re-filter alignemnts and determine which
    # ePCR records to keep
    cp all.epcr /cluster/data/hg18/bed/sts/primers
    cd /cluster/data/hg18/bed/sts/primers
    pslFilterPrimers -epcr=all.epcr -verbose=1 \
	/san/sanvol1/scratch/hg18/sts.10/primers.psl.unlifted \
	/cluster/data/ncbi/sts.10/all.primers primers.unlifted.epcr.psl
    #	creates three files:
# -rw-rw-r-  1   148528 Feb  6 10:39 epcr.not.found
# -rw-rw-r-  1 51632003 Feb  6 10:39 primers.unlifted.epcr.psl
# -rw-rw-r-  1  1189756 Feb  6 10:39 primers.unlifted.epcr.psl.notfound.primers

    # convert to PSL and combine with other psl file
    time /cluster/bin/scripts/epcrToHgPsl epcr.not.found \
        /cluster/data/ncbi/sts.10/all.primers /cluster/data/hg18
    #	real    81m24.041s  (on pk, may have been better on kkstore02
    #	where all of the data is)
    cat primers.unlifted.epcr.psl epcr.not.found.psl \
                | sort -k 10n > primers.final.unlifted.psl
    wc -l primers.final.unlifted.psl
    #	454869 primers.final.unlifted.psl

    #	should have been on kkstore02 already
    ssh kkstore02
    cd /cluster/data/hg18/bed/sts/primers
    # Fix the query gap lengths so that they match the all.primers.fa
    #   file lengths
    time /cluster/bin/scripts/fixPrimersQueryGaps \
        /cluster/data/ncbi/sts.10/all.primers primers.final.unlifted.psl \
                > primers.final.unlifted.fix.psl
    #	real    0m19.814s
    wc -l primers.final.unlifted.fix.psl
    #	454869 primers.final.unlifted.fix.psl

    # lift results from contigs to chrom coordinates, and create final file
    time liftUp -nohead primers.psl \
            /cluster/data/hg18/jkStuff/liftContigs.lft warn \
            primers.final.unlifted.fix.psl
    #	real    0m2.897s
    wc -l primers.psl
    #	454869 primers.psl

    # Extract relevant info, make alignments unique, and create final file to
    #	be merged with full sequence alignments
    time /cluster/bin/scripts/extractPslInfo primers.psl
    #	real    0m15.303s
    wc -l primers.psl.initial
    #	451023 primers.psl.initial
    $HOME/kent/src/utils/findAccession.pl -agp primers.psl.initial \
                /cluster/data/hg18
    wc -l primers.psl.initial.acc
    #	451023 primers.psl.initial.acc

    /cluster/bin/scripts/getStsId /cluster/data/hg18/bed/sts/stsInfo2.bed \
	primers.psl.initial.acc \
        | sort -k 4n > primers.final
    #rm primers.psl.initial.acc
    wc -l primers.final
    # 451023 primers.final
    #	There doesn't appear to be any use for this primers.ids list
    #	except for curiosity.  Check the head and tail of this list to
    #	verify no garbage is in here.  There should just be numbers.
    awk '{print $4}' primers.final | sort -n | uniq > primers.ids
    wc -l primers.ids
    #	287465 primers.ids

    # Merge primer and sequence files to create final bed file
    # Merge (combineSeqPrimerPos) takes about an hour to run
    ssh kkstore02
    cd /cluster/data/hg18/bed/sts
    time /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final \
	primers/primers.final
    #	real    55m33.254so
    wc -l stsMarkers_pos.rdb
    #	307082 stsMarkers_pos.rdb

    time /cluster/bin/scripts/createSTSbed \
	/cluster/data/ncbi/sts.10/stsInfo2.bed stsMarkers_pos.rdb > stsMap.bed
    #	real    0m13.351s
    wc -l stsMap.bed
    #	300492 stsMap.bed

    # Set up sequence files
    ssh hgwdev
    mkdir /gbdb/hg18/sts.10/
    ln -s /cluster/data/ncbi/sts.10/all.STS.fa /gbdb/hg18/sts.10/all.STS.fa
    ln -s /cluster/data/ncbi/sts.10/all.primers.fa \
        /gbdb/hg18/sts.10/all.primers.fa

    # Load all files
    cd /cluster/data/hg18/bed/sts
    hgLoadSeq hg18 /gbdb/hg18/sts.10/all.STS.fa /gbdb/hg18/sts.10/all.primers.fa
    #	Advisory lock created
    #	Creating .tab file
    #	Adding /gbdb/hg18/sts.10/all.STS.fa
    #	93698 sequences
    #	Adding /gbdb/hg18/sts.10/all.primers.fa
    #	306885 sequences
    #	Updating seq table
    #	Advisory lock has been released
    #	All done
    #	real    1m25.459s


    hgsql hg18 < $HOME/kent/src/hg/lib/stsInfo2.sql
    hgsql hg18 < $HOME/kent/src/hg/lib/stsAlias.sql
    #	these two files are already here from previous operations above
    # cp /cluster/data/ncbi/sts.10/{stsInfo2.bed,stsAlias.bed} .
    hgsql hg18 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
    hgsql hg18 -e 'load data local infile "stsAlias.bed" into table stsAlias'
    #	a couple minutes for each load above
    hgLoadBed -notItemRgb -noBin -tab \
	-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql \
	    hg18 stsMap stsMap.bed
    hgLoadPsl -nobin -table=all_sts_primer hg18 primers/primers.psl
    #	load of all_sts_primer did not go as planned: 454869 record(s),
    #	0 row(s) skipped, 10 warning(s) loading primers/primers.psl
    hgLoadPsl -nobin -table=all_sts_seq hg18 stsMarkers.lifted.psl

# PRUNE stsMap RECORDS (DONE 3/3/06)

  hgsql hg18 -e 'delete from stsMap where chromEnd-chromStart > 5000'

###########################################################################
# CREATE HAPLOTYPEPOS TRACK (DONE 1/31/06, Fan)

  ssh kkstore02
  cd /cluster/data/hg18/bed

  mkdir haplotypePos
  cd haplotypePos

  cp /cluster/data/hg18/*hap*/*.fa . -p
  ls *.fa|sed -e 's/chr/split1 chr/' |sed -e 's/.fa//' >splitAll

cat << '_EOF_' > split1
echo processing $1
faSplit2 -lift=$1.lft -overlap=500 size $1.fa 3500 split/$1
'_EOF_'

chmod +x split*
mkdir split
mkdir result
splitAll

ls ./split/*.fa > split.lst

cat << '_EOF_' > gsub
#LOOP
/cluster/store11/gs.19/build36/bed/haplotypePos/hblat1 $(file1) {check out line+ /cluster/store11/gs.19/build36/bed/haplotypePos/result/$(root1).psl}
#ENDLOOP
'_EOF_'

gensub2 split.lst single gsub jobList

ssh pk
cd /cluster/data/hg18/bed/haplotypePos
mkdir result

para create jobList
para try, push, check ...

# Completed: 3091 of 3092 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      33164s     552.73m     9.21h    0.38d  0.001 y
# IO & Wait Time:                172783s    2879.72m    48.00h    2.00d  0.005 y
# Average job time:                  67s       1.11m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             300s       5.00m     0.08h    0.00d
# Submission to last job:           743s      12.38m     0.21h    0.01d

# The single job that crashed was due to chr5_h2_hap1368.fa, which
# does not have a decent alignment on chr5.

# collect BLAT results
cat result/*.psl >all.psl

# keep the main alignments
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 all.psl all_filtered.psl all.psr

cat chr*.lft > hap.lft
liftUp lifted.psl hap.lft warn all_filtered.psl -pslQ

mkdir tNibs qNibs
cp -p /cluster/data/hg18/nib/*hap*.nib qNibs

cp -p /cluster/data/hg18/nib/chr5.nib tNibs
cp -p /cluster/data/hg18/nib/chr6.nib tNibs
cp -p /cluster/data/hg18/nib/chr22.nib tNibs

axtChain -psl -linearGap=medium lifted.psl tNibs qNibs out.chain

chainAntiRepeat tNibs qNibs out.chain final.chain

cat << '_EOF_' > hap.chrom.lis
/cluster/data/hg18/nib/chr5.nib
/cluster/data/hg18/nib/chr6.nib
/cluster/data/hg18/nib/chr22.nib
'_EOF_'

ls *.fa >q.lis

chainToPsl final.chain /cluster/data/hg18/chrom.sizes \
/cluster/data/hg18/chrom.sizes hap.chrom.lis q.lis haplotypePos.psl
# took about 20 minutes

hgLoadPsl hg18 haplotypePos.psl

# add haplotypePos entry in trackDb.ra

###########################################################################
# LOAD AFFYRATIO (DONE - 2006-02-01 - Fan)
#	Copied from Hg17 doc
# NOTE: Jim recommends that, in the future, all AFFY blat alignments should drop
# -mask=lower for blat and drop -minIdentity=95 to -minIdentity=90 as the
# higher minIdentity is causing alignments to be dropped that should not be.
# e.g.
# /cluster/bin/i386/blat -fine -minIdentity=90 -ooc=/cluster/bluearc/hg18/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
# pslReps can be used to handle filtering at a later step. Blat's minIdentity
# seems to be more severe than that for pslReps as it takes insertions and
# deletions into account.
#
# NOTE FROM QA (brooke, 8/28/07):  In the future, run hgLoadBed without the
# -sqlTable=$HOME/src/hg/lib/affyRatio.sql option, so that tableDescriptions
# will be built properly.  affyRatio.sql was needed before Jim added bed15
# capability to hgLoadBed (in Oct. 2003), but now bed15 tables can use the
# default bedExp.as and bedExp.sql files.
#
    # Set up cluster job to align consenesus/exemplars to hg18
    ssh kkstore02
    mkdir /cluster/bluearc/hg18/affyGnf
    cp -p /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa \
    /cluster/bluearc/hg18/affyGnf

    ssh kkr1u00
    mkdir -p /iscratch/i/affyGnf
    cp -p /cluster/bluearc/hg18/affyGnf/* /iscratch/i/affyGnf
    /cluster/bin/iSync

    ssh kki
    mkdir /cluster/data/hg18/bed/affyGnf.2004-06-09
    cd /cluster/data/hg18/bed/affyGnf.2004-06-09
    ls -1 /iscratch/i/affyGnf/* > affy.lst
    ls -1 /iscratch/i/gs.19/build36/maskedContigs/* > allctg.lst
    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/hg18/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 allctg.lst affy.lst template.sub jobList
    mkdir psl
    para create jobList
    para try, push, check
# Completed: 378 of 378 jobs
# CPU time in finished jobs:       3055s      50.91m     0.85h    0.04d  0.000 y
# IO & Wait Time:                  1267s      21.12m     0.35h    0.01d  0.000 y
# Average job time:                  11s       0.19m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              78s       1.30m     0.02h    0.00d
# Submission to last job:           367s       6.12m     0.10h    0.00d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyU95.psl
    ssh kkstore02
    cd /cluster/data/hg18/bed/affyGnf.2004-06-09
    pslSort dirs raw.psl tmp psl

    # change filter parameters for these sequences. only use alignments that
    # cover 30% of sequence and have at least 95% identity in aligned
    # region.
    # minAli = 0.97 too high. low minCover as a lot of n's in these
    # sequences
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl
    #   Eliminate the long names
    sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl

    # Merge with spot data and load into database. added -chip flag to
    # affyPslAndAtlasToBed to allow correct parsing
    ssh hgwdev
    cd /cluster/data/hg18/bed/affyGnf.2004-06-09

    bash
    /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 \
	affyU95shortQname.psl \
	/projects/compbio/data/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt \
	affyRatio.bed affyRatio.exr > affyPslAndAtlasToBed.log 2>&1

    hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg18 \
	affyRatio affyRatio.bed
    # Loaded 13043 elements of size 15

    mkdir affyU95
    hgLoadPsl hg18 -table=affyU95 affyU95shortQname.psl
    # sequences loaded 2006-02-1
    hgLoadSeq -abbr=U95Av2: hg18 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
    #	Advisory lock created
    #	Creating .tab file
    #	Adding /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
    #	12386 sequences
    #	Updating seq table
    #	Advisory lock has been released
    #	All done

# Load AFFYUCLANORM, extended version of affyUcla track. Hopefully
# final freeze of data set.		(DONE - 2006-02-01 - Fan)
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/affyUclaNorm
    cd /cluster/data/hg18/bed/affyUclaNorm

    cp -p /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa .

    ssh pk
    cd /cluster/data/hg18/bed/affyUclaNorm
    ls -1 /scratch/hg/gs.19/build36/maskedContigs/* > contig.lst

    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

    mkdir psl
    ls HG-U133AB_all.fa > affy.lst
    gensub2 contig.lst affy.lst gsub jobList
    para create jobList
    para try
    para check
    para push ... etc
# Completed: 378 of 378 jobs
# CPU time in finished jobs:       6766s     112.77m     1.88h    0.08d  0.000 y
# IO & Wait Time:                  1541s      25.68m     0.43h    0.02d  0.000 y
# Average job time:                  22s       0.37m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             202s       3.37m     0.06h    0.00d
# Submission to last job:           302s       5.03m     0.08h    0.00d

    ssh kkstore02
    cd /cluster/data/hg18/bed/affyUclaNorm
    pslSort dirs hg18.affyU133AB_all.psl tmp psl
    wc hg18.affyU133AB_all.psl
    # 62043  1302842 13163424 hg18.affyU133AB_all.psl

    liftUp hg18.affyU133AB_all.lifted.psl \
	/cluster/data/hg18/jkStuff/liftAll.lft warn hg18.affyU133AB_all.psl
    pslReps -minCover=0.5 -sizeMatters -minAli=0.97 \
	-nearTop=0.005  hg18.affyU133AB_all.lifted.psl \
	hg18.affyU133AB_all.lifted.pslReps.psl out.psr
    # Processed 62038 alignments
    ~/kent/src/hg/affyGnf/affyUclaMergePslData \
    -pslFile=hg18.affyU133AB_all.lifted.pslReps.psl \
	-affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt \
	-bedOut=hg18.affyUcla.bed \
	-expRecordOut=hg18.affyUcla.expRecords \
	-expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt

    ~/kent/src/hg/affyGnf/addUclaAnnotations.pl hg18.affyUcla.expRecords \
	/projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg18.affyUcla.annotations.expRecords

    # Load the databases
    ssh hgwdev
    cd /cluster/data/hg18/bed/affyUclaNorm
    sed -e 's/affyRatio/affyUclaNorm/' ~/kent/src/hg/lib/affyRatio.sql > affyUclaNorm.sql
    hgLoadBed hg18 affyUclaNorm hg18.affyUcla.bed -sqlTable=affyUclaNorm.sql

############################################################################
# MAKE AFFY U133 - made after above affyUclaNorm (DONE - 2006-02-01 - Fan)
    #	Someday the names can be fixed.
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/affyU133
    cd /cluster/data/hg18/bed/affyU133
    ln -s ../affyUclaNorm/hg18.affyU133AB_all.lifted.pslReps.psl affyU133.psl

    hgLoadPsl hg18 affyU133.psl
    hgsql -e "select count(*) from affyU133;" hg18
    #	row count in hg17: 44620, in hg18: 45559
    hgLoadSeq hg18 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
    #	44792 sequences

# GNF ATLAS 2 (DONE - 2006-02-01 - Fan)
    # Align probes from GNF1H chip.
    ssh pk
    cd /cluster/data/hg18/bed
    mkdir -p geneAtlas2/run/psl
    cd geneAtlas2/run
    #	This bluearc/geneAtlas2 directory already exists
    # mkdir -p /cluster/bluearc/geneAtlas2
    # cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2
    ls -1 /scratch/hg/gs.19/build36/maskedContigs > genome.lst
    ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -fine -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.19/build36/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 genome.lst mrna.lst gsub jobList
    para create jobList
    para try
    para check
    para push
    para time
# Completed: 378 of 378 jobs
# CPU time in finished jobs:       4038s      67.29m     1.12h    0.05d  0.000 y
# IO & Wait Time:                  2182s      36.37m     0.61h    0.03d  0.000 y
# Average job time:                  16s       0.27m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             250s       4.17m     0.07h    0.00d
# Submission to last job:           322s       5.37m     0.09h    0.00d
# Estimated complete:                 0s       0.00m     0.00h    0.00d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    # Processed 79733 alignments
    liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl
    rm -r contig.psl raw.psl psl

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /cluster/data/hg18/bed/geneAtlas2
    #	Already symlinked
    # ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa \
    #	/gbdb/hgFixed/affyProbes
    hgLoadPsl hg18 affyGnf1h.psl
    hgLoadSeq hg18 /gbdb/hgFixed/affyProbes/gnf1h.fa

    grep -v U133B ../affyUclaNorm/hg18.affyU133AB_all.lifted.pslReps.psl \
	| sed -e "s/exemplar://; s/consensus://; s/U133A://" \
	| sed -e "s/;//" > affyU133A.psl

    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
    	affyU133A.psl  /cluster/data/hg18/bed/geneAtlas2/affyGnf1h.psl

    # Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio
    # Mapped 32926,  multiply-mapped 2000, missed 48, unmapped 11770

    hgLoadBed hg18 gnfAtlas2 gnfAtlas2.bed
    # Loaded 34926 elements of size 15

########################################################################
#  Creating the ideoband data track  (DONE - 2006-02-02 - Hiram)
#	This was reloaded upon completion of the cytoband sequence
#	mentioned above.
#  Received the following files in email from Wonhee Jang from NCBI:
# -rw-rw-r--    1     1917 Feb  2 14:01 setBands.txt
# -rw-rw-r--    1    39058 Feb  2 14:01 human_ideogram.dat
# -rw-rw-r--    1   673148 Feb  2 14:01 fish.markers.bed
#	placed them into /cluster/data/hg18/bed/ideogram
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/ideogram
    cd /cluster/data/hg18/bed/ideogram

    cat << '_EOF_' > mkBands.sh
#!/bin/sh

T=/cluster/data/hg18/bed/ideogram
HI=${T}/human_ideogram.dat
FM=${T}/fish.markers.bed
SB=${T}/setBands.txt

bander chr1 ${HI} ${FM} ${SB} 1 247199719 100 2.0 2
bander chr2 ${HI} ${FM} ${SB} 2 242751149 100 2.0 2
bander chr3 ${HI} ${FM} ${SB} 3 199446827 100 2.0 2
bander chr4 ${HI} ${FM} ${SB} 4 191263063 100 2.0 2
bander chr5 ${HI} ${FM} ${SB} 5 180837866 100 2.0 2
bander chr6 ${HI} ${FM} ${SB} 6 170896992 100 2.0 2
bander chr7 ${HI} ${FM} ${SB} 7 158821424 100 2.0 2
bander chr8 ${HI} ${FM} ${SB} 8 146274826 100 2.0 2
bander chr9 ${HI} ${FM} ${SB} 9 140273252 100 2.0 2
bander chr10 ${HI} ${FM} ${SB} 10 135374737 100 2.0 2
bander chr11 ${HI} ${FM} ${SB} 11 134452384 100 2.0 2
bander chr12 ${HI} ${FM} ${SB} 12 132289534 100 2.0 2
bander chr13 ${HI} ${FM} ${SB} 13 114127980 100 2.0 2
bander chr14 ${HI} ${FM} ${SB} 14 106360585 100 2.0 2
bander chr15 ${HI} ${FM} ${SB} 15 100338915 100 2.0 2
bander chr16 ${HI} ${FM} ${SB} 16 88822254 100 2.0 2
bander chr17 ${HI} ${FM} ${SB} 17 78654742 100 2.0 2
bander chr18 ${HI} ${FM} ${SB} 18 76117153 100 2.0 2
bander chr19 ${HI} ${FM} ${SB} 19 63806651 100 2.0 2
bander chr20 ${HI} ${FM} ${SB} 20 62435964 100 2.0 2
bander chr21 ${HI} ${FM} ${SB} 21 46944323 100 2.0 2
bander chr22 ${HI} ${FM} ${SB} 22 49591432 100 2.0 2
bander chrX ${HI} ${FM} ${SB} X 154913754 100 2.0 2
bander chrY ${HI} ${FM} ${SB} Y 57443437 100 2.0 2

for I in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
    cat chr${I}.bed
done > cytobands.bed
'_EOF_'
    #	happy emacs

    chmod +x mkBands.sh
    ./mkBands.sh

    #	should be 862
    wc cytobands.bed
    #	862    4310   29911 cytobands.bed

    hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
        hg18 cytoBand cytobands.bed

    hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
        hg18 cytoBandIdeo cytobands.bed

############################################################################
# H-INVITATIONAL GENE ANNOTATION DATABASE (DONE 2006-0202, Fan)
    # http://www.jbirc.aist.go.jp/hinv/top.html
    # Create knownGene table to reference HINV gene ID's
    #  for link on knownGenes details page
    # Also, create an HINV gene track

    # download CDNA file release 2.2 (Jan 20, 2006) -- got release # from downloads page).
    ssh kkstore03
    cd /cluster/data/hinv
    mkdir 2005-02-02
    cd 2005-02-02
    wget --timestamp http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz
    gunzip FCDNA.gz
    mv FCDNA FCDNA.2.2

    # set up assembly work area
    ssh kkstore02
    cd /cluster/data/hg18
    mkdir -p bed/hinv
    cd bed/hinv

    # extract H-INV ID's and Genbank accessions of mRNAs
    awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > accessions.txt
    awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > ids.txt
    paste accessions.txt ids.txt > queries.txt
    wc -l ids.txt
    #   56419 ids.txt

    # create PSL file from alignments for these mRNA's, extracted from the
    #       table of all aligned mRNA's
    ssh hgwdev
    cd /cluster/data/hg18/bed/hinv
    hgsql hg18 -s -e "SELECT * FROM all_mrna"  | cut -f 2- > all_mrna.tab

    ssh kkstore02
    cd /cluster/data/hg18/bed/hinv
    pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl

    # using pslReps to generate the PSL file header
    pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl

    # NEXT TIME, LOAD HInvGeneMrna TABLE AFTER HInv TABLE IS LOADED TO AVOID
    # joinerCheck TO COMPLAIN.
    # load track of mrna alignments
    ssh hgwdev
    cd /cluster/data/hg18/bed/hinv
    hgLoadPsl hg18 -table=HInvGeneMrna hinv_mrna.psl
    hgsql hg18 -s -e \
        "select distinct(qName) from HInvGeneMrna order by qName" > hg18.mrna
    hgsql hg17 -s -e \
        "select distinct(qName) from HInvGeneMrna order by qName" > hg17.mrna
    wc -l hg*.mrna
        # 41023 hg17.mrna
        # 54974 hg18.mrna

    comm -1 -3 *.mrna > hg18.aligned
    wc -l hg18.aligned
        # 14758 (transcripts newly aligned in hg18)
    comm -2 -3 *.mrna > hg17.aligned
    wc -l hg17.aligned
        # 807 (transcripts no longer aligned in hg18)
    comm -2 -3 ids.txt hg18.mrna > hg18.notaligned
    wc -l hg18.notaligned
        # 1445 (transcripts not aligned in hg18 -- checking on why...)

    # also make a table with various useful items for each transcript
    ssh hgwdev
    hgsql hg18 < ~/kent/src/hg/lib/HInv.sql
    cd /cluster/data/hg18/bed/hinv
    /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > HInv.tab
    echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg18
    hgsql hg17 -s -e "select count(*) from HInv"
        # 41118
    hgsql hg18 -s -e "select count(*) from HInv"
        # 56419

    # !!! DO THIS AFTER KG IS BUILD !!!
    # DONE (4/13/06 Fan).
    # create table for knownGenes detail page
    ssh hgwdev
    cd /cluster/data/hg18/bed/hinv
    hgMapToGene hg18 HInvGeneMrna knownGene knownToHInv

# QA NOTE (3-6-2006): did a mytouch to update the time for the HInvGeneMrna table
# (because joinerCheck was complaining during -times check):
# sudo mytouch hg18 HInvGeneMrna 200602031600.00
# touch -t 200602031600.00 /var/lib/mysql/hg18/HInvGeneMrna.MYD


# PRODUCE FUGU BLAT ALIGNMENT (DONE - 2006-02-02 - Fan)

    ssh kk
    mkdir /cluster/data/hg18/bed/blatFr1
    cd /cluster/data/hg18/bed/blatFr1
    mkdir psl
    # next time, use N?_?????? (to pick up NG_ contigs)
    foreach f ( `cat /cluster/data/hg18/contig.lst` )
      set c=$f:t:r
      echo $c
      mkdir psl/$c
    end

    # create cluster job
    mkdir run
    cd run
    ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
    ls -1S /scratch/hg/gs.19/build36/maskedContigs/*.fa > human.lst
cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg18/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
    # << keep emacs happy
    gensub2 human.lst fugu.lst gsub jobList
    para create jobList
    # 218484 jobs written to batch
    para try
    para check
    para push -maxQueue=300000 -maxPush=220000
    para check
# Completed: 218484 of 218484 jobs
# CPU time in finished jobs:    5073329s   84555.48m  1409.26h   58.72d  0.161 y
# IO & Wait Time:                692572s   11542.87m   192.38h    8.02d  0.022 y
# Average job time:                  26s       0.44m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             910s      15.17m     0.25h    0.01d
# Submission to last job:         14753s     245.88m     4.10h    0.17d

        # cd ../psl
        # count files with aligments
        # find . -not -size 427c | wc -l
        # 44458
        # count files with no aligments
        # find . -size 427c | wc -l
        # 174405

   # When cluster run is done, sort alignments
   # into chrom directory
    ssh kkstore02
    cd /cluster/data/hg18/bed/blatFr1
    pslCat -dir psl/N?_?????? | \
      liftUp -type=.psl stdout \
      /cluster/data/hg18/jkStuff/liftAll.lft warn stdin | \
      pslSortAcc nohead chrom temp stdin

    # Processed 218887 lines into 1 temp files

    # Rename to correspond with tables and load into database:
    ssh hgwdev
    cd /cluster/data/hg18/bed/blatFr1/chrom
    foreach i (chr*.psl)
        set r = $i:r
        echo mv $i ${r}_blatFr1.psl
        mv $i ${r}_blatFr1.psl
    end

    # lift fugu scaffolds to Fugu browser chrUn,
    # so you can link to other browser.  And don't need to load sequence
    cd /cluster/data/hg18/bed/blatFr1
    liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl

    hgLoadPsl -table=blatFr1 hg18 all.psl

    nice featureBits hg18 blatFr1 refGene:CDS
    # 14636876 bases of 2881515245 (0.508%) in intersection
    nice featureBits hg17 blatFr1 refGene:CDS
    # 14488047 bases of 2866216770 (0.505%) in intersection


#######################################################################
#  OPOSSUM BLASTZ - (DONE - 2006-02-10 - Hiram)
    ssh kk
    #	this was done again after this, see 2006-02-13
    mkdir /cluster/data/hg18/bed/blastzMonDom4.2006-02-10
    cd /cluster/data/hg18/bed/blastzMonDom4.2006-02-10

    cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin

BLASTZ=blastz.v7

# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom4
SEQ2_DIR=/iscratch/i/monDom4/monDom4.2bit
SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-10
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-stop=net `pwd`/DEF > blastz.out 2>&1 &
    #	running 2006-02-10
# Completed: 43469 of 43470 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:   25745592s  429093.20m  7151.55h  297.98d  0.816 y
# IO & Wait Time:               8466642s  141110.70m  2351.85h   97.99d  0.268 y
# Average job time:                 787s      13.12m     0.22h    0.01d
# Longest finished job:           51561s     859.35m    14.32h    0.60d
# Submission to last job:        103470s    1724.50m    28.74h    1.20d
    #	There wasn't actually an outstanding job, it had been completed.
# Completed: 345 of 345 jobs
# CPU time in finished jobs:        620s      10.33m     0.17h    0.01d  0.000 y
# IO & Wait Time:                  1631s      27.19m     0.45h    0.02d  0.000 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest finished job:              69s       1.15m     0.02h    0.00d
# Submission to last job:           255s       4.25m     0.07h    0.00d

# Completed: 49 of 49 jobs
# CPU time in finished jobs:     224697s    3744.94m    62.42h    2.60d  0.007 y
# IO & Wait Time:                  4790s      79.84m     1.33h    0.06d  0.000 y
# Average job time:                4683s      78.06m     1.30h    0.05d
# Longest finished job:          115041s    1917.35m    31.96h    1.33d
# Submission to last job:        115147s    1919.12m    31.99h    1.33d


    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=cat -stop=net `pwd`/DEF > cat-net.out 2>&1 &
    #	running 2006-02-11
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=load -stop=load `pwd`/DEF > load.out 2>&1 &

    ssh kolossus
    cd /cluster/data/hg18/bed/blastz.monDom4
    time nice -n +19 featureBits hg18 chainMonDom4Link \
	> fb.hg18.chainMonDom4Link 2>&1 &
    cat fb.hg18.chainMonDom4Link
    #	356865888 bases of 2881515245 (12.385%) in intersection

####################################################################################
# BUILD KNOWN GENES TABLES (STARTED 2/1/06, DONE 2/13/06 Fan)

# First build protein databases, sp060115 and proteins060115
# See makeProteins060115.doc for details.

# Create working subdirectories and temporary databases (kgHg18A)

    ssh hgwdev
  cd /cluster/store11/kg
  mkdir kgHg18A
  ln -s /cluster/store11/kg/kgHg18A /cluster/store6/kgDB/bed/kgHg18A
  ln -s /cluster/store11/kg/kgHg18A /cluster/data/hg18/bed/kgHg18A

  hgsql hg18 -e "create database kgHg18A"
  hgsql hg18 -e "create database kgHg18ATemp"

  mkdir /cluster/bluearc/kgDB/kgHg18A
  mkdir /cluster/bluearc/kgDB/kgHg18A/protBlat
  ln -s /cluster/bluearc/kgDB/kgHg18A/protBlat /cluster/store11/kg/kgHg18A/protBlat
  cd /cluster/store11/kg/kgHg18A/protBlat

# Get all human protein sequences

  hgsql -N sp060115 -e \
  'select p.acc, p.val from protein p, accToTaxon x where x.taxon=9606 and p.acc=x.acc'\
  |awk '{print ">" $1;print $2}' >humanProt.fa

  hgsql -N sp060115 -e \
  'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x where v.parAcc = p.acc and x.taxon=9606   and v.parAcc=x.acc'\
  |awk '{print ">" $1;print $2}' \
  >humanVarProt.fa

# append var proteins to humanProt.fa
  cat humanVarProt.fa >>humanProt.fa

# Prepare and perform cluster run for protein/genome alignment

  ssh pk
  cd /cluster/data/hg18/bed/kgHg18A/protBlat
  mkdir prot
  faSplit sequence humanProt.fa 2000 prot/prot
  ls /cluster/bluearc/kgDB/kgHg18A/protBlat/prot/* > prot.lis

  ssh hgwdev
  cd /cluster/data/hg18/bed/kgHg18A/protBlat
  hgsql hg18 -N -e 'select chrom from chromInfo' > chrom.lis
  exit

  cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -t=dnax -q=prot /cluster/data/hg18/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgHg18A/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

  mkdir result
  gensub2 chrom.lis prot.lis gsub jobList

  para create jobList
  para try
  para check
  para push
  para check ...
# Completed: 97020 of 97020 jobs
# CPU time in finished jobs:   16070335s  267838.92m  4463.98h  186.00d  0.510 y
# IO & Wait Time:                279789s    4663.15m    77.72h    3.24d  0.009 y
# Average job time:                 169s       2.81m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:          152051s    2534.18m    42.24h    1.76d
# Submission to last job:        152235s    2537.25m    42.29h    1.76d

# This cluster run took a little less than 2 days.

# collect BLAT results

   pslSort -nohead dirs raw.psl temp result
   pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl protBlat.psl /dev/null

   ssh hgwdev
   cd /cluster/bluearc/kgDB/kgHg18A/protBlat
   hgLoadPsl hg18 protBlat.psl

# create all_mrna.psl and tight_mrna.psl

   hgsql hg18 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl

   pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
           all_mrna.psl tight_mrna.psl /dev/null

# Save a copy of the following hg18 tables:

all_mrna
gbCdnaInfo
gbExtFile
gbLoaded
gbSeq
gbStatus
genbank.lis
refFlat
refGene
refLink
refSeqAli
refSeqStatus
refSeqSummary
xenoMrna
xenoRefFlat
xenoRefGene
xenoRefSeqAli

# Use overlapSelect to get protein and mRNA alignment overlaps
   overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
   -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.stat

   overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
   -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.out

# Create protein/mRNA pair and protein lists
   cut -f 10,31 protMrna.out|sort -u >spMrna.tab
   cut -f 10    protMrna.out|sort -u >protein.lis
   mv protein.lis ..

# Load spMrna.tab into spMrna table in temp DB.
   hgsql kgHg18ATemp < ~/src/hg/lib/spMrna.sql
   hgsql kgHg18ATemp -e 'load data local infile "spMrna.tab" into table spMrna'
   hgsql kgHg18ATemp -e 'create index mrnaID on spMrna(mrnaID)'

# Prepare and perform cluster run of protein/mRNA alignment

# Get mRNA fa file.
   cd /cluster/data/hg18/bed/kgHg18A
   /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=hg18 \
   -gbRoot=/cluster/data/genbank genbank mrna mrna.fa

# Create mrnaSeq table in kgHg18ATemp DB.

   faToTab mrna.fa mrnaSeq.tab

   hgsql kgHg18ATemp -e 'drop table mrnaSeq'
   hgsql kgHg18ATemp <~/src/hg/lib/mrnaSeq.sql
   hgsql kgHg18ATemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'

# Prepare files for cluster run

   cd /cluster/bluearc/kgDB/kgHg18A
   ~/src/hg/protein/KG2B.sh kgHg18A hg18 060115

# Perform cluster run of protein/mRNA alignment
   ~/src/hg/protein/KG3.sh kgHg18A hg18 060115

# Collect cluster run results
   cd kgBestMrna

   ls out | sed -e 's/prot/do1 prot/g' >doall

# create do1 with the following 2 lines:
   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments
   pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
   cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
   wc protMrna.lis

# Load BLAT results into temp DB.
   ssh hgwdev
   cd /cluster/store11/kg/kgHg18A/kgBestMrna
   hgsql kgHg18ATemp < ~/src/hg/lib/protMrnaBlat.sql
   hgsql kgHg18ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
   hgsql kgHg18ATemp -e 'create index tName on protMrnaBlat(tName)'

# Create CDS files from protein/mRNA alignment results.
   hgsql kgHg18ATemp -N -e \
   'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
   |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds

# Create protMrna.psl with proteinID_mrnaID as query ID.
   cut -f 22-30 ../protBlat/protMrna.out > j1.tmp
   cut -f 32-42 ../protBlat/protMrna.out > j2.tmp
   cut -f 10,31 ../protBlat/protMrna.out|sed -e 's/\t/_/g' >j3.tmp
   paste j1.tmp j3.tmp j2.tmp >protMrna.psl
   rm j1.tmp j2.tmp j3.tmp

# Run mrnaToGene to create protMrna.gp
   bash
   mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
   exit

# move kgBestMrna to /san/sanvol1 to save space on store11

   mv /cluster/store11/kg/kgHg18A/kgBestMrna/clusterRun /san/sanvol1/scratch/fan/hg18/kgHg18A/kgBestMrna
   ln -s /san/sanvol1/scratch/fan/hg18/kgHg18A/kgBestMrna/clusterRun \
   /cluster/store11/kg/kgHg18A/kgBestMrna/clusterRun

# Prepare refGene and all_mrna gp files.

   cd ..
   cp -p base/refGene.tab ref.gp

#   hgsql hg18 -N -e 'select * from refGene' >ref.gp

   hgsql hg18 -N -e \
   'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
   |sort -u > all_mrna.cds

   cat base/all_mrna.tab |cut -f 2-22 >all_mrna.psl
   bash
   mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
   exit

# Align proteins to RefSeq.

   overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp ref.stat
   overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp protRef.gp

   overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
   -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out

   cut -f 10,22 protRef.out | sort -u >spRef.tab
   cut -f 10 protRef.out    | sort -u >protRef.lis

   hgsql kgHg18ATemp -e 'drop table spRef'
   hgsql kgHg18ATemp <~/src/hg/lib/spRef.sql
   hgsql kgHg18ATemp -e 'load data local infile "spRef.tab" into table spRef'

# Prepare and perform cluster runs for protein/RefSeq alignments

   ~/src/hg/protein/KGRef2.sh kgHg18A hg18 060115
# Took 7 hours.  This step should be investigated and improved.

   ~/src/hg/protein/KGRef3.sh kgHg18A hg18 060115

   cd kgBestRef
   ls out | sed -e 's/prot/do1 prot/g' >doall

   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments.
   pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
   cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
   wc protRef.lis

   hgsql kgHg18ATemp -e 'drop table protRefBlat'
   hgsql kgHg18ATemp < ~/src/hg/lib/protRefBlat.sql
   hgsql kgHg18ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
   hgsql kgHg18ATemp -e 'create index tName on protRefBlat(tName)'

# Run gene-check to filter out invalid gp entries
   cd /cluster/data/hg18/bed/kgHg18A
   cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
   gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/hg18/nib kgCandidate0.gp kgCandidate0.check

   hgsql kgHg18ATemp -e 'drop table kgCandidate0'
   hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidate0.sql
   hgsql kgHg18ATemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'

   hgsql kgHg18ATemp -e 'drop table geneCheck'
   hgsql kgHg18ATemp < ~/src/hg/lib/geneCheck.sql
   hgsql kgHg18ATemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'

# Run kgCheck to get all KG candidates that pass the KG gene check criteria

   kgCheck kgHg18ATemp hg18 kgCandidate0 geneCheck kgCandidate.tab
   hgsql kgHg18ATemp -e  'drop table kgCandidate'
   hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidate.sql
   hgsql kgHg18ATemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
   hgsql kgHg18ATemp -e 'create index alignID on kgCandidate(alignID)'

# ####### NEXT TIME AROUND PUT IN AN EXTRA STEP TO BRING IN ITEMS ON A "PUT BACK" LIST
# FOR SPECIAL CASES LIKE SELENOCYSTEINE, NON-AUG INITIATION CODON, RIBOSOMAL SLIPPAGE, ETC.
# #######

# Construct the kgCandidateX table that has alignID in the name field.
   cut -f 2-10 kgCandidate.tab >j2.tmp
   cut -f 11 kgCandidate.tab >j1.tmp
   paste j1.tmp j2.tmp >kgCandidateX.tab

   hgsql kgHg18ATemp -e  'drop table kgCandidateX'
   hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateX.sql
   hgsql kgHg18ATemp -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'

# Score protein/mRna and protein/RefSeq alignments

   ln -s protBlat/protein.lis protein.lis
   kgResultBestMrna2 060115 kgHg18ATemp hg18 protMrnaBlat|sort -u >protMrnaBlatScore.tab
   kgResultBestRef2  060115 kgHg18ATemp hg18 protRefBlat|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
   cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgHg18ATemp -e 'drop table protMrnaScore'
   hgsql kgHg18ATemp < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgHg18ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
   hgsql kgHg18ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'

# Run kgGetCds to get CDS structure of each gene

   kgGetCds kgHg18ATemp 060115 kgCandidateX jY.tmp
   cat jY.tmp |sort -u >kgCandidateY.tab
   rm jY.tmp
   hgsql kgHg18ATemp -e  'drop table kgCandidateY'
   hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateY.sql
   hgsql kgHg18ATemp -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'

# Run kgPickPrep to replace long cds structure string with cdsId.
   kgPickPrep kgHg18ATemp kgCandidateZ.tab
   hgsql kgHg18ATemp -e  'drop table kgCandidateZ'
   hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateZ.sql
   hgsql kgHg18ATemp -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
   hgsql kgHg18ATemp -e 'create index cdsId on kgCandidateZ(cdsId)'

# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.

   kgPick kgHg18ATemp hg18 sp060115 kg3.tmp dupSpMrna.tmp
   sort -u dupSpMrna.tmp >dupSpMrna.tab

# Create put back list

# gbGetSeqs2, a modified version of gbGetSeqs output the RefSeq IDs at the beginning of each output line.

   gbGetSeqs2 -gbRoot=/cluster/data/genbank db=hg18 -get=ra RefSeq mrna ref.ra
   cat ref.ra | sed -e 's/ /\t/' | sort -u >refRa.tab

   hgsql hg18 -e  'drop table refRa'
   hgsql hg18 < ~/src/hg/lib/refRa.sql
   hgsql hg18 -e  'load data local infile "refRa.tab" into table refRa ignore 1 lines'

    hgsql hg18 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and  r3.val="Homo sapiens"' \
    >kgPutBack2.tab

    hgsql hg18 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%ribosomal frameshift%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
    >>kgPutBack2.tab

    hgsql hg18 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
    >>kgPutBack2.tab

    hgsql hg18 -N -e \
    'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
    >>kgPutBack2.tab

    hgsql hg18 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
    >>kgPutBack2.tab

   hgsql kgHg18ATemp -e 'drop table kgPutBack2'
   hgsql kgHg18ATemp < ~/src/hg/lib/kgPutBack2.sql
   hgsql kgHg18ATemp -e  'load data local infile "kgPutBack2.tab" into table kgPutBack2'

   kgPutBack kgHg18ATemp hg18 sp060115 kgPutBack2 kgPutBack2.gp

# No matching protein found for NM_201397.
# No matching protein found for NM_203341.
# No matching protein found for NM_213593.
# No matching protein found for NM_052987.
# No matching protein found for NM_201397.
# No matching protein found for NM_203341.
# No matching protein found for NM_213593.

# Sort KG genes to make the kg4.gp table file.
   cat kgPutBack2.gp kg3.tmp > kg4.tmp
   ~/kent/src/hg/protein/sortKg.pl kg4.tmp >knownGene.tab

   hgsql kgHg18ATemp -e  'drop table knownGene'
   hgsql kgHg18ATemp < ~/src/hg/lib/knownGene.sql
   hgsql kgHg18ATemp -e  'load data local infile "knownGene.tab" into table knownGene'

# Load data into hg18 knownGene table.
   hgsql hg18 -e  'drop table knownGene'
   hgsql hg18 < ~/src/hg/lib/knownGene.sql
   hgsql hg18 -e  'load data local infile "knownGene.tab" into table knownGene'

# Load dupSpMrna table after knownGene table is loaded so that joinerCheck does not complain.

   hgsql hg18 -e  'drop table dupSpMrna'
   hgsql hg18 < ~/src/hg/lib/dupSpMrna.sql
   hgsql hg18 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'

# Perform analysis on KG

# Build knownGeneMrna and knownGenePep tables.

   kgPepMrna kgHg18ATemp hg18 060115
   hgsql hg18 -e  'drop table knownGeneMrna'
   hgsql hg18 < ~/src/hg/lib/knownGeneMrna.sql
   hgsql hg18 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
   hgsql hg18 -e  'drop table knownGenePep'
   hgsql hg18 < ~/src/hg/lib/knownGenePep.sql
   hgsql hg18 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'

# Build kgXref table

   kgXref2 kgHg18ATemp 060115 hg18

   hgsql hg18 -e  'drop table kgXref'
   hgsql hg18 < ~/src/hg/lib/kgXref.sql
   hgsql hg18 -e  'load data local infile "kgXref.tab" into table kgXref'

# Build spMrna table

   hgsql hg18 -N -e 'select proteinID, name from knownGene' >kgSpMrna.tab

   hgsql hg18 -e  'drop table spMrna'
   hgsql hg18 <~/src/hg/lib/spMrna.sql
   hgsql hg18 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build kgProtMap table

    ~/src/hg/protein/kgProtMap2.sh kgHg18A hg18 060115

# Found the number of kgProtMap table was less than 20,000,
# indicating missing a lot of entries.  The problem was
# due to that tight_mrna.psl was now in ~/hg18Kg/protBlat.
# Manually ran the following to correct the problem:

cd ~/hg18Kg/kgProtMap/psl.tmp
cat ~/hg18Kg/protBlat/tight_mrna.psl refSeqAli.psl > both.psl

pslMap kgProtMrna.psl both.psl stdout | sort -u| \
        sort -k 14,14 -k 16,16n -k 17,17n > kgProtMap.psl

hgsql hg18 -e "drop table kgProtMap;"
hgLoadPsl -tNameIx hg18 kgProtMap.psl

#####################################
# Build alias tables.

   kgAliasM hg18 proteins060115

#	kgAliasKgXref reads from hg18.knownGene.proteinID,
#	hg18.knownGene.name, hg18.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref hg18

#	kgAliasRefseq reads from hg18.knownGene.name,
#	hg18.knownGene.proteinID, hg18.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq hg18

   hgsql sp060115 -N -e 'select name,gene.val from hg18.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
   | sort -u  > kgAliasP.tab

   hgsql hg18 -N -e 'select name, name from knownGene' >kgAliasDup.tab
   hgsql hg18 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab

   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" hg18
   hgsql hg18 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql hg18 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias'

#	kgProtAlias reads from hg18.knownGene.name,
#	hg18.knownGene.proteinID, hg18.knownGene.alignID,
#	proteins060115.spXref3.accession, proteins060115.spSecondaryID, proteins060115.pdbSP.pdb
#	to create kgProtAlias.tab#

   kgProtAlias hg18 060115

   hgsql hg18 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

# include variant splice protein IDs

   hgsql hg18 -N -e \
   'select name, proteinID, parAcc from knownGene,sp060115.varAcc where varAcc=proteinID'\
   |sort -u >kgProtAliasDup.tab

# include duplicate protein IDs from dupSpMrna table
   hgsql hg18 -N -e \
   'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
   |sort -u >>kgProtAliasDup.tab

# catch parent acc from dupProteinID too
   hgsql hg18 -N -e\
   'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp060115.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
   |sort -u >>kgProtAliasDup.tab
    cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab

    echo "`date` creating table kgProtAlias"
    hgsql hg18 -e "drop table kgProtAlias;"
    hgsql hg18 <~/src/hg/lib/kgProtAlias.sql;
    hgsql hg18 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'

# Build kgSpAlias table

    hgsql hg18 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql hg18 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >hg18.kgSpAlias.tab
    rm j.tmp

    hgsql hg18 -e 'drop table kgSpAlias';
    hgsql hg18 < ~/src/hg/lib/kgSpAlias.sql
    hgsql hg18 -e 'load data local infile "hg18.kgSpAlias.tab" into table kgSpAlias'

# QA NOTE (3-6-2006): did a mytouch to update the time for the knownGene table
# (because joinerCheck was complaining during -times check):
# [hgwdev:~/joiner> sudo mytouch hg18 knownGene 200602061707
# touch -t 200602061707 /var/lib/mysql/hg18/knownGene.MYD

# MAKE FOLDUTR TABLES (DONE 2006-02-09, Fan)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/hg18/bed
    mkdir rnaStruct.2006-02-09
    rm rnaStruct
    ln -s rnaStruct.2006-02-09 rnaStruct
    cd rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa hg18 knownGene utr3 utr3/utr.fa
    utrFa hg18 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh pk
    cd /cluster/data/hg18/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 36097 of 36097 jobs
# CPU time in finished jobs:     335580s    5593.00m    93.22h    3.88d  0.011 y
# IO & Wait Time:                653230s   10887.16m   181.45h    7.56d  0.021 y
# Average job time:                  27s       0.46m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1730s      28.83m     0.48h    0.02d
# Submission to last job:          6007s     100.12m     1.67h    0.07d

# Do cluster run for 5' UTRs
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 34011 of 34011 jobs
# CPU time in finished jobs:      78543s    1309.05m    21.82h    0.91d  0.002 y
# IO & Wait Time:                938250s   15637.50m   260.62h   10.86d  0.030 y
# Average job time:                  30s       0.50m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            5873s      97.88m     1.63h    0.07d
# Submission to last job:          6139s     102.32m     1.71h    0.07d

# Load database
    ssh hgwdev
    cd /cluster/data/hg18/bed/rnaStruct/utr5
    hgLoadRnaFold hg18 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold hg18 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# Build KEGG pathway tables.  DONE 5/19/05.  Fan.
   ssh hgwdev
   cd /cluster/store11/kg/kgHg18A
   md kegg
   cd kegg

   ~/src/hg/protein/KGpath.sh kgHg18A hg18 060115

   hgsql hg18 -e "drop table keggMapDesc"
   hgsql hg18 -e "drop table keggPathway"
   hgsql hg18 <~/src/hg/lib/keggMapDesc.sql
   hgsql hg18 <~/src/hg/lib/keggPathway.sql
   hgsql hg18 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql hg18 -e 'load data local infile "keggPathway.tab" into table keggPathway'

# Build CGAP pathway tables
# RELOAD cgapAlias TABLE AFTER REMOVING REPLICATE ROWS (hartera, 2005-07-26)
# duplicate rows. (hartera, 2005-07-26)
# RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)

   cd ..
   mkdir cgap
   cd cgap
   ~/src/hg/protein/KGcgap.sh kgHg18A hg18 060115

   cat cgapBIOCARTAdesc.tab |sort -u > cgapBIOCARTAdescSorted.tab
   hgsql hg18 -e "drop table cgapAlias"
   hgsql hg18 -e "drop table cgapBiocDesc"
   hgsql hg18 -e "drop table cgapBiocPathway"
   hgsql hg18 <~/src/hg/lib/cgapAlias.sql
   hgsql hg18 <~/src/hg/lib/cgapBiocDesc.sql
   hgsql hg18 <~/src/hg/lib/cgapBiocPathway.sql

   hgsql hg18 -e 'load data local infile "cgapAlias.tab" \
                 into table cgapAlias'
   hgsql hg18 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
   hgsql hg18 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'


# Build hg18 PROTEOME BROWSER TABLES

# These are instructions for building tables
# needed for the Proteome Browser.

# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.
# This build is based on proteins DBs dated 060115.

# Create the working directory

    ssh hgwdev
    mkdir /cluster/store11/kg/kgHg18A/pb-2006-02-10
    cd /cluster/data/hg18/bed
    rm pb
    ln -s /cluster/store11/kg/kgHg18A/pb-2006-02-10 pb
    cd pb

# Define pep* tables in hg18 DB

	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

#  First edit out pepPred table definition, then

	hgsql hg18 < pepAll.sql

# Build the pepMwAa table

  hgsql proteins060115 -N -e \
"select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql hg18 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'

o Build the pepPi table

    hgsql proteins060115 -e \
    "select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis

    hgsql hg18 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis

    pbCalPi protAcc.lis sp060115 pepPi.tab
    hgsql hg18 -e 'delete from pepPi'
    hgsql hg18 -e 'load data local infile "pepPi.tab" into table hg18.pepPi'

# Calculate and load pep distributions

    pbCalDist sp060115 proteins060115 9606 hg18 >pbCalDist.out
    wc  pbCalDist.out

    hgsql hg18
    load data local infile "pepExonCntDist.tab" into table hg18.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table hg18.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table hg18.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table hg18.pepMolWtDist;
    load data local infile "pepResDist.tab" into table hg18.pepResDist;
    load data local infile "pepIPCntDist.tab" into table hg18.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table hg18.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd sp060115 9606 hg18

# Create pbAnomLimit and pbResAvgStd tables

   hgsql hg18 -e "drop table pbAnomLimit"
   hgsql hg18 -e "drop table pbResAvgStd"
   hgsql hg18 < ~/src/hg/lib/pbAnomLimit.sql
   hgsql hg18 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql hg18 -e 'load data local infile "pbResAvgStd.tab" into table hg18.pbResAvgStd;'
   hgsql hg18 -e 'load data local infile "pbAnomLimit.tab" into table hg18.pbAnomLimit;'

# Create pbStamp table for PB
  hgsql hg18 -e "drop table pbStamp"
  hgsql hg18 < ~/src/hg/lib/pbStamp.sql
  hgsql hg17 -N -e 'select * from pbStamp' > pbStamp.tab
  hgsql hg18 -e 'load data local infile "pbStamp.tab" into table hg18.pbStamp'

# Turn on Proteome Browser for hg18.

  hgsql -e 'delete from dbDb where name="hg18"' \
        -h genome-testdb hgcentraltest

  hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \
        "chr7:127,664,479-127,689,005", 1, 10, "Human", "Homo sapiens", \
        "/gbdb/hg18/html/description.html", 0, 1, "NCBI Build 36.1");' \
        -h genome-testdb hgcentraltest

# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the
  pbStamp.tab file and then delete and reload the pbStamp table.

  hgsql hg18 -e "drop table pbStamp"
  hgsql hg18 < ~/src/hg/lib/pbStamp.sql
  hgsql hg18 -e 'load data local infile "pbStamp.tab" into table hg18.pbStamp'

# Perform preliminary review of Proteome Browser for hg18, then
  notify QA for formal review.


# First build entrez DB tables.

   cd /cluster/store10/entrez
   mkdir 060208
   ln -s /cluster/store10/entrez/060208 /cluster/data/entrez/060208
   cd /cluster/data/entrez/060208

   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
   gzip -d *.gz

   cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab
   cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
   cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab

   hgsql entrez -e 'drop table entrezRefseq'
   hgsql entrez -e 'drop table entrezMrna'
   hgsql entrez -e 'drop table entrezRefProt'

   hgsql entrez < ~/src/hg/lib/entrezRefseq.sql
   hgsql entrez < ~/src/hg/lib/entrezMrna.sql
   hgsql entrez < ~/src/hg/lib/entrezRefProt.sql

   hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq'
   hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna'
   hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt'

   cd /cluster/store11/kg/kgHg18A
   hgsql entrez -N -e \
        'select mrna, refseq from entrezRefseq, entrezMrna, hg18.all_mrna where qName=mrna and entrezRefseq.geneID=entrezMrna.geneID' \
   >mrnaRefseq1.tab

# Include RefSeq as valid mRNA too.
    hgsql hg18 -N -e 'select name, name from refGene' >mrnaRefseq2.tab

    cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab

    hgsql hg18 -e 'drop table mrnaRefseq'
    hgsql hg18 < ~/src/hg/lib/mrnaRefseq.sql
    hgsql hg18 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 2/16/06 Fan)
# This depends on the go and uniProt databases as well as
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/store11/kg/kgHg18A
     mkdir index
     cd index
     hgKgGetText hg18 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/store11/kg/kgHg18A/index/knownGene.ix  /gbdb/hg18/knownGene.ix
     ln -s /cluster/store11/kg/kgHg18A/index/knownGene.ixx /gbdb/hg18/knownGene.ixx

# BUILD KNOWN GENE LIST FOR GOOGLE.  (REDONE 8/12/08 JK)
# make knownGeneLists.html hg18GeneList.html mm5GeneList.html rm3GeneList.html

    cd /cluster/data/hg18/bed
    rm -rf knownGeneList/hg18

# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/hg18

    hgKnownGeneList hg18

# copy over to /usr/local/apache/htdocs

    rm -rf /usr/local/apache/htdocs/knownGeneList/hg18
    mkdir -p /usr/local/apache/htdocs/knownGeneList/hg18
    cp -Rfp knownGeneList/hg18/* /usr/local/apache/htdocs/knownGeneList/hg18

##################################################################################
# Create description.html for hg18

mkdir -p ~/kent/src/hg/makeDb/trackDb/human/hg18
cd ~/kent/src/hg/makeDb/trackDb/human/hg18
cp ../hg17/description.html .

vi description.html
# Change release date and build number and change hg17 to hg18
# Check it into CVS

mkdir -p /cluster/data/hg18/html
cp -p description.html /cluster/data/hg18/html

ln -s /cluster/data/hg18/html/description.html /gbdb/hg18/html/description.html

# BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-02-11, DONE 2006-02-14 - Fan)
#	This should be done after KG tables are complete from known genes build
#	process.
#
# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg18/bed/geneSorter.2006-02-11
# remove old symbolic link
rm /cluster/data/hg18/bed/geneSorter
ln -s /cluster/data/hg18/bed/geneSorter.2006-02-11 /cluster/data/hg18/bed/geneSorter
cd /cluster/data/hg18/bed/geneSorter
hgClusterGenes hg18 knownGene knownIsoforms knownCanonical

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg18/bed/geneSorter/blastp
cd /cluster/data/hg18/bed/geneSorter/blastp
pepPredToFa hg18 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
#	This command is in /projects/compbio/bin/$MACH/formatdb

# Copy over database to bluearc
rm -fr /cluster/bluearc/hg18/blastp
mkdir -p /cluster/bluearc/hg18/blastp
cp -p /cluster/data/hg18/bed/geneSorter/blastp/known.* /cluster/bluearc/hg18/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg18/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory
ssh pk
mkdir /cluster/data/hg18/bed/geneSorter/blastp/self
cd /cluster/data/hg18/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/hg18/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod +x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para push
para check
# Completed: 7733 of 7733 jobs
# CPU time in finished jobs:      56608s     943.47m    15.72h    0.66d  0.002 y
# IO & Wait Time:                467120s    7785.33m   129.76h    5.41d  0.015 y
# Average job time:                  68s       1.13m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             119s       1.98m     0.03h    0.00d
# Submission to last job:          1433s      23.88m     0.40h    0.02d

# Load into database.  This takes about 20 minutes
ssh hgwdev
cd /cluster/data/hg18/bed/geneSorter/blastp/self/run/out
bash
time hgLoadBlastTab hg18 knownBlastTab *.tab
# Scanning through 7733 files
# Loading database with 9647176 rows
# real    21m51.039s

cd /cluster/data/hg18/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene hg18 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene
#	hgsql -e "select count(*) from knownToRefSeq;" hg18
#	row count changed 34267

# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg18 > refToLl.txt
hgMapToGene hg18 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#	hgsql -e "select count(*) from knownToLocusLink;" hg18
#	row count changed to 34267

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg18 knownGene name proteinID Pfam knownToPfam
#	hgsql -e "select count(*) from knownToPfam;" hg18
#	row count changed to 34177

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene hg18 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
#	hgsql -e "select count(*) from knownToGnfAtlas2;" hg18
#	row count changed to 32015

# Create expression distance table - takes about an hour
    hgExpDistance hg18 hgFixed.gnfHumanAtlas2MedianRatio \
    	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnfAtlas2 &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 32015 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
#	hgsql -e "select count(*) from gnfAtlas2Distance;" hg18
#	row count changed to 32015000

# Create a table that maps between known genes and
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg18 affyUclaNorm knownGene knownToU133
#	hgsql -e "select count(*) from knownToU133;" hg18
#	row count changed to 32632

# Create expression distance table.  This will take about 2.5 hours
cd /tmp
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg18 affyUclaNorm affyUclaExp knownExpDistance \
	-weights=affyUcla.weight -lookup=knownToU133 &
# Have 43039 elements in affyUclaNorm
# 211 genes, 42 weights, 26.500000 total wieght
# Got 32965 unique elements in affyUclaNorm

# Create table that maps between known genes and
# the GNF data.
cd /tmp
hgMapToGene hg18 affyU95 knownGene knownToU95
#	row count changed to 17401
#	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg18 hgFixed.gnfHumanU95MedianRatio \
	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95 &
# Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
# Got 16378 unique elements in hgFixed.gnfHumanU95MedianRatio
#	row count changed to 16378000

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes only 10 minutes.)

hgMapToGene hg18 affyGnf1h knownGene knownToGnf1h
hgExpDistance hg18 hgFixed.gnfHumanAtlas2MedianRatio \
	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnf1h &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 8739 unique elements in hgFixed.gnfHumanAtlas2MedianRatio

# AFFYMETRIX HG-U133 PLUS TRACK (DONE, 2006-02-11, Fan)
# Loaded the HG-U133 Plus 2 sequences for hg18 (DONE, 2006-03-29, hartera)
# The below was already done.
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
     ssh hgwdev
     mkdir -p /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
     # Go to http://www.affymetrix.com/support/technical/byproduct.affx?product=hg-u133-plus
     # and download the consensus and exemplar sequences to this directory
     cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
     unzip HG-U133_Plus_2_consensus.zip
     unzip HG-U133_Plus_2_exemplar.zip
     cat HG-U133_Plus_2_consensus HG-U133_Plus_2_exemplar >> U133Plus2_all.fa
     perl -pi.bak -e "s/(consensus|exemplar):HG-U133_Plus_2:/U133+2:/" \
                     U133Plus2_all.fa
     # remove ";" from probe set names
     perl -pi.bak -e "s/;//" U133Plus2_all.fa
     # clean up
     rm *.zip *.bak

     # Set up cluster job to align consensus/exemplars to hg16
     ssh kkr1u00
     mkdir -p /iscratch/i/affy
     mv /cluster/data/hg18/bed/affyU133Plus2.2006-02-11/U133Plus2_all.fa \
        /iscratch/i/affy
     iSync
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# The above is already done by Rachel during hg17 build.

     ssh hgwdev
     cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
     mkdir -p /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
     cp -p U133Plus2_all.fa /cluster/data/hg18/bed/affyU133Plus2.2006-02-11

     cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
     mkdir -p /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
     cp -p U133Plus2_all.fa /cluster/data/hg18/bed/affyU133Plus2.2006-02-11

     ssh kk
     cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
     ls -1 /iscratch/i/affy/U133Plus2_all.fa > affy.lst
     ls -1 /iscratch/i/gs.19/build36/maskedContigs/* > allctg.lst

    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << for emacs
    gensub2 allctg.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
    para try, para check, para push ...
# Completed: 378 of 378 jobs
# CPU time in finished jobs:      24764s     412.74m     6.88h    0.29d  0.001 y
# IO & Wait Time:                 13823s     230.38m     3.84h    0.16d  0.000 y
# Average job time:                 102s       1.70m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             782s      13.03m     0.22h    0.01d
# Submission to last job:           827s      13.78m     0.23h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyU133Plus2.psl
    pslSort dirs raw.psl tmp psl

    # use filter parameters for these sequences. only use alignments that
    # cover 30% of sequence and have at least 95% identity in aligned region.
    # minAli = 0.97 too high. low minCover as a lot of n's in these sequences

    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyU133Plus2.psl ../../jkStuff/liftAll.lft warn contig.psl
    perl -pi.bak -e "s/U133\+2://" affyU133Plus2.psl
    # load into the database
    ssh hgwdev
    cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
    hgLoadPsl hg18 affyU133Plus2.psl

# The below was already done.
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv

    # Add sequence data to database
        # Copy probe sequence to /gbdb if it isn't already
    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    ln -s /projects/compbio/data/microarray/affyHuman/HG-U133Plus2/U133Plus2_all.fa .
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# The above is already done by Rachel during hg17 build.
    cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
    # the sequences need to be loaded for the hg18 database
    # (2006-03-29, hartera)
    hgLoadSeq -abbr=U133+2: hg18 /gbdb/hgFixed/affyProbes/U133Plus2_all.fa

    # clean up
    rm -r psl tmp err contig.psl raw.psl *.bak psl.tab seq.tab

# Added knownToU133Plus2 track

cd /cluster/data/hg18/bed/geneSorter
hgMapToGene hg18 affyU133Plus2 knownGene knownToU133Plus2
#	row count changed to 34745

# Make knownToCdsSnp table (DONE Sept 12, 2007, jk)
  ssh hgwdev
  hgMapToGene hg18 snp126 knownGene knownToCdsSnp -all -cds
# approx. 5 minutes running time

# UPDATE GO DATABASE
# Download the terms and make the database.
ssh hgwdev
mkdir /cluster/store1/geneOntology/20060211
cd /cluster/store1/geneOntology/20060211

wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200601-assocdb-data.gz

hgsql mysql <<end
create database go060211;
end
zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
hgsql go060211 <j.tmp
rm j.tmp

wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz

# The format of gene_association.goa_uniprot.gz changed, there is 6 comment lines at the head now.
# Updated hgGoAssociation.c to skip first 6 lines.

zcat gene_association.goa_uniprot.gz | /cluster/home/fanhsu/bin/i386/hgGoAssociation go060211 goaPart stdin
# Passed 6832447 of 7933823 of 7933823, 86.12%

# Ask sys-admin to switch the database pointer go to point to go060211.

# HGNEAR PROTEIN BLAST TABLES (DONE 2/12/06 Fan)

    ssh hgwdev
    mkdir /cluster/data/hg18/bed/hgNearBlastp
    cd /cluster/data/hg18/bed/hgNearBlastp
    cat << _EOF_ > config.ra
# Latest human vs. other Gene Sorter orgs:
# mouse, rat, zebrafish, worm, yeast, fly

targetGenesetPrefix human
targetDb hg18
queryDbs mm7 rn3 danRer3 ce2 sacCer1 dm2

hg18Fa /cluster/data/hg18/bed/blastp/known.faa
mm7Fa /cluster/data/mm7/bed/geneSorter/blastp/known.faa
rn3Fa /cluster/data/rn3/bed/blastp/known.faa
danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
dm2Fa /cluster/data/dm2/bed/flybase4.1/flybasePep.fa

buildDir /cluster/data/hg18/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/hg18HgNearBlastp
_EOF_
#    doHgNearBlastp.pl config.ra >& do.log &

    doHgNearBlastp.pl config.ra >do3.log
#    tail -f do.log
0657.tab dm2_0658.tab dm2_0659.tab dm2_0660.tab dm2_0661.tab dm2_0662.tab dm2_0663.tab dm2_0664.tab dm2_0665.tab dm2_0666.tab dm2_0667.tab dm2_0668.tab dm2_0669.tab dm2_0670.tab
Scanning through 671 files
Loading database with 14488 rows
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/hg18.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/hg18.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/mm7.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/mm7.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/rn3.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/rn3.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/danRer3.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/danRer3.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/ce2.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/ce2.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/sacCer1.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/sacCer1.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/dm2.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/dm2.formatdb
# ssh -x pk rmdir /san/sanvol1/scratch/hg18HgNearBlastp

 *** All done!
 *** Check these tables in hg18:
 *** humanBlastTab mmBlastTab rnBlastTab drBlastTab ceBlastTab scBlastTab dmBlastTab
 *** and hgBlastTab in these databases:
 *** mm7 rn3 danRer3 ce2 sacCer1 dm2

# MAKE ORGANISM-SPECIFIC HGNEARDATA FILES
    cd ~/kent/src/hg/near/hgNear/hgNearData
# any updates necessary?

# ENABLE HGNEAR FOR HG18 IN HGCENTRALTEST
    echo "update dbDb set hgNearOk = 1 where name = 'hg18';" \
      | hgsql -h genome-testdb hgcentraltest

# END OF HGNEAR STUFF

#############################################################################
# UPDATE BIOCYCTABLES NEEDED BY hgGene (DONE 2/16/06 Fan)

# First register with BioCyc to download their HumanCyc database
# The site will email you the URL for download

    wget --timestamping http://bioinformatics.ai.sri.com/ecocyc/dist/pdff-XXXXXX/humancyc-flatfiles.zip
    unzip  humancyc-flatfiles.zip

    cp genes.col genes.tab
    cp pathways.col pathways.tab

# delete the first 20 or so header lines from these two files.
    vi genes.tab
    vi pathways.tab

    hgsql hg18 -e 'create database bioCyc060216'
    hgsql bioCyc060216 < ~/src/hg/lib/bioCycGenes.sql
    hgsql bioCyc060216 -e 'load data local infile "genes.tab" into table genes'

    hgsql bioCyc060216 < ~/src/hg/lib/bioCycPathways.sql
    hgsql bioCyc060216 -e 'load data local infile "pathways.tab" into table pathways'

# Create bioCycMapDesc.tab
    hgsql bioCyc060216 -N -e 'select UNIQUE_ID, NAME from pathways' |sort -u >  bioCycMapDesc.tab

# Create bioCycPathway.tab
    kgBioCyc0 bioCyc060216 hg18 hg17

    hgsql hg18 -e 'delete from bioCycPathway'
    hgsql hg18 -e 'delete from bioCycMapDesc'
    hgsql hg18 < ~/src/hg/lib/bioCycPathway.sql
    hgsql hg18 < ~/src/hg/lib/bioCycMapDesc.sql

# Load results into hg18.

   hgsql hg18 -e 'LOAD DATA local INFILE "bioCycMapDesc.tab" into table bioCycMapDesc'
   hgsql hg18 -e 'LOAD DATA local INFILE "bioCycPathway.tab" into table bioCycPathway'

#############################################################################
# BLASTZ/CHAIN/NET RN4 (DONE 2/17/06 Fan)
    ssh kkstore02

    cd /cluster/store11/gs.19/build36
    cp -Rp linSpecRep /san/sanvol1/scratch/hg18
    cp -Rp nib /san/sanvol1/scratch/hg18

    mkdir /cluster/data/hg18/bed/blastz.rn4.2006-02-17
    cd /cluster/data/hg18/bed/blastz.rn4.2006-02-17

    cat << '_EOF_' > DEF
# human vs. rat

BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/san/sanvol1/scratch/hg18/nib
SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRep/notInRat
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat
SEQ2_DIR=/san/sanvol1/scratch/rn4/nib
SEQ2_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.rn4.2006-02-17
'_EOF_'
    # << for emacs
    doBlastzChainNet.pl DEF -chainLinearGap medium \
      -bigClusterHub pk -smallClusterHub pk -workhorse pk \
      -blastzOutRoot /san/sanvol1/scratch/blastzHg17Rn4Out >& do.log &
    tail -f do.log
    rm -f /cluster/data/hg18/bed/blastz.rn4
    ln -s blastz.rn4.2006-02-17 /cluster/data/hg18/bed/blastz.rn4

#############################################################################
# BUILD WGRNA TRACK (DONE, 2006-02-22, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2006-05-15, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK       (DONE, 2007-05-31, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK       (DONE, 2007-10-05, Fan)

  ssh hgwdev
  cd /cluster/data/hg18/bed

  mkdir wgRna-2006-02-22
  cd wgRna-2006-02-22

# Received the data file, wg_hg18_track.txt, from Michel Weber's email
# (Michel.Weber at ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg18/bed/wgRna-2006-02-22.

  cp -p wg_hg18_track.txt wgRna.tab

  hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab

#############################################################################
# RE-BUILD WGRNA TRACK (DONE, 2006-05-15, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK       (DONE, 2007-05-31, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK       (DONE, 2007-10-05, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK       (DONE, 2008-05-29, Fan)

    ssh hgwdev
    cd /cluster/data/hg18/bed

    mkdir wgRna-2008-05-28
    cd wgRna-2008-05-28

# Received the data file, wgtrack_may2008.doc, from Michel Weber's
# email
# (Michel.Weber at ibcg.biotoul.fr)
# Save it as .txt file and change all blanks into tabs.
# and place it under cd /cluster/data/hg18/bed/wgRna-2008-05-28.

    cp -p wgtrack_may2008.txt wgRna.tab

    hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab

#############################################################################
# 17-WAY MULTIZ ALIGNMENTS (DONE - 2006-02-22 Fan)

    # copy net mafs to cluster-friendly storage for multiz run

    ssh kkstore02

    ln -s /cluster/data/hg18/bed/blastzMonDom4.2006-02-13 /cluster/data/hg18/bed/blastz.monDom4
    cd /cluster/data/hg18/bed/blastz.monDom4

    cd /cluster/data/hg18/bed
    mkdir -p multiz17way.2006-02-18
    ln -s multiz17way.2006-02-18 multiz17way
    cd multiz17way

    # copy MAF's to cluster-friendly server
    # These MAF's already on bluearc:
    #  canFam2, fr1, galGal2, panTro1, rn4
    mkdir -p /san/sanvol1/scratch/hg18/mafNet
    cd /san/sanvol1/scratch/hg18/mafNet
    ln -s /cluster/bluearc/hg18/mafNet/{*} .

    # copy others
    foreach s (bosTau2 canFam2 danRer3 dasNov1 echTel1 fr1 galGal2 loxAfr1 \
               mm8 monDom4 oryCun1 panTro1 rn4 tetNig1 xenTro1 rheMac2)
        echo $s
        cp -Rp /cluster/data/hg18/bed/blastz.$s/mafNet $s
    end

# danRer3 directory structure is different.  It is under /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun

    ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun/mafNet /san/sanvol1/scratch/hg18/mafNet/danRer3

    # thanks for the tree, Hiram! Taken from mm7 17way...
    cd /cluster/data/hg18/bed/multiz17way
    cat << '_EOF_' > 17way.nh
(((((((((
(human_hg18:0.006690,chimp_panTro1:0.007571):0.024272,
  macaque_rheMac2:0.0592):0.023960,
  ((rat_rn4:0.081728,mouse_mm8:0.077017):0.229273,
      rabbit_oryCun1:0.206767):0.1065):0.023026,
(cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505,
armadillo_dasNov1:0.149862):0.015994,
(elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400,
monodelphis_monDom4:0.371073):0.189124,
chicken_galGal2:0.454691):0.123297,
xenopus_xenTro1:0.782453):0.156067,
((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961,
    zebrafish_danRer3:0.782561):0.156067);
'_EOF_'

    /cluster/bin/draw_tree 17way.nh > 17way.ps
    /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt
    grep hg18 17way.distances.txt | sort -k3,3n | \
        awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
    # edit distances.txt to include featureBits, and chain parameters
    # from blastz run.
    cat distances.txt
# 0.0143  chimp_panTro1
# 0.0902  macaque_rheMac2
# 0.2563  armadillo_dasNov1
# 0.2651  dog_canFam2
# 0.2677  elephant_loxAfr1
# 0.2766  cow_bosTau2
# 0.3682  rabbit_oryCun1
# 0.4226  tenrec_echTel1
# 0.4677  mouse_mm8
# 0.4724  rat_rn4
# use loose chain params and score from here, down (5000)
# 0.7119  monodelphis_monDom4
# 0.9847  chicken_galGal2
# 1.4357  xenopus_xenTro1
# 1.6577  tetraodon_tetNig1
# 1.6983  fugu_fr1
# 1.7480  zebrafish_danRer3

    # the order in the browser display will be by tree topology,
    # not by distance, so it will be:
#  >>         # 0.0143  chimp_panTro1
#  >>         # 0.0902  macaque_rheMac2
#  >>         # 0.4677  mouse_mm8
#  >>         # 0.4724  rat_rn4
#  >>         # 0.3682  rabbit_oryCun1
#  >>         # 0.2651  dog_canFam2
#  >>         # 0.2766  cow_bosTau2
#  >>         # 0.2563  armadillo_dasNov1
#  >>         # 0.2677  elephant_loxAfr1
#  >>         # 0.4226  tenrec_echTel1
#  >>         # 0.7119  monodelphis_monDom4
#  >>         # 0.9847  chicken_galGal2
#  >>         # 1.4357  xenopus_xenTro1
#  >>         # 1.6577  tetraodon_tetNig1
#  >>         # 1.6983  fugu_fr1
#  >>         # 1.7480  zebrafish_danRer3

    # make output dir and run dir
    ssh pk
    cd /cluster/data/hg18/bed/multiz17way.2006-02-18

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 17way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir -p maf run
    cd run

    # stash binaries
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
    set db = hg18
    set c = $1
    set maf = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /san/sanvol1/scratch/$db/mafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../tree/tree.nh ../species.lst $tmp
    pushd $tmp
    foreach s (`cat species.lst`)
        set in = $pairs/$s/$c.maf
        set out = $db.$s.sing.maf
        if ($s == hg18) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'EOF'
# << happy emacs
    chmod +x autoMultiz.csh

cat  << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz17way.2006-02-18/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << happy emacs

    awk '{print $1}' /cluster/data/hg18/chrom.sizes > chrom.lst
    gensub2 chrom.lst single spec jobList
    para create jobList
        # 49 files
    para try
    para check
    para push

#  NOTE: much faster than V10 (40 hrs for hg17 V10, 14.53 hrs for hg17 V11)
# Completed: 49 of 49 jobs
# CPU time in finished jobs:     341776s    5696.26m    94.94h    3.96d  0.011 y
# IO & Wait Time:                122801s    2046.69m    34.11h    1.42d  0.004 y
# Average job time:                9481s     158.02m     2.63h    0.11d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           81334s    1355.57m    22.59h    0.94d
# Submission to last job:         81334s    1355.57m    22.59h    0.94d

    # Load into database
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way/maf
    mkdir -p /gbdb/hg18/multiz17way/maf
    ln -s /cluster/data/hg18/bed/multiz17way/maf/*.maf \
        /gbdb/hg18/multiz17way/maf
cat > loadMaf.csh << 'EOF'
    time hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/maf hg18 multiz17way
    cat *.maf | \
        nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 -maxSize=200000  multiz17waySummary stdin
'EOF'
    # 3213116

#<< happy emacs
    # expect lengthy load time for this -- a few hours ?
    # csh loadMaf.csh >&! loadMaf.log &
    script loadMaf.log
    csh loadMaf.csh
    exit

###############################################################
# PHASTCONS CONSERVATION (DONE, 2ND TIME, 2006-03-28 Fan)
# This process is distilled from Hiram and Adam's experiments
# on mouse (mm7) 17way track.  Many parameters are now fixed, without
# being experimentally derived, either because the experiments
# were lengthy and produced similar results, or because they
# weren't runnable given the alignment size.
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model
# from the (37-way) ENCODE alignments.

    # NOTE: reusing cluster-friendly chrom fasta files created earlier

    ssh kkstore02
    mkdir /cluster/bluearc/hg18/chrom
    cd /cluster/data/hg18
    foreach f (`cat chrom.lst`)
    echo $f
    cp $f/*.fa /cluster/bluearc/hg18/chrom
    end

    # Split chromosome MAF's into windows and use to generate
    # "sufficient statistics" (ss) files for phastCons input
    # NOTE: as the SAN fs has lotsa space, we're leaving these
    # big (temp) files unzipped, to save time during phastCons run.
    # Note also the larger chunk sizes from previous runs -- this
    # reduces run-time on the split, slows down the actual phastCons
    # enough so jobs don't crash (jobs are very quick, just a minute
    # or so), and according to Adam, will produce better results.
    # The previous small chunks were probably required by
    # the phyloFit step, which we are no longer using for the
    # human alignments.
    ssh pk
    mkdir /cluster/data/hg18/bed/multiz17way.2006-02-18/cons
    cd /cluster/data/hg18/bed/multiz17way.2006-02-18/cons
    cp /cluster/store5/gs.18/build35/bed/multiz17way.2005-12-20/cons/elliotsEncode.mod .
    # edit, change to hg18, monDom4, mm8, and rn4.
    mkdir run.split
    cd run.split
    set WINDOWS = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss
    rm -fr $WINDOWS
    mkdir -p $WINDOWS

    cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
# unfortunately this exhausts 2G mem limit currently on pk
# next time, run on mini-cluster
    set MAFS = /cluster/data/hg18/bed/multiz17way.2006-02-18/maf
    set WINDOWS = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss
    cd $WINDOWS
    set c = $1
    echo $c
    rm -fr $c
    mkdir $c
    /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
        -M /cluster/bluearc/hg18/chrom/$c.fa \
        -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
    echo "Done" >> $c.done
'EOF'
# << happy emacs
    chmod +x doSplit.csh

rm -f jobList
foreach f (../../maf/*.maf)
set c = $f:t:r
echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
end

    para create jobList
        # 49 jobs
    para try
    para check
    para push
# Completed: 49 of 49 jobs
# CPU time in finished jobs:       9254s     154.24m     2.57h    0.11d  0.000 y
# IO & Wait Time:                 15027s     250.44m     4.17h    0.17d  0.000 y
# Average job time:                 496s       8.26m     0.14h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1916s      31.93m     0.53h    0.02d
# Submission to last job:          1921s      32.02m     0.53h    0.02d

    # check tree model on 5MB chunk, using params recommended by Adam,
    # (to verify branch lengths on 2X species)
    # he ok'ed the results -- not necessary for next human run
    ssh kolossus
    cd /cluster/data/hg18/bed/multiz17way.2006-02-18/cons
    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
        --tree "`cat ../tree-commas.nh`" \
        /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss/chr7/chr7.110000001-120000000.ss \
        -o phyloFit.tree

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    # cd ..
    mkdir run.cons
    cd run.cons
    cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons
cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative panTro1,rheMac2 \
--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$c $san/bed/$c
sleep 1
mv $tmp/$f.pp $san/pp/$c
mv $tmp/$f.bed $san/bed/$c
rm -fr $tmp
'EOF'
    # emacs happy
    chmod a+x doPhast.csh

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 14 .008 .28
#ENDLOOP
'EOF'
    #	happy emacs

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons
    # mkdir /cluster/data/hg18/bed/multiz17way/cons/run.cons
    ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
        /cluster/data/hg18/bed/multiz17way/cons/run.cons/in.list

    ssh pk
    cd /cluster/store11/gs.19/build36/bed/multiz17way.2006-02-18/cons/run.cons

    gensub2 in.list single template jobList
    para create jobList
        # 337 jobs
    para try
    para check
    para push

# NOTE: some jobs crashed due to can not stat some /san/... files, but worked when pushed once again
# Completed: 337 of 337 jobs
# CPU time in finished jobs:      16000s     266.66m     4.44h    0.19d  0.001 y
# IO & Wait Time:                 13307s     221.79m     3.70h    0.15d  0.000 y
# Average job time:                  87s       1.45m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             173s       2.88m     0.05h    0.00d
# Submission to last job:           225s       3.75m     0.06h    0.00d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons
    #	The sed's and the sort get the file names in chrom,start order
    # (Hiram tricks -- split into columns on [.-/] with
    #    identifying x,y,z, to allow column sorting and
    #    restoring the filename.  Warning: the sort column
    # will depend on how deep you are in the dir
    find ./bed -name "chr*.bed" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
	sort -k7,7 -k9,9n | \
	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
	xargs cat | \
	awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
	/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/hg18/bed/multiz17way/cons

    # load into database
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way/cons
    hgLoadBed -strict hg18 phastConsElements17way mostConserved.bed
        # Loaded 2037557 elements
    # compare with previous tracks
    hgsql hg18 -e "select count(*) from phastConsElements17way"
        # 2260575
    # hgsql hg18 -e "select count(*) from phastConsElements"
    # hg18 does not have phastConsElements table
        # 1601903
    # Try for 5% overall cov, and 70% CDS cov (used elen=13, tcov=.007, rho=.27)
    featureBits hg18 -enrichment refGene:cds phastConsElements17way
    # refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x
    featureBits hg17 -enrichment refGene:cds phastConsElements17way
    # refGene:cds 1.064%, phastConsElements17way 5.104%, both 0.748%, cover 70.29%, enrich 13.77x

    # compare with previous tracks
    featureBits hg18 -enrichment refGene:cds phastConsElements10way
        # refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x
    featureBits hg18 -enrichment refGene:cds phastConsElements
        # refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x

    # Create merged posterier probability file and wiggle track data files
    #	pk is currently closer to the san than any other machine
    ssh pk
    cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/
    # sort by chromName, chromStart so that items are in numerical order
    #  for wigEncode
    find ./pp -name "chr*.pp" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
	sort -k7,7 -k9,9n | \
	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
	xargs cat | \
        nice wigEncode stdin phastCons17way.wig phastCons17way.wib
    # about 23 minutes for above

    cp -p phastCons17way.wi? /cluster/data/hg18/bed/multiz17way/cons

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way/cons
    ln -s `pwd`/phastCons17way.wib /gbdb/hg18/multiz17way/phastCons17way.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz17way hg18 \
        phastCons17way phastCons17way.wig
    #  ~ 3 minute load

    # Downloads  (2006-02-22 Fan)
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way
    mkdir mafDownloads
    cd mafDownloads
    # upstream mafs (mafFrags takes a while)
cat > mafFrags.csh << 'EOF'
    date
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad
        awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        nice mafFrags hg18 multiz17way up.bed upstream$i.maf \
                -orgs=../species.lst
        rm up.bed
    end
    date
'EOF'

    time csh mafFrags.csh > mafFrags.log
    nice gzip up*.maf

    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz17way/mafDownloads
cat > downloads.csh << 'EOF'
    date
    foreach f (../maf/chr*.maf)
	set c = $f:t:r
        echo $c
	nice gzip -c $f > $c.maf.gz
    end
    md5sum *.gz > md5sum.txt
    date
'EOF'
    time csh downloads.csh > downloads.log

    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz17way
    mkdir $dir
    ln -s /cluster/data/hg18/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir

##############################################################################
# SET DEFAULT POSITION TO chrX:151,073,054-151,383,976, TO SHOW GENE GABRA3

  hgsql -e 'delete from dbDb where name="hg18"' \
          -h genome-testdb hgcentraltest

  hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
	    defaultPos, active, orderKey, genome, scientificName, \
            htmlPath, hgNearOk, hgPbOk, sourceName) \
            VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \
	    "chrX:151,073,054-151,383,976", 1, 9, "Human", "Homo sapiens", \
	    "/gbdb/hg18/html/description.html", 1, 1, "NCBI Build 36.1");' \
	    -h genome-testdb hgcentraltest

############################################################################
# HG16/HG17 -> HG18 LIFTOVER CHAINS (DONE 2/24/06 Fan)
    # These chains hopefully don't suck.
    # Sorry I only used the makeLoChain-align script from the set of scripts
    # already created for this task.  I wanted more control.  I should mention
    # I used a size of 10kb instead of 3kb for the split (blat query) sizes in
    # hg18.  This had a huge affect on the amount of hits in the blat, which
    # then had a huge effect on the amount of chains.  I should also mention
    # that hg18 chromosomes chr1 and chr2 were split further
    # into more than a single query file.  This helped a LOT in avoiding
    # cluster hippos classically associated with those chroms.
    ######## LIFTOVER PREPARATION
    # Split up hg18
    ssh pk
    cd /san/sanVol1/scratch/hg18
    mkdir -p liftSplits/{split,lift}
    bash
    for fa in /cluster/data/hg18/?{,?,*hap*}/*.fa; do
      c=`basename $fa .fa`
      echo $c
      faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
    done
    mkdir -p biggerSplits/split
    cd biggerSplits/
    ln -s ../liftSplits/lift
    cd split/
    ln -s ../../liftSplits/split/* .
    faSplit sequence chr1.fa 5 chr1_
    faSplit sequence chr2.fa 5 chr2_
    rm chr{1,2}.fa

    # Make some dirs
    cd /san/sanVol1/scratch
    mkdir -p hg{15,16,17}

    # Copy 11.ooc files to each of hg15, hg16, hg17 dirs.
    cp -p /cluster/store5/gs.16/build33/11.ooc hg15
    cp -p /cluster/store4/gs.17/build34/11.ooc hg16
    cp -p /cluster/store5/gs.18/build35/11.ooc hg17

    ## First, copy over Andy's scripts.

    mkdir -p /san/sanVol1/scratch/fan
    cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
    cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan

    ######## LIFTOVER BLATING

    # HG16
    ssh pk
    cd /cluster/data/hg16
    makeLoChain-align hg16 /scratch/hg/hg16/bothMaskedNibs hg18 \
    /san/sanVol1/scratch/hg18/biggerSplits/split
    cd bed/
    mv blat.hg18.2006-02-24 /san/sanVol1/scratch/hg16
    cd /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/run/
    sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg16ToHg18"}' > newspec
    para create newspec
    para try
    para push
# Completed: 2394 of 2394 jobs
# CPU time in finished jobs:     623927s   10398.79m   173.31h    7.22d  0.020 y
# IO & Wait Time:                 13255s     220.91m     3.68h    0.15d  0.000 y
# Average job time:                 266s       4.44m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3613s      60.22m     1.00h    0.04d
# Submission to last job:          4112s      68.53m     1.14h    0.05d

    # HG17
    ssh pk
    cd /cluster/data/hg17
    makeLoChain-align hg17 /scratch/hg/hg17/bothMaskedNibs hg18 /san/sanVol1/scratch/hg18/biggerSplits/split
    cd bed/
    mv blat.hg18.2006-02-24/ /san/sanVol1/scratch/hg17
    cd /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/run/
    sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg17ToHg18"}' > newspec
    para create newspec
    para try
    para push
# Completed: 2622 of 2622 jobs
# CPU time in finished jobs:     618557s   10309.28m   171.82h    7.16d  0.020 y
# IO & Wait Time:                 13735s     228.92m     3.82h    0.16d  0.000 y
# Average job time:                 241s       4.02m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3655s      60.92m     1.02h    0.04d
# Submission to last job:          4228s      70.47m     1.17h    0.05d

    ######## LIFTOVER CHAINING
    # LIFTING
    ssh pk
    cd /san/sanVol1/scratch/fan
    cp mm7SplitLift.sh hg18SplitLift.sh

    # change andy to fan, mm7 to hg18, and chrX to chr2, and remove chrUn_random
    vi hg18SplitLift.sh

    cat << 'EOF' > hg18ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/fan/hg18Lifts
pushd /scratch/fan/hg18Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
'EOF'

    chmod +x hg18ChainMergeSplit.sh

    # HG16
    cd /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/raw
    /san/sanVol1/scratch/fan/hg18SplitLift.sh
    cd ../
    mkdir chainRun chainRaw
    cd chainRun
    cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg16/bothMaskedNibs /san/sanVol1/scratch/hg18/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'EOF'
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single gsub spec
    para create spec
    para push
    para time
# Completed: 49 of 49 jobs
# CPU time in finished jobs:       3599s      59.98m     1.00h    0.04d  0.000 y
# IO & Wait Time:                  1040s      17.34m     0.29h    0.01d  0.000 y
# Average job time:                  95s       1.58m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             303s       5.05m     0.08h    0.00d
# Submission to last job:           303s       5.05m     0.08h    0.00d

    # HG17
    cd /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/raw
    /san/sanVol1/scratch/fan/hg18SplitLift.sh

    cd ../
    mkdir chainRun chainRaw
    cd chainRun

    cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg17/bothMaskedNibs /san/sanVol1/scratch/hg18/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'EOF'
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
    para time
# Completed: 49 of 49 jobs
# CPU time in finished jobs:       3671s      61.19m     1.02h    0.04d  0.000 y
# IO & Wait Time:                  1186s      19.76m     0.33h    0.01d  0.000 y
# Average job time:                  99s       1.65m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             282s       4.70m     0.08h    0.00d
# Submission to last job:           282s       4.70m     0.08h    0.00d

    ######### CHAINMERGE/NET/NETSUBSET
    ssh kolossus
    mkdir -p /scratch/fan/hg18Lifts
    cd /scratch/fan/hg18Lifts
    cp -rp /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/chainRaw/ .
    mkdir chain
    time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin

    cp -rp chain /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/

    mv chain chain.17
# remove it later
    rm -rf chain.17

    cp -r /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/chainRaw/ .
    mkdir chain
    /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
# about 30 minutes.

    cp -rp chain /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/
    rm -rf chain*

    ssh pk
    cd /san/sanvol1/scratch/fan
    cat << 'EOF' > netOver.sh
#!/bin/bash

chain=$1
chrom=`basename $chain .chain`
sizesHGOld=$2
sizesHG18=/cluster/data/hg18/chrom.sizes
chainDir=`dirname $chain`
blatDir=`dirname $chainDir`
net=${blatDir}/net/${chrom}.net
over=${blatDir}/over/${chrom}.over

mkdir -p ${blatDir}/{over,net}
/cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG18 $net /dev/null
/cluster/bin/x86_64/netChainSubset $net $chain $over
'EOF'
    chmod +x netOver.sh
    mkdir netRun

    cd netRun/

    find /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/chain -name "*.chain" \
     | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg16/chrom.sizes"}' >> spec
    find /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/chain -name "*.chain" \
     | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg17/chrom.sizes"}' >> spec
    para create spec
    para push
    para time
# Completed: 88 of 88 jobs
# CPU time in finished jobs:        881s      14.68m     0.24h    0.01d  0.000 y
# IO & Wait Time:                   284s       4.74m     0.08h    0.00d  0.000 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              33s       0.55m     0.01h    0.00d
# Submission to last job:            73s       1.22m     0.02h    0.00d

# seems much faster than mm7.

    ########## FINISHING
    ssh hgwdev

    # HG16
    cd /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/over
    cat * >> ../hg16ToHg18.over.chain
    cd ../
    rm -rf psl/ net/ chain/ chainRaw/ over/
    cd ../
    cp -rp blat.hg18.2006-02-24/ /cluster/data/hg16/bed

    cd /cluster/data/hg16/bed
    ln -s blat.hg18.2006-02-24 blat.hg18
    ln -s `pwd`/blat.hg18/hg16ToHg18.over.chain liftOver/hg16ToHg18.over.chain
    ln -s `pwd`/liftOver/hg16ToHg18.over.chain /gbdb/hg16/liftOver/hg16ToHg18.over.chain
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/liftOver
    cd /usr/local/apache/htdocs/goldenPath/hg16/liftOver
    cp /gbdb/hg16/liftOver/hg16ToHg18.over.chain .
    gzip hg16ToHg18.over.chain
    hgAddLiftOverChain hg16 hg18 /gbdb/hg16/liftOver/hg16ToHg18.over.chain

    # HG17
    cd /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/over
    cat * >> ../hg17ToHg18.over.chain
    cd ../
    rm -rf psl/ net/ chain/ chainRaw/ over/
    cd ../
    cp -r blat.hg18.2006-02-24/ /cluster/data/hg17/bed
    cd /cluster/data/hg17/bed
    ln -s blat.hg18.2006-02-24 blat.hg18
    ln -s `pwd`/blat.hg18/hg17ToHg18.over.chain liftOver/hg17ToHg18.over.chain
    ln -s `pwd`/liftOver/hg17ToHg18.over.chain /gbdb/hg17/liftOver/hg17ToHg18.over.chain
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/liftOver
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
    cp /gbdb/hg17/liftOver/hg17ToHg18.over.chain .
    gzip hg17ToHg18.over.chain
    hgAddLiftOverChain hg17 hg18 /gbdb/hg17/liftOver/hg17ToHg18.over.chain

############################################################################
##  BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
    ssh pk
    cd /cluster/data/mm8/bed/blastzHg18.2006-02-16
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
        `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits hg18 chainMm8Link
    #   994530182 bases of 2881515245 (34.514%) in intersection

# GENOSCOPE TETRAODON (tetNig1) ECORES (DONE, 2006-03-03, Fan)
# GENOSCOPE TETRAODON (tetNig1) ECORES (REBUILT, 2006-04-04, Fan)
    ssh kkstore02
    mkdir -p /cluster/data/hg18/bed/ecoresTetNig1
    cd /cluster/data/hg18/bed/ecoresTetNig1

    wget --timestamp \
         http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecoresHumBuild36/EXOFISH_HS_WITH_TN.gff
    wget --timestamp \
         http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecoresHumBuild36/EXOFISH_TN_WITH_HS.gff

    # this is in gff format
    # remove "Ecotig" from name field
    sed -e 's/Ecotig EG/EG/g' EXOFISH_HS_WITH_TN.gff |sed -e 's/CHR//' > ExofishHs36Tnig1.gff
    # sed -e 's/Ecotig EG/EG/g' ExofishHs36Tnig1 > ExofishHs36Tnig1.gff
    # need to have tabs between fields not a space to load file into table
    sed -e 's/ /\t/g' ExofishHs36Tnig1.gff > Hs36Tnig1format.gff
    # if "ecore" is changed to "CDS" and "ecotig" to "transcript" this loads
    # correctly into the table.
    sed -e 's/ecore/CDS/' Hs36Tnig1format.gff | sed -e 's/ecotig/transcript/' \
	    | cut -f 1-8,11 > Hg18vstetNig1.gff
    # add "chr" in front of the chromsome name in first field (2005-02-08)
    perl -pi.bak -e 's/^([0-9XYM]{1,2})/chr$1/' Hg18vstetNig1.gff
    rm *.bak
    # need to reload table

    ssh hgwdev
    cd /cluster/data/hg18/bed/ecoresTetNig1
    echo 'drop table ecoresTetNig1;' | hgsql hg18
    nice ldHgGene hg18 ecoresTetNig1 Hg18vstetNig1.gff

#########################################################################
# BUILD MAF ANNOTATION FOR MULTIZ17WAY (DONE 2006-03-07, Fan)
    ssh kkstore01
    cd /cluster/data/monDom4
    twoBitInfo -nBed monDom4.2bit monDom4.N.bed

    cd /cluster/data/rn4
    twoBitInfo -nBed rn4.2bit rn4.N.bed

    cd /cluster/data/mm8
    twoBitInfo -nBed mm8.2bit mm8.N.bed

    ssh kolossus
    cd /cluster/data/hg18/bed/multiz17way
    mkdir anno
    cd anno
    mkdir maf run
    cd run
    rm sizes nBeds

    foreach i (`cat /cluster/data/hg18/bed/multiz17way/species.lst`)
        ln -s  /cluster/data/$i/chrom.sizes $i.len
        ln -s  /cluster/data/$i/$i.N.bed $i.bed
        echo $i.bed  >> nBeds
        echo $i.len  >> sizes
    end

    echo date > jobs.csh
    foreach i (../../maf/*.maf)
        echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg18/hg18.2bit ../maf/`basename $i` >> jobs.csh
        echo "echo $i" >> jobs.csh
    end
    echo date >> jobs.csh

    # do smaller jobs first
    tac jobs.csh > jobsRev.csh
    mv jobsRev.csh jobs.csh

    csh jobs.csh > jobs.log

    # This took 10 hours.  Hg17 took 1.5 hrs.

    ssh kolossus
    # loading here because summary table load crashed on hgwdev
    cd /cluster/data/hg18/bed/multiz17way/anno/maf
    mkdir -p /gbdb/hg18/multiz17way/anno/maf
    ln -s /cluster/data/hg18/bed/multiz17way/anno/maf/*.maf \
        /gbdb/hg18/multiz17way/anno/maf
cat > loadMaf.csh << 'EOF'
    date
    hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/anno/maf \
                hg18 multiz17way
    date
    cat *.maf | \
        nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
            -maxSize=200000  multiz17waySummary stdin
    date
'EOF'
    csh loadMaf.csh > loadMaf.log

    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz17way
    mkdir frames
    cd frames
    cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
    cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
    #edit Makefile to correct species names

cat > copy.csh << 'EOF'
    set dir = /cluster/bluearc/hg18/multiz17way/frames/maf
    mkdir -p $dir
    foreach i (../maf/*.maf)
        echo $i
        cp -p $i $dir
    end
'EOF'
    csh copy.csh > copy.log

    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way/frames
    time make getGenes > getGenes.log
    # 26.100u 4.360s 1:02.78 48.5%    0+0k 0+0io 29643pf+0w
    time make getFrames > getFrames.log

# Batch failed after 4 tries on ../mkMafFrames bosTau2 hg18 /san/sanvol1/scratch/hg18/multiz17way/frames/genes/bosTau2.gp.gz /cluster/data/hg18/bed/multiz17way/maf/chr1.maf /san/sanvol1/scratch/hg18/multiz17way/frames/mafFrames/bosTau2/chr1.mafFrames
#make[1]: *** [mafFrames/bosTau2.cluster.done] Error 255

# copy Makefile to Makefile.try2 and remove bosTau2
    time make -f Makefile.try2 getFrames > getFrames.try2.log

# copy Makefile to Makefile.try3 and with only bosTau2 remains
    time make -f Makefile.try3 getGenes  > getGenes.try3.log
    time make -f Makefile.try3 getFrames > getFrames.try3.log
    time make -f Makefile.try3 getFrames > getFrames.try5.log
    time make -f Makefile.try3 getFrames > getFrames.try6.log

# Finally after Mark fixed the bug and recompiled, it worked.
    time make -f Makefile.try3 getFrames > getFrames.try7.log

    time make loadDb > loadDb.log

#########################################################################
# Build maf annotation for multiz17way  (STARTED 2006-02-28, DONE 2006-03-09, Fan)
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)

    ssh kkstore01
    cd /cluster/data/monDom4
    twoBitInfo -nBed monDom4.2bit monDom4.N.bed

    cd /cluster/data/rn4
    twoBitInfo -nBed rn4.2bit rn4.N.bed

    cd /cluster/data/mm8
    twoBitInfo -nBed mm8.2bit mm8.N.bed

    ssh kolossus
    cd /cluster/data/hg18/bed/multiz17way
    mkdir anno
    cd anno
    mkdir maf run
    cd run
    rm sizes nBeds

    foreach i (`cat /cluster/data/hg18/bed/multiz17way/species.lst`)
        ln -s  /cluster/data/$i/chrom.sizes $i.len
        ln -s  /cluster/data/$i/$i.N.bed $i.bed
        echo $i.bed  >> nBeds
        echo $i.len  >> sizes
    end

    echo date > jobs.csh
    foreach i (../../maf/*.maf)
        echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg18/hg18.2bit ../maf/`basename $i` >> jobs.csh
        echo "echo $i" >> jobs.csh
    end
    echo date >> jobs.csh

    # do smaller jobs first
    tac jobs.csh > jobsRev.csh
    mv jobsRev.csh jobs.csh

    csh jobs.csh > jobs.log

    # This took 10 hours.  Hg17 took 1.5 hrs.

    ssh hgwdev
    # loading here because summary table load crashed on hgwdev
    cd /cluster/data/hg18/bed/multiz17way/anno/maf
    mkdir -p /gbdb/hg18/multiz17way/anno/maf
    ln -s /cluster/data/hg18/bed/multiz17way/anno/maf/*.maf \
        /gbdb/hg18/multiz17way/anno/maf
cat > loadMaf.csh << 'EOF'
    date
    hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/anno/maf \
                hg18 multiz17way
    date
    cat *.maf | \
        nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
            -maxSize=200000  multiz17waySummary stdin
    date
'EOF'
    csh loadMaf.csh > loadMaf.log

    # Dropped unused indexes (2006-05-09 kate)
    # NOTE: this is not required in the future, as the loader
    # has been fixed to not generate these indexes
    hgsql hg18 -e "alter table multiz17waySummary drop index chrom_2"
    hgsql hg18 -e "alter table multiz17waySummary drop index chrom_3"

    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz17way
    mkdir frames
    cd frames
    cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
    cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .

    # !!! NEXT TIME, COPY ALL maf FILES OVER TO san TO AVOID kkstore02 OVERLOAD.
    # edit Makefile to correct species names

cat > copy.csh << 'EOF'
    set dir = /cluster/bluearc/hg18/multiz17way/frames/maf
    mkdir -p $dir
    foreach i (../maf/*.maf)
        echo $i
        cp -p $i $dir
    end
'EOF'
    csh copy.csh > copy.log

    #for i in ../../maf/*.maf; do echo $i; cp $i /san/sanvol1/scratch/hg18/multiz17wayFrames/maf/$i; done

    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way/frames
    time make getGenes > getGenes.log
    # 26.100u 4.360s 1:02.78 48.5%    0+0k 0+0io 29643pf+0w
    time make getFrames > getFrames.log
        # ~2 hours

    time make loadDb > loadDb.log

    ###
    # rebuild frames to get bug fix, using 1-pass maf methodology
    # (2006-06-09 markd)
    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz17way/frames
    mv mafFrames/ mafFrames.old2
    nice tcsh # easy way to get process niced
    (cat  ../maf/*.maf | time genePredToMafFrames hg18 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro1 genes/panTro1.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz xenTro1 genes/xenTro1.gp.gz  |  gzip >multiz17way.mafFrames.gz)>&frames.log&
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way/frames

    hgLoadMafFrames hg18 multiz17wayFrames multiz17way.mafFrames.gz >&log&


##########################################################################
# BUILD ALLEN BRAIN TRACK (DONE 03/11/06 Fan)

# Make the working directory
    ssh hgwdev
    cd /cluster/data/hg18/bed
    mkdir allenBrain
    cd allenBrain

# Remap the probe alignments from mm7 to hg18

    zcat /gbdb/mm7/liftOver/mm7ToHg18.over.chain.gz \
        |  pslMap -chainMapFile -swapMap \
	       /cluster/data/mm7/bed/allenBrain/allenBrainAli.psl stdin stdout \
	  |  sort -k 14,14 -k 16,16n > unscored.psl

    pslRecalcMatch unscored.psl /cluster/data/hg18/nib \
        /cluster/data/mm7/bed/allenBrain/allProbes.fa allenBrainAli.psl

# Load the database
   hgsql hg18 < ~/kent/src/hg/lib/allenBrainUrl.sql
   hgsql hg18 -e 'load data local infile "/cluster/data/mm7/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;'
   hgLoadPsl hg18 allenBrainAli.psl
   mkdir /gbdb/hg18/allenBrain
   ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/hg18/allenBrain/allProbes.fa
   hgLoadSeq hg18 /gbdb/hg18/allenBrain/allProbes.fa

# Make mapping between known genes and allenBrain
   hgMapToGene hg18 allenBrainAli -type=psl knownGene knownToAllenBrain

##########################################################################
####  Blat knownGene proteins to determine exons
#	(DONE - 2006-03-15 - 2006-03-24 - hiramc)
    ssh hgwdev
    cd /cluster/data/hg18/bed
    mkdir blat.hg18KG.2006-03-15
    rm blat.hg18KG
    ln -s  blat.hg18KG.2006-03-15 blat.hg18KG
    cd blat.hg18KG
    pepPredToFa hg18 knownGenePep known.fa

    #	The kluster run
    ssh pk
    cd /cluster/data/hg18/bed/blat.hg18KG
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
blat -t=dnax -q=prot -out=pslx /scratch/hg/gs.19/build36/bothMaskedNibs/$1.nib \
	kgfa/$2.fa $3
'_EOF_'
    # << keep emacs happy
    chmod +x blatSome
    ls -1S /scratch/hg/gs.19/build36/bothMaskedNibs > human.lst
    mkdir kgfa
    cd kgfa
    #	This split should be done on the file server, not over NFS
    faSplit sequence ../known.fa 3000 kg
    ls -1S *.fa > ../kg.lst
    cd ..
    cat << '_EOF_' > template
#LOOP
blatSome $(root1) $(root2) {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    gensub2 human.lst kg.lst template jobList
    mkdir psl
    cd psl
    sed -e "s/.nib//" ../human.lst | xargs mkdir
    cd ..
    para create jobList
    para try ... check ... push ... etc
# Completed: 142100 of 142100 jobs
# CPU time in finished jobs:    7520598s  125343.30m  2089.06h   87.04d  0.238 y
# IO & Wait Time:                415523s    6925.38m   115.42h    4.81d  0.013 y
# Average job time:                  56s       0.93m     0.02h    0.00d
# Longest finished job:            5737s      95.62m     1.59h    0.07d
# Submission to last job:         72538s    1208.97m    20.15h    0.84d

    ssh kkstore02
    cd /cluster/data/hg18/bed/blat.hg18KG.2006-03-15
    pslSort dirs raw.psl /tmp psl/*
    #	-rw-rw-r--   1 568238823 Mar 20 13:30 raw.psl
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
    #	-rw-rw-r--   1  43446007 Mar 24 11:13 cooked.psl
    pslUniq cooked.psl hg18KG.psl
    #	-rw-rw-r--   1  41321225 Mar 24 11:14 hg18KG.psl
    cut -f 10 hg18KG.psl > kgName.lst
    faSomeRecords known.fa kgName.lst hg18KG.fa
    faSize hg18KG.fa
    #	16419953 bases (12961273 N's 3458680 real 3458680 upper 0 lower)
    #	in 36727 sequences in 1 files
    faSize known.fa
    #	16430067 bases (12969298 N's 3460769 real 3460769 upper 0 lower)
    #	in 36798 sequences in 1 files

    #	You may need to build this pslxToFa - it is not in the standard build
    pslxToFa hg18KG.psl hg18KG_ex.fa -liftTarget=genome.lft \
	-liftQuery=protein.lft
    #	-rw-rw-r--   1  11294262 Mar 24 11:31 protein.lft
    #	-rw-rw-r--   1  21428637 Mar 24 11:31 hg18KG_ex.fa
    #	-rw-rw-r--   1  14324928 Mar 24 11:31 genome.lft
    wc -l *.psl *.lft *.fa kgName.lst
    #	  39908 cooked.psl
    #	  36727 hg18KG.psl
    #	1521400 raw.psl
    #	 303516 genome.lft
    #	 303516 protein.lft
    #	 383037 hg18KG.fa
    #	 607032 hg18KG_ex.fa
    #	 383348 known.fa
    #	  36727 kgName.lst
    #	3615211 total

    #	back on hgwdev
    ssh hgwdev
    cd /cluster/data/hg18/bed/blat.hg18KG
    kgName hg18 hg18KG.psl blastKGRef04
    #	After about an hour, it exited with this message:
    #	sqlFreeConnection called on cache (hg18) that doesn't contain
    #	the given connection
    #	This may be a lurking error in this program, because the
    #	resulting file seems to have the correct number of lines:
    hgsql hg18 < ~/kent/src/hg/lib/blastRef.sql
    echo "rename table blastRef to blastKGRef04" | hgsql hg18
    echo "load data local infile 'blastKGRef04' into table blastKGRef04" | hgsql hg18
     wc -l kgName.lst blastKGRef04 hg18KG.psl
    #	 36727 kgName.lst
    #	 36727 blastKGRef04
    #	 36727 hg18KG.psl
    #	110181 total
    hgPepPred hg18 generic blastKGPep04 hg18KG.fa
    #	end blat proteins

##########################################################################
# BUILD NIBB IMAGE PROBES (DONE 2006-03-14 galt following Jim's hg17 example)

# Make directory on san for cluster job and copy in sequence
    ssh pk
    mkdir /san/sanvol1/scratch/hg18/nibbPics
    cd /san/sanvol1/scratch/hg18/nibbPics
    cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .

# Make parasol job dir and sequence list files
    mkdir run
    cd run
    mkdir psl
    ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
    echo ../nibbImageProbes.fa > mrna.lst

# Create parasol gensub file file
cat << '_EOF_' > gsub
#LOOP
blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
#ENDLOOP
'_EOF_'

# Create parasol batch
    gensub2 genome.lst mrna.lst gsub spec
    para create spec

# Do para try/push/time etc.
#Completed: 49 of 49 jobs
#CPU time in finished jobs:      12585s     209.74m     3.50h    0.15d  0.000 y
#IO & Wait Time:                   411s       6.86m     0.11h    0.00d  0.000 y
#Average job time:                 265s       4.42m     0.07h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            1145s      19.08m     0.32h    0.01d
#Submission to last job:          1195s      19.92m     0.33h    0.01d


# Make sort and filter
    catDir psl | sort -k 10 \
        | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
	| sort -k 14,14 -k 16,16n \
	| sed 's#/san/sanvol1/scratch/hg18/nib/chr#chr#' \
	| sed 's/.nib//' > ../nibbImageProbes.psl

# Make bed file and copy in stuff
    ssh hgwdev
    cd /cluster/data/hg18/bed
    mkdir nibbPics
    cd nibbPics
    cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
    cp /san/sanvol1/scratch/hg18/nibbPics/nibbImageProbes.psl .

# Load into database
    ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/hg18/nibbImageProbes.fa
    hgLoadSeq hg18 /gbdb/hg18/nibbImageProbes.fa
    hgLoadPsl hg18 nibbImageProbes.psl

##########################################################################
# UPDATED hg18.knownToVisiGene (2006-03-15 galt)
#  after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc)
ssh hgwdev
knownToVisiGene hg18 -fromProbePsl=vgAllProbes

##########################################################################
# GENERATE SUMMARY STATISTICS (DONE, Fan 3/18/06)

   ssh hgwdev
   cd /cluster/data/hg18
   mkdir stat
   cd stat

   stats.pl ~/hg18 >hg18.pl.out
   hgCalStat hg18.pl.out hg18 hg18.out

   cp hg18.out hg18.out.sorted
# Editi hg18.out.sorted to order by chromosomes and
# replace the "?" in the Y chrom line with 6265435 and align its position.
  vi hg18.out.sorted

# Add the hg18 stats to goldenPath/stats.html

  cd ~/browser/goldenPath

# insert hg18.out.sorted into stats.html and add necessary
# surrounding HTML lines for the hg18 section.

  vi stats.html

  cvs update stats.html
  cvs commit stats.html

# Change description of hg18, per suggestion by Kim at NCBI (3/20/06, Fan).

  ssh hgwdev
  echo "update dbDb set description='Mar. 2006' where name = 'hg18';" \
      | hgsql -h genome-testdb hgcentraltest

############################################################################
# hg18 -> hg17 LIFTOVER CHAINS (DONE 3/20/06 Fan)
    # I used a size of 10kb instead of 3kb for the split (blat query) sizes in
    # hg17.  This had a huge affect on the amount of hits in the blat, which
    # then had a huge effect on the amount of chains.  I should also mention
    # that hg17 chromosomes chr1 and chr2 were split further
    # into more than a single query file.  This helped a LOT in avoiding
    # cluster hippos classically associated with those chroms.

    ######## LIFTOVER PREPARATION
    # The following paragraph was already done during hg15 to hg17 liftover built
    # Split up hg17
    ssh pk
    cd /san/sanVol1/scratch/hg17
    mkdir -p liftSplits/{split,lift}
    bash
    for fa in /cluster/data/hg17/?{,?,*hap*}/*.fa; do
      c=`basename $fa .fa`
      echo $c
      faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
    done
    mkdir -p biggerSplits/split
    cd biggerSplits/
    ln -s ../liftSplits/lift
    cd split/
    ln -s ../../liftSplits/split/* .
    faSplit sequence chr1.fa 5 chr1_
    faSplit sequence chr2.fa 5 chr2_
    rm chr{1,2}.fa

    # Make some dirs
    cd /san/sanVol1/scratch
    mkdir -p hg18

    # Copy 11.ooc files to hg18 subdirectory.
    # cp -p /cluster/store5/gs.16/build33/11.ooc hg18

    ## First, copy over scripts. (Already done before)

    # mkdir -p /san/sanVol1/scratch/fan
    # cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
    # cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan

    ######## LIFTOVER BLATING

    # HG18
    ssh pk
    cd /cluster/data/hg18
    makeLoChain-align hg18 /scratch/hg/hg18/nib hg17 /san/sanVol1/scratch/hg17/biggerSplits/split
    cd bed

    mv blat.hg17.2006-03-20 /san/sanVol1/scratch/hg18
    cd /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/run/
    sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg18ToHg17"}' > newspec
    para create newspec
    para try
    para push
# Completed: 2646 of 2646 jobs
# CPU time in finished jobs:     633021s   10550.35m   175.84h    7.33d  0.020 y
# IO & Wait Time:                 14063s     234.39m     3.91h    0.16d  0.000 y
# Average job time:                 245s       4.08m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3645s      60.75m     1.01h    0.04d
# Submission to last job:          6153s     102.55m     1.71h    0.07d

    ######## LIFTOVER CHAINING
    # LIFTING
    ssh pk
    cd /san/sanVol1/scratch/fan
    cp mm7SplitLift.sh hg17SplitLift.sh

    # change andy to fan, mm7 to hg17, and chrX to chr2, and remove chrUn_random
    vi hg17SplitLift.sh

    cat << 'EOF' > hg17ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/fan/hg17Lifts
pushd /scratch/fan/hg17Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
'EOF'

    chmod +x hg17ChainMergeSplit.sh

    # HG18
    cd /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/raw
    /san/sanVol1/scratch/fan/hg17SplitLift.sh
    cd ../
    mkdir chainRun chainRaw
    cd chainRun
    cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg18/nib  /san/sanVol1/scratch/hg17/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'EOF'
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
    para time
# Completed: 46 of 46 jobs
# CPU time in finished jobs:       3713s      61.88m     1.03h    0.04d  0.000 y
# IO & Wait Time:                  1284s      21.41m     0.36h    0.01d  0.000 y
# Average job time:                 109s       1.81m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             310s       5.17m     0.09h    0.00d
# Submission to last job:           310s       5.17m     0.09h    0.00d
    ######### CHAINMERGE/NET/NETSUBSET
    ssh kolossus
    mkdir -p /scratch/fan/hg17Lifts
    cd /scratch/fan/hg17Lifts

    cp -r /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/chainRaw/ .
    mkdir chain
    /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
# about 30 minutes.

    cp -rp chain /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/
    rm -rf chain
    rm -rf chainRaw

    ssh pk
    cd /san/sanvol1/scratch/fan
    cat << 'EOF' > netOver.sh
#!/bin/bash

chain=$1
chrom=`basename $chain .chain`
sizesHGOld=$2
sizesHG17=/cluster/data/hg17/chrom.sizes
chainDir=`dirname $chain`
blatDir=`dirname $chainDir`
net=${blatDir}/net/${chrom}.net
over=${blatDir}/over/${chrom}.over

mkdir -p ${blatDir}/{over,net}
/cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG17 $net /dev/null
/cluster/bin/x86_64/netChainSubset $net $chain $over
'EOF'
    chmod +x netOver.sh

    mkdir netRun

    cd netRun/

    find /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/chain -name "*.chain" \
     | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg18/chrom.sizes"}' > spec
    para create spec
    para push
    para time
# Completed: 49 of 49 jobs
# CPU time in finished jobs:        431s       7.18m     0.12h    0.00d  0.000 y
# IO & Wait Time:                   151s       2.52m     0.04h    0.00d  0.000 y
# Average job time:                  12s       0.20m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              30s       0.50m     0.01h    0.00d
# Submission to last job:            43s       0.72m     0.01h    0.00d

    ########## FINISHING
    ssh hgwdev

    # HG18
    cd /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/over
    cat * >> ../hg18ToHg17.over.chain
    cd ../
    rm -rf psl/ net/ chain/ chainRaw/ over/
    cd ../
    cp -rp blat.hg17.2006-03-20/ /cluster/data/hg18/bed

    cd /cluster/data/hg18/bed
    ln -s blat.hg17.2006-03-20 blat.hg17
    ln -s `pwd`/blat.hg17/hg18ToHg17.over.chain liftOver/hg18ToHg17.over.chain
    ln -s `pwd`/liftOver/hg18ToHg17.over.chain /gbdb/hg18/liftOver/hg18ToHg17.over.chain
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg18/liftOver
    cd /usr/local/apache/htdocs/goldenPath/hg18/liftOver
    cp /gbdb/hg18/liftOver/hg18ToHg17.over.chain .
    gzip hg18ToHg17.over.chain
    hgAddLiftOverChain hg18 hg17 /gbdb/hg18/liftOver/hg18ToHg17.over.chain

##########################################################################
# NSCAN track - ( markd)
# hg17 had both NSCAN and NSCAN-EST tracks, in a composite track.
# currently have only NSCAN for hg18
    cd /cluster/data/hg18/bed/nscan/

    # obtainedf NSCAN predictions from michael brent's group
    # at WUSTL
    wget -nv http://genes.cse.wustl.edu/jeltje/hg18/hg18.nscan.gtf
    wget -r -np -nv http://genes.cse.wustl.edu/jeltje/hg18/chr_ptx/
    mv genes.cse.wustl.edu/jeltje/hg18/chr_ptx .
    rm -rf genes.cse.wustl.edu chr_ptx/index.html*
    gzip -9 hg18.nscan.gtf chr_ptx/*.fa
    chmod a-w hg18.nscan.gtf.gz chr_ptx/*.gz

    # load tracks.  Note that these have *utr features, rather than
    # exon features.  currently ldHgGene creates separate genePred exons
    # for these.
    ldHgGene -bin -gtf -genePredExt hg18 nscanGene hg18.nscan.gtf.gz
    # add .a suffix to match transcript id
    hgPepPred -suffix=.a hg18 generic nscanPep chr_ptx/*.fa.gz
    rm -f *.tab

    # update trackDb; need a hg18-specific page to describe informants
    human/hg18/nscanGene.html
    human/hg18/trackDb.ra

# QA NOTE [ASZ 9-11-2006]: mytouch nscanPep 200603271900.00

##########################################################################
# UPDATED hg18.knownToVisiGene (2006-04-05 galt)
#  after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc)
ssh hgwdev
knownToVisiGene hg18 -fromProbePsl=vgAllProbes


##############################################################################
# BLASTZ CHIMP PanTro1 second time (STARTED - 2006-01-05, DONE 2006-01-13 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzPanTro1.2006-01-05
    cd /cluster/data/hg18/bed
    rm blastz.panTro1
    ln -s blastzPanTro1.2006-01-05 blastz.panTro1
    cd blastzPanTro1.2006-01-05

    cat << '_EOF_' > DEF
# human vs chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/hg18.2bit
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes

# QUERY: Chimp PanTro1 - single chunk big enough to run entire genome
SEQ2_DIR=/san/sanvol1/scratch/panTro1/panTro1.2bit
SEQ2_LEN=/scratch/hg/panTro1/chrom.sizes
SEQ2_CHUNK= 30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzPanTro1.2006-01-05
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Thu Jan  5 11:26:45 PST 2006
# Encountered an error at the net step:

startStep: 0, at step 5 net to stopStep 6
# chmod a+x /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh
# ssh -x pk nice /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh
cd /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain
chainPreNet hg18.panTro1.all.chain.gz /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes stdout
chainNet stdin -minSpace=1 /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes stdout /dev/null
netSyntenic stdin noClass.net
Got 49 chroms in /scratch/hg/hg18/chrom.sizes, 52 in /scratch/hg/panTro1/chrom.sizes
Finishing nets
writing stdout
writing /dev/null
memory usage 363347968, utime 1042 s/100, stime 56
netChainSubset -verbose=0 noClass.net hg18.panTro1.all.chain.gz stdout
chainSort stdin stdout
gzip -c
Out of memory needMem - request size 6 bytes

gzip: stdout: Broken pipe
Command failed:
ssh -x pk nice /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh

# 1/9/06, Retry again

ssh pk
cd /cluster/data/hg18/bed
cd blastzPanTro1.2006-01-05
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-continue=net \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &

# Same error.

# Try with kolossus

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=net \
-stop=load \
`pwd`/DEF > load3.out 2>&1 &

# Still have problems, which seem to be related to the
# wrong $MACHTYPE and $PATH on kolossus.  Updated my .cshrc

# Did the following manually on kolossus:

# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv

/cluster/bin/x86_64/netChainSubset -verbose=0 noClass.net hg18.panTro1.all.chain.gz stdout | chainSort stdin stdout | gzip -c > hg18.panTro1.over.chain.gz

mkdir -p /cluster/data/hg18/bed/liftOver
cp -p hg18.panTro1.over.chain.gz /cluster/data/hg18/bed/liftOver/hg18ToPanTro1.over.chain.gz

# Make axtNet for download: one .axt per hg18 seq.
netSplit noClass.net net
cd ..
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
  /scratch/hg/hg18/hg18.2bit /san/sanvol1/scratch/panTro1/panTro1.2bit stdout \
  | axtSort stdin stdout \
  | gzip -c > axtNet/$f:t:r.hg18.panTro1.net.axt.gz
end

# Make mafNet for multiz: one .maf per hg18 seq.
mkdir mafNet
foreach f (axtNet/*.hg18.panTro1.net.axt.gz)
  axtToMaf -tPrefix=hg18. -qPrefix=panTro1. $f \
        /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes \
        stdout \
  | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end

# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ above by hand.

ssh pk
cd /cluster/data/hg18/bed
cd blastzPanTro1.2006-01-05

screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=load \
-stop=load \
`pwd`/DEF > load4.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=download \
`pwd`/DEF > download.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &

# Measurements:

# Go to kolossus to run featureBits to avoid out of memory problem.

ssh kolossus
bash

time HGDB_CONF=~/.hg.conf.read-only featureBits panTro1 chainHg18Link
# 2641472125 bases of 2733948177 (96.617%) in intersection

time HGDB_CONF=~/.hg.conf.read-only featureBits hg18 chainPanTro1Link
# 2681146909 bases of 2881515245 (93.046%) in intersection

time HGDB_CONF=~/.hg.conf.read-only featureBits panTro1 chainHg17Link
# 0 bases of 2733948177 (0.000%) in intersection

time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainPanTro1Link
# 2633869032 bases of 2866216770 (91.894%) in intersection

#########################################################################
# BLASTZ RAT Rn3 (STARTED - 2005-12-22, DONE 2006-01-05 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzRn3.2005-12-22
    cd /cluster/data/hg18/bed
    rm blastz.rn3
    ln -s blastzRn3.2005-12-22 blastz.rn3
    cd blastzRn3.2005-12-22

    cat << '_EOF_' > DEF
# human vs rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Muman Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInRat
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole
pieces
SEQ2_DIR=/scratch/rat/rn3/softNib
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzRn3.2005-12-22
TMPDIR=/scratch/tmp
'_EOF_'
    # happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-stop=load \
	`pwd`/DEF > to-load.out 2>&1 &

# start processing again on 12/31/05.

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=pk \
      -swap \
      -stop=load \
	`pwd`/DEF > swap.out 2>&1 &

# Either UCSC RR and hgwdev systems or network went down around 11 AM 12/31/05.

# After holidays, start again on 1/3/06 and again on 1/5/06.

    ssh pk
    cd /cluster/data/hg18/bed
    cd blastzRn3.2005-12-22
    screen
    bash

      time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=pk \
      -swap \
      -continue=net \
      -stop=load \
	`pwd`/DEF > swap6.out 2>&1 &

# DONE! Jan  5 13:39

# Measurements:
nice featureBits rn3 chainHg18Link
# 962630574 bases of 2571104688 (37.440%) in intersection
nice featureBits hg18 chainRn3Link
# 964251210 bases of 2881515245 (33.463%) in intersection

#########################################################################
# BLASTZ ARMADILLO DasNov1 (STARTED - 2006-01-06 - 2006-01-09 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzDasNov1.2006-01-06
    cd /cluster/data/hg18/bed
    rm blastz.dasNov1
    ln -s blastzDasNov1.2006-01-06 blastz.dasNov1
    cd blastzDasNov1.2006-01-06

    cat << '_EOF_' > DEF
# human vs armadillo
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for armadillo (per Webb email to Brian Raney)

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY: Armadillo DasNov1
SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit
SEQ2_LEN=/scratch/hg/dasNov1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzDasNov1.2006-01-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
	-stop=load \
	`pwd`/DEF > load.out 2>&1 &
# Started Fri Jan  6 06:20:12 PST 2006

# 1:20 PM, 1/7/06
# The blastz cluster run seemed finished OK, but make jobList some how
# does not end, even after creating the run.time file manually.  Kill it manually.

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-continue=cat \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &

# Done, Jan  8 21:40.

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=download \
`pwd`/DEF > download.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &

# Done! Jan  9 06:11

    # Reciprocal best net mafs for multiz (kate)
    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.dasNov1
    ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 dasNov1 >&! rbest.log &

    # Load nets (2007-03-12 kate)
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.dasNov1/axtChain
    netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netDasNov1 stdin
    netFilter -minGap=10 hg18.dasNov1.rbest.net.gz |  \
        hgLoadNet -warn hg18 netRBestDasNov1 stdin

#########################################################################
# BLASTZ DOG CanFam2 second time (DONE - 2005-12-28 - 2005-12-29 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzCanFam2.2005-12-28
    cd /cluster/data/hg18/bed
    rm blastz.canFam2
    ln -s blastzCanFam2.2005-12-28 blastz.canFam2
    cd blastzCanFam2.2005-12-28

    cat << '_EOF_' > DEF
# human vs dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for dog (per Webb email to Brian Raney)
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog CanFam2 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_LEN=/cluster/bluearc/canFam2/chrom.sizes
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzCanFam2.2005-12-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-stop=load \
	`pwd`/DEF > load.out 2>&1 &
    #	Started 2005-12-28 21:33

    # Two jobs stuck in the same node.  Did manual para stop and para push.
    # Both finished within a few minutes.

    # Done! On Thu Dec 29 05:27:31 PST 2005.

    # system seems hang on kolossus (3 processes of [tcsh -c nice chainMergeSort], not moving)
    # manually killed the jobs.
    # now use pk as the workhorse.

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
      -workhorse=pk \
      -continue=chainMerge \
	-stop=load \
	`pwd`/DEF > load2.out 2>&1 &

    # Done! Thu Dec 29 09:10:02 PST 2005.

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=pk \
      -swap -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

    # Had an error at the load step,
    # mySQL error 2013: Lost connection to MySQL server during query,
    # probably due to sys admin working on network connections,
    # continue at the load step

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
      -workhorse=pk \
      -swap -continue=load -stop=load \
	`pwd`/DEF > swap-load2.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
    -workhorse=pk \
    -continue=download \
    `pwd`/DEF > download.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -workhorse=pk \
    -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
    -swap -continue=download \
    `pwd`/DEF > swap-download.out 2>&1 &

    # Done! Dec 29 13:21

    #	Measurements:

    ssh hgwdev
nice featureBits canFam2 chainHg18Link
# 1477551526 bases of 2384996543 (61.952%) in intersection
nice featureBits hg18 chainCanFam2Link
# 1524764349 bases of 2881515245 (52.915%) in intersection
nice featureBits canFam2 chainHg17Link
# 1487483112 bases of 2384996543 (62.368%) in intersection
nice featureBits hg17 chainCanFam2Link
# 1530197469 bases of 2866216770 (53.387%) in intersection


#########################################################################
# BLASTZ ELEPHANT LoxAfr1 second time (STARTED - 2006-01-03 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03
    cd /cluster/data/hg18/bed
    rm blastz.loxAfr1
    ln -s blastzLoxAfr1.2006-01-03 blastz.loxAfr1
    cd blastzLoxAfr1.2006-01-03

    cat << '_EOF_' > DEF
# human vs elephant
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Elephant LoxAfr1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit
SEQ2_LEN=/scratch/hg/loxAfr1/chrom.sizes
SEQ2_LIMIT=300
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    bash

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-stop=load \
`pwd`/DEF > load.out 2>&1 &

# failed at step 2 due to kki cluster not started.

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-smallClusterHub=pk \
-continue=cat \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-smallClusterHub=pk \
-continue=net \
-stop=load \
`pwd`/DEF > load3.out 2>&1 &

# Same broken pipe error.

netChainSubset -verbose=0 noClass.net hg18.loxAfr1.all.chain.gz stdout
chainSort stdin stdout
gzip -c
Out of memory needMem - request size 28 bytes

gzip: stdout: Broken pipe
Command failed:
ssh -x kolossus nice /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03/axtChain/netChains.csh

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=net \
-stop=load \
`pwd`/DEF > load4.out 2>&1 &

# Finally, a success!

tail load4.out
#...
# cd /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03/axtChain
#netClass -verbose=0 -noAr noClass.net hg18 loxAfr1 hg18.loxAfr1.net
#netFilter -minGap=10 hg18.loxAfr1.net
#hgLoadNet -verbose=0 hg18 netLoxAfr1 stdin
#startStep: 5, at step 7 download to stopStep 6

# *** All done!
# *** Add {chain,net}LoxAfr1 tracks to trackDb.ra if necessary.

# The swap-load was not successful, after several tries.
# Last one seems was due to out of memory problem.
# Per Hiram, we no longer do swap for 2X genomes, unless specifically requested.
# Mark made an inquiry, but said he can get by with hg18->loxAfr1 nets.

    # reciprocal best net mafs for multiz (2007-03-09 kate)
    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.loxAfr1
    ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 loxAfr1 >&! rbest.log &

    # load net and reciprocal best net for comparison
    # note sure why these tables and cleanup aren't done -- ask Fan
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.loxAfr1/axtChain
    netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netLoxAfr1 stdin
    netFilter -minGap=10 hg18.loxAfr1.rbest.net.gz |  \
        hgLoadNet -warn hg18 netRBestLoxAfr1 stdin

#########################################################################
# BLASTZ COW BosTau2 second time (STARTED - 2006-01-07 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzBosTau2.2006-01-07
    cd /cluster/data/hg18/bed
    rm blastz.bosTau2
    ln -s blastzBosTau2.2006-01-07 blastz.bosTau2
    cd blastzBosTau2.2006-01-07

    cat << '_EOF_' > DEF
# human vs cow
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow BosTau2 - single chunk big enough to run entire genome
SEQ2_DIR=/san/sanvol1/scratch/bosTau2/bosTau2.2bit
SEQ2_LEN=/san/sanvol1/scratch/bosTau2/chrom.sizes
SEQ2_CHUNK=3200000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzBosTau2.2006-01-07
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-stop=load \
-workhorse=pk \
`pwd`/DEF > load.out 2>&1 &

# Started Sat Jan  7 07:57:22 PST 2006
# blastz run (and load) done Jan  8 00:13

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &

# took a long time to finish.

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=download \
`pwd`/DEF > download.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &

# Done! Jan  8 21:10

# Measurements:

ssh hgwdev
nice featureBits bosTau2 chainHg18Link
# 1357027317 bases of 2812203870 (48.255%) in intersection
nice featureBits hg18 chainBosTau2Link
# 1357291762 bases of 2881515245 (47.103%) in intersection
nice featureBits bosTau2 chainHg17Link
# 0 bases of 2812203870 (0.000%) in intersection
# nice featureBits hg17 chainBosTau2Link
1350076765 bases of 2866216770 (47.103%) in intersection

#########################################################################
# BLASTZ TENREC EchTel1 second time (STARTED - 2006-01-09 DONE 2006-01-12 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzEchTel1.2006-01-09
    cd /cluster/data/hg18/bed
    rm blastz.echTel1
    ln -s blastzEchTel1.2006-01-09 blastz.echTel1
    cd blastzEchTel1.2006-01-09

    cat << '_EOF_' > DEF
# human vs tenrec
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Tenrec EchTel1
SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzEchTel1.2006-01-09
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &

# Started Mon Jan  9 08:09:03 PST 2006

# Found over a thousand jobs failed, all with the following 7 hosts.

[pk:run.blastz> fgrep host j1.err | sort -u
host: kkr10u06.kilokluster.ucsc.edu
host: kkr10u58.kilokluster.ucsc.edu
host: kkr10u62.kilokluster.ucsc.edu
host: kkr11u34.kilokluster.ucsc.edu
host: kkr11u39.kilokluster.ucsc.edu
host: kkr12u18.kilokluster.ucsc.edu
host: kkr12u29.kilokluster.ucsc.edu

# manually created /scratch/tmp on above machines (except one).

# 2 jobs still running for more than 5 hours each.
para stop
para recover jobList newJobList

# newJobList contains only 2 jobs.  Checked the .psl files under psl confirming only two files missing.
para create newJobList
para push
# This 2 jobs finished within a couple of mintues!
para time >run.time

bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=cat \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net \
-swap \
-stop=load \
`pwd`/DEF > swap-load3.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=download \
`pwd`/DEF > download.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &

# Done! On Jan 12 09:18

    # reciprocal best net mafs for multiz (2007-03-09 kate)
    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.echTel1
    ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 echTel1 >&! rbest.log &
    # reloading chains which disappeared (2007-04-17 kate)
    cd /cluster/data/hg18/bed/blastz.echTel1/axtChain
    # edit loadUp.csh --> create loadUp2.csh and loadUp3.csh
    # run loadUp2.csh (does chainSplit) on kkstore02
    # run loadUp3.csh (does hgLoadChain) on hgwdev

#########################################################################
# BLASTZ CHICKEN GalGal2 second time (DONE - 2005-12-28 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzGalGal2.2005-12-28
    cd /cluster/data/hg18/bed
    rm blastz.galGal2
    ln -s blastzGalGal2.2005-12-28 blastz.galGal2
    cd blastzGalGal2.2005-12-28

    cat << '_EOF_' > DEF
# human vs chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken GalGal2 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/cluster/bluearc/galGal2/chrom.sizes
SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzGalGal2.2005-12-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-stop=load \
	`pwd`/DEF > load.out 2>&1 &
    #	Started 2005-12-28 10:35

    # Two jobs stuck in the same node.  Did manual para stop and para push.
    # Both finished within a few minutes.

    # Done! On Wed Dec 28 15:32:45 PST 2005.

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

    # Had an error at the net step

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
      -swap -continue=net -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
    -continue=download \
    `pwd`/DEF > download.out 2>&1 &

    # the gzip job on kolossus seems not moving at all.
    # killed it manually.  Try again.

    # Seemed not moving, kill it again.  Now use pk instead of kolossus.

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -workhorse=pk \
    -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
    -continue=download \
    `pwd`/DEF > download.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
    -workhorse=pk \
    -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
    -swap -continue=download \
    `pwd`/DEF > swap-download.out 2>&1 &

    # Done! Wed Dec 28 20:39:44 PST 2005

    #	Measurements:

    ssh hgwdev

    nice featureBits galGal2 chainHg18Link
    # 91564024 bases of 1054197620 (8.686%) in intersection
    nice featureBits hg18 chainGalGal2Link
    # 102417858 bases of 2881515245 (3.554%) in intersection

    nice featureBits galGal2 chainHg17Link
    # 93277286 bases of 1054197620 (8.848%) in intersection
    nice featureBits hg17 chainGalGal2Link
    # 103882699 bases of 2866216770 (3.624%) in intersection

# BLASTZ FROG XenTro1 second time (DONE - 2006-01-07 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06
    cd /cluster/data/hg18/bed
    rm blastz.xenTro1
    ln -s blastzXenTro1.2006-01-06 blastz.xenTro1
    cd blastzXenTro1.2006-01-06

    cat << '_EOF_' > DEF
# human vs frog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Frog XenTro1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
      -stop=load \
	`pwd`/DEF > load.out 2>&1 &
# Started Fri Jan  6 20:19:30 PST 2006
# Blastz run done.  Jan  7 02:07 load.out

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

# got the following error:

startStep: 4, at step 5 net to stopStep 6
netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]).

# Try it with pk instead of kolossus:

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load2.out 2>&1 &

# It worked, swap-load done. Jan  7 06:05

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=download \
`pwd`/DEF > download.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &

# Done! Jan  7 06:18

# Measurements:

ssh hgwdev
nice featureBits xenTro1 chainHg18Link
# 61197900 bases of 1381238994 (4.431%) in intersection
nice featureBits hg18 chainXenTro1Link
# 67810866 bases of 2881515245 (2.353%) in intersection

nice featureBits xenTro1 chainHg17Link
# 81777842 bases of 1381238994 (5.921%) in intersection
nice featureBits hg17 chainXenTro1Link
# 85701475 bases of 2866216770 (2.990%) in intersection

# BLASTZ FROG XenTro1 second time (STARTED - 2006-01-06 Fan)

    ssh pk
    mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06
    cd /cluster/data/hg18/bed
    rm blastz.xenTro1
    ln -s blastzXenTro1.2006-01-06 blastz.xenTro1
    cd blastzXenTro1.2006-01-06

    cat << '_EOF_' > DEF
# human vs frog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Frog XenTro1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
      -stop=load \
	`pwd`/DEF > load.out 2>&1 &
# Started Fri Jan  6 20:19:30 PST 2006
# Blastz run done.  Jan  7 02:07 load.out

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

# got the following error:

startStep: 4, at step 5 net to stopStep 6
netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]).

# Try it with pk instead of kolossus:

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load2.out 2>&1 &

# It worked, swap-load done. Jan  7 06:05

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=download \
`pwd`/DEF > download.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &

# Done! Jan  7 06:18

# Measurements:

ssh hgwdev
nice featureBits xenTro1 chainHg18Link
# 61197900 bases of 1381238994 (4.431%) in intersection
nice featureBits hg18 chainXenTro1Link
# 67810866 bases of 2881515245 (2.353%) in intersection

nice featureBits xenTro1 chainHg17Link
# 81777842 bases of 1381238994 (5.921%) in intersection
nice featureBits hg17 chainXenTro1Link
# 85701475 bases of 2866216770 (2.990%) in intersection

# BLASTZ TETRAODON TetNig1 second time (DONE - 2006-01-07 Fan)

ssh pk
mkdir /cluster/data/hg18/bed/blastzTetNig1.2006-01-07
cd /cluster/data/hg18/bed
rm blastz.tetNig1
ln -s blastzTetNig1.2006-01-07 blastz.tetNig1
cd blastzTetNig1.2006-01-07

    cat << '_EOF_' > DEF
# human vs tetraodon
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CHUNK=410000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzTetNig1.2006-01-07
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Sat Jan  7 05:40:51 PST 2006

# Encountered an error:
startStep: 0, at step 5 net to stopStep 6
netChains: looks like previous stage was not successful (can't find [hg18.tetNig1.]all.chain[.gz]).

# Try it with pk as the workhorse.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=net \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &

# Load done.  Sat Jan  7 07:34:56 PST 2006

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=download \
 `pwd`/DEF > download.out 2>&1 &

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &

# Done! Sat Jan  7 08:02:14 PST 2006
# The download and swap-download took less than 10 seconds each.  ???

# Measurements:

ssh hgwdev
nice featureBits tetNig1 chainHg18Link
# 50026847 bases of 342403326 (14.611%) in intersection
nice featureBits hg18 chainTetNig1Link
# 57654754 bases of 2881515245 (2.001%) in intersection

nice featureBits tetNig1 chainHg17Link
# 34379509 bases of 342403326 (10.041%) in intersection
nice featureBits hg17 chainTetNig1Link
# 35910128 bases of 2866216770 (1.253%) in intersection

#########################################################################
# BLASTZ FUGU fr1 (STARTED - 2005-12-20, DONE 2006-01-04 Fan)
    ssh pk
    mkdir /cluster/data/hg18/bed/blastzFr1.2005-12-20
    cd /cluster/data/hg18/bed
    ln -s blastzFr1.2005-12-20 blastz.fr1
    cd blastzFr1.2005-12-20

    cat << '_EOF_' > DEF
# human vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Reuse parameters from human-chicken, except L=6000 (more relaxed)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human Hg18 - testing 100,000,000 sized chunk on pk kluster
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000

# QUERY: Fugu Fr1 - chunk big enough to run the whole chrom at once
SEQ2_DIR=/san/sanvol1/scratch/fr1/nib
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CHUNK=400000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzFr1.2005-12-20
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    ssh pk
    cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -stop=load \
	`pwd`/DEF > thruLoad.out 2>&1 &

    ssh pk
    cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
    screen
    bash
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -continue=chainMerge -stop=load \
	`pwd`/DEF > thruLoad.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -continue=download \
	`pwd`/DEF > download.clean.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -swap \
	`pwd`/DEF > swap.out 2>&1 &

# Finish the remaining step, 1/4/05.

    ssh pk
    cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
    screen
    bash

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 \
	-swap -continue=download \
	`pwd`/DEF > DownloadSwap.out 2>&1 &

# First try found the DEF was some how altered for rn3.
# Re-generated DEF and try again.

time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 \
	-swap -continue=download \
	`pwd`/DEF > DownloadSwap2.out 2>&1 &

# Done.  Jan  4 09:48.

# measurements

nice featureBits hg18 chainFr1Link
# 51795958 bases of 2881515245 (1.798%) in intersection
nice featureBits hg17 chainFr1Link
#50831650 bases of 2866216770 (1.773%) in intersection

nice featureBits hg18 netFr1
# 691148929 bases of 2881515245 (23.986%) in intersection
nice featureBits hg17 netFr1
# 714234935 bases of 2866216770 (24.919%) in intersection

nice featureBits fr1 chainHg18Link
# 43267869 bases of 315518167 (13.713%) in intersection
# nice featureBits fr1 chainHg17Link
0 bases of 315518167 (0.000%) in intersection
nice featureBits fr1 netHg18
# 140843080 bases of 315518167 (44.639%) in intersection
nice featureBits fr1 netHg17
# 0 bases of 315518167 (0.000%) in intersection

##################################################

# For blastz runs between hg18 and other organisms, they are documented in
# makeMm8.doc makeRn4.doc, makeRheMac2.doc, makeDanRer3.doc.

# PHASTCONS SCORES DOWNLOADABLES FOR 17WAY (2006-04-06 Fan)

    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz17way
    mkdir phastConsDownloads
    cd phastConsDownloads
cat > downloads.csh << 'EOF'
    date
    cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/pp
    foreach chr (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
      echo $chr
      cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
        | nice gzip -c \
            > /cluster/data/hg18/bed/multiz17way/phastConsDownloads/$chr.gz
    end
    date
'EOF'
    csh downloads.csh >&! downloads.log &

        # ~20 minutes
    # << happy emacs
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way/phastConsDownloads
    md5sum *.gz > md5sum.txt
    set dir = /usr/local/apache/htdocs/goldenPath/hg18/phastCons17way
    mkdir $dir
    ln -s /cluster/data/hg18/bed/multiz17way/phastConsDownloads/{*.gz,md5sum.txt} $dir
    cp /usr/local/apache/htdocs/goldenPath/hg17/phastCons17way/README.txt $dir
    # edit this file to reflect the latest releases used.
    vi $dir/README.txt

##########################################################################
# RE-BUILT GO DATABASE (DONE 4/12/06, Fan)

# GO changed the content of gene_association.goa_uniprot.gz.
# Tho original one we use no longer has human, mouse, etc in it.
# They are placed in separate files.
# Per GO's suggestion, we now get the file from the submission sub-directory.
# This seems cover more than concatenating the individual goa... files.

# Download the terms and make the database.
ssh hgwdev
mkdir /cluster/store1/geneOntology/20060330
cd /cluster/store1/geneOntology/20060330

wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200603-assocdb-data.gz

hgsql mysql <<end
create database go060330;
end
zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
hgsql go060330 <j.tmp
rm j.tmp

wget --timestamping \
"ftp://ftp.geneontology.org/pub/go/gene-associations/submission/gene_association.goa_uniprot.gz"

# Updated hgGoAssociation.c so that it does not skip any line in the beginning */

zcat gene_association.goa_uniprot.gz|\
/cluster/home/fanhsu/bin/i386/hgGoAssociation go060330 goaPart stdin

# Ask sys-admin to switch the database pointer go to point to go060330.

##########################################################################
# GENEID GENE PREDICTIONS (DONE - 2006-04-21 FIXED: 2006-05-09 - Hiram)
# RELOADED PEPTIDE TABLE, GENEIDPEP (DONE, 2006-07-11, hartera)
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/geneid
    cd /cluster/data/hg18/bed/geneid
    for C in `awk '{print $1;}' ../../chrom.sizes`
    do
	wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/geneid_v1.2/$C.gtf
	wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/geneid_v1.2/$C.prot
    done
    # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
    for F in chr*.prot
    do
      perl -wpe 's/^(>chr\S+)/$1.1/' $F
    done >> geneid.fa
    #	one of the files in this delivery, chr1.prot, did *not* have a
    #	terminal <CR> character and it caused the next protein in the
    #	next file processed, chr10.prot, to be a continuation of the
    #	last protein in chr1.prot.  To check for this:
    grep ">" geneid.fa | grep -v "^>"
    #	shows a line:
    #	AVSET>chr10_1.1
    #	This turns out to have been the result of a truncated file.
    #	Fetch that file again:
    mv chr1.prot chr1.prot.orig
    wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/geneid_v1.2/chr1.prot
    #	That's better:
    wc -l chr1.prot chr1.prot.orig
    #	24494 chr1.prot
    #	4524 chr1.prot.orig
    rm chr1.prot.orig
    #	run the above loop again to generate geneid.fa after:
    rm geneid.fa

    ldHgGene -gtf -genePredExt hg18 geneid *.gtf
    #	Read 33410 transcripts in 275347 lines in 49 files
    #	33410 groups 49 seqs 1 sources 3 feature types
    #	33410 gene predictions
    hgPepPred hg18 generic geneidPep geneid.fa

    #	verify same names in both tables:
    awk '{print $1}' geneidPep.tab | sort > pep.names
    awk '{print $1}' genePred.tab | sort > id.names
    wc -l pep.names id.names
    #	33410 pep.names
    #	33410 id.names

    comm -12 pep.names id.names | wc -l
    #	33410

# QA NOTE (ASZ 5-11-2006) I dropped the geneidPep table and the reference
# to it from the trackDb.ra file.  This functionality is now done on the
# fly and this table is no longer needed.
    # Added back the geneidPep table as requested by a user
    # (hartera, 2006-07-11)
    ssh hgwdev
    cd /cluster/data/hg18/bed/geneid
    hgPepPred hg18 generic geneidPep geneid.fa
    # The trackDb.ra file in kent/src/makeDb seems to have a reference
    # to the geneidPep table already.

##########################################################################
# BLASTZ/CHAIN/NET XENTRO2 (DONE 4/20/06 angie)
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.xenTro2.2006-04-20
    cd /cluster/data/hg18/bed/blastz.xenTro2.2006-04-20
    cat << '_EOF_' > DEF
# human vs. frog
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64

# Use same params as used for mammal-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Frog xenTro2 - single chunk big enough to run two of the
#               largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/hg18/bed/blastz.xenTro2.2006-04-20
'_EOF_'
    # << emacs
    doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/hg18XenTro2 \
      -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
      >& do.log & tail -f do.log
    ln -s blastz.xenTro2.2006-04-20 /cluster/data/hg18/bed/blastz.xenTro2

###########################################################################
# BLASTZ CHAIN SWAP FOR ZEBRAFISH (danRer4) (DONE, 2006-04-25, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
    # See also makeDanRer4.doc
    # alignments are in: /cluster/data/hg18/bed/blastz.danRer4.swap
    # Blastz parameters used were:
    # BLASTZ_H=2000
    # BLASTZ_Y=3400
    # BLASTZ_L=6000
    # BLASTZ_K=2200
    # BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
    # There are no lineage-specific repeats defined for this species pair so
    # all repeats were used as lineage-specific.
    ssh pk
    cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
    nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -swap `pwd`/DEF >& doSwap.log &
    # Took about 15 minutes.

    # check with featureBits and compare to danRer3 chains:
    featureBits hg18 chainDanRer4Link
    # 57415379 bases of 2881515245 (1.993%) in intersection
    featureBits hg18 chainDanRer3Link
    # 64801985 bases of 2881515245 (2.249%) in intersection

    featureBits -chrom=chr1 hg18 refGene:cds chainDanRer4Link -enrichment
    # refGene:cds 1.389%, chainDanRer4Link 2.337%, both 0.937%, cover 67.47%,
    # enrich 28.87x

    featureBits -chrom=chr1 hg18 refGene:cds chainDanRer3Link -enrichment
    # refGene:cds 1.389%, chainDanRer3Link 2.601%, both 0.931%, cover 67.01%,
    # enrich 25.76x

    featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
    # refGene:cds 1.395%, chainDanRer2Link 2.742%, both 0.911%, cover 65.31%,
    # enrich 23.82x

    # similar coverage and enrichment for danRer4 and danRer3 chains
    # which is good.
    featureBits -chrom=chr1 hg18 refGene:cds netDanRer4 -enrichment
    # refGene:cds 1.389%, netDanRer4 31.001%, both 1.096%, cover 78.91%,
    # enrich 2.55x
    featureBits -chrom=chr1 hg18 refGene:cds netDanRer3 -enrichment
    # refGene:cds 1.389%, netDanRer3 29.929%, both 1.080%, cover 77.72%,
    # enrich 2.60x
    # Similar coverage and enrichment for danRer4 net on hg18 as for danRer3.

#  LOAD FIRSTEF TRACK (DONE 2006-04-25 Fan)

    ssh hgwdev
    mkdir -p /cluster/data/hg18/bed/firstEF
    cd /cluster/data/hg18/bed/firstEF

# receive the file firstEFMar05New.bed.gz from email (ramana.davuluri at osumc.edu) into this subdirectory

    cat << '_EOF_' > sedScript
s/chr23/chrX/g
s/chr24/chrY/g
/^>/d
/^$/d
/^No/d
'_EOF_'
    # << this line keeps emacs coloring happy
    bash
    zcat firstEFMar05New.bed.gz | sed -f sedScript | awk  "{OFS=\"\t\"} {\$3 +=1; print  \$0}" > firstEF.bed
    exit

    hgLoadBed hg18 firstEF firstEF.bed
    rm firstEF.bed bed.tab

#done firstEF

###########################################################################
# ALTGRAPHX TRACK (sugnet) Wed Apr 26 13:46:46 PDT 2006

cd /cluster/store1/sugnet/altSplice/
mkdir hg18-2006.04.13
cd hg18-2006.04.13
mkdir rnaCluster
cd rnaCluster

# Don't use RAGE libraries for clone bounds.
~/latestJk/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh  hg18 rage.libs

# Make spec file to run.
foreach c (`echo 'select chrom from chromInfo' | hgsql hg18 | grep -v chrom`)
    set out = chrom/$c.bed
    echo "clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c" >> clusterRna.spec
end


# Tried running it on the minicluster, but can't connect to the
# cluster accounts so run it from here on hgwdev.
chmod 755 clusterRna.spec
mkdir chrom
./clusterRna.spec >& clusterRna.log

cd ..

# Make script to setup parasol job file for raw altGraphX files on human
cat << '_EOF_' > makeRun.sh
#!/bin/sh

for chrom in `echo "select chrom from chromInfo" | hgsql hg18 | grep -v chrom`; do
echo 'echo "Doing $chrom"'
echo "/cluster/home/sugnet/bin/i386/altSplice -db=hg18   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg18.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg18/nib/$chrom.nib"
done
'_EOF_'
    # << this line makes emacs coloring happy

mkdir agxs
chmod 755 makeRun.sh
chmod 755 toRun.sh
./toRun.sh >& toRun.log &

cat agxs/*.agx > hg18.agx

mkdir hg18
mv agxs/ makeRun.sh toRun.log toRun.sh hg18.agx hg18
cd ..

mkdir mm7
cd mm7
# make the rnaClusters
mkdir rnaCluster
cd rnaCluster/
mkdir chrom

# Don't use RAGE libraries for clone bounds.
~/latestJk/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh  mm7 rage.libs

foreach c (`echo 'select chrom from chromInfo' | hgsql mm7 | grep -v chrom`)
    set out = chrom/$c.bed
    echo "clusterRna -mrnaExclude=mm7.rage.libs mm7 /dev/null $out -chrom=$c" >> clusterRna.spec
end

# tried to run on kki, but no longer can access db from minicluster.
chmod 755 clusterRna.spec
./clusterRna.spec >& clusterRna.log &

cd ..

cat << '_EOF_' > makeRun.sh
#!/bin/sh

for chrom in `echo "select chrom from chromInfo" | hgsql mm7 | grep -v chrom`; do
echo 'echo "Doing $chrom"'
echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm7   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm7.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm7/nib/$chrom.nib"
done
'_EOF_'
    # << this line keeps emacs coloring happy
chmod 755 makeRun.sh
./makeRun.sh > toRun.sh
chmod 755 toRun.sh
mkdir agxs
./toRun.sh >& toRun.log &

cat agxs/*.agx > mm7.agxc
cd ..
mkdir orthoSpliceExoniphy
cd orthoSpliceExoniphy/

echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg17 | grep -v txStart > hg17.exoniphy.bed
liftOver hg17.exoniphy.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.exoniphy.bed hg17.exoniphy.unmapped.bed
mkdir orthoSplice
cd orthoSplice
ln -s ../orthoSpliceExoniphy/hg18.exoniphy.bed .
echo 'select chrom, size from chromInfo' | hgsql hg18 | grep -v chrom > chromSizes.tab
cp /cluster/data/hg18/bed/blastz.mm7/axtChain/hg18.mm7.all.chain.gz .
chainSplit chains hg18.mm7.all.chain
cp /cluster/data/hg18/bed/blastz.mm7/axtChain/hg18.mm7.net.gz .
netSplit hg18.mm7.net.gz nets

mkdir agx report logs
cat << '_EOF_' > makeRun.sh
#!/usr/bin/perl -w

open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n";
while(<IN>) {
    chomp;
    @w = split;
    print "/cluster/home/sugnet/bin/i386/orthoSplice -chromSize=$w[1] -exonFile=hg18.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../hg18/agxs/hg18.$w[0].agx -orthoAgxFile=../mm7/mm7.agx -db=hg18 -orthoDb=mm7 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg18.mm7.cons.t3.agx -reportFile=report/$w[0].hg18.report -edgeFile=report/$w[0].hg18.edge.report >& logs/$w[0].test.log\n";
}
'_EOF_'
    # << this line keeps emacs coloring happy

# clean up disk space we're not using
rm hg18.mm7.all.chain hg18.mm7.net.gz nets/* chains/*
chmod 755 makeRun.sh
./makeRun.sh > orthoSplice.para.spec

ssh kki
cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/orthoSplice
para create orthoSplice.para.spec
para push

cat agx/*.agx > hg18.mm7.t3.exoniphy.agx
cp ~/latestJk/kent/src/hg/lib/altGraphX.sql .
hgLoadBed -notItemRgb -sqlTable=altGraphX.sql hg18 altGraphX hg18.mm7.t3.exoniphy.agx

# end AltGraphX track.

####################################################################
# EXONWALK TRACK (sugnet) Wed Apr 26 13:51:14 PDT 2006

# first make altGraphX track (see above)
cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/orthoSplice
mkdir exonWalk
mkdir beds
cd exonWalk
mkdir beds
foreach file (`ls ../agx/*.agx`)
  set base=`basename $file .agx`
  echo "/cluster/home/sugnet/bin/i386/exonWalk db=hg18 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec
end

para create exonWalk.para.spec
para push
cat beds/*.bed > hg18.mm7.cons.t3.exoniphy.bed

mkdir orfs
cd orfs
mkdir bedOrf beds fa borf
cp ~/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./
splitFile ../../hg18.mm7.cons.t3.exoniphy.bed 500 exonWalk.
cat << '_EOF_' > makeFa.sh
#!/bin/sh

for file in "$@"
do
 base=`basename $file`
 echo "Doing $file"
 echo "sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa "
 sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa
done
'_EOF_'
chmod 755 makeFa.sh
makeFa.sh beds/*


cat << '_EOF_' > makeGenePred.sh
#!/bin/sh

for file in "$@"
do
  base=`basename $file`
  /cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp
done
'_EOF_'
chmod 755 makeGenePred.sh

makeGenePred.sh beds/*
cat beds/* > hg18.mm7.exonWalk.bed
cat genePred/*.gp > hg18.mm7.exonWalk.gp
ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.gp

cat << '_EOF_' > makeNoNmdGenePred.sh
#!/bin/sh

for file in "$@"
do
  base=`basename $file`
  /cluster/home/sugnet/bin/i386/borfMatcher beds/$base borf/$base.borf bedOrfNoNmd/$base.bed genePredNoNmd/$base.gp
done
'_EOF_'

mkdir bedOrfNoNmd genePredNoNmd
chmod 755 ./makeNoNmdGenePred.sh

wc beds/*
 275987 3311844 57319256 total
wc genePredNoNmd/*.gp
 169203 1692030 59907679 total
wc genePred/*.gp
 225252 2252520 83619240 total


cat genePred/*.gp > hg18.mm7.exonWalk.nmd.gp
cat genePredNoNmd/*.gp > hg18.mm7.exonWalk.noNmd.gp
cat beds/* > hg18.mm7.exonWalk.all.bed

# Plain "exonWalk" track is the only one used on regular genome browser.
ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.noNmd.gp
hgLoadBed hg18 exonWalkAll hg18.mm7.exonWalk.all.bed
ldHgGene -predTab hg18 exonWalkWithNmd hg18.mm7.exonWalk.nmd.gp

cat hg18.mm7.exonWalk.noNmd.gp  | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
Q1 1.000000
median 3.000000
Q3 7.000000
average 10.670556
min 1.000000
max 3844.000000
count 15857
total 169203.000000
standard deviation 63.330761

cat hg18.mm7.exonWalk.nmd.gp  | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
ave counts.txt
Q1 1.000000
median 3.000000
Q3 8.000000
average 14.037891
min 1.000000
max 7278.000000
count 16046
total 225252.000000
standard deviation 99.406890

trackGenome hg18 all refGene:cds trackGenome.spec
Track Specification      track  overlap track    cov   track   new    cum
                          size     size  geno  track     cov   cov    cov
-----------------------------------------------------------------------------
exonWalk:cds          31207765 27951670  1.00%  89.57%  90.24% 90.24% 90.24%
# end ExonWalk track.

###########################################################################
# ALTGRAPHX2 TRACK (kent) in progress Fri Jan 19 11:27:45 PST 2007
# The exoniphy and human/mouse blastz/chain/nets need to be done before
# this.

ssh hgwdev
cd /cluster/store1/sugnet/altSplice/
mkdir hg18-2007.01.19
cd hg18-2007.01.19
mkdir rnaCluster
cd rnaCluster

# Don't use RAGE libraries for clone bounds.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh  hg18 rage.libs

# Make spec file to run.
echo "#!/bin/tcsh -ef@ > clusterRna.spec
foreach c (`echo 'select chrom from chromInfo' | hgsql hg18 | grep -v chrom`)
    set out = chrom/$c.bed
    echo "clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c" >> clusterRna.spec
end

# Run the file. Needs to be done on machine with database access.
# Takes an hour or so.
chmod 755 clusterRna.spec
mkdir chrom
./clusterRna.spec >& clusterRna.log

cd ..

# Make script to setup job file for raw altGraphX files on human
# If we had a cluster with database access this could be run there.
# As it is, run it on hgwdev.  This took 45 minutes.
cat << '_EOF_' > makeRun.sh
#!/bin/sh
echo "#!/bin/tcsh -ef"
for chrom in `echo "select chrom from chromInfo" | hgsql hg18 | grep -v chrom`; do
echo "echo 'Doing $chrom'"
echo "altSplice -db=hg18   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg18.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg18/nib/$chrom.nib"
done
'_EOF_'
    # << this line makes emacs coloring happy

mkdir agxs
chmod 755 makeRun.sh
./makeRun.sh > toRun.sh
chmod 755 toRun.sh
./toRun.sh >& toRun.log &

cat agxs/*.agx > hg18.agx

mkdir hg18
mv agxs/ makeRun.sh toRun.log toRun.sh hg18.agx hg18
cd ..

mkdir mm8
cd mm8
# make the rnaClusters
mkdir rnaCluster
cd rnaCluster/
mkdir chrom

# Don't use RAGE libraries for clone bounds.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh  mm8 rage.libs

echo "#!/bin/tcsh -ef" > clusterRna.spec
foreach c (`echo 'select chrom from chromInfo' | hgsql mm8 | grep -v chrom`)
    set out = chrom/$c.bed
    echo "clusterRna -mrnaExclude=mm8.rage.libs mm8 /dev/null $out -chrom=$c" >> clusterRna.spec
end

# Could make this a cluster run if had a cluster with database access.
# as is, took about 15 minutes on hgwdev. (Faster than human since less ESTs.)
chmod 755 clusterRna.spec
./clusterRna.spec >& clusterRna.log &

cd ..

# Make batch file file to run altSplice program (by making a batch file).
cat << '_EOF_' > makeRun.sh
#!/bin/sh
echo "#!/bin/tcsh -ef"
for chrom in `echo "select chrom from chromInfo" | hgsql mm8 | grep -v chrom`; do
echo "echo 'Doing $chrom'"
echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm8   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm8.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm8/nib/$chrom.nib"
done
'_EOF_'
    # << this line keeps emacs coloring happy
chmod 755 makeRun.sh
./makeRun.sh > toRun.sh
chmod 755 toRun.sh

# Run altSplice.  This takes about 12 minutes.
mkdir agxs
./toRun.sh >& toRun.log &

cat agxs/*.agx > mm8.agx
cd ..
mkdir orthoSpliceExoniphy
cd orthoSpliceExoniphy/

echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg18 | grep -v txStart > hg18.exoniphy.bed
mkdir orthoSplice
cd orthoSplice
echo 'select chrom, size from chromInfo' | hgsql hg18 | grep -v chrom > chromSizes.tab
zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.all.chain.gz | chainSplit chains stdin
zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.net.gz | netSplit stdin nets

mkdir agx report logs
cat << '_EOF_' > makeRun.sh
#!/usr/bin/perl -w

open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n";
while(<IN>) {
    chomp;
    @w = split;
    print "orthoSplice -chromSize=$w[1] -exonFile=../hg18.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../../hg18/agxs/hg18.$w[0].agx -orthoAgxFile=../../mm8/mm8.agx -db=hg18 -orthoDb=mm8 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg18.mm8.cons.t3.agx -reportFile=report/$w[0].hg18.report -edgeFile=report/$w[0].hg18.edge.report >& logs/$w[0].test.log\n";
}
'_EOF_'
    # << this line keeps emacs coloring happy

chmod 755 makeRun.sh
./makeRun.sh > orthoSplice.para.spec

# do a little cluster run
ssh kki
cd /cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy/orthoSplice
para create orthoSplice.para.spec
para push
# Do para check, etc until done.  Here's the para time results.
#
# 49 jobs in batch
# 147 jobs (including everybody's) in Parasol queue.
# Checking finished jobs
# Completed: 47 of 49 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:       7002s     116.70m     1.94h    0.08d  0.000 y
# IO & Wait Time:                   196s       3.27m     0.05h    0.00d  0.000 y
# Average job time:                 153s       2.55m     0.04h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1283s      21.38m     0.36h    0.01d
# Submission to last job:          1283s      21.38m     0.36h    0.01d
#
# The two jobs that crashed are ok, it was simply the result of no input on
# some of the small random chroms. It'd be good to take the jobs out earlier
# somehow. Probably Angie could figure out a way to add a file existence
# test in a line of the perl script above.  The altInFile is missing in this
# case.

# Concatenate cluster output and load it into the database.
ssh hgwdev
cd /cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy/orthoSplice
cat agx/*.agx > hg18.mm8.t3.exoniphy.agx
cp ~/kent/src/hg/lib/altGraphX.sql .
hgLoadBed -notItemRgb -sqlTable=altGraphX.sql hg18 altGraphX2 hg18.mm8.t3.exoniphy.agx

# clean up disk space we're not using
rm hg18.mm7.all.chain hg18.mm7.net.gz nets/* chains/*

# end AltGraphX2 track.

####################################################################
# EXONWALK2 TRACK (kent) Tue Jan 24 2007

# first make altGraphX2 track (see above)
ssh hgwdev
cd
/cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy
mkdir exonWalk
mkdir beds
cd exonWalk
mkdir beds
foreach file (`ls ../orthoSplice/agx/*.agx`)
  set base=`basename $file .agx`
  echo "exonWalk db=hg18 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec
end

# Execute para spec as batch file since wants database access.
# takes about 2.5 hours
#para create exonWalk.para.spec
#para push
#cat beds/*.bed > hg18.mm7.cons.t3.exoniphy.bed
time tcsh -efx exonWalk.para.spec
#8256.940u 21.747s 2:18:07.32 99.8%      0+0k 0+0io 0pf+0w

mkdir orfs
cd orfs
mkdir bedOrf beds fa borf genePred
cd beds
# cp /cluster/store1/sugnet/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./
cat ../../beds/*.bed | splitFile stdin 500 exonWalk.
cd ..
cat << '_EOF_' > makeFa.sh
#!/bin/sh

for file in "$@"
do
 base=`basename $file`
 echo "Doing $file"
 echo "sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa "
 sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa
done
'_EOF_'
chmod 755 makeFa.sh
makeFa.sh beds/*

cat << '_EOF_' > makeBorf.sh
#!/bin/sh

for file in "$@"
do
 base=`basename $file`
 echo "Doing $file"
 echo "borfBig $file borf/$base.borf "
 borfBig $file borf/$base.borf
done
'_EOF_'
chmod 755 makeBorf.sh
makeBorf.sh fa/*.fa

# Alternatively do this on the cluster.  It takes a little doing to
# get a version of bestorf set up to be cluster accessible.  I
# just copied it in from /projects/compbio/bin/borf, including
# copying in some binary fiels that script referenced.
# As a parasol job on kk, here's what para time said:
CPU time in finished jobs:      51577s     859.61m    14.33h    0.60d  0.002 y
IO & Wait Time:                 25442s     424.04m     7.07h    0.29d  0.001 y
Average job time:                 132s       2.19m     0.04h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             179s       2.98m     0.05h    0.00d
Submission to last job:           307s       5.12m     0.09h    0.00d

cat << '_EOF_' > makeGenePred.sh
#!/bin/sh

for file in "$@"
do
  base=`basename $file`
  borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp
done
'_EOF_'
chmod 755 makeGenePred.sh

makeGenePred.sh beds/*
cat beds/* > hg18.mm7.exonWalk.bed
cat genePred/*.gp | ldHgGene -predTab hg18 exonWalk2 stdin

cat << '_EOF_' > makeNoNmdGenePred.sh
#!/bin/sh

for file in "$@"
do
  base=`basename $file`
  /cluster/home/sugnet/bin/i386/borfMatcher beds/$base borf/$base.borf bedOrfNoNmd/$base.bed genePredNoNmd/$base.gp
done
'_EOF_'

mkdir bedOrfNoNmd genePredNoNmd
chmod 755 ./makeNoNmdGenePred.sh

wc beds/*
 275987 3311844 57319256 total
wc genePredNoNmd/*.gp
 169203 1692030 59907679 total
wc genePred/*.gp
 225252 2252520 83619240 total


cat genePred/*.gp > hg18.mm7.exonWalk.nmd.gp
cat genePredNoNmd/*.gp > hg18.mm7.exonWalk.noNmd.gp
cat beds/* > hg18.mm7.exonWalk.all.bed

# Plain "exonWalk" track is the only one used on regular genome browser.
ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.noNmd.gp
hgLoadBed hg18 exonWalkAll hg18.mm7.exonWalk.all.bed
ldHgGene -predTab hg18 exonWalkWithNmd hg18.mm7.exonWalk.nmd.gp

cat hg18.mm7.exonWalk.noNmd.gp  | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
Q1 1.000000
median 3.000000
Q3 7.000000
average 10.670556
min 1.000000
max 3844.000000
count 15857
total 169203.000000
standard deviation 63.330761

cat hg18.mm7.exonWalk.nmd.gp  | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
ave counts.txt
Q1 1.000000
median 3.000000
Q3 8.000000
average 14.037891
min 1.000000
max 7278.000000
count 16046
total 225252.000000
standard deviation 99.406890

trackGenome hg18 all refGene:cds trackGenome.spec
Track Specification      track  overlap track    cov   track   new    cum
                          size     size  geno  track     cov   cov    cov
-----------------------------------------------------------------------------
exonWalk:cds          31207765 27951670  1.00%  89.57%  90.24% 90.24% 90.24%
# end ExonWalk track.


####################################################################
# LOAD ENSEMBL GENES (DONE, 2006-05-02, Fan)
# ADDED STABLE URL TO TRACKDB (DONE, 2006-05-29, hartera)
# ADDED RELEASE ALPHA AND RELEASE BETA VERSIONS OF TRACK ENTRY IN
# trackDb.ra SO THAT CORRECT ENSEMBL BUILD VERSION DISPLAYED AND LINKED TO
# AS DIFFERENT ENSEMBL BUILDS ON RR AND HGWDEV (DONE, 2007-09-25, hartera)
    mkdir /cluster/data/hg18/bed/ensembl
    cd /cluster/data/hg18/bed/ensembl
    # Get the ensembl protein data from
    # http://www.ensembl.org/Homo_sapiens/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box.
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensemblGene.gtf.gz

    # Ensembl handles random chromosomes differently than us, so we
    # strip this data.  Fortunately it just loses a couple of genes.
    # Add "chr" to front of each line in the gene data gtf file to make
    # it compatible with our software.
    # Finally, get rid of the ".1" or ".2" after the name
    gunzip -c ensemblGene.gtf.gz \
    |sed -e 's/c22_H2/22_h2_hap1/'\
    |sed -e 's/c5_H2/5_h2_hap1/'\
    |sed -e 's/c6_COX/6_cox_hap1/'\
    |sed -e 's/c6_QBL/6_qbl_hap2/'\
    | perl -wpe 's/^([0-9]|X|Y|Un|MT|5_h2_hap1|22_h2_hap1|6_cox_hap1|6_qbl_hap2)/chr$1/ || die "Line $. doesnt start with human chrom:\n$_"' \
    | sed -e 's/\..\"/\"/g' \
    | sed -e 's/chrMT/chrM/' \
    > ensGene.gtf

    ssh hgwdev
    cd /cluster/data/hg18/bed/ensembl

    # Remove hap chroms entries because Ensembl is using different genomic coordinates.

    fgrep -v hap ensGene.gtf > ensGeneNew.gtf

    /cluster/bin/i386/ldHgGene hg18 ensGene ensGeneNew.gtf
    #  Read 58424 transcripts in 1014240 lines in 1 files
    #  58424 groups 25 seqs 1 sources 4 feature types
    #  58424 gene predictions

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql hg18 < ~/kent/src/hg/lib/ensGtp.sql
    # remove header line from ensGtp.txt
    echo "load data local infile 'ensGtp.txt' into table ensGtp ignore 1 lines" | hgsql -N hg18

    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 2) Choose protein_coding for gene type
    # Page 3) Choose the "Sequences" box.
    # Page 4) check Ensembl Gene ID, Transcript ID, and Peptid ID, uncheck chrom, Transcripts/Proteins.  Peptide.  Format = FASTA.
    # Save file as ensemblPep.fa.gz
    gunzip ensemblPep.fa.gz
    hgPepPred hg18 ensembl ensemblPep.fa
    # Added stable archive URL for Ensembl v38 to human/hg18/trackDb.ra
    # (2006-05-29, hartera)
    # Changed url line for ensGene entry to:
    # url http://apr2006.archive.ensembl.org/perl/transview?transcript=$$

    # (2007-09-25, hartera)
    # Created a release beta version of this track in human/hg18/trackDb.ra
    # with the ensArchive setting set to apr2006 to create the correct URL
    # as above and add the correct version (version 38) in the label:
track ensGene
release beta
shortLabel Ensembl Genes
longLabel Ensembl (Build 38) Gene Predictions
group genes
priority 40
visibility hide
color 150,0,0
type genePred ensPep
ensArchive apr2006
    # A separate trackDb entry (release alpha) was made for the updated
    # track on hgwdev which is Build 46 (aug2007). This means that the
    # correct version will be displayed and the correct links made on both
    # the RR and hgwdev.

# Create knownToEnsembl column (updated 2007-11-15 - Jim Kent)
    hgMapToGene hg18 ensGene knownGene knownToEnsembl

# QA NOTE [ASZ: 9-11-2006]: mytouch on ensGtp and ensPep.  This is because
# ensGene was updated later than they were.  Ensembl treats hap chroms
# differently than we do.  So the ensGene table was reloaded.
# sudo mytouch hg18 ensGtp 200605241000.00
# sudo mytouch hg18 ensPep 200605241000.00


# SGP GENES (DONE 5/3/06 Fan)
# See below for: SGP GENES Update (DONE - 2007-10-02 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/sgp
    cd /cluster/data/hg18/bed/sgp
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/SGP/$chr.gtf
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/SGP/$chr.prot
    end

    ldHgGene -gtf -genePredExt hg18 sgpGene chr*.gtf

# VEGA LIFT FROM HG17 (DONE 5/22/06 acs)
# This can be replaced when the new version comes out (Tim Hubbard says soon)
    ssh hgwdev
    cd /cluster/store8/ensembl/vega33_35f

    # there's a bad record at the top of both of these files
    awk 'NF == 15 ' vegaGene.gp > tmp.gp
    awk 'NF == 15 ' vegaPseudo.gp > tmp2.gp

    zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | liftOver tmp.gp stdin vegaGeneHg18.gp unMapped.gp -genePred
    # only 6 dropped
    zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | liftOver tmp2.gp stdin vegaPseudoGeneHg18.gp unMappedPseudo.gp -genePred
    # only 11 dropped

    ldHgGene hg18 vegaGene -predTab vegaGeneHg18.gp -genePredExt
    ldHgGene hg18 vegaPseudoGene -predTab vegaPseudoGeneHg18.gp -genePredExt

    hgsql hg18 -N -B < /cluster/home/acs/kent/src/hg/lib/vegaInfo.sql
    echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg18 -N -B

# SYNTENIC NETS FOR PANTRO2, RHEMAC2, MM8, RN4, AND CANFAM2 AS COMPOSITE TRACK (DONE 5/22/06 acs)
# (for use in defining orthologs for macaque paper)
    ssh hgwdev

    # load syntenic nets created previously by Robert
    hgLoadNet hg18 netSyntenyPanTro2 /cluster/data/hg18/bed/blastz.panTro2/axtChain/hg18.panTro2.syn.net
    zcat /cluster/data/hg18/bed/blastz.rheMac2/axtChain/hg18.rheMac2.syn.net.gz | hgLoadNet hg18 netSyntenyRheMac2 stdin
    zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.syn.net.gz | hgLoadNet hg18 netSyntenyMm8 stdin
    zcat /cluster/data/hg18/bed/blastz.rn4/axtChain/hg18.rn4.syn.net.gz | hgLoadNet hg18 netSyntenyRn4 stdin
    zcat /cluster/data/hg18/bed/blastz.canFam2/axtChain/hg18.canFam2.syn.net.gz | hgLoadNet hg18 netSyntenyCanFam2 stdin

    # add more distant vertebrates to track so we can evaluate
    # syntenic netting for multiple alignment (2007-03-10 kate)
    cd /cluster/data/hg18/bed
    netFilter -syn blastz.danRer4/axtChain/hg18.danRer4.net.gz | \
        hgLoadNet hg18 netSyntenyDanRer4 stdin
    netFilter -syn blastz.galGal3/axtChain/hg18.galGal3.net.gz | \
        hgLoadNet hg18 netSyntenyGalGal3 stdin
    netFilter -syn blastz.monDom4/axtChain/hg18.monDom4.net.gz | \
        hgLoadNet -warn hg18 netSyntenyMonDom4 stdin
    netFilter -syn blastz.ornAna1/axtChain/hg18.ornAna1.net.gz | \
        hgLoadNet hg18 netSyntenyOrnAna1 stdin
    netFilter -syn blastz.anoCar1/axtChain/hg18.anoCar1.net.gz | \
        hgLoadNet hg18 netSyntenyAnoCar1 stdin
    netFilter -syn blastz.xenTro2/axtChain/hg18.xenTro2.net.gz | \
        hgLoadNet hg18 netSyntenyXenTro2 stdin
    netFilter -syn blastz.fr2/axtChain/hg18.fr2.net.gz | \
        hgLoadNet hg18 netSyntenyFr2 stdin
    netFilter -syn blastz.equCab1/axtChain/hg18.equCab1.net.gz | \
        hgLoadNet hg18 netSyntenyEquCab1 stdin
    netFilter -syn blastz.bosTau3/axtChain/hg18.bosTau3.net.gz | \
        hgLoadNet -warn hg18 netSyntenyBosTau3 stdin
    netFilter -syn blastz.oryLat1/axtChain/hg18.oryLat1.net.gz | \
        hgLoadNet hg18 netSyntenyOryLat1 stdin

cat > netCov.csh << 'EOF'
    #!/bin/csh -ef
    foreach db (PanTro2 RheMac2 Mm8 Rn4 CanFam2 EquCab1 BosTau3 MonDom4 OrnAna1 GalGal3 AnoCar1 XenTro2 DanRer4 Fr2 OryLat1)
        echo -n "       "
        featureBits -countGaps -chrom=chr1 hg18 refGene:cds net$db -enrichment
        featureBits -countGaps -chrom=chr1 hg18 refGene:cds netSynteny$db -enrichment
        echo ""
    end
'EOF'
    csh netCov.csh >&! netCov.log &
cat netCov.log
       #refGene:cds 1.282%, netPanTro2 99.979%, both 1.282%, cover 100.00%, enrich 1.00x
#refGene:cds 1.282%, netSyntenyPanTro2 99.978%, both 1.282%, cover 100.00%, enrich 1.00x
       #refGene:cds 1.282%, netRheMac2 99.970%, both 1.282%, cover 100.00%, enrich 1.00x
#refGene:cds 1.282%, netSyntenyRheMac2 99.961%, both 1.282%, cover 99.97%, enrich 1.00x
       #refGene:cds 1.282%, netMm8 98.650%, both 1.278%, cover 99.69%, enrich 1.01x
#refGene:cds 1.282%, netSyntenyMm8 98.352%, both 1.255%, cover 97.89%, enrich 1.00x
       #refGene:cds 1.282%, netRn4 98.404%, both 1.281%, cover 99.89%, enrich 1.02x
#refGene:cds 1.282%, netSyntenyRn4 98.074%, both 1.258%, cover 98.10%, enrich 1.00x
       #refGene:cds 1.282%, netCanFam2 99.527%, both 1.281%, cover 99.91%, enrich 1.00x
#refGene:cds 1.282%, netSyntenyCanFam2 99.274%, both 1.272%, cover 99.16%, enrich 1.00x
       #refGene:cds 1.282%, netEquCab1 99.457%, both 1.281%, cover 99.87%, enrich 1.00x
#refGene:cds 1.282%, netSyntenyEquCab1 99.020%, both 1.270%, cover 99.06%, enrich 1.00x
       #refGene:cds 1.282%, netBosTau3 99.641%, both 1.282%, cover 100.00%, enrich 1.00x
#refGene:cds 1.282%, netSyntenyBosTau3 99.493%, both 1.280%, cover 99.81%, enrich 1.00x
       #refGene:cds 1.282%, netMonDom4 98.718%, both 1.279%, cover 99.72%, enrich 1.01x
#refGene:cds 1.282%, netSyntenyMonDom4 98.029%, both 1.260%, cover 98.26%, enrich 1.00x
       #refGene:cds 1.282%, netOrnAna1 68.119%, both 1.168%, cover 91.06%, enrich 1.34x
#refGene:cds 1.282%, netSyntenyOrnAna1 56.729%, both 0.714%, cover 55.67%, enrich 0.98x
       #refGene:cds 1.282%, netGalGal3 82.246%, both 1.189%, cover 92.68%, enrich 1.13x
#refGene:cds 1.282%, netSyntenyGalGal3 80.379%, both 1.101%, cover 85.86%, enrich 1.07x
       #refGene:cds 1.282%, netAnoCar1 63.263%, both 1.128%, cover 87.97%, enrich 1.39x
#refGene:cds 1.282%, netSyntenyAnoCar1 54.068%, both 0.816%, cover 63.65%, enrich 1.18x
       #refGene:cds 1.282%, netXenTro2 45.072%, both 1.057%, cover 82.44%, enrich 1.83x
#refGene:cds 1.282%, netSyntenyXenTro2 31.985%, both 0.596%, cover 46.44%, enrich 1.45x
       #refGene:cds 1.282%, netDanRer4 28.211%, both 1.012%, cover 78.87%, enrich 2.80x
#refGene:cds 1.282%, netSyntenyDanRer4 7.631%, both 0.177%, cover 13.83%, enrich 1.81x
       #refGene:cds 1.282%, netFr2 26.938%, both 0.975%, cover 76.03%, enrich 2.82x
#refGene:cds 1.282%, netSyntenyFr2 7.991%, both 0.200%, cover 15.62%, enrich 1.95x

    # Conclusion: CDS coverage loss is small in all placentals and opossum, so
    # use syntenic net mafs for these in multiz.
    # Ask about chicken -- it's marginal
    # Robert prepped synMafNet's for some species, but the files lack
    # soft-masked sequence, so redo if time.

    # (set up trackDb.ra entry for composite track)


# SYNTENIC NET MAFS FOR MULTIZ (2007-03-09 kate)
# Compare with Robert's
    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.rheMac2
    mv mafSynNet mafSynNet.robert
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
        -syntenicNet -continue syntenicNet >&! synnet.log &
    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.panTro2

    # need DEF file for syntenic net, but this was
    # a swapped run, so we will simulate
    cp /cluster/data/panTro2/bed/blastz.hg18/DEF .
    # edit to reverse target and query, and change BASE dir
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
        -syntenicNet -continue syntenicNet >&! synnet.log &
    rm DEF

    # edit DEF file to reference kolossus-accessible sequence and chrom.sizes
    cd /cluster/data/hg18/bed/blastz.monDom4
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
        -syntenicNet -continue syntenicNet >&! synnet.log &

    cd /cluster/data/hg18/bed/blastz.equCab1
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
        -syntenicNet -continue syntenicNet >&! synnet.log &

    cd /cluster/data/hg18/bed/blastz.bosTau3
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
        -syntenicNet -continue syntenicNet >&! synnet.log &

    cd /cluster/data/hg18/bed/blastz.mm8
    cp /cluster/data/mm8/bed/blastz.hg18/DEF .
    # edit to reverse target & query, change BASE
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
        -syntenicNet -continue syntenicNet >&! synnet.log
    rm -f DEF

    cd /cluster/data/hg18/bed/blastz.rn4
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
        -syntenicNet -continue syntenicNet >&! synnet.log

    cd /cluster/data/hg18/bed/blastz.canFam2
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
        -syntenicNet -continue syntenicNet >&! synnet.log &

    # use syntenic net on opossum too
    cd /cluster/data/hg18/bed/blastz.monDom4
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
        -syntenicNet -continue syntenicNet >&! synnet.log &

# NET AND RECIPROCAL BEST TABLES FOR 2X MAMMALS
    # load net and reciprocal best net for comparison

    # rabbit
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.oryCun1/axtChain
    netFilter -minGap=10 hg18.oryCun1.net | hgLoadNet -warn hg18 netOryCun1 stdin
    netFilter -minGap=10 hg18.oryCun1.rbest.net.gz |  \
        hgLoadNet -warn hg18 netRBestOryCun1 stdin

    # tenrec
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.echTel1/axtChain
    netFilter -minGap=10 hg18.echTel1.net.gz | hgLoadNet -warn hg18 netEchTel1 stdin
    netFilter -minGap=10 hg18.echTel1.rbest.net.gz |  \
        hgLoadNet -warn hg18 netRBestEchTel1 stdin

    # net coverage
    ssh hgwdev
    cd /cluster/data/hg18/bed
cat > netRBestCov.csh << 'EOF'
    #!/bin/csh -ef
    foreach db (OtoGar1 OryCun1 CavPor2 LoxAfr1 EchTel1 DasNov1)
        echo -n "     "
        featureBits -countGaps -chrom=chr1 hg18 refGene:cds net$db -enrichment
        featureBits -countGaps -chrom=chr1 hg18 refGene:cds netRBest$db -enrichment
        echo ""
    end
'EOF'
    # << emacs
    csh netRBestCov.csh >&! netRBestCov.log &


##########################################################################
# EVOFOLD (Done, 05/12/06) Jakob Skou Pedersen
# RNA secondary structure predictions lifted from hg17 and filtered
  ssh -C hgwdev
  mkdir -p /cluster/data/hg18/bed/evofold
  cd /cluster/data/hg18/bed/evofold
  echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed
  liftOver -minMatch=1.0 foldsHg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz tmp.bed unmapped.bed
  # remove elements which are wrong size after lifting
  awk '$3-$2 == $7' tmp.bed | sort -k4,4 > rawFoldsHg18.bed

  # structure filters
  # first, remove pairs that can't form in human
  cut -f 1-6 rawFoldsHg18.bed > tmp.bed
  # sequenceForBed can be found and compiled from here: $HOME/kent/src/hg/altSplice/altSplice/
  nice /cluster/home/sugnet/bin/i386/sequenceForBed -db=hg18 -bedIn=tmp.bed -fastaOut=tmp.fa
  cat tmp.fa | sed -e 's/\.[+-]\.chr.*$//' \
             | sed -e '/^>/s/$/\t/' | tr -d '\n' | sed -e 's/>/\n/g' | sed -e '1d' -e '$s/$/\n/' | sort -k1,1 > foldsHg18Seq.tab
  join -1 4 -2 1 -o "1.4 1.8 2.2" rawFoldsHg18.bed foldsHg18Seq.tab | sed -e 's/  */\t/g' | sort -k1,1 \
	     | /cluster/home/jsp/scripts/tabFoldFilter.py > cleanFolds.tab
  join -1 4 -2 1 -o "1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.2 1.9" rawFoldsHg18.bed cleanFolds.tab | sed -e 's/  */\t/g' > tmp1.bed
  # second, remove poor predictions
  # scripts can be found in cvs tree at: cvsroot/jsp/scripts/. They use a few modules which can be found at: cvsroot/jsp/py_modules
  cat tmp1.bed | /cluster/home/jsp/scripts/bedRnassFilter.py --dangling --minAvrStemSize=3 | /cluster/home/jsp/scripts/bedRnassFilter.sh 1 3 \
	       | /cluster/home/jsp/scripts/roundListFloats.py -c9 > foldsHg18.bed
  # clean up
  rm tmp.bed tmp1.bed foldsHg17.bed foldsHg18Seq.tab rawFoldsHg18.bed tmp.fa cleanFolds.tab

  # upload
  hgLoadBed -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/evofold.sql hg18 evofold foldsHg18.bed


#########################################################################
# BLASTZ CHICKEN galGal3 (DONE 5/23/06 angie)
    ssh pk
    mkdir /cluster/data/hg18/bed/blastz.galGal3.2006-05-22
    cd /cluster/data/hg18/bed/blastz.galGal3.2006-05-22
    cat << '_EOF_' > DEF
# human vs chicken
BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/san/sanvol1/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.galGal3.2006-05-22
'_EOF_'
    # << emacs
    ~/kent/src/utils/doBlastzChainNet.pl DEF \
      -bigClusterHub=pk -smallClusterHub=pk \
      -chainMinScore=5000 -chainLinearGap=loose \
      >& do.log & tail -f do.log
    ln -s blastz.galGal3.2006-05-22 /cluster/data/hg18/bed/blastz.galGal3

    # running syntenicNet 2008-10-30
    #	had to update the DEF file to correspond to new hive layout
    cd /cluster/data/hg18/bed/blastz.galGal3.2006-05-22
    mv DEF DEF.0
    cat << '_EOF_' > DEF
# human vs chicken
BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_SMSK=/scratch/data/hg18/linSpecRep/notInMouseRat
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/galGal3/nib
SEQ2_LEN=/scratch/data/galGal3/chrom.sizes
SEQ2_SMSK=/scratch/data/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastz.galGal3.2006-05-22
'_EOF_'
    # << happy emacs 

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
      -bigClusterHub=swarm -smallClusterHub=memk \
      -continue=syntenicNet -syntenicNet \
	-chainMinScore=5000 -chainLinearGap=loose > synNet.log 2>&1
    #	worked OK in about 3 minutes

#########################################################################
# REGULATORY POTENTIAL (DONE - 2006-06-09 - Hiram)
    #	download data from "James Taylor" <james at bx.psu.edu>
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/regPotential7X
    cd /cluster/data/hg18/bed/regPotential7X

    #	This is a lot of data
    for C in 1 2 3 4 5 6 7 8 9 X Y 10 11 12 13 14 15 16 17 18 19 20 21 22
    do
    wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/chr${C}.scores.truncated.bz2"
    done

    wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/trackDb.html" -O description.html

    time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
    do
	bzcat chr${C}.scores.truncated.bz2
    done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    23m27.454s
    #	user    22m41.058s
    #	sys     0m41.850s

    #	Loading the table on hgwdev
    ssh hgwdev
    cd /cluster/data/hg18/bed/regPotential7X
    ln -s /cluster/data/hg18/bed/regPotential7X/regPotential7X.wib \
	/gbdb/hg18/wib/regPotential7X.wib
    #	using the tmpDir is faster since it is on local disk and it will
    #	clean up any temporary .tab file it creates there
    time hgLoadWiggle -tmpDir=/scratch/tmp \
	hg18 regPotential7X regPotential7X.wig

    #	How about a histogram of the data.
    #	find min and max for everything to verify it is 0 to 1
    ssh kkstore02
    cd /cluster/data/hg18/bed/regPotential7X
    time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
    do
	echo " ============   ${C}  ======================="
	bzcat chr${C}.scores.truncated.bz2 | ave -col=2 stdin
    done > stats.all 2>&1

    grep "^min" stats.all | sort -u
    #	min 0.000000
    grep "^max" stats.all | sort -u
    #	max 1.000000

    ssh kolossus
    cd /cluster/data/hg18/bed/regPotential7X
    time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \
	-hMinVal=0.0 -db=hg18 regPotential7X > histogram.data 2>&1
    #	real    2m42.311s
    #	73 % of the data values are zero

    #	create download gzip files from the bz2 files:
    ssh kkstore02
    cd /cluster/data/hg18/bed/regPotential7X
    for F in chr*.scores.truncated.bz2
    do
	C=`echo $F | awk -F'.' '{print $1}'`
	echo -n "${C}.regPotential7X.hg18.gz working ... "
	bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz
	echo
    done

#########################################################################
# create md5sum.txt under bigZips (DONE, 6/7/06, Fan)

    cd /cluster/store11/gs.19/build36/downloads/bigZips
    md5sum *.zip *.2bit README.txt > md5sum.txt

#########################################################################
# UPDATE BACENDS track (DONE - 2006-06-16 - Hiram)
#	An attempt to recover some of the missing clones from the
#	bacEnds track.  It turns out the perl processing script wasn't
#	properly catagorizing all the clone ends, thus a lot of them
#	were being left out of the final track
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/updateCloneEnds
    cd /cluster/data/hg18/bed/updateCloneEnds
    ln -s ../cloneend/all.txt.gz .
    #	Checked this script into the source tree and fixed it up to
    #	recognize more of the catagories of clone ends
    zcat all.txt.gz | $HOME/kent/src/hg/utils/cloneEndParse.pl /dev/stdin
    #	Reading in end info
    #	Writing out pair info
    #	Writing out singleton info
    #	301377 pairs and 204698 singles
    #	Note that there are none marked at "unclassified" - this script
    #	will print out that message to stderr if it doesn't recognize
    #	any marker classifications.  This produces the files:
    #	-rw-rw-r--  1 9645568 Jun 16 14:09 cloneEndPairs.txt
    #	-rw-rw-r--  1 4906468 Jun 16 14:09 cloneEndSingles.txt
    wc -l clone*.txt
    #	301377 cloneEndPairs.txt
    #	204698 cloneEndSingles.txt
    #	This is a lot better than previous:
    wc -l ../cloneend/cloneEnd*.txt
    #	249619 ../cloneend/cloneEndPairs.txt
    #	318500 ../cloneend/cloneEndSingles.txt

    mkdir /san/sanvol1/scratch/hg18/updateBacEnds
    cd /san/sanvol1/scratch/hg18/updateBacEnds
    ln -s ../bacends/bacEnds.sorted.psl .
    ln -s ../bacends/lifted .
    pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
	-mismatch -verbose bacEnds.sorted.psl \
	/cluster/data/hg18/bed/updateCloneEnds/cloneEndPairs.txt \
	all_bacends bacEnds

    echo -e \
'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
    echo -e '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
    cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \
	| headchg -del > bacEndPairs.bed
    cat header bacEnds.slop bacEnds.short bacEnds.long \
	bacEnds.mismatch bacEnds.orphan \
        | row score ge 300 | sorttbl chr start | headchg -del \
	> bacEndPairsBad.bed

    extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
                bacEndPairsBad.bed | \
                        sorttbl tname tstart | headchg -del > bacEnds.load.psl
    #	looks like we are getting a lot more now in every catagory:
    wc -l bacEnds.* bacEndPairs* | sort -n
       49 bacEnds.long
     1399 bacEnds.mismatch
     4516 bacEnds.slop
     7202 bacEnds.short
    66861 bacEnds.orphan
    78900 bacEndPairsBad.bed
   205443 bacEndPairs.bed
   207997 bacEnds.pairs
  1727387 bacEnds.load.psl
    #	Previously:
    wc -l ../bacends/bacEnds.* ../bacends/bacEndPairs* | sort -n
       40 ../bacends/bacEnds.long
     1061 ../bacends/bacEnds.mismatch
     3954 ../bacends/bacEnds.slop
     6279 ../bacends/bacEnds.short
    59245 ../bacends/bacEnds.orphan
    69788 ../bacends/bacEndPairsBad.bed
   159268 ../bacends/bacEndPairs.bed
   161251 ../bacends/bacEnds.pairs
  1249956 ../bacends/bacEnds.load.psl

    #	Move the previous build out of the way and copy these
    #	results over to the primary hg18 bed location:
    mv /cluster/data/hg18/bed/bacends /cluster/data/hg18/bed/bacends.2006-02-02
    mkdir /cluster/data/hg18/bed/bacends
    cp -p bacEnd* /cluster/data/hg18/bed/bacends
    cp -p lifted/bacEnds.lifted.psl /cluster/data/hg18/bed/bacends

    #	load them into the database
    ssh hgwdev
    cd /cluster/data/hg18/bed/bacends
    #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
    awk '{print $5}' bacEndPairs.bed | sort | uniq -c
    #	result should be the scores, no extraneous strings:
    #	202488 1000
    #	   255 300
    #	   416 375
    #	   384 500
    #	  1900 750
    #	edit the file and fix it if it has a bad name.

    sed -e "s/bacEndPairs /bacEndPairsUpdate /" \
	$HOME/kent/src/hg/lib/bacEndPairs.sql > bacEndPairsUpdate.sql

    hgLoadBed -notItemRgb hg18 bacEndPairsUpdate bacEndPairs.bed \
                 -sqlTable=bacEndPairsUpdate.sql
    # Loaded 205443 elements of size 11
    #	Previously was:
    # Loaded 159268

    # note - this track isn't pushed to RR, just used for assembly QA
    sed -e "s/bacEndPairsBad /bacEndPairsBadUpdate /" \
	$HOME/kent/src/hg/lib/bacEndPairsBad.sql > bacEndPairsBadUpdate.sql

    hgLoadBed -notItemRgb hg18 bacEndPairsBadUpdate bacEndPairsBad.bed \
                 -sqlTable=bacEndPairsBadUpdate.sql
    # Loaded 78900 elements of size 11
    #	Previously was:
    # Loaded 69788
    #hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
    # NOTE: truncates file to 0 if -nobin is used
    # NOTE: truncates file to 0 if -nobin is used
    hgLoadPsl hg18 -table=all_bacendsUpdate bacEnds.load.psl

    #	no complaints !  Usually there are, this loaded:
    hgsql -N -e "select count(*) from all_bacendsUpdate;" hg18
    #	1727387
    #	Previously this was:
    #	1249956

    nice featureBits hg18 all_bacendsUpdate
# 227770876 bases of 2881515245 (7.905%) in intersection
    nice featureBits hg18 all_bacends
# 191078854 bases of 2881515245 (6.631%) in intersection
    nice featureBits hg17 all_bacends
# 225763317 bases of 2866216770 (7.877%) in intersection

    nice featureBits hg18 bacEndPairsUpdate
# 162690030 bases of 2881515245 (5.646%) in intersection
    nice featureBits hg18 bacEndPairs
# 130270940 bases of 2881515245 (4.521%) in intersection
    nice featureBits hg17 bacEndPairs
# 162099487 bases of 2866216770 (5.656%) in intersection

    nice featureBits hg18 bacEndPairsBadUpdate
# 37326990 bases of 2881515245 (1.295%) in intersection
    nice featureBits hg18 bacEndPairsBad
# 33650226 bases of 2881515245 (1.168%) in intersection
    nice featureBits hg17 bacEndPairsBad
# 37437558 bases of 2866216770 (1.306%) in intersection


# Renamed the new BAC End Pairs tables (7-27-2006 Brooke)
mysql> alter table all_bacends rename all_bacendsOld;
Query OK, 0 rows affected (0.01 sec)

mysql> alter table bacEndPairs rename bacEndPairsOld;
Query OK, 0 rows affected (0.00 sec)

mysql> alter table all_bacendsUpdate rename all_bacends;
Query OK, 0 rows affected (0.00 sec)

mysql> alter table bacEndPairsUpdate rename bacEndPairs;
Query OK, 0 rows affected (0.00 sec)

#########################################################################
# dbSNP BUILD 126 (Heather, June 2006)

# Set up directory structure
ssh kkstore02
cd /cluster/data/dbSNP
mkdir 126
cd 126
mkdir human
cd human
mkdir data
mkdir schema
mkdir rs_fasta

# Get data from NCBI (anonymous FTP)
cd /cluster/data/dbSNP/126/human/data
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/database/organism_data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
get b126_SNPContigLoc_36_1.bcp.gz
# ContigLocusId has function
get b126_SNPContigLocusId_36_1.bcp.gz
get b126_ContigInfo_36_1.bcp.gz
# MapInfo has alignment weights
get b126_SNPMapInfo_36_1.bcp.gz
# SNP has univar_id, validation status and heterozygosity
get SNP.bcp.gz

# Get schema from NCBI
cd /cluster/data/dbSNP/126/human/schema
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/database/organism_schema
get human_9606_table.sql.gz

# Get fasta files from NCBI
# using headers of fasta files for molType
cd /cluster/data/dbSNP/126/human/rs_fasta
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/rs_fasta
mget *.gz

# Simplify names of data files
cd /cluster/data/dbSNP/126/human/data
mv b126_SNPContigLoc_36_1.bcp.gz ContigLoc.gz
mv b126_SNPContigLocusId_36_1.bcp.gz ContigLocusId.gz
mv b126_ContigInfo_36_1.bcp.gz ContigInfo.gz
mv b126_SNPMapInfo_36_1.bcp.gz MapInfo.gz
mv SNP.bcp.gz SNP.gz
ls -1 *.gz > filelist

# edit table descriptions
cd /cluster/data/dbSNP/126/human/schema
# get CREATE statements from human_9606_table.sql for our 5 tables
# store in table.tmp
# convert and rename tables
sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
rm table.tmp
sed -f 'tableRename.sed' table2.tmp > table.sql
rm table2.tmp

# Get updated UniVariation table
cd /cluster/data/dbSNP/126/shared
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/database/shared_data
get UniVariation.bcp.gz
cd ../shared_schema
get dbSNP_main_table.sql.gz
# get UniVariation CREATE statement from dbSNP_main_table.sql
# use mssqlToMysql.sed to convert

# get header lines from rs_fasta
cd /cluster/data/dbSNP/126/human/rs_fasta
/bin/csh gnl.csh

# add rs_fasta to seq/extFile
# 2 edits first: strip header to just rsId, and remove duplicates
# work on /cluster/store12 (kkstore05) which has more disk space
# also for human, don't include chrUn
cp rs_ch*.fas.gz /cluster/store12/snp/126/human/rs_fasta
ssh kkstore05
cd /cluster/store12/snp/126/human/rs_fasta
mkdir unarchive
mv rs_chUn.fas.gz unarchive
# concat into rsAll.fas
cat << '_EOF_' > concat.csh
#!/bin/csh -ef
rm -f rsAll.fas
foreach file (rs_ch*.fas.gz)
    echo $file
    zcat $file >> rsAll.fas
end
'_EOF_'
# << emacs
# snpCleanSeq strips the header and skips duplicates
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa
rm rsAll.fas
# load on hgwdev
ssh hgwdev
mkdir /gbdb/hg18/snp
ln -s /cluster/store12/snp/126/human/rs_fasta/snp.fa /gbdb/hg18/snp/snp.fa
cd /cluster/store12/snp/126/human/rs_fasta
hgLoadSeq hg18 /gbdb/hg18/snp/snp.fa
# look up id in extFile
# move into separate table
hgsql hg18 < snpSeq.sql
hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 15200238' hg18
hgsql -e 'delete from seq where extFile = 15200238' hg18
hgsql -e 'alter table snpSeq add index acc (acc)' hg18
# clean up after hgLoadSeq
rm seq.tab

# load on kkr5u00
ssh kkr5u00
hgsql -e mysql 'create database hg18snp126'
cd /cluster/data/dbSNP/126/human/schema
hgsql hg18snp126 < table.sql
cd ../data
/bin/csh load.csh

# note rowcount
# ContigLoc     27007176
# SNP           11961761
# MapInfo       11712346
# ContigLocusId 11854143

cd /cluster/data/dbSNP/126/shared
hgsql hg18snp126 < UniVariation.sql
zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' hg18snp126

# create working /scratch dir
cd /scratch/snp
mkdir 126
cd 126
mkdir human
cd human

# get hg18 ctgPos, load into dbSnpHumanBuild126, compare contig list between ctgPos and ContigInfo
# Note: missing chrY PAR regions

# get gnl files
cp /cluster/data/dbSNP/126/human/rs_fasta/*.gnl .

# examine ContigInfo for group_term and edit pipeline.csh
# use "ref_assembly"

cd /scratch/snp/126/human
# filter ContigLoc into ContigLocFilter
# this lifts from contig coords to chrom coords
# phys_pos_from is used to check coords for non-random chroms
# errors reported to stdout
# this gets rid of alternate assemblies (using ContigInfo)
# this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo)
# assumes all contigs are positively oriented; will abort if not true

mysql> desc ContigLocFilter;
#  +---------------+-------------+------+-----+---------+-------+
#  | Field         | Type        | Null | Key | Default | Extra |
#  +---------------+-------------+------+-----+---------+-------+
#  | snp_id        | int(11)     | NO   |     |         |       |
#  | ctg_id        | int(11)     | NO   |     |         |       |
#  | chromName     | varchar(32) | NO   |     |         |       |
#  | loc_type      | tinyint(4)  | NO   |     |         |       |
#  | start         | int(11)     | NO   |     |         |       |
#  | end           | int(11)     | YES  |     | NULL    |       |
#  | orientation   | tinyint(4)  | NO   |     |         |       |
#  | allele        | blob        | YES  |     | NULL    |       |
#  +---------------+-------------+------+-----+---------+-------+

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter hg18snp126 ref_assembly reference
# note rowcount
# ContigLocFilter  12368145
# how many are positive strand? hopefully 90%
mysql> select count(*) from ContigLocFilter where orientation = 0;
# 10622168
# note count by loc_type
mysql> select count(*), loc_type from ContigLocFilter group by loc_type;
# +----------+----------+
# | count(*) | loc_type |
# +----------+----------+
# |   205359 |        1 |
# | 10678378 |        2 |
# |  1464642 |        3 |
# |     9025 |        4 |
# |     1117 |        5 |
# |     9624 |        6 |
# +----------+----------+


# filter ContigLocusId into ContigLocusIdFilter
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter hg18snp126 ref_assembly
# note rowcount
# ContigLocusIdFilter  5812538

# condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
# assumes SNPs are in numerical order; will errAbort if not true
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense hg18snp126
# note rowcount; expect about 50% for human
# ContigLocusIdCondense 3975405 (note this is smaller than hg17/snp125)
# could delete ContigLocusIdFilter table here

# create chrN_snpFasta tables from *.gnl files
# we are just using molType, but also storing class and observed
# 266,366 duplicates detected in snpMoltype.errors
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta hg18snp126

# (could start using pipeline.csh here)
# (pipeline.csh takes about 35 minutes to run)

# split ContigLocFilter by chrom
# create the first chrN_snpTmp
# we will reuse this table name, adding/changing columns as we go
# at this point chrN_snpTmp will have the same description as ContigLocFilter
# this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom hg18snp126 ref_assembly

# adjust coords using loc_type
# possible errors logged to snpLocType.error:
# Unknown locType
# Between with end != start + 1
# Between with allele != '-'
# Exact with end != start
# Range with end < start

# possible exceptions logged to snpLocType.exceptions:
# RefAlleleWrongSize

# This run no errors, no exceptions
# I do note that out of 25K rows where loc_type == 6, 12259 have asn_from == asn_to
# All of loc_type == 1, 4, 5 have zero rows where asn_from == asn_to
# This was also true in build125

# morph chrN_snpTmp

mysql> desc chr1_snpTmp;

#  +---------------+-------------+------+-----+---------+-------+
#  | Field         | Type        | Null | Key | Default | Extra |
#  +---------------+-------------+------+-----+---------+-------+
#  | snp_id        | int(11)     | NO   |     |         |       |
#  | ctg_id        | int(11)     | NO   |     |         |       |
#  | chromStart    | int(11)     | NO   |     |         |       |
#  | chromEnd      | int(11)     | NO   |     |         |       |
#  | loc_type      | tinyint(4)  | NO   |     |         |       |
#  | orientation   | tinyint(4)  | NO   |     |         |       |
#  | allele        | blob        | YES  |     | NULL    |       |
#  +---------------+-------------+------+-----+---------+-------+

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype hg18snp126 ref_assembly

# expand allele as necessary
# report syntax errors to snpExpandAllele.errors
# possible exceptions logged to snpExpandAllele.exceptions:
# RefAlleleWrongSize
# This run no errors, no exceptions
# 8092 alleles expanded

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele hg18snp126 ref_assembly

# the next few steps prepare for working in UCSC space

# sort by position
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort hg18snp126 ref_assembly

# rename MT --> M (pipeline.csh takes care of this)
hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" hg18snp126

# get hg18 nib files
# get hg18 chromInfo, load into hg18snp126 with editted path
# lookup reference allele in nibs
# keep reverse complement to use in error checking (snpCheckAlleles)
# check here for SNPs larger than 1024
# errAbort if detected
# check for coords that are too large, log to snpRefUCSC.error and skip
# This run we got 30678 lines in snpRefUCSC.error
# 12178 from chr14 (reported to dbSNP)
# also 18423 from chr1_random and 77 from chr6_random
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC hg18snp126

# morph chrN_snpTmp

mysql> desc chr1_snpTmp;

#  +--------------------+-------------+------+-----+---------+-------+
#  | Field              | Type        | Null | Key | Default | Extra |
#  +--------------------+-------------+------+-----+---------+-------+
#  | snp_id             | int(11)     | NO   |     |         |       |
#  | ctg_id             | int(11)     | NO   |     |         |       |
#  | chromStart         | int(11)     | NO   |     |         |       |
#  | chromEnd           | int(11)     | NO   |     |         |       |
#  | loc_type           | tinyint(4)  | NO   |     |         |       |
#  | orientation        | tinyint(4)  | NO   |     |         |       |
#  | allele             | blob        | YES  |     | NULL    |       |
#  | refUCSC            | blob        | YES  |     | NULL    |       |
#  | refUCSCReverseComp | blob        | YES  |     | NULL    |       |
#  +--------------------+-------------+------+-----+---------+-------+

# compare allele from dbSNP to refUCSC
# locType between is excluded from this check
# log exceptions to snpCheckAllele.exceptions
# if SNP is positive strand, expect allele == refUCSC
# log RefAlleleMismatch if not
# if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
# If allele == refUCSCRevComp, log RefAlleleNotRevComp
# If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
# This run we got:
# 0 RefAlleleMismatch
# 119366   RefAlleleNotRevComp
# Note this is double from build125
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles hg18snp126

# add class and observed using univar_id from SNP table
# to get class (subsnp_class) and observed (var_str) from UniVariation
# log errors to snpClassAndObserved.errors
# errors detected:
# class = 0 in UniVariation
# class > 8 in UniVariation
# univar_id = 0 in SNP
# no row in SNP for snp_id in chrN_snpTmp
# This run we got:
# 3 class = 0 in UniVariation
# 0 class > 8 in UniVariation
# 39059 univar_id = 0 in SNP
# 879 no row in SNP for snp_id in chrN_snpTmp (all chr6)
# dbSNP has class = 'in-del'
# we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved hg18snp126

# morph chrN_snpTmp
#  +--------------------+---------------+------+-----+---------+-------+
#  | Field              | Type          | Null | Key | Default | Extra |
#  +--------------------+---------------+------+-----+---------+-------+
#  | snp_id             | int(11)       | NO   |     |         |       |
#  | chromStart         | int(11)       | NO   |     |         |       |
#  | chromEnd           | int(11)       | NO   |     |         |       |
#  | loc_type           | tinyint(4)    | NO   |     |         |       |
#  | class              | varchar(255)  | NO   |     |         |       |
#  | orientation        | tinyint(4)    | NO   |     |         |       |
#  | allele             | blob          | YES  |     | NULL    |       |
#  | refUCSC            | blob          | YES  |     | NULL    |       |
#  | refUCSCReverseComp | blob          | YES  |     | NULL    |       |
#  | observed           | blob          | YES  |     | NULL    |       |
#  +--------------------+---------------+------+-----+---------+-------+

# generate exceptions for class and observed

# SingleClassBetweenLocType
# SingleClassRangeLocType
# NamedClassWrongLocType

# ObservedWrongFormat
# ObservedWrongSize (twice as many as hg17/snp125)
# ObservedMismatch (nearly 3x as many as hg17/snp125)

# RangeSubstitutionLocTypeExactMatch

# SingleClassTriAllelic
# SingleClassQuadAllelic

# This will also detect IUPAC symbols in allele

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved hg18snp126

# add function
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction hg18snp126

# add validation status and heterozygosity
# log error if validation status > 31 or missing
# this run we got 8 missing
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP hg18snp126

# add molType
# errors detected: missing or duplicate molType
# no errors this run
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype hg18snp126

# generate chrN_snp126 and snp126Exceptions tables
cp snpCheckAlleles.exceptions snpCheckAlleles.tab
cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
cp snpExpandAllele.exceptions snpExpandAllele.tab
cp snpLocType.exceptions snpLocType.tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable hg18snp126 126

# handle chrY PAR SNPs (still missing from dbSNP)
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpPAR hg18snp126
hgsql -e 'load data local infile "snpPARexceptions.tab" into table snp126Exceptions' hg18snp126

# concat into snp126.tab
# cat chr*_snp126.tab >> snp126.tab
# note chr18_random_snp126.tab is empty (just 2 rows in hg17/snp125)
/bin/sh concat.sh

# check for multiple alignments
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple hg18snp126
mysql> load data local infile 'snpMultiple.tab' into table snp126Exceptions;

# run and review snpCompareLoctype
# load snp125subset
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareLoctype hg18snp126 snp125subset snp126
# cat snpCompareLoctypeCounts.out
# note: rangeToExact is 2x 124/125 conversion rate

# exactToExact = 8747888
# exactToBetween = 1071
# exactToRange = 6673
# betweenToBetween = 321371
# betweenToExact 1323
# betweenToRange 514
# rangeToRange = 95562
# rangeToBetween = 1794
# rangeToExact = 15148
# oldToNew = 10649

# run and review snpCompareWeight
# load into database snp125snp126
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareWeight snp125snp126 weight125 weight126
# cat snpCompareWeightCounts.out
# oneToOne = 9161896
# oneToTwo = 0   <-- good
# oneToThree = 531  <--- interesting but minor
# twoToTwo = 38  <-- okay
# twoToOne = 1896  <--- improvement
# twoToThree = 0   <-- good
# threeToThree = 494  <-- okay
# threeToOne = 37571  <-- improvement
# threeToTwo = 12  <-- improvement


# load on hgwdev
cp snp126.tab /cluster/home/heather/transfer/snp
hgsql hg18snp126 -e 'select * from snp126Exceptions' > /cluster/home/heather/transfer/snp/snp126Exceptions.tab
ssh hgwdev
mysql> load data local infile 'snp126.tab' into table snp126;
mysql> load data local infile 'snp126Exceptions.tab' into table snp126Exceptions;

# create indexes
mysql> alter table snp126 add index name (name);
mysql> alter table snp126 add index chrom (chrom, bin);
mysql> alter table snp126Exceptions add index name(name);

# create snp126ExceptionDesc table
cd /cluster/data/dbSNP
hgsql hg18 < snp126ExceptionDesc.sql
# add counts to exception.human.126, can start with exception.template
hgsql -e 'select count(*), exception from snp126Exceptions group by exception' hg18
mysql> load data local infile 'exception.human.126' into table snp126ExceptionDesc;

################################################################
# SNP126 edit: condense UTR/intron func into just intron at Jim's request
ssh kkr5u00
cd /scratch/snp/126/human
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense hg18snp126
/bin/csh pipeline.csh
ssh hgwdev
cd /cluster/home/heather/transfer/snp
hgsql hg18 -e 'drop table snp126'
hgsql hg18 < /cluster/home/heather/kent/src/hg/lib/snp126.sql
hgsql hg18 -e 'load data local infile "snp126.tab" into table snp126'
hgsql hg18 -e 'alter table snp126 add index name (name)'
hgsql hg18 -e 'alter table snp126 add index chrom (chrom, bin)'

################################################################
# SNP126 edit: detect clustering errors  (Heather, Sept. 2006)
# for locType = 'between' (class = 'insertion')
# 1,393,040 candidates
# exceptions:
#   DuplicatedObserved (3020 of these)
#   MixedObserved (1312 of these)

# create and populate a simple table snp126insertions
mysql> insert into snp126insertions
       select chrom, chromStart, chromEnd, name, score, strand, observed from snp126
       where locType = 'between' and class = 'insertion';

# generate and load data
cd /cluster/home/heather/kent/src/hg/snp/snpLoad
./snpCheckCluster hg18 snp126insertions
mysql> load data local infile 'snpCheckCluster.tab' into table snp126Exceptions;

# update snp126ExceptionDesc

################################################################
# generate snpMasked sequence for snp126 (Heather, Sept. 2006)
# snpMaskChrom was run too, not documented here.
# OBSOLETED by snp128Mask, see below.

# 3 steps: simple filtering, advanced filtering, generate sequence

# simple filtering: create and populate tables
# insertions: 1,393,040
# deletions: 783,454
ssh hgwdev
mysql> insert into snp126insertions select * from snp126
       where locType = 'between' and class = 'insertion';
mysql> insert into snp126deletions select * from snp126
       where class = 'deletion';

# advanced filtering -- insertions
cd /cluster/home/heather/kent/src/hg/snp/snpLoad
# this removes SNPs with weight != 1
# this removes SNPs that align to more than one position
# this removes SNPs that cluster together with conflicting observations
# (these should be class = 'mixed')
# this removes SNPs with invalid observed string
# this asserts end == start
# final count 1,352,380
# written to insertions.tab
./snpGetInsertions hg18 snp126insertions snp126Exceptions

# advanced filtering -- deletions
cd /cluster/home/heather/kent/src/hg/snp/snpLoad
# this removes SNPs with weight != 1
# this removes SNPs that align to more than one position
# this removes SNPs with invalid observed string
# this removes SNPs with exception ObservedWrongSize
# this asserts end > start
# final count 621,024
# written to deletions.tab
./snpGetDeletions hg18 snp126deletions snp126Exceptions

# Note: the advanced filtering pretty much removes all SNPs from chrN_random

# generate sequence -- insertions
# use kent/src/hg/snp/snpMask/seqWithInsertions.c
# this asserts that position doesn't exceed chromSize
# this will reverse complement observed if strand is negative
# if no SNPs found, output sequence == input sequence
# write to chrN.fat
ssh kkr5u00
mysql> load data local infile
"/cluster/home/heather/kent/src/hg/snp/snpLoad/insertions.tab" into table
snp126insertionsClean;
cd /scratch/snp126/human/fat
/bin/sh fat.sh
cp *.fat /cluster/data/hg18/snpMask/insertions
ssh kkstore02
cd /cluster/data/hg18/snpMask/insertions
nice gzip *.fat

# generate sequence -- deletions
# use kent/src/hg/snp/snpMask/seqWithoutDeletions.c
# this asserts that position doesn't exceed chromSize
# if no SNPs found, output sequence == input sequence
# write to chrN.skinny
ssh kkr5u00
mysql> load data local infile
"/cluster/home/heather/kent/src/hg/snp/snpLoad/deletions.tab" into table
snp126deletionsClean;
cd /scratch/snp126/human/skinny
/bin/sh skinny.sh
cp *.skinny /cluster/data/hg18/snpMask/deletions
ssh kkstore02
cd /cluster/data/hg18/snpMask/deletions
nice gzip *.skinny

# create links on hgwdev
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg18/snpMask/insertions
/bin/sh link.sh
cd /usr/local/apache/htdocs/goldenPath/hg18/snpMask/deletions
/bin/sh link.sh


############################################################################
# Lift simple bi-allelic SNPs to rheMac2 and panTro2 (Heather, August 2006)
# OBSOLETED by snp128Ortho, see below.

ssh hgwdev
cd /cluster/data/dbSNP/ortho/hg18/snpDump

# dump raw data -- this creates snpGetSimple.chr*
# exceptions table is used to skip SNPs that align in multiple places
# We also skip SNPs on chrN_random
# We also skip triallelic and quadallelic
# We don't filter on weight
# This yields 9,092,533 SNPs
# This data is also stored into hg18.snp126simple for later use
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpGetSimple hg18 snp126 snp126Exceptions

# split up into just under 200 files to make for an efficient pk run
# using file size of 60K lines
# this creates /cluster/data/dbSNP/ortho/hg18/split/chr1-01, chr1-02, chr1-03, etc.
# 165 files created
# 140 files have 60k lines
/bin/csh split.csh

# prepare cluster runs
# I didn't use -bedPlus=6, didn't seem to need it
cp /cluster/data/dbSNP/ortho/hg18/split/* /san/sanvol1/snp/liftOver/hg18/rheMac2/input
cp /cluster/data/dbSNP/ortho/hg18/split/* /san/sanvol1/snp/liftOver/hg18/panTro2/input
cd /san/sanvol1/snp/liftOver/hg18/rheMac2
/bin/csh makeJobList.csh

    rm -f jobList
    foreach fileName (`ls input/chr*`)
        set baseName = $fileName:t
        echo liftOver $fileName /cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz output/$baseName.out unmapped/$baseName.unmapped >> jobList
    end

cd /san/sanvol1/snp/liftOver/hg18/panTro2
/bin/csh makeJobList.csh

    rm -f jobList
    foreach fileName (`ls input/chr*`)
        set baseName = $fileName:t
        echo liftOver $fileName /cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz output/$baseName.out unmapped/$baseName.unmapped >> jobList
    end


# do cluster runs
# this only took a few minutes
# got 7321537 lifts for rheMac2
# got 8517465 lifts for panTro2
ssh pk
cd /san/sanvol1/snp/liftOver/hg18/rheMac2
para create jobList
para try; para check; para push; para check; etc.
cd /san/sanvol1/snp/liftOver/hg18/panTro2
para create jobList
para try; para check; para push; para check; etc.

# concatenate output files into all.out
cd /san/sanvol1/snp/liftOver/hg18/rheMac2/output
/bin/csh concat.csh
cd /san/sanvol1/snp/liftOver/hg18/panTro2/output
/bin/csh concat.csh

# load into panTro2 and rheMac2
# Doing the load and split so I can easily load sequence for a full chrom
ssh hgwdev
cp /san/sanvol1/snp/liftOver/hg18/rheMac2/output/all.out /cluster/data/dbSNP/ortho/hg18/rheMac2Lift
cd /cluster/data/dbSNP/ortho/hg18/rheMac2Lift
hgsql rheMac2 < snp126hg18ortho.sql
hgsql -e 'load data local infile "all.out" into table snp126hg18ortho' rheMac2
cp /san/sanvol1/snp/liftOver/hg18/panTro2/output/all.out /cluster/data/dbSNP/ortho/hg18/panTro2Lift
cd /cluster/data/dbSNP/ortho/hg18/panTro2Lift
hgsql panTro2 < snp126hg18ortho.sql
hgsql -e 'load data local infile "all.out" into table snp126hg18ortho' panTro2

# split by chrom
# this creates tables chrN_snp126hg18ortho and can be run from anywhere
# it will create chrN_snp126hg18ortho.tab files which can be deleted
cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom2 rheMac2 snp126hg18ortho
rm chr*.tab
# rm snp126ortho.tab
cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom2 panTro2 snp126hg18ortho
rm chr*.tab
# rm snp126ortho.tab

# get sequence
# this creates chrN_snp126hg18orthoPrelim.tab files
# random chroms are okay here
# note we are including Ns
# This will log to fetchSeq.errors any examples where chromEnd != chromStart + 1
# It will also check for coordinates past the end of the chrom.
# No errors for rheMac2 or panTro2.

cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq rheMac2 /cluster/data/rheMac2/rheMac2.2bit
# ssh kkstore02
# cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
/bin/csh concat.csh
# cleanup; remove split tables from rheMac2, keep snp126hg18orthoPrelim
hgsql rheMac2 < drop.sql
rm chr*.tab

cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq panTro2 /cluster/data/panTro2/panTro2.2bit
# ssh kkstore02
# cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
/bin/sh concat.sh
# cleanup; remove split tables from panTro2, keep snp126hg18orthoPrelim
hgsql panTro2 < drop.sql
rm chr*.tab

# do a preliminary load -- combine chimp and macaque
cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
hgsql hg18 < snp126orthoPrelim.sql
hgsql -e 'load data local infile "snp126orthoPrelim.tab" into table snp126orthoPrelim' hg18
cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
hgsql -e 'load data local infile "snp126orthoPrelim.tab" into table snp126orthoPrelim' hg18

# add human chrom, chromStart, chromEnd, allele, variant
# liftOver loses the chrom, chromStart and chromEnd
# liftOver does retain the allele and variant
cd /cluster/data/dbSNP/ortho/hg18/integrate
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrthoLookup hg18 snp126simple snp126orthoPrelim

# load final table with separate rows for chimp and macaque
# drop snp126orthoPrelim because it has non-human coords
# rm tab file because it is huge
hgsql hg18 < snp126ortho.sql
load data local infile "snpOrthoLookup.tab" into table snp126ortho
drop table snp126orthoPrelim
rm snpOrthoLookup.tab

# create indices
mysql> alter table snp126ortho add index name (name);
mysql> alter table snp126ortho add index chrom (chrom, bin);

# manually validate a few examples on various chroms, various strands
# I used rheMac2:
# rs533274, hg18 chr1 +, rheMac2 chr18 -
# rs1690550, hg18 chr1 -, rheMac2 chr19 +
# rs3121568, hg18 chr1 -, rheMac2 chr19 -
# rs28709562, hg18 chr1 +, rheMac2 chr19 +
# rs34675838, also hg18 chr1 +, rheMac2 chr19 +

# create alternate format with both alleles in same row
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrthoJoin hg18 snp126simple snp126ortho

# 8517465 rows in hash for panTro2
# 7321537 rows in hash for rheMac2
# humanCount =       9092533
# chimpOnlyCount =   1418324
# macaqueOnlyCount =  222396
# missingCount =      352672
# bothCount =        7098141
# confirm that chimpOnly + macaqueOnly + missing + both = human

hgsql hg18 < snp126orthoPanTro2RheMac2.sql
hgsql -e "load data local infile 'snpOrthoJoin.tab' into table snp126orthoPanTro2RheMac2" hg18
mysql> alter table snp126orthoPanTro2RheMac2 add index name (name);
mysql> alter table snp126orthoPanTro2RheMac2 add index chrom (chrom, bin);

################################################################
### CREATE chimpHiQualDiff -- panTro2 (Daryl; May 1, 2006)
    # Make file/table of high quality single base pair differences
    # between hg18 and panTro2
    set bedDir = /cluster/data/hg18/bed/chimpHiQualDiffs
    mkdir -p $bedDir
    cd $bedDir
    sed 's/simpleNucDiff/chimpHiQualDiffs/' ~/kent/src/hg/lib/simpleNucDiff.sql >! chimpHiQualDiffs.sql

    set axtDir = /cluster/data/hg18/bed/blastz.panTro2/axtRBestNet
    mkdir -p chroms; cd chroms
    ls -1 $axtDir | grep chr | grep axt | sed 's/.hg18.panTro2.net.axt.gz//' | grep -v random | grep -v "_" | xargs mkdir
    set workDir = /scratch/chqd
    mkdir -p $workDir
    touch $workDir/chqd.log
#       time nice /cluster/home/daryl/bin/i386/chimpHiQualDiffs $workDir/$f /cluster/data/panTro2/bed/quality/qac/panTro2.qac $f.chimpHiQualDiffs
.bed>>& $workDir/chqd.log
    foreach f (chr*)
        echo -n $f "  "
        mkdir -p $workDir/$f/
        cp $axtDir/$f.*.axt.gz $workDir/$f/
        gunzip $workDir/$f/$f.*.axt.gz
        time nice /cluster/home/daryl/bin/i386/chimpHiQualDiffs $workDir/$f /cluster/data/panTro2/bed/quality/qac/panTro2.qac $f.chimpHiQualDiffs
.bed
        rm -f $workDir/$f/$f.*axt
        rmdir $workDir/$f/
    end
    mv $workDir/chqd.log .
    cat chr*bed >! ../chimpHiQualDiffs.bed

    ## The load (sort) ran out of memory on hgwdev, so sort the
    ## file first on kolossus and then load it on hgwdev
    ssh kolossus
    time hgLoadBed -strict -sqlTable=chimpHiQualDiffs.sql -noLoad hg18 chimpHiQualDiffs chimpHiQualDiffs.bed
    # 110.214u 10.836s 2:24.42 83.8%  0+0k 0+0io 1pf+0w
    exit
    ## hgwdev
    time hgLoadBed -hasBin -noSort -sqlTable=chimpHiQualDiffs.sql hg18 chimpHiQualDiffs bed.tab
    # 328.890u 113.230s 42:26.00 17.3%        0+0k 0+0io 197676pf+0w

    ## TODO: need to filter out polymorphic sites (SNPs)


#################################################################
###### BUILD SUPERFAMILY RELATED TABLES (DONE - 2006-06-20 - Fan)

# Build Superfamily track and create sf tables needed for PB

   ssh hgwdev
   hgsql hg18 < ~/src/hg/lib/sfAssign.sql

   cd /cluster/data/superfamily/060619
   hgsql hg18 -e 'load data local infile "ass_18-Jun-2006.tab" into table hg18.sfAssign;'

# If hg18.sfDes already exists, drop it.

   mkdir /cluster/data/hg18/bed/sf
   cd /cluster/data/hg18/bed/sf

   hgsql superfam060619 -N -e "select * from des" >sfDes.tab
   hgsql hg18 < ~/src/hg/lib/sfDes.sql
   hgsql hg18 -e 'load data local infile "sfDes.tab" into table sfDes'

# Build ensemblXref3

    # Get the ensembl gene/protein cross-reference data from Ensembl BioMart
    # http://www.ensembl.org/Multi/martview
    # Follow this sequence through the pages:
    # Page 1) Select Ensembl39 and Homo Sapien. Hit next.
    # Page 2) Do not select anything. Hit next.
    # Page 3) Choose the "Feature" box, select Ensembl gene ID, transcript ID, peptide ID,
              UniProt/TrEMBL ID, UniProt/SWISSPROT ID, and UniProt/SWISSPROT Accession
    # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
    # Save as ensembXref3.gz

    ssh hgwdev
    cd /cluster/data/hg18/bed/ensembl
    gzip -d ensembXref3.gz

    hgsql hg18 < ~/src/hg/lib/ensemblXref3Temp.sql
    hgsql hg18 -e \
    'load data local infile "ensemblXref3" into table ensemblXref3Temp ignore 1 lines'

    hgsql hg18 -N -e \
    'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \
    > ensemblXref3.tab

    hgsql hg18 -e 'drop table ensemblXref3'
    hgsql hg18 <~/src/hg/lib/ensemblXref3.sql
    hgsql hg18 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3'

# If hg18.superfamily already exists, drop it.
   cd /cluster/data/hg18/bed/sf
   hgSuperfam hg18 superfam060619 > sf.log

# It is normal that many proteins do not have corresponding Superfamily entries.

# If hg18.sfDescription exists, drop it.

   hgsql hg18 < ~/src/hg/lib/sfDescription.sql
   hgsql hg18 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table hg18.sfDescription;'

# Finally, load the superfamily table.

   hgLoadBed hg18 superfamily superfamily.tab -tab

# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.

   cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \
   | hgKnownToSuper hg18 hs stdin
# created 27,511 rows in knownToSuper

############################################################################
# SEGMENTAL DUPLICATIONS (DONE 7/14/06 angie)
    # File emailed from Xinwei She <xws at u.washington.edu>
    mkdir /cluster/data/hg18/bed/genomicSuperDups
    cd /cluster/data/hg18/bed/genomicSuperDups
    # The sed command is necessary to fix "_" used as strand.
    # The awk command was necessary for some recent other species
    # genomicSuperDups that had some too-short regions.  It does not seem
    # to be necessary here, but doesn't hurt and may be useful in
    # future builds.
    sed -e 's/\t_\t/\t-\t/' hg18genomicSuperDup.tab \
    | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
    | hgLoadBed hg18 genomicSuperDups stdin \
      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql

    #	fix off-by one error:
    sed -e 's/\t_\t/\t-\t/' hg18genomicSuperDup.tab \
    | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' > hg18.gsd.bed
    #	run this perl script:
#!/usr/bin/env perl

use strict;
use warnings;

open (FH, "<hg18.gsd.bed") or die "can not read hg18.gsd.bed";
while (my $line = <FH>) {
        chomp $line;
    my ($chr, $start, $rest) = split('\s+', $line, 3);
    printf "%s\t%d\t%s\n", $chr, $start-1, $rest;
}
close (FH);
    #
    ./addOne.pl > oneLarger.bed
    #	check first column:
    ave -col=2 hg18.gsd.bed
    ave -col=2 oneLarger.bed
    #	reload table
    hgLoadBed hg18 genomicSuperDups oneLarger.bed \
      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql

############################################################################
# GENE BOUNDS (RNACLUSTER) (DONE 08-09-2006 Fan)
# Create rnaCluster table (depends on {est,mrna}OrientInfo)

cd /cluster/data/hg18/bed
mkdir rnaCluster
cd rnaCluster/
mkdir chrom

# Create a list of accessions that come from RAGE libraries and need to be excluded.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs
foreach f (/cluster/data/hg18/nib/chr*.nib)
    set c = $f:t:r
    set out = chrom/$c.bed
    # Exclude accesions in the RAGE file
    echo clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
    clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
 end
hgLoadBed hg18 rnaCluster chrom/*.bed
############################################################################

############################################################################
# POLYA_DB TRACK (DONE 08-28-2006 Andy)

mkdir /cluster/data/hg18/bed/polyaDB
cd /cluster/data/hg18/bed/polyaDB
wget http://polya.umdnj.edu/download/polyAsite.gz
gunzip polyAsite.gz
find /cluster/data/hg16/ -name 'ordered.lft' | xargs cat > hg16.lft
sed 's/\(\s\).*\//\1/; s/chr/hg16.chr/' hg16.lft > tmp
mv tmp hg16.lft
cut -f2 hg16.lft > hg16.lft.names
grep -F -f hg16.lft.names polyAsite > hg16.polyAsite
awk '{printf("%s\t%d\t%d\t%s\n", $3, ($5-1), $5, $1);}' hg16.polyAsite > hg16.polyAsite.bed
liftUp lifted.bed hg16.lft warn hg16.polyAsite.bed
sed 's/hg16\.//' lifted.bed > final.bed
liftOver final.bed /gbdb/hg16/liftOver/hg16ToHg18.over.chain.gz hg18.bed unmapped
hgLoadBed hg18 polyaDB hg18.bed
# trackDb entry/html in human/hg18

# redmine issue 19: wrong bed file format used on hgwdev
# reload it with table from beta  (2010-10-28 - Chin)
cd /cluster/data/hg18/bed/polyaDB
hgLoadBed hg18 polyaDb hg18.beta.polyaDb.bed

############################################################################
# Translate SNP Array data from hg17 (Heather August 2006)

# Affy500
cd /cluster/data/hg18/bed/snp/affy
# get rsId/affy name pairs from hg17 where rsId != 'unknown'
# 257954 candidates from Nsp (4311 with unknown rsId)
# 234765 candidates from Sty (3540 with unknown rsId)
hgsql hg17 < getHg17-Nsp.sql > nsp.hg17
hgsql hg17 < getHg17-Sty.sql > sty.hg17
# get name, chrom, chromStart, chromEnd, strand, observed from snp126simple
# snp126simple contains only class = "simple", locType = "exact",
# chromEnd = chromStart + 1, biallelic, singly-aligning
hgsql hg18 < getHg18.sql > snp126simple.hg18
# sort and join
# 257213in nsp.join
# 233941 in sty.join
# 741 in nsp.missing
# 824 in sty.missing
sort nsp.hg17 > nsp.hg17.sort
sort sty.hg17 > sty.hg17.sort
sort snp126simple.hg18 > snp126simple.hg18.sort
join nsp.hg17.sort snp126simple.hg18.sort > nsp.join
join sty.hg17.sort snp126simple.hg18.sort > sty.join
join -v 1 nsp.hg17.sort snp126simple.hg18.sort > nsp.missing
join -v 1 sty.hg17.sort snp126simple.hg18.sort > sty.missing
# fix column order
awk '{print $3, $4, $5, $2, 0, $6, $7, $1}' nsp.join > nsp.bed
awk '{print $3, $4, $5, $2, 0, $6, $7, $1}' sty.join > sty.bed
# load
hgLoadBed hg18 snpArrayAffy250Nsp nsp.bed -sqlTable=snpArrayAffy250Nsp.sql
hgLoadBed hg18 snpArrayAffy250Sty sty.bed -sqlTable=snpArrayAffy250Sty.sql
# cleanup
rm nsp.hg17 nsp.hg17.sort nsp.join
rm sty.hg17 sty.hg17.sort sty.join
rm snp126simple.hg18 bed.tab
mv snp126simple.hg18.sort ../illumina
gzip nsp.bed sty.bed

# Illumina300
cd /cluster/data/hg18/bed/snp/illumina
# 317,100 candidates from hg17
hgsql -e 'select name from snpArrayIllumina300' hg17 >  hg17.data
# sort and join
# 314,093 in join.out
# 3,007 in join.missing
sort hg17.data > hg17.data.sort
join hg17.data.sort hg18.data.sort > join.out
join -v 1 hg17.data.sort hg18.data.sort > join.missing
# fix column order
awk '{print $2, $3, $4, $1}' join.out > illumina.bed
# load
hgsql hg18 < snpArrayIllumina300.sql
hgLoadBed hg18 snpArrayIllumina300 illumina.bed -sqlTable=snpArrayIllumina300.sql
# cleanup
rm hg17.data hg17.data.sort hg18.data.sort bed.tab join.out
gzip illumina.bed

##########################################################################
# New SNP Array data (Heather April 2007)
# Affymetrix introduced a new genotyping array in February
# I got the data from Venu in April
# It is based on dbSNP build 126
# Venu reviewed the load

    ssh hgwdev
    cd /cluster/data/hg18/bed/snp/affy
    # There were 60 lines with no chrom, chromEnd or strand
    grep -v NULL GenomeWideSNP_5_ucsc.tsv > genomewide.in
    # little Perl script to add chromEnd & score for bed format
    genomewide.pl < genomewide.id > genomewide.bed
    # preliminary load
    hgLoadBed hg18 snpArrayAffyGenomeWidePrelim genomewide.bed -tab -sqlTable=snpArrayAffyGenomeWidePrelim.sql
    # based on position, lookup rsId
    # 2 runs
    # first run: don't include dbSNP if class != single or locType != exact or
    # chromEnd != chromStart + 1
    /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffyGenomeWidePrelim snp126
    # missing count = 5279
    # multiple count = 44
    # second run: use all of snp126
    /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffyGenomeWidePrelim snp126
    # missing count = 5210
    # multiple count = 724
    # Use the first run (better to avoid nearly 700 multiples at the cost of
    # 69 more unknown)
    hgLoadBed hg18 snpArrayAffy5 affyLookup.out -tab -sqlTable=snpArrayAffy5.sql

##########################################################################
# More new SNP Array data from Affymetrix  (Heather May 2007)
# Source: Venu_Valmeekam at affymetrix.com
# This is the 6.0 array, announced mid-May
# It contains 2 components: single-base substitutions and copy-number probes
# Single-base substitutions are based on snp127

    ssh hgwdev

    cd /cluster/data/hg18/bed/snp/affy/6.0/single
    unzip GenomeWideSNP_6_ucsc_1.tsv.zip
    unzip GenomeWideSNP_6_ucsc_2.tsv.zip
    format.pl < GenomeWideSNP_6_ucsc_1.tsv > 1.bed
    format.pl < GenomeWideSNP_6_ucsc_2.tsv > 2.bed
    cp 1.bed all.bed
    cat 2.bed >> all.bed
    hgLoadBed hg18 snpArrayAffy6Prelim all.bed -tab -sqltable=snpArrayAffy6Prelim.sql
    mysql> update snpArrayAffy6Prelim set chrom = "chrM" where chrom = "chrMT";
    /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffy6Prelim snp127
    # missing count = 1149
    # multiple count = 2396
    # used the strict version of affyLookup (class="single", locType="exact", size=1)
    hgLoadBed hg18 snpArrayAffy6 affyLookup.out -tab -sqlTable=snpArrayAffy6.sql
    mysql> alter table snpArrayAffy6 add index name(name);
    mysql> alter table snpArrayAffy6 add index chrom(chrom, bin);

    cd /cluster/data/hg18/bed/snp/affy/6.0/sv
    unzip GenomeWideSNP_6_CN_ucsc_1.tsv.zip
    unzip GenomeWideSNP_6_CN_ucsc_2.tsv.zip
    format.pl < GenomeWideSNP_6_CN_ucsc_1.tsv > 1.bed
    format.pl < GenomeWideSNP_6_CN_ucsc_2.tsv > 2.bed
    cp 1.bed all.bed
    cat 2.bed >> all.bed
    hgLoadBed hg18 snpArrayAffy6SV all.bed -tab
    mysql> delete from snpArrayAffy6SV where chrom = "chr0";
    mysql> update snpArrayAffy6SV set chromStart = chromStart - 1;

##########################################################################
# Venu from Affy requested to remove about 25,000 items from
# snpArrayAffy6 track.
#
# Imported the list into the table, snpArrayAffy6Remove, in hg18.
#
# Issued a simple MySQL command to delete records in snpArrayAffy6
# that having ids in snpArrayAffy6Remove (sorry did not write down it).
#
# This was done 10/8/07. Fan.
##########################################################################
# New Illumina Array data (Heather April 2007)
# HumanHap300v3, HumanHap550v3, HumanHap650v3
# Data from Luana Galver (lgalver at illumina.com)
# Based on dbSNP build 126

    ssh hgwdev
    cd /cluster/data/hg18/bed/snp/illumina
    # split off chrM from zips
    bed.pl < 300.in > 300.bed
    bed.pl < 550.in > 550.bed
    bed.pl < 650.in > 650.bed
    chrM.pl < 550.in.M > 550.bed.M
    chrM.pl < 650.in.M > 650.bed.M
    hgLoadBed hg18 snpArrayIllumina300 300.bed -sqlTable=snpArrayIllumina300.sql -tab
    hgLoadBed hg18 snpArrayIllumina550 550.bed -sqlTable=snpArrayIllumina550.sql -tab
    hgLoadBed hg18 snpArrayIllumina650 650.bed -sqlTable=snpArrayIllumina650.sql -tab
    hgLoadBed hg18 snpArrayIllumina550 550.bed.M -tab -oldTable
    hgLoadBed hg18 snpArrayIllumina650 650.bed.M -tab -oldTable

    # add indices
    mysql> alter table snpArrayIllumina300 add index name (name);
    mysql> alter table snpArrayIllumina300 add index chrom (chrom, bin);
    mysql> alter table snpArrayIllumina550 add index name (name);
    mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin);
    mysql> alter table snpArrayIllumina650 add index name (name);
    mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin);

    # fix strand convention
    mysql> update snpArrayIllumina300 set strand = "+" where strand = "F";
    mysql> update snpArrayIllumina300 set strand = "-" where strand = "R";
    mysql> update snpArrayIllumina550 set strand = "+" where strand = "F";
    mysql> update snpArrayIllumina550 set strand = "-" where strand = "R";
    mysql> update snpArrayIllumina650 set strand = "+" where strand = "F";
    mysql> update snpArrayIllumina650 set strand = "-" where strand = "R";

    # Note no A/T or C/G!!
    mysql> select distinct(observed) from snpArrayIllumina300;
    # +----------+
    # | observed |
    # +----------+
    # | [A/G]    |
    # | [T/C]    |
    # | [A/C]    |
    # | [T/G]    |
    # +----------+

    # fix observed
    mysql> update snpArrayIllumina300 set observed = "A/C" where observed = "[A/C]";
    mysql> update snpArrayIllumina550 set observed = "A/C" where observed = "[A/C]";
    mysql> update snpArrayIllumina650 set observed = "A/C" where observed = "[A/C]";
    mysql> update snpArrayIllumina300 set observed = "A/G" where observed = "[A/G]";
    mysql> update snpArrayIllumina550 set observed = "A/G" where observed = "[A/G]";
    mysql> update snpArrayIllumina650 set observed = "A/G" where observed = "[A/G]";
    mysql> update snpArrayIllumina300 set observed = "C/T" where observed = "[T/C]";
    mysql> update snpArrayIllumina550 set observed = "C/T" where observed = "[T/C]";
    mysql> update snpArrayIllumina650 set observed = "C/T" where observed = "[T/C]";
    mysql> update snpArrayIllumina300 set observed = "G/T" where observed = "[T/G]";
    mysql> update snpArrayIllumina550 set observed = "G/T" where observed = "[T/G]";
    mysql> update snpArrayIllumina650 set observed = "G/T" where observed = "[T/G]";

    # Note 2 rows in 300 and 15 rows in 550 and 650 where chrom = "chrXY"

    # validation
    /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina300 snp126 snp126Exceptions illuminaLookup.hg18.300
    /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina550 snp126 snp126Exceptions illuminaLookup.hg18.550
    /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina650 snp126 snp126Exceptions illuminaLookup.hg18.650
    # Not found: 2 in 300, 15 in 550 and 650
    # These are in snp127
    # Mixed: 55 in 300, 74 in 550, 81 in 650

    # Found 2 strange things here:
    # First of all, for snps that are illumina forward strand, dbSNP reverse strand:
    # in all cases, the observed polymorphism is identical.
    # Counts:
    # 36k on the HumanHap300v3
    # 52k on the HumanHap550v3
    # 59k on the HumanHap650v3
    # This surprises me, because the dbSNP observation is intended to be reverse-complemented.
    # Examples from HumanHap300v3 include rs1000007, rs1000031, rs1000041, rs1000071, rs1000078.
    # Secondly, for snps that are illumina reverse strand:

    # in all cases is that your observed polymorphism is the reverse complement of the dbSNP polymorphism.
    # this could only make sense for the dbSNP forward strand OR the dbSNP reverse strand, although I don't think it matters which one.

    # examples:

    # rs3934834: illumina A/G (-), dbSNP C/T (+)
    # rs6687776: illumina A/G (-), dbSNP C/T (+)
    # rs2298217: illumina A/G (-), dbSNP C/T (+)
    # rs9442380: illumina A/G (-), dbSNP C/T (+)
    # rs3737728: illumina A/G (-), dbSNP C/T (-)
    # rs3813199: illumina A/G (-), dbSNP C/T (-)
    # rs880051: illumina A/G (-), dbSNP C/T (-)

    # rs12562034: illumina C/T (-), dbSNP A/G (+)
    # rs9442372: illumina C/T (-), dbSNP A/G (+)
    # rs11260588: illumina C/T (-), dbSNP A/G (+)
    # rs12726255: illumina C/T (-), dbSNP A/G (+)
    # rs2887286: illumina C/T (-), dbSNP A/G (-)
    # rs2649588: illumina C/T (-), dbSNP A/G (-)
    # rs2296716: illumina C/T (-), dbSNP A/G (-)
    # rs2474460: illumina C/T (-), dbSNP A/G (-)

    # redo this, just using name/chrom/pos from illumina
    bed2.pl < 300.in > 300.bed.2
    hgLoadBed hg18 snpArrayIllumina300Prelim 300.bed.2 -tab
    /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina300Prelim snp126 snp126Exceptions
    mv illuminaLookup.out lookup.300
    mv illuminaLookup.err lookup.300.err
    hgLoadBed hg18 snpArrayIllumina300 lookup.300 -tab -sqlTable=snpArrayIllumina300.sql
    hgsql -N -e 'drop table snpArrayIllumina300Prelim' hg18

    bed2.pl < 550.in > 550.bed.2
    hgLoadBed hg18 snpArrayIllumina550Prelim 550.bed.2 -tab
    /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina550Prelim snp126 snp126Exceptions
    mv illuminaLookup.err lookup.550.err
    mv illuminaLookup.out lookup.550
    hgLoadBed hg18 snpArrayIllumina550 lookup.550 -tab -sqlTable=snpArrayIllumina550.sql
    hgsql -N -e 'drop table snpArrayIllumina550Prelim' hg18

    bed2.pl < 650.in > 650.bed.2
    hgLoadBed hg18 snpArrayIllumina650Prelim 650.bed.2 -tab
    /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina650Prelim snp126 snp126Exceptions
    mv illuminaLookup.out lookup.650
    mv illuminaLookup.err lookup.650.err
    hgLoadBed hg18 snpArrayIllumina650 lookup.650 -tab -sqlTable=snpArrayIllumina650.sql
    hgsql -N -e 'drop table snpArrayIllumina650Prelim' hg18

    # add indices
    mysql> alter table snpArrayIllumina300 add index name (name);
    mysql> alter table snpArrayIllumina300 add index chrom (chrom, bin);
    mysql> alter table snpArrayIllumina550 add index name (name);
    mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin);
    mysql> alter table snpArrayIllumina650 add index name (name);
    mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin);


##########################################################################
# Added gvPos table for Locus Variants (Belinda Giardine Sept 2006)
# This uses the gv* tables in hgFixed for the related data.  The track has
# been on hg17, just added to hg18.  Most variants were mapped directly to
# hg18 only the LSDB BGMUT was lifted using liftOver.
# Update, reloaded table Dec 2006	Belinda Giardine
# new entries for previous sources and more IDbases
# Update, reloaded table January 2007       Belinda Giardine
# new source (first set of LOVD) and some fixes to IDbases and HbVar
# Update most LSDBs, add more genes for LMDp(LOVD) Jan 11, 2008
# loaded and tested first at PSU

#update old dbs and add dbPEX     March 22-23, 2007
#need to truncate and reload all tables (new entries in old)
#prepare positions for loading
cd gvNov2006
cat gvPosARdb.hg17.txt gvPosSrd5a2.hg17.txt gvPosPah.hg17.txt > ../gvMar2007/gvPosNov2006.hg17.txt
cd ../gvMar2007
cat ../gvJan2007/gvPosLOVD.hg17.txt *.hg17.txt > gvPos.Hg17.txt
grep "^chr" gvPos.Hg17.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg17.bed
cd gvNov2006
cat gvPosARdb.hg18.txt gvPosSrd5a2.hg18.txt gvPosPah.hg18.txt > ../gvMar2007/gvPosNov2006.hg18.txt
cd ../gvMar2007
cat ../gvJan2007/gvPosLOVD.hg18.txt *.hg18.txt > gvPos.Hg18.txt
grep "^chr" gvPos.Hg18.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg18.bed
#run checks
~giardine/gv/checkLinksRaFile.pl /cluster/store6/giardine/gvMar2007/
~giardine/gv/checkSeq.pl hg18 < gvPos.Hg18.txt > errors.txt
~giardine/gv/checkSeq.pl hg17 < gvPos.Hg17.txt > errors17.txt
#start reload
hgsql hgFixed < emptyTables.sql
#copy and paste from reloadHgFixed.txt
#load new dbs
hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvRettBASE.txt
hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrRettBASE.txt
hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkRettBASE.txt
hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvdbPEX.txt
hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrdbPEX.txt
hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkdbPEX.txt
#load position tables
hgLoadBed hg18 gvPos gvPosSorted.Hg18.bed -noSort -oldTable -tab
hgLoadBed hg17 gvPos gvPosSorted.Hg17.bed -noSort -oldTable -tab
#run remaining checks
select distinct attrType from gvAttr;
select distinct attrType from gvLink;
#and compare against gvAttrTypeKey in hg/lib/gvUi.c
~/gv/joinerChecks.pl table1 IDfield1 table2 IDfield2
#for gv, gvPos, gvSrc, gvAttr, and gvLink
#script to check for non unique rows in database
~/gv/uniqueCheck.pl gvAttr > gvAttrNonunique.txt
~/gv/uniqueCheck.pl gvLink > gvLinkNonunique.txt

#add IPNMDB and reload LOVD with more genes           April 12, 2007
cat *.hg17.txt > gvPos.Hg17.txt
grep "^chr" gvPos.Hg17.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg17.bed
cat *.hg18.txt > gvPos.Hg18.txt
grep "^chr" gvPos.Hg18.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg18.bed
#run checks
~giardine/gv/checkLinksRaFile.pl /cluster/store6/giardine/gvMar2007/
~giardine/gv/checkSeq.pl hg18 < gvPos.Hg18.txt > errors.txt
~giardine/gv/checkSeq.pl hg17 < gvPos.Hg17.txt > errors17.txt
#remove old LOVD entries
hgsql hgFixed
        delete from gvLink where id like 'FKRP%';
        delete from gvAttr where id like 'FKRP%';
        delete from gv where id like 'FKRP%';
        insert into gvSrc values ('IPNMDB', 'LSDB', 'Mutation Database of Inherited Peripheral Neuropathies');
#load new dbs
hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvLOVD.txt
hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrLOVD.txt
hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkLOVD.txt
hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvIPNMDB.txt
hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrIPNMDB.txt
adSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkIPNMDB.txt
hgsql hg18
        truncate table gvPos;
hgsql hg17
        truncate table gvPos;
#load position tables
hgLoadBed hg18 gvPos gvPosSorted.Hg18.bed -noSort -oldTable -tab
hgLoadBed hg17 gvPos gvPosSorted.Hg17.bed -noSort -oldTable -tab
#run remaining checks
select distinct attrType from gvAttr;
select distinct attrType from gvLink;
#and compare against gvAttrTypeKey in hg/lib/gvUi.c
~/gv/joinerChecks.pl table1 IDfield1 table2 IDfield2
#for gv, gvPos, gvSrc, gvAttr, and gvLink
#script to check for non unique rows in database
~/gv/uniqueCheck.pl gvAttr
~/gv/uniqueCheck.pl gvLink
#found missing common names
hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrIPNMDBcommonName.txt


##########################################################################
#  hars 1 to 202  Sol 09/10/2006

    set bedDir = /gbdb/hg18/haseq/bed
    mkdir -p $bedDir/hars
    pushd /projects/hg/wet/Sol/hars1to49
    cp -p hars_1to202.hg18.bed  $bedDir/hars/hars_1to202.bed
    hgLoadBed hg18 hars         $bedDir/hars/hars_1to202.bed
    rm -f                       $bedDir/hars/hars_1to202.bed
    popd

# BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (DONE 9/11/06)

# Download HPRD_XML_060106.tar.gz from www.hprd.org

    gzip -d HPRD_XML_060106.tar.gz
    tar -xvf HPRD_XML_060106.tar.gz

# This will create 18838 xxxx.xml files under HPRD_XML_060106

# Create hprdToCdna table

    echo 'grep -H entry_cdna  HPRD_XML_060106/$1.xml' >do1Cdna

    ls  HPRD_XML_060106 >j
    cat j |sed -e 's/.xml/\tdo1Cdna/g' >jj
    cut -f 1 jj >j.2
    cut -f 2 jj >j.1
    paste j.1 j.2 >doAllCdna
    chmod +x do*

    ./doAllCdna >j.cdna
    cat j.cdna| sed -e 's/\//\t/' | sed -e 's/.xml/\t/' |\
    sed -e 's/<entry_cdna>/\t/' | sed -e 's/<\//\t/'| sed -e 's/\./\t/'| cut -f 2,4|\
    grep -v None >hprdToCdna.tab

    hgsql hg18 -e 'drop table hprdToCdna'
    hgsql hg18 <~/src/hg/lib/hprdToCdna.sql
    hgsql hg18 -e 'load data local infile "hprdToCdna.tab" into table hprdToCdna'

# Create hprdToUniProt table

    echo 'fgrep -H Swiss  HPRD_XML_060106/$1.xml' >do1

    ls HPRD_XML_060106 >j
    cat j |sed -e 's/.xml/\tdo1/g' >jj
    cut -f 1 jj >j.2
    cut -f 2 jj >j.1
    paste j.1 j.2 >doall
    chmod +x do*

    ./doall >j.out
    cat j.out|grep SwissProt | sed -e 's/\//\t/' | sed -e 's/.xml/\t/' | \
    sed -e 's/Prot>/\t/' | sed -e 's/<\//\t/'| cut -f 2,4|grep -v None >hgrdToUniProt.tab

    hgsql hg18 -e 'drop table hprdToUniProt'
    hgsql hg18 <~/src/hg/lib/hprdToUniProt.sql
    hgsql hg18 -e 'load data local infile "hprdToUniProt.tab" into table hprdToUniProt'

# build knownToHprd table

    hgsql hg18 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=kgId' >j.kg1
    hgsql hg18 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2

    cat j.kg1 j.kg2 |sort -u >knownToHprd.tab
    wc knownToHprd.tab

    hgsql hg18 -e 'drop table knownToHprd'
    hgsql hg18 <~/src/hg/lib/knownToHprd.sql

    hgsql hg18 -e 'load data local infile "knownToHprd.tab" into table knownToHprd'
    hgsql hg18 -e 'select count(*) from knownToHprd'

# 19,646 records created.

# remove temporary files.

    rm j*

# Do the same for hg17.  See hg17.txt for details.

##########################################################################
# ORegAnno: oreganno, oregannoAttr, oregannoLink
# Belinda Giardine  August 3, 2007
#             updated Oct 26, 2007
#             updated July 7, 2008
# This has regulatory annotations from ORegAnno.
# Get updated file from ORegAnno wiki page
#    http://www.bcgsc.ca/wiki/display/oreganno/DataFiles
# Parse flat file into 3 tables, truncate tables, load.
# Has other species but only Human, Fly, sacSer1 has enough entries for now.
cd /cluster/store6/giardine/oreganno/20071026/
~giardine/oreganno/parseOra hg18 < oreganno_UCSC_25Oct07.txt
hgsql hg18
        truncate table oreganno;
        truncate table oregannoAttr;
        truncate table oregannoLink;
	quit;
grep "^chr" oreganno.hg18.txt | sort -k1,1 -k2,2n > oreganno.bed
hgLoadBed hg18 oreganno oreganno.bed -noSort -oldTable -tab
hgLoadSqlTab -oldTable hg18 oregannoAttr
~/humPhen/kent/src/hg/lib/oreganno.sql oregannoAttr.hg18.txt
hgLoadSqlTab -oldTable hg18 oregannoLink
~/humPhen/kent/src/hg/lib/oreganno.sql oregannoLink.hg18.txt

##########################################################################
# LIFT ACEMBLY FROM HG17 TO HG18 (DONE, Fan, 9/28/06)
# OBSOLETED BY LOAD OF NEW DATA, SEE BELOW 8/28/07 angie

# get acembly data from hg17

    hgsql hg17 -N -e 'select * from acembly' >hg17Acembly.gp

# lift to hg18

    zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | \
    liftOver hg17Acembly.gp stdin acembly.gp  unMapped.gp -genePred

# load the genePred table

   ldHgGene hg18 acembly -predTab acembl.gp

# get acemblyPep and acemblyClass table from hg17 and load them into hg18.

    hgsql hg17 -N -e 'select * from acemblyPep' >acemblyPep.tab
    hgsql hg18 -e 'drop table acemblyPep'
    hgsql hg18 < ~/src/hg/lib/acemblyPep.sql
    hgsql hg18 -e 'load data local infile "acemblyPep.tab" into table acemblyPep'

    hgsql hg17 -N -e 'select * from acemblyClass' >acemblyClass.tab
    hgsql hg18 -e 'drop table acemblyClass'
    hgsql hg18 < ~/src/hg/lib/acemblyClass.sql
    hgsql hg18 -e 'load data local infile "acemblyClass.tab" into table acemblyClass'


##########################################################################
# LIFT RNAGENE FROM HG17 TO HG18 (DONE, Robert, 10/3/06)
mkdir /cluster/data/hg18/bed/rnaGene
cd /cluster/data/hg18/bed/rnaGene
hgsql hg18 < rnaGene.sql
liftOver ~/hg17/rnaGene/rnaGenes.tab  /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz rnaGenes.bed unmapped -bedPlus=10 -tab
hgLoadBed hg18 rnaGene rnaGenes.bed -oldTable -tab -noBin


##########################################################################
# SWAP/CHAIN/NET GASACU1 (DONE 10/17/06 angie)
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.gasAcu1.swap
    cd /cluster/data/hg18/bed/blastz.gasAcu1.swap
    doBlastzChainNet.pl -swap /cluster/data/gasAcu1/bed/blastz.hg18/DEF \
      -chainMinScore=2000 -chainLinearGap=loose >& do.log & tail -f do.log
    ln -s blastz.gasAcu1.swap /cluster/data/hg18/bed/blastz.gasAcu1
    nice featureBits hg18 chainGasAcu1Link
#55424609 bases of 2881515245 (1.923%) in intersection


##########################################################################
# YALE TRANSCRIPTIONALLY ACTIVE REGIONS (TARs/TransFrags) TRACK IDENTIFIED #
# USING A WHOLE GENOME TILING ARRAY (DONE, 2006-10-12 - 2006-10-13, hartera)
# Data is from the paper: Bertone et al. Science 24 December 2004:
# Vol. 306. no. 5705, pp. 2242 - 2246. From Mark Gerstein's lab at Yale.
# Contact at Yale: Joel S. Rozowsky, joel.rozowsky at yale.edu
# The data consist of Transcriptionally Active Regions (TARs or TransFrags)
# found using Affymetrix genome tiling arrays. The data is from the lab
# of Mark Gerstein at Yale.
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/yaleBertoneTars/
     cd /cluster/data/hg18/bed/yaleBertoneTars/
     # download Bertone et al. data from this URL:
    #http://dart.gersteinlab.org/cgi-bin/ar/download.cgi?ID=TAR_data_NCBI31.txt
     # and put it in this directory.
     # The sequences used to design the microarrays were from
     # UCSC hg13/NCBI Build 31 so the sequences
     # should be aligned again using Blat since this is probably better
     # than using liftOver across so many assemblies.

     # Get sequences from TARs file and put in FASTA format:
     # Remove characters from Windows:
     dos2unix TAR_data_NCBI31.txt
     # The TARs are in order of IDs in the file so the first TAR has ID 1, the
     # second is 2 up to the last which is 17517. These IDs are used to link
     # to the DART database of TARs at Yale so use these IDs in the FASTA
     # header lines. Need to add "TAR" as prefix to ID so that it is unique
     # in the seq table.
   awk 'BEGIN {FS="\t";n=0;}{if ($1 ~ /^chr/) print ">TAR"n"\n"$14"\n";n++;}' \
         TAR_data_NCBI31.txt > yaleBertoneTARSeqs.fa
     ssh pk
     mkdir -p /san/sanvol1/scratch/hg18/TARs/
     cp /cluster/data/hg18/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
        /san/sanvol1/scratch/hg18/TARs/
     # Set up to Blat the TAR sequences against hg18
     cd /cluster/data/hg18/bed/yaleBertoneTars
     ls -1 /san/sanvol1/scratch/hg18/TARs/yaleBertoneTARSeqs.fa > tars.lst
     ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
     # output dir
     mkdir psl

     cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/x86_64/blat -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
     # << for emacs
     gensub2 genome.lst tars.lst template.sub para.spec
     para create para.spec
     para try, para check, para push ...
     para time
# Completed: 49 of 49 jobs
#CPU time in finished jobs:        396s       6.61m     0.11h    0.00d  0.000y
#IO & Wait Time:                   198s       3.29m     0.05h    0.00d  0.000 y
#Average job time:                  12s       0.20m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              39s       0.65m     0.01h    0.00d
#Submission to last job:           253s       4.22m     0.07h    0.00d

     # sort and then filter
     pslSort dirs raw.psl tmp psl
     # use these parameters as for Genbank alignments of native mRNAs
     # for finished assemblies.
     pslCDnaFilter -minId=0.96 -minCover=0.25 -localNearBest=0.001 \
       -minQSize=20 -minNonRepSize=16 -ignoreNs -bestOverlap \
       raw.psl yaleBertoneTars.psl

#                     seqs    aligns
#             total:     17512   38243
# drop minNonRepSize:     159     403
#     drop minIdent:     3822    14798
#     drop minCover:     563     895
#        weird over:     242     832
#        kept weird:     204     210
#    drop localBest:     2410    4018
#              kept:     17469   18129

     # 99.75% were kept.
     # check how many aligned
     grep '>' yaleBertoneTARSeqs.fa | wc -l
     # 17517
     # 99.7% of the original set of sequences are in this filtered PSL file.

     pslCheck yaleBertoneTars.psl
     # psl is ok

     # load into database
     ssh hgwdev
     cd /cluster/data/hg18/bed/yaleBertoneTars
     hgLoadPsl hg18 yaleBertoneTars.psl

     # Add sequences to /gbdb/hg18 and to seq and extFile tables.
     mkdir -p /gbdb/hg18/yaleTARs/
     ln -s /cluster/data/hg18/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
           /gbdb/hg18/yaleTARs/
     hgLoadSeq hg18 /gbdb/hg18/yaleTARs/yaleBertoneTARSeqs.fa

     # Add trackDb.ra entry to trackDb/human/trackDb.ra and create
     # a description page.

##############################################################################
# Update upstream maf files, fixing a problem of RefSeq ID being trucated. (2006-10-20 Fan)

    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz17way
    cd mafDownloads
    # upstream mafs (mafFrags takes a while)
cat > mafFrags.csh << 'EOF'
    date
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad

        cat up.bad|sed -e "s/_up_${i}_/\t/" >up.bad2
	awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, $4, 0, $6)}' up.bad2 > up.bed
	rm up.bad up.bad2
	nice mafFrags hg18 multiz17way up.bed upstream$i.maf \
	   -orgs=/cluster/store11/gs.19/build36/bed/multiz17way.2006-02-18/species.lst
	rm up.bed
    end
    date
'EOF'
# << happy emacs

    time csh mafFrags.csh > mafFrags.log
    nice gzip up*.maf
    md5sum up*.gz >> md5sum.txt


#########################################################################
# BLASTZ/CHAIN/NET FELCAT3 (Done Nov 09 2006 heather)
# working in /cluster/data/felCat3 because /cluster/data/hg18 is 96% full
# make this a link in /cluster/data/hg18
    mkdir /cluster/data/felCat3/bed/blastz.hg18.2006-11-09
    ln -s /cluster/data/felCat3/bed/blastz.hg18.2006-11-09 /cluster/data/hg18/bed/blastz.felCat3
    cd /cluster/data/felCat3/bed/blastz.hg18.2006-11-09
    cat << '_EOF_' > DEF

BLASTZ_M=50

# TARGET: Human Hg18
# Can we use 2bit here?
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cat felCat3
SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/felCat3/bed/blastz.hg18.2006-11-09
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/felCat3/blastz.hg18 >& do.log &
    tail -f do.log

    nice featureBits -chrom=chr1 hg18 chainFelCat3Link
    # 86932463 bases of 224999719 (38.637%) in intersection

    # reciprocal best net mafs for multiz
     ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 felCat3 >&! rbest.log &


#########################################################################
# BLASTZ/CHAIN/NET BOSTAU3 (Done Feb 2007 heather)
    mkdir /cluster/data/hg18/bed/blastz.bosTau3.2007-02-23
    ln -s /cluster/data/hg18/bed/blastz.bosTau3.2007-02-23 /cluster/data/hg18/bed/blastz.bosTau3
    cd /cluster/data/hg18/bed/blastz.bosTau3
    cat << '_EOF_' > DEF

BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau3
SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.bosTau3.2007-02-23
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/bosTau3/blastz.hg18 >& do.log &
    tail -f do.log

    nice featureBits -chrom=chr1 hg18 chainBosTau3Link
    # 114562908 bases of 224999719 (50.917%) in intersection


##############################################################################
# MAKE tfbsConsSites and tfbsConsFactors for TFBS conserved track (DONE weirauch 11/19/06)
# Questions?  weirauch at soe.ucsc.edu or braney at soe.ucsc.edu

ssh hgwdev
mkdir /cluster/data/hg18/bed/tfbsCons
cd /cluster/data/hg18/bed/tfbsCons

# Define all parameters in 'PARAMS.txt'
# Define all chromosomes in 'CHROMS.txt'
# Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts  weirauch at soe.ucsc.edu
set tarfile=/cluster/data/hg18/bed/tfbsCons/tfbsConsUtils.tar.gz
tar zxf $tarfile

nice ./getRefseqStats.pl &
nice ./getBatchQueries.pl &

ssh kk
mkdir /cluster/bluearc/braney/tfloc
# Copy ./tmp/ctfbs_batch_list.txt to this dir
# Copy ./scripts/doit to this dir
para create ctfbs_batch_list.txt
para try
para push

# When the run is done (within a day or so), the results will be in individual dirs, one for each chromosome.

ssh kksilo (or hgwdev, or whatever)
nice ./getBedFile.pl &

hgLoadBed -noSort hg18 tfbsConsSites -sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed -tab
hgLoadBed -noSort hg18 tfbsConsFactors -sqlTable=$HOME/kent/src/hg/lib/tfbsConsFactors.sql tfbsConsFactors.bed -tab

# Feel free to delete or gzip anything in ./tmp (particularly the huge .maf and .bed files) after the final two bed files are sucessfully loaded

# fixed up the tfbsConsSites.bed file to remove extra indexes, then:
hgsql -e "drop index chrom_2 on tfbsConsSites;" hg18
hgsql -e "drop index chrom_3 on tfbsConsSites;" hg18

# the tfbsConsFactors table had extra names, they were removed:
for N in `cat extra.tfbsConsFactors.name`
do
        echo "delete from tfbsConsFactors where name=\"${N}\";" hg18
        hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg18
done

# the extra names were:
# B$CRP_C F$DDE1_B F$STRE_01 P$GBP_Q6 V$ACAAT_B V$APOLYA_B V$ATATA_B
# V$BARBIE_01 V$BEL1_B V$CAAT_01 V$CAAT_C V$CAP_01 V$DTYPEPA_B V$E2F_Q2
# V$ETF_Q6 V$ETS_Q6 V$GC_01 V$GEN_INI2_B V$GEN_INI3_B V$GEN_INI_B V$HFH8_01
# V$HOGNESS_B V$LBP1_Q6 V$LDSPOLYA_B V$LEF1_Q2 V$LPOLYA_B V$MEF3_B V$MINI19_B
# V$MINI20_B V$MTATA_B V$MUSCLE_INI_B V$PADS_C V$PEA3_Q6 V$POLY_C V$SRY_01
# V$STAT4_01 V$STAT5A_03 V$STAT5A_04 V$STAT6_02 V$TAACC_B V$TANTIGEN_B
# V$TEF1_Q6 V$USF2_Q6

# And re-load once again since the above data was based on transfac data that
# is too new (2006-11-03 - Hiram)
    cd /cluster/data/hg18/bed/tfbsCons
    hgLoadBed -tab -strict hg18 tfbsConsSites \
	-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed
# And this leads once again to a bunch of extra names in Factors
    hgsql -N -e "select name from tfbsConsSites;" hg18 | sort -u > names.new
    hgsql -N -e "select name from tfbsConsFactors;" hg18 \
	| sort -u > names.factors
    comm -13 names.new names.factors > names.extra.factors
for N in `cat names.extra.factors`
do
        echo "delete from tfbsConsFactors where name=\"${N}\";" hg18
        hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg18
done

# Reload tfbsCons to correct errors (2007-07-17 - Hiram)
    cd /cluster/data/hg18/bed/tfbsCons
    hgLoadBed -tab hg18 tfbsConsSites \
	-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed
    hgsql -N -e "select name from tfbsConsSites;" hg18 | sort -u \
	> names.new.2007-07-17
    #	showing zero difference still, nothing more to be done
    comm -13 names.new.2007-07-17 names.factors

##############################################################################
# REWORK PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE
#	(WORKING - 2006-10-23 - Hiram)
#	five different cluster runs are described here for different classes
#	of clones
#	runPlacedNotSplit - all placed clones split or not split with blat
#	runFish - 392 fish clones against all 378 contigs, with blat
#	runUnPlaced - 14,569 clones on known contigs - with psLayout
#	runUnPlacedChr - 297 clones on known chroms - with psLayout
#	runLastOnes - 1,877 clones against 378 contigs - with blat

#  The original run of this forgot to split of the BAC clones that were just
#	a fasta file full of unordered pieces.  They need to be split up
#	to work properly.

    ssh pk
    mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23
    cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
    #	Going to copy over the BAC clones from the previous runs and split
    #	them up if they have too many N's (>100) (indicating pieces)
    #	This may actually split up a couple of BACs that are not actually
    #	pieces, but in the cases I could find, and they were rare, the big
    #	BACs appear to break into only two pieces.
    #	The first set to do are the clones that were used in the assembly
    #	Since they were placed, we know where they all belong.  Only 50 of
    #	them end up being split, and then usually only in 2 pieces.
    #	We could tediously go through each of these 50 and determine if they
    #	are actually unordered pieces.  Although this raises the question,
    #	how could unordered pieces be used in the assembly ?  Doesn't make any
    #	sense.
    cat << '_EOF_' > placedClones.sh
#!/bin/sh

D0=placedNotSplit
D1=placedSplit
export D0 D1

find ../coverage/placedClones -type f | grep -v faCount.all.txt | while read F
do
    BN=`basename "${F}"`
    DN=`dirname "${F}"`
    CHROM=`basename "${DN}"`
    Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"`
    if [ "${Ncount}" -gt 99 ]; then
	out="${D1}/${CHROM}/${BN}"
        mkdir -p ${D1}/${CHROM}
        echo "gapSplit -minGap=100 ${F} ${out}"
        gapSplit -minGap=100 ${F} stdout | gzip > ${out}
        faSize "${F}"
        faSize "${out}"
    else
	out="${D0}/${CHROM}/${BN}"
        mkdir -p ${D0}/${CHROM}
        echo "cp -p ${F} ${out}"
        cp -p ${F} ${out}
    fi
done
'_EOF_'
# << happy emacs
    #	Going to use blat this time instead of psLayout
    #	It is faster and appears to do just about the same exact job
    mkdir runPlacedNotSplit
    cd runPlacedNotSplit
    #	Re-use the previous jobList
    sed -e "s/runPsLayout.sh/runBlat.csh/" \
	../../coverage/runPlaced/masterJobList > jobList
    cat << '_EOF_' > runBlat.csh
#!/bin/csh -fe
set chrom = $1
set clone = $2
set contig = $3
set result = $4
set target = /san/sanvol1/scratch/hg18/coverage/maskedContigs/$chrom/$contig.fa.gz
set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/placedNotSplit/$chrom/$clone.fa.gz
if ( ! -f $query ) then
set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/placedSplit/$chrom/$clone.fa.gz
endif
set scrTmp = "/scratch/tmp/$contig/$clone"
set ooc = /san/sanvol1/scratch/hg18/coverage/contigOoc/$contig.10.ooc
mkdir -p "$scrTmp"
zcat $target > $scrTmp/$contig.fa
zcat $query > $scrTmp/$clone.fa
cp -p $ooc $scrTmp/10.ooc
pushd $scrTmp
pwd
ls -l
blat -minIdentity=98 -fastMap -tileSize=10 -t=dna -q=dna -ooc=10.ooc $contig.fa $clone.fa $clone.psl
popd
mkdir -p psl/$chrom/$contig
cp -p $scrTmp/$clone.psl $result
rm $scrTmp/*
rmdir $scrTmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$contig
'_EOF_'
# << happy emacs

    para create jobList
    para try; para check; etc ...
    para time
# Completed: 27093 of 27093 jobs
# CPU time in finished jobs:     435042s    7250.69m   120.84h    5.04d  0.014 y
# IO & Wait Time:                 74031s    1233.86m    20.56h    0.86d  0.002 y
# Average job time:                  19s       0.31m     0.01h    0.00d
# Longest finished job:             463s       7.72m     0.13h    0.01d
# Submission to last job:          3079s      51.32m     0.86h    0.04d

    #	combine the results into one large raw.psl file
    time pslSort dirs raw.psl tmp psl/*/*
    ls -og raw.psl
# -rw-rw-r--  1 52067774 Oct 31 12:06 raw.psl
    #	This raw.psl file will be included in the overall results, but as a
    #	check, it is possible to turn just these results into a .bed file for
    #	uploading as a custom track to take a look at them.
    time pslReps -nohead -nearTop=0.001 -singleHit \
	raw.psl repsSingle.psl /dev/null
    clusterClone -allowDuplicates -agp -minCover=80 \
        -maxGap=60000 repsSingle.psl > single.agp 2> single.out
    sort -k1,1 -k2,2n single.agp | ../../coverage/fixPhase.pl \
        /cluster/data/hg18/bed/coverage/phase.txt > contig_overlaps.agp \
        2> singleToOverlaps.out
    awk -F'\t' '{printf "%s\t%s\t%s\t%s\t0\t%s\n", $1,$2,$3,$6,$9}' \
        contig_overlaps.agp > cOverlaps.bed
    liftUp chrOverlaps.bed /san/sanvol1/scratch/hg18/bacends/liftContigs.lft \
        warn cOverlaps.bed
    #	Load up that chrOverlaps.bed as a custom track to see these results

    ##################################################################
    #	The next big group are the FISH clones
    cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
    #	Split or not split depending on gap count >= 100
    cat << '_EOF_' > splitFishClones.sh
#!/bin/sh

D0=fishSplit
export D0

find ../coverage/fishClones/sequence -type f | while read F
do
    BN=`basename "${F}"`
    Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"`
    if [ "${Ncount}" -gt 99 ]; then
	out="${D0}/fishPieces/${BN}"
        echo "gapSplit -minGap=100 ${F} ${out}"
        gapSplit -minGap=100 ${F} stdout | gzip > ${out}
        faSize "${F}"
        faSize "${out}"
    else
	out="${D0}/noPieces/${BN}"
	echo "cp -p ${F} ${out}"
	cp -p "${F}" "${out}"
    fi
done
'_EOF_'
# << happy emacs
    mkdir fishSplit
    chmod +x splitFishClones.sh
    time ./splitFishClones.sh

    #	combine them all into large fasta files to lower the file count
    cd fishSplit
    for F in fishPieces/* noPieces/*
    do
	zcat "${F}"
    done | gzip > all.fa.gz
    faSplit about all.fa.gz 500000 split/f_

    mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runFish
    cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runFish
    ls ../fishSplit/split | sed -e "s/.fa.gz//" > fish.list
    ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/?/* | \
	sed -e \
"s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
	> contig.list
    ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/??/* | \
	sed -e \
"s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
	>> contig.list
    ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/?_*/* | \
	sed -e \
"s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
	>> contig.list
    ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/??_*/* | \
	sed -e \
"s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
	>> contig.list

    cat << '_EOF_' > template
#LOOP
./runBlat.csh $(path1) $(path2) {check out line+ psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs

    cat << '_EOF_' > runBlat.csh
#!/bin/csh -fe
set target = /san/sanvol1/scratch/hg18/coverage/maskedContigs/$1.fa.gz
set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/fishSplit/split/$2.fa.gz
set contig = $target:t:r:r
set fishPiece = $query:t:r:r
set result = psl/$contig/$fishPiece.psl
set scrTmp = "/scratch/tmp/$contig/$fishPiece"
set ooc = /san/sanvol1/scratch/hg18/coverage/contigOoc/$contig.10.ooc
mkdir -p "$scrTmp"
zcat $target > $scrTmp/$contig.fa
zcat $query > $scrTmp/$fishPiece.fa
cp -p $ooc $scrTmp/10.ooc
pushd $scrTmp
pwd
ls -l
blat -fastMap -tileSize=10 -t=dna -q=dna -ooc=10.ooc $contig.fa $fishPiece.fa $fishPiece.psl
popd
mkdir -p psl/$contig
cp -p $scrTmp/$fishPiece.psl $result
rm $scrTmp/*
rmdir $scrTmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$contig
'_EOF_'
# << happy emacs
    chmod +x runBlat.csh

    para create contig.list fish.list template jobList
    para try; para create; etc ...
    para time
# Completed: 148176 of 148176 jobs
# CPU time in finished jobs:    2884533s   48075.56m   801.26h   33.39d  0.091 y
# IO & Wait Time:                385142s    6419.03m   106.98h    4.46d  0.012 y
# Average job time:                  22s       0.37m     0.01h    0.00d
# Longest finished job:             270s       4.50m     0.07h    0.00d
# Submission to last job:          9510s     158.50m     2.64h    0.11d

    #	put all the results together into a single file
    pslSort dirs raw.psl tmp psl/*
    #	this is a big result
    ls -og raw.psl
# -rw-rw-r--  1 6972351482 Oct 25 16:25 raw.psl
    #	can do the same thing as above to look at these results individually
    #	not listed here

    ##################################################################
    #	The next big group are the unplaced clones.  In the original run, the
    #	contig location of these items were inferred from Hg17 results, and
    #	thus many of them can be aligned against their respective contig.  For
    #	some cases, the contig isn't known, but the chrom is, thus they can be
    #	aligned to all the contigs for a chrom.  And finally, those completely
    #	unknown have to be aligned to all contigs.
    #	There are two sections here, those for which contig details are
    #	unknown, and those for which contigs are known.  First, those for
    #	which details are unknown:
    cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
    cat << '_EOF_' > splitUnplacedClones.sh
#!/bin/sh

find ../coverage/unPlacedClones -type f | while read F
do
    BN=`basename "${F}"`
    DN=`dirname "${F}"`
    CONTIG=`basename "${DN}"`
    DN=`dirname "${DN}"`
    CHROM=`basename "${DN}"`
    out="unPlacedSplit/${CHROM}/${CONTIG}/${BN}"
#    echo "${CHROM}/${CONTIG}/${BN}"
    mkdir -p unPlacedSplit/${CHROM}/${CONTIG}
    Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"`
    if [ "${Ncount}" -gt 99 ]; then
        echo "gapSplit -minGap=100 ${F} ${out}"
        gapSplit -minGap=100 ${F} stdout | gzip > ${out}
        faSize "${F}"
        faSize "${out}"
    fi
done
'_EOF_'
# << happy emacs
    chmod +x splitUnplacedClones.sh
    mkdir unPlacedSplit
    time ./splitUnplacedClones.sh > unPlaced.out 2>&1

    mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlacedChr
    cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlacedChr
    ls ../unPlacedSplit/*/XX*/*.fa.gz > bac.list
    cat << '_EOF_' > mkJobList.sh
#!/bin/sh

cat bac.list | while read F
do
    CHR=`echo "${F}" | sed -e "s#.*unPlacedSplit/##; s#/.*##"`
    CLONE=`basename ${F} | sed -e "s/.fa.gz//"`
    case $CHR in
        U|Un)
        for C in /san/sanvol1/scratch/hg18/coverage/maskedContigs/? \
            /san/sanvol1/scratch/hg18/coverage/maskedContigs/?? \
            /san/sanvol1/scratch/hg18/coverage/maskedContigs/?_* \
            /san/sanvol1/scratch/hg18/coverage/maskedContigs/??_*
        do
            CH=`basename ${C}`
            for CT in /san/sanvol1/scratch/hg18/coverage/maskedContigs/${CH}/*
            do
                CONTIG=`basename ${CT} | sed -e "s/.fa.gz//"`
                echo "./runPsLayout.sh $CH $CLONE $CONTIG {check out line+ psl/$
CH/$CONTIG/$CLONE.psl}"
            done
        done
        ;;
        *)
        for CT in /san/sanvol1/scratch/hg18/coverage/maskedContigs/${CHR}/*
        do
            CONTIG=`basename ${CT} | sed -e "s/.fa.gz//"`
            echo "./runPsLayout.sh $CHR $CLONE $CONTIG {check out line+ psl/$CHR
/$CONTIG/$CLONE.psl}"
        done
        ;;
    esac
'_EOF_'
# << happy emacs
    chmod +x mkJobList.sh

    ./mkJobList.sh > jobList
    cat << '_EOF_' > runPsLayout.sh
#!/bin/sh
#   runPsLayout.sh <chrom> <clone> <contig>
#     where <chrom> is the chrom this contig is on
#      <clone> is one of the .fa.gz files in
#         /san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/<chrom>/<clone>.fa.gz
#      <contig> is one of the contigs found in:
#       /san/sanvol1/scratch/hg18/coverage/maskedContigs/<chrom>/<contig>.fa.gz
#
HERE=`pwd`
CHROM=$1
CLONE=$2
CONTIG=$3
TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz
CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/$CHROM/XX_000000/$CLONE.fa.gz
OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc
RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl"
export CHROM CLONE CONTIG TARGET CLONESRC RESULT
mkdir -p psl/${CHROM}/${CONTIG}
if [ ! -s ${CLONESRC} ]; then
    CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/U/XX_000000/$CLONE.fa.gz
    if [ ! -s ${CLONESRC} ]; then
        CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/Un/XX_000000/$CLONE.fa.gz
        if [ ! -s ${CLONESRC} ]; then
            echo "Can not find: ${CLONESRC}" 1>/dev/stderr
            exit 255
        fi
    fi
fi
if [ ! -s ${TARGET} ]; then
        echo "Can not find: ${TARGET}" 1>/dev/stderr
        exit 255
fi
if [ ! -s ${OOC} ]; then
        echo "Can not find: ${OOC}" 1>/dev/stderr
        exit 255
fi
WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}"
mkdir -p "${WRKDIR}"
cd ${WRKDIR}
zcat ${CLONESRC} > ${CLONE}.fa
zcat ${TARGET} > ${CONTIG}.fa
cp -p ${OOC} ./10.ooc
/cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT}
RET=$?
cd ${HERE}
rm -fr ${WRKDIR}
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}"
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}"
exit ${RET}
'_EOF_'
# << happy emacs
    chmod +x ./runPsLayout.sh

    mkdir psl
    para create jobList
    para try; para check; ... etc ...
    para time
# Completed: 40509 of 40509 jobs
# CPU time in finished jobs:    5354801s   89246.69m  1487.44h   61.98d  0.170 y
# IO & Wait Time:                115279s    1921.31m    32.02h    1.33d  0.004 y
# Average job time:                 135s       2.25m     0.04h    0.00d
# Longest finished job:          164276s    2737.93m    45.63h    1.90d
# Submission to last job:        187712s    3128.53m    52.14h    2.17d

    # combine into one result file
    pslSort dirs raw.psl tmp psl/*/*

    ##################################################################
    #	Now, for those unplaced clones for which contig details are known
    ssh pk
    mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlaced
    cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlaced
    cat << '_EOF_' > mkJobList.sh
#!/bin/sh

find ../unPlacedSplit -type f | grep -v XX_ | while read F
do
    BN=`basename ${F} | sed -e "s/.fa.gz//"`
    DN=`dirname ${F}`
    CONTIG=`basename ${DN}`
    DN=`dirname ${DN}`
    CHROM=`basename ${DN}`
    echo "./runPsLayout.sh ${CHROM} ${BN} ${CONTIG} {check out line+ psl/${CHROM
}/${CONTIG}/${BN}.psl}"
done
'_EOF_'
# << happy emacs
    chmod +x mkJobList.sh

    ./mkJobList.sh > jobList
    cat << '_EOF_' > runPsLayout.sh
#!/bin/sh
#   runPsLayout.sh <chrom> <clone> <contig>
#     where <chrom> is the chrom this contig is on
#      <clone> is one of the .fa.gz files in
#         /san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/<chrom>/<clone>.fa.gz
#      <contig> is one of the contigs found in:
#       /san/sanvol1/scratch/hg18/coverage/maskedContigs/<chrom>/<contig>.fa.gz
#
HERE=`pwd`
CHROM=$1
CLONE=$2
CONTIG=$3
TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz
CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/$CHROM/$CONTIG/$CLONE.fa.gz
OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc
RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl"
mkdir -p psl/${CHROM}/${CONTIG}
if [ ! -s ${CLONESRC} ]; then
        echo "Can not find: ${CLONESRC}" 1>/dev/stderr
        exit 255
fi
if [ ! -s ${TARGET} ]; then
        echo "Can not find: ${TARGET}" 1>/dev/stderr
        exit 255
fi
if [ ! -s ${OOC} ]; then
        echo "Can not find: ${OOC}" 1>/dev/stderr
        exit 255
fi
WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}"
mkdir -p "${WRKDIR}"
cd ${WRKDIR}
zcat ${CLONESRC} > ${CLONE}.fa
zcat ${TARGET} > ${CONTIG}.fa
cp -p ${OOC} ./10.ooc
/cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT}
RET=$?
cd ${HERE}
rm -fr ${WRKDIR}
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}"
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}"
exit ${RET}
'_EOF_'
# << happy emacs
    chmod +x runPsLayout.sh

    para create jobList
    para try; para check; ... etc ...
    para time
# Completed: 14569 of 14569 jobs
# CPU time in finished jobs:    4863551s   81059.19m  1350.99h   56.29d  0.154 y
# IO & Wait Time:                 64196s    1069.93m    17.83h    0.74d  0.002 y
# Average job time:                 338s       5.64m     0.09h    0.00d
# Longest finished job:           36681s     611.35m    10.19h    0.42d
# Submission to last job:         68213s    1136.88m    18.95h    0.79d

    #	combine into a single result
    pslSort dirs raw.psl tmp psl/*/*


    #	combine into a single result
    time pslSort dirs raw.psl tmp psl/*
    #	real    550m57.744s
    #	user    324m56.251s
    #	sys     10m15.358s
    ls -og raw.psl
    #	-rw-rw-r--  1 39273644954 Nov  2 20:23 raw.psl
    #	Wow ...
    time pslReps -nohead -nearTop=0.001 -singleHit \
	raw.psl repsSingle.psl /dev/null
    #	real    15m14.462s
    #	user    13m6.580s
    #	sys     1m50.304s
    ls -og repsSingle.psl
    #	-rw-rw-r--    1    73403317 Nov  3 09:44 repsSingle.psl

    ###########################################################
    #  And now, combining all results together
    mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/finalPsl
    cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/finalPsl
    ln -s ../runLastOnes/repsSingle.psl lastOnes.psl
    ln -s ../runFish/raw.psl fish.psl
    ln -s ../runUnPlaced/raw.psl unPlaced.psl
    ln -s ../runUnPlacedChr/raw.psl unPlacedChr.psl
    ln -s ../runPlacedNotSplit/raw.psl placed.psl
    cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
    time pslSort dirs raw.psl tmp finalPsl
    #	real    18m53.770s
    #	user    12m19.002s
    #	sys     1m17.504s
    ls -og raw.psl
    #	-rw-rw-r--   1 7742802124 Nov  3 10:10 raw.psl
    time pslReps -nohead -nearTop=0.001 -singleHit \
        raw.psl repsSingle.psl /dev/null
    clusterClone -allowDuplicates -agp -minCover=80 \
        -maxGap=60000 repsSingle.psl > single.agp 2> single.out
    sort -k1,1 -k2,2n single.agp | ../coverage/fixPhase.pl \
        /cluster/data/hg18/bed/coverage/phase.txt > contig_overlaps.agp \
        2> singleToOverlaps.out
    awk -F'\t' '{printf "%s\t%s\t%s\t%s\t0\t%s\n", $1,$2,$3,$6,$9}' \
        contig_overlaps.agp > cOverlaps.bed
    liftUp chrOverlaps.bed /san/sanvol1/scratch/hg18/bacends/liftContigs.lft \
        warn cOverlaps.bed
    #	Load up that chrOverlaps.bed as a custom track to see these results

    #	And back to the original business of eliminating obsolete clones
    awk '{print $6}' contig_overlaps.agp | sort -u > clone.coverage.list
    time $HOME/kent/src/hg/makeDb/hgClonePos/ckMultipleVersions.pl \
	clone.coverage.list > /dev/null 2> obsolete.clones

    time $HOME/kent/src/hg/makeDb/hgClonePos/removeObsoleteClones.sh \
	contig_overlaps.agp obsolete.clones > clean_overlaps.agp
    #	looks like it removes 295 lines
    wc -l contig_overlaps.agp clean_overlaps.agp
    #	613577 contig_overlaps.agp
    #	613507 clean_overlaps.agp
    mv contig_overlaps.agp contig_overlapsWithObsoletes.agp
    mv clean_overlaps.agp contig_overlaps.agp


    cd /cluster/data/hg18
    #	save all existing .gl files before we overwrite them all
    tar cvzf ./save.glFiles.tgz ./?/*.gl ./??/*.gl ./?_*/*.gl \
	./??_*/*.gl ./?/*/*.gl ./??/*/*.gl ./?_*/*/*.gl ./??_*/*/*.gl
    time agpToGl contig_overlaps.agp . -md=seq_contig.md
    #	real    1m4.253s
    time ./jkStuff/liftGl.csh contig.gl
    #	saw some errors such as: NT_113974/contig.gl doesn't exist, skipping
    #	I'm guessing they were contigs with no alignment results
    #	capture these new .gl files for future reference
    tar cvzf ./new.glFiles.tgz ./?/*.gl ./??/*.gl ./?_*/*.gl \
	./??_*/*.gl ./?/*/*.gl ./??/*/*.gl ./?_*/*/*.gl ./??_*/*/*.gl
    #	now reload all the _gold, _gap and _gl tables
    #	Tested this load on a dummy database and found that the contents of
    #	the gold and gap tables do not change
    hgGoldGapGl -chromLst=chrom.lst hg18 /cluster/store11/gs.19 build36
    #   Then hgClonePos uses those tables to create the Coverage track
    #	table: clonePos
    hgClonePos  -maxErr=600 -maxWarn=50000 -chromLst=chrom.lst \
        hg18 /cluster/data/hg18 ./cleanedSequence.inf /cluster/store11/gs.19 \
        > updated.clone.pos.errors 2>&1
    #	Now let's check for clones that are excessively wrong
    cd /tmp
    hgsql -N -e \
"select chrom,chromStart,chromEnd,name,chromEnd-chromStart,seqSize from clonePos;" \
	hg18 > clonePos.hg18.lengths
    awk '{if ($6 > 0) { printf "%.2f\t%s\n", 100.0*$5/$6,$0}}' \
	clonePos.hg18.lengths | sort -n > clonePos.hg18.deviations
    #	Looking at that list of deviations, there are still a number of them
    #	that are extreme deviants, but there are a lot less than there were
    #	before. Previously:
    ave clonePos.hg18.deviations
    #	Q1 100.000000
    #	median 100.000000
    #	Q3 109.172500
    #	average 350.043843
    #	min 80.000000
    #	max 23574.310000
    #	count 44978
    #	total 15744271.980000
    #	standard deviation 851.762186
    #	Over 3,500 of them larger than 10 times too large:
    awk '{if ($1 > 1000) {print}}' clonePos.hg18.deviations | wc
    #	3881   27167  223039

    # This new lot:
    ave clonePos.hg18.deviations
    #	Q1 100.000000
    #	median 100.000000
    #	Q3 100.360000
    #	average 140.353820
    #	min 0.250000
    #	max 40838.840000
    #	count 43734
    #	total 6138233.960000
    #	standard deviation 381.871589
    #	Only 277 are larger than 10 times too big:
    awk '{if ($1 > 1000) {print}}' clonePos.hg18.deviations | wc
    #	277    1939   15747

# QA NOTE: ran mytouch on the *gold and *gap tables because the values were
# unachaged, but they got a new date/time in the above process (ASZ
# 11-14-2006):
# sudo mytouch hg18 'chr*_gold' 200604060800.00
# sudo mytouch hg18 'chr*_gap' 200604060800.00


##############################################################################
# LongSAGE                                  (2006-10-20 markd)
# Load LongSAGE composite tag with genomo mappings of tag clusters
# obtained from "Martin Hirst" <mhirst at bcgsc.ca>

    ftp ftp2.bcgsc.ca
    user: ucsc
    <password from martin >

    download SHE*_u.map
    chmod a-w *.map

    ~/compbio/kent/src/hg/makeDb/outside/bcgscSage/bcgscSageLoad hg18 *_u.map

####################################################################
# MAKE UNIGENE/SAGE TRACK (DONE - 2006-11-20 Fan)

# Create the uniGene alignments
# /cluster/data/hg18/uniGene/hg18.uniGene.lifted.pslReps.psl

    # Download of the latest UniGene version is now automated by a
    # cron job -- see /cluster/home/angie/crontab ,
    # /cluster/home/angie/unigeneVers/unigene.csh .
    # If hgwdev gets rebooted, that needs to be restarted... maybe there's
    # a more stable place to set up that cron job.

    ssh hgwdev
    cd /cluster/store11/gs.19/build36/bed
    mkdir uniGene
    cd uniGene

    set Version = 196

    zcat /cluster/store7/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\
    sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa

    ssh pk
    set Version = 196
    mkdir -p /san/sanvol1/scratch/hg18/uniGene/
    cd /san/sanvol1/scratch/hg18/uniGene/
    cp -p /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa .
    ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
    ls -1S \
    /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa \
      > uniGene.lst
    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2)  {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 genome.lst uniGene.lst template.sub para.spec
    para create para.spec
    mkdir psl
    para try
    para check
    para push
# Completed: 49 of 49 jobs
# CPU time in finished jobs:      46855s     780.92m    13.02h    0.54d  0.001 y
# IO & Wait Time:                   240s       3.99m     0.07h    0.00d  0.000 y
# Average job time:                 961s      16.02m     0.27h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3629s      60.48m     1.01h    0.04d
# Submission to last job:          4337s      72.28m     1.20h    0.05d

    pslSort dirs raw.psl tmp psl >& pslSort.log
    cat raw.psl|\
    pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
      stdin hg18.uniGene.pslReps.psl /dev/null

# Processed 553470 alignments
    gzip raw.psl
    gzip Hs.seq.uniq.simpleHeader.fa

    ssh hgwdev
    cd /cluster/store11/gs.19/build36/bed/uniGene
    cp -p /san/sanvol1/scratch/hg18/uniGene/hg18.uniGene.pslReps.psl .

    hgLoadPsl -table=uniGene_3 hg18 hg18.uniGene.pslReps.psl

####################################################################
# EXONIPHY (2006-12-05 acs)

    # predictions provided by Brona Brejova in Siepel Lab (bb248 at cornell.edu).
    # stored in /cluster/data/hg18/bed/exoniphy/exoniphy.gff

    ldHgGene -genePredExt -gtf hg18 exoniphy exoniphy.gff

####################################################################
# HapMap CNVRs (copy number variable regions) from Matt Hurles (Heather Dec. 2006)
# Change bed3 to bed6 to match hg17

cd /cluster/data/hg18/bed/sv
redon.pl < cnpRedon.hg18 > redon.bed
hgLoadBed hg18 cnpRedon cnpRedon.bed

#########################################################
# Structural Variation from Lars Feuk (Heather Jan - April 2007)
# These tables are all tiny so I'm not using indices
# I kept the bin column in all but Sebat but I could have done without that, # too

    ssh hgwdev
    cd /cluster/data/hg18/bed/sv
    # 8 *txt files from Lars

    # Sharp (format different from hg17)
    cp Sharp*txt sharp.in
    # use editor to remove header from sharp.in
    # grab the data we need
    sharp.pl < sharp.in > sharp.prelim
    # adjust
    sharp2.pl < sharp.prelim > sharp.bed
    hgLoadBed hg18 cnpSharp2 sharp.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpSharp2.sql

    # Iafrate (format different from hg17)
    cp Iafrate*txt iafrate.in
    # use editor to change TABTAB to TAB0TAB and get rid of header
    iafrate.pl < iafrate.in > iafrate.bed
    hgLoadBed hg18 cnpIafrate2 iafrate.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpIafrate2.sql

    # Sebat (format different from hg17)
    cp Sebat*txt sebat.in
    # use editor to get rid of header
    sebat.pl < sebat.in > sebat.bed
    hgLoadBed hg18 cnpSebat2 sebat.bed -noBin -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpSebat2.sql

    # Tuzun (I called this cnpFosmid in hg17)
    # simple bed 4 .
    cp Tuzun*txt tuzun.in
    # use editor to get rid of header
    tuzun.pl < tuzun.in > tuzun.bed
    hgLoadBed hg18 cnpTuzun tuzun.bed -tab

    # McCarroll (same format as hg17, simple bed 4 .)
    # need to sort and assign ids
    cp McCarroll*txt mccarroll.in
    # use editor to get rid of header
    mccarroll.pl < mccarroll.in > mccarroll.prelim
    sort -g mccarroll.prelim > mccarroll.sort
    # sort isn't perfect, use editor to finish
    mccarroll2.pl < mccarroll.sort > mccarroll.bed
    hgLoadBed hg18 delMccarroll mccarroll.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delMccarroll.sql

    # Conrad (different format from hg17)
    cp Conrad*txt conrad.prelim
    # use editor to shorten "Study" column
    conrad.pl < conrad.prelim > conrad.prelim2
    cp conrad.prelim2 conrad.prelim3
    # use editor to sort conrad.prelim3 (lame)
    # assign Ids
    conradId.pl < conrad.prelim3 > conrad.bed
    hgLoadBed hg18 delConrad2 conrad.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delConrad2.sql

    # Hinds (different format from hg17)
    cp Hinds*txt hinds.in
    # use editor to remove header
    hinds.pl < hinds.in > hinds.prelim
    sort -g hinds.prelim > hinds.sort
    # sort isn't perfect, use editor to finish
    hinds2.pl < hinds.sort > hinds.bed
    hgLoadBed hg18 delHinds2 hinds.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delHinds2.sql

    # Locke (new data)
    cp Locke*txt locke.in
    locke.pl < locke.in > locke.prelim
    sort -g locke.prelim > locke.??
    locke2.pl


#########################################################
# BUILD GAD TRACK (Done, 12/12/06, Fan)

   mkdir /cluster/store12/gad061211
   rm /cluster/data/gad
   ln -s /cluster/store12/gad061211 /cluster/data/gad

# Receive "GAD-Hg18DATA.txt" from GAD/NIA
# contact person: Shenoy, Narmada, shenoyn at grc.nia.nih.gov

   hgsql hg18 -e 'drop table gadAll'
   hgsql hg18 <~/src/hg/lib/gadAll.sql
   hgsql hg18 -e 'load data local infile "GAD-Hg18DATA.txt" into table gadAll ignore 1 lines'
   hgsql hg18 -e 'create index geneSymbol on gadAll(geneSymbol(10))'

# create gad table

   hgsql hg18 -N -e \
   'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll where chromStart <>0 and chromEnd <>0 and chromosome<>""'|\
   sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gadHg18.bed

   hgLoadBed hg18 gad gadHg18.bed

#########################################################################
# BLASTZ/CHAIN/NET oryLat1 (DONE - 2006-12-14 - Hiram)
#  third time with randoms and chrUn in scaffolds on both sequences
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.oryLat1.2006-12-14
    cd /cluster/data/hg18/bed/blastz.oryLat1.2006-12-14
    cat << '_EOF_' > DEF
# Human vs. Medaka

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human hg18, randoms in contigs, lifted to their chr*_random
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/hg18.randomContigs.sdTrf.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/hg18.randomContigs.sdTrf.sizes
SEQ1_LIFT=/san/sanvol1/scratch/hg18/hg18.randomContigs.lift
SEQ1_CHUNK=10000000
SEQ1_LIMIT=1
SEQ1_LAP=10000

# QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
#       chrUn in Scaffolds for this alignment run
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=40000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.oryLat1.2006-12-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -verbose=2 \
	-blastzOutRoot /cluster/bluearc/hg18OryLat1 > do.log 2>&1 &
    ### this did not work, abandoned

#########################################################################
# BLASTZ/CHAIN/NET oryLat1 (DONE - 2006-12-14 - Hiram)
#  fourth time with randoms and chrUn in scaffolds for only Medaka
#	All chroms and randoms as they are complete on Human
    ssh kkstore04
    mkdir /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
    cd /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
    cat << '_EOF_' > DEF
# Human vs. Medaka

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human hg18, randoms complete, as they are, no contig confusion
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1

# QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
#       chrUn in Scaffolds for this alignment run
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=40000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -verbose=2 \
	-blastzOutRoot /cluster/bluearc/hg18OryLat1 > do.log 2>&1 &
    #	real    318m45.339s
    # typical failure:
    #	HgStepManager: executing step 'net'.
    #	netChains: looks like previous stage was not successful
    #		(can't find [hg18.oryLat1.]all.chain[.gz]).
    #	continuing net:
    time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-continue=net -bigClusterHub=pk -verbose=2 \
	-blastzOutRoot /cluster/bluearc/hg18OryLat1 > net.log 2>&1 &
    #	real    39m25.853s

    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
    nice -n +19 featureBits  hg18 chainOryLat1Link \
	> fb.hg18.chainOryLat1Link.txt 2>&1 &
    #	57393910 bases of 2881515245 (1.992%) in intersection

    ssh kkstore04
    mkdir /cluster/data/oryLat1/bed/blastz.hg18.swap
    cd /cluster/data/oryLat1/bed/blastz.hg18.swap
    time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \
	/cluster/data/hg18/bed/blastz.oryLat1.2007-02-24/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -verbose=2 -swap > swap.log 2>&1 &

    ssh hgwdev
    cd /cluster/data/oryLat1/bed/blastz.hg18.swap
    nice -n +19 featureBits  oryLat1 chainHg18Link \
	> fb.oryLat1.chainHg18Link.txt 2>&1 &
    #	48002423 bases of 700386597 (6.854%) in intersection

##########################################################################
# AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14)
   ssh hgwdev
   cd /cluster/data/hg18/bed/affyHumanExon
   liftOver /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.bed \
      /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz affyHuEx1.bed affyHuEx1.unmapped
   awk 'BEGIN{OFS="\t"}{print $4,$3-$2}' affyHuEx1.bed | sort -k2,2nr | head
#2440970 81664
#3016074 9552
#3641787 8061
#2321649 8054
   # It seems the liftOver problem still happens for that segmental dupe.
   # So the start is correct and the end is correct.  Just make two entries, both
   # with size == 305.
   grep -v "\b2440970\b" affyHuEx1.bed > tmp.bed
   grep "\b2440970\b" affyHuEx1.bed > bad.bed
   awk 'BEGIN{OFS="\t"}{print $1,$2,$2+305,$4,$5,$6}' bad.bed > good.bed
   awk 'BEGIN{OFS="\t"}{print $1,$3-305,$3,$4,$5,$6}' bad.bed >> good.bed
   cat tmp.bed good.bed > affyHuEx1.bed
   bedSort affyHuEx1.bed tmp.bed
   mv tmp.bed affyHuEx1.bed
   rm good.bed bad.bed
   hgLoadBed hg18 affyHuEx1 affyHuEx1.bed

##########################################################################
# CGAP SAGE (In progress Andy 2007-01-09)
    # This is the BED part.
    ssh hgwdev
    cd /cluster/data/hg18/bed
    mkdir /san/sanVol1/scratch/andy/cgapSage
    ln -s /san/sanVol1/scratch/andy/cgapSage cgapSage
    wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_hs_long_forward_v36.1.tar.gz
    wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_hs_long_reverse_v36.1.tar.gz
    tar xfz SAGE_hs_long_forward_v36.1.tar.gz
    tar xfz SAGE_hs_long_reverse_v36.1.tar.gz
    cd hs_forward/
    cat * | awk 'BEGIN{OFS="\t"}{print $1, $3, $4, $2, 1000, "+"}' > ../unlifted.bed
    cd ../hs_reverse/
    cat * | awk 'BEGIN{OFS="\t"}{print $1, $4, $3, $2, 1000, "-"}' >> ../unlifted.bed
    cd ../
    rm -rf hs*
    liftUp lifted.bed /cluster/data/hg18/jkStuff/liftAll.lft warn unlifted.bed
#Got 378 lifts in /cluster/data/hg18/jkStuff/liftAll.lft
#Lifting unlifted.bed
#Expecting number field 3 line 13868252 of unlifted.bed, got CCATCGGATGCCCACCT
    # Looks like there was a funny line in unlifted.bed:
    grep CCATCGGATGCCCACCT unlifted.bed
#NT_011362       24364534NT_004321       CCATCGGATGCCCACCT       AATAAGCCAGAGTCTAT       1000    -
#NT_004321       7900    7884    CCATCGGATGCCCACCT       1000    -
    # Ok so there's one record for CCATCGGATGCCCACCT in addition... and for
    # AATAAGCCAGAGTCTAT?
    grep AATAAGCCAGAGTCTAT unlifted.bed
#NT_011362       24364534NT_004321       CCATCGGATGCCCACCT       AATAAGCCAGAGTCTAT       1000    -
#NT_011362       24364534        24364518        AATAAGCCAGAGTCTAT       1000    -
    # Looks like that one's got a record too.  So just get rid of the stupid
    # line:
    grep -v 24364534NT_004321 unlifted.bed > tmp
    mv tmp unlifted.bed
    liftUp lifted.bed /cluster/data/hg18/jkStuff/liftAll.lft warn unlifted.bed
    rm unlifted.bed
    head lifted.bed
#chr1    649     665     TGTCTGCGCCTGCGCCG       1000    -
#chr1    670     686     CTAGCGCGTCGGGGTGG       1000    +
    nibFrag /cluster/data/hg18/nib/chr1.nib 669 686 "+" /dev/stdout
#>/cluster/data/hg18/nib/chr1.nib:669-686
#ctagcgcgtcggggtgg
    nibFrag /cluster/data/hg18/nib/chr1.nib 649 665 m /dev/stdout
#>/cluster/data/hg18/nib/chr1.nib:649-665
#tgtctgcgcctgcgcc
    # It looks like there's off-by-one errors, so fix em:
    awk 'BEGIN{OFS="\t"}{start=$2; end=$3;if ($6 == "-") { end = end+1; } else { start = start-1 } print $1, start, end, $4, $5, $6}' \
       < lifted.bed > mapping.bed6
    rm lifted.bed
    # Add thickStart/thickEnd fields
    awk 'BEGIN{OFS="\t"}{thickStart=$2; thickEnd=$3; if ($6=="-") {thickStart = thickStart+13; } else { thickEnd = thickEnd-13; } print $0, thickStart, thickEnd}' \
       < mapping.bed6 > mapping.bed


##########################################################################
#  xxBlastTab - Help filter out unwanted paralogs  (Galt 2007-01-10)
#
# Background: The xxBlastTab tables are made with a simple blastall
# (blastp with -b 1) which chooses the best match.  Unfortunately this
# means that if there is no proper match it will still pick something
# even though it's probably not orthologous. This is especially a problem
# in organisms like rat knownGene which has only 30% gene coverage.
# The strategy here is to filter our xxBlastTab using synteny mappings from
# the chains. This is done by simply taking hg18.kg and using /gbdb/$db chains
# and pslMap to lift the genes to the target xx assembly.  Then hgMapToGene
# will find which of those mapped ids have good overlap with xx.knownGene.
# The final mapping is then created by doing an inner join between
# the traditional xxBlastTab and the mapping table produced above.
# Then simply drop the old table and rename the new table.
#
#
# We are starting with xxBlastTab tables already built in the usual way with
# blastall/blastp, probably with doHgNearBlastp.pl script.
#
# I created a new utility script called synBlastp.csh since I have to do this
# several times.
#
# we want to update hg18 for rat and mouse,
# so check ./hgGeneData/Human/hg18/otherOrgs.ra for current settings

ssh hgwdev

synBlastp.csh hg18 rn4

#hg18.rnBlastTab results:
#new number of unique query values:
# 13120
#new number of unique target values
# 6431
#old number of unique query values:
# 26982
#old number of unique target values
# 6732

synBlastp.csh hg18 mm8

#hg18.mmBlastTab results:
#new number of unique query values:
# 28733
#new number of unique target values
# 15366
#old number of unique query values:
# 33016
#old number of unique target values
# 15918


##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg18

#################################################
# BUILD ncRna TRACK (DONE, 1/12/07, Fan)

# Download the terms and make the database.
    ssh hgwdev
    cd /cluster/store11/gs.19/build36
    cd bed
    mkdir ncRna

# copy Perl file at:
# http://cvs.sanger.ac.uk/cgi-bin/viewcvs.cgi/biomart-perl/scripts/webExample.pl?view=markup
# into getBiomart.pl

# create the following query xml file, ncRna.xml:

    cat << '_EOF_' >ncRna.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE Query>
<Query virtualSchemaName="default" Header="1" count="" softwareVersion="0.5">
<Dataset name="hsapiens_gene_ensembl" interface="default" Formatter="GTF" >

<Attribute name="biotype" />
<Attribute name="str_chrom_name" />
<Attribute name="gene_stable_id" />
<Attribute name="exon_chrom_start" />
<Attribute name="exon_chrom_end" />
<Attribute name="transcript_chrom_strand" />
<Attribute name="external_gene_id" />
</Dataset>
</Query>
'_EOF_'

# get Ensembl gene data from BioMart and filter out protein-conding genes

    perl getBiomart.pl ncRna.xml | grep -v protein_coding >ncRna0.tab

# cut and paste different cols to form ncRna.tab

    cat ncRna0.tab | sed -e 's/ENSG/chr\tENSG/'>j1
    cut -f 2 j1 >j.chr0
    cut -f 1 j1 >j.chr
    cat j.chr0|sed -e 's/chr/0/' >j.0
    cut -f 6 j1  >j.strand
    cut -f 4,5 j1 >j.startEnd
    cut -f 3 j1 >j.name
    cut -f 7 j1 >j.type
    cut -f 8 j1 >j.extGeneId
    paste j.chr0 j.chr j.startEnd j.name j.0 j.strand j.0 j.0 j.type j.extGeneId >j.all
    cat j.all|grep -v c6_COX|grep -v c6_QBL|grep -v c5_H2\
    |sed -e 's/chr\t/chr/'\
    |grep -v NT_\
    |sed -e 's/\t-1\t/\t-\t/' |sed -e 's/\t1\t/\t+\t/' \
    |sed -e 's/chrMT/chrM/'\
    |sort -k1,1 -k2,2n -k3,3n >ncRna.tab

    hgLoadBed -strict -tab -sqlTable=/cluster/home/fanhsu/src/hg/lib/ncRna.sql hg18 ncRna ncRna.tab

    rm j.*
    rm j1

###########################################################
# MAKE Drosophila Proteins track (DONE 2007-02-06 braney)
    ssh kkstore02
    sandir=/san/sanvol1/scratch/hg18
    mkdir $sandir
    cd /cluster/data/hg18
    cat noUn/chr*fa > temp.fa
    faSplit gap temp.fa 1000000 $sandir/blastDb/x -lift=$sandir/blastDb.lft
    cat randomContigs/*.fa > temp.fa
    faSplit sequence temp.fa 150 $sandir/blastDb/y
    rm temp.fa
    cd $sandir/blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    mkdir -p /cluster/data/hg18/bed/tblastn.dm2FB
    cd /cluster/data/hg18/bed/tblastn.dm2FB
    echo  /san/sanvol1/scratch/hg18/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 3066 query.lst

   # we want around 150000 jobs
   calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\)

# 18929/(150000/3066) = 386.908760

   mkdir -p /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa
   split -l 387 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl  /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa/kg
   ln -s /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa
   cd fbfa
   for i in *; do
     nice pslxToFa $i $i.fa;
     rm $i;
     done
   cd ..
   ls -1S fbfa/*.fa > fb.lst
   mkdir -p /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut
   ln -s /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut
   for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/hg18/bed/tblastn.dm2FB
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /san/sanvol1/scratch/hg18/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.3

        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    exit
    chmod +x blastSome
    gensub2 query.lst fb.lst blastGsub blastSpec

    ssh pk
    cd /cluster/data/hg18/bed/tblastn.dm2FB
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 150234 of 150234 jobs
# CPU time in finished jobs:    8313632s  138560.53m  2309.34h   96.22d  0.264 y
# IO & Wait Time:                882301s   14705.02m   245.08h   10.21d  0.028 y
# Average job time:                  61s       1.02m     0.02h    0.00d
# Longest finished job:             545s       9.08m     0.15h    0.01d
# Submission to last job:         40693s     678.22m    11.30h    0.47d

    ssh kkstore02
    cd /cluster/data/hg18/bed/tblastn.dm2FB
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut/c.`basename $1`.psl)
'_EOF_'
    exit
    chmod +x chainOne
    ls -1dS /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/hg18/bed/tblastn.dm2FB/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

# Completed: 48 of 49 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:     209872s    3497.86m    58.30h    2.43d  0.007 y
# IO & Wait Time:                 48501s     808.35m    13.47h    0.56d  0.002 y
# Average job time:                5383s      89.71m     1.50h    0.06d
# Longest finished job:           19336s     322.27m     5.37h    0.22d
# Submission to last job:         19336s     322.27m     5.37h    0.22d

    ssh kkstore02
    cd /cluster/data/hg18/bed/tblastn.dm2FB/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/hg18/bed/tblastn.dm2FB/unliftBlastDm2FB.psl
    cd ..
    pslCheck unliftBlastDm2FB.psl
    sed "s/[0-9XY]*\///" unliftBlastDm2FB.psl | liftUp -type=.psl -nohead stdout ../../randomContigs/hg18.randomContigs.lift carry stdin | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > blastDm2FB.psl

    # load table
    ssh hgwdev
    cd /cluster/data/hg18/bed/tblastn.dm2FB
    hgLoadPsl hg18 blastDm2FB.psl

    # check coverage
    featureBits hg18 blastDm2FB
# 5976178 bases of 2881515245 (0.207%) in intersection

    featureBits hg18 knownGene:cds blastDm2FB  -enrichment
# knownGene:cds 1.111%, blastDm2FB 0.207%, both 0.130%, cover 11.71%, enrich 56.45x

    ssh kkstore04
    rm -rf /cluster/data/hg18/bed/tblastn.dm2FB/blastOut
    rm -rf /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut
#end tblastn
##########################################################################

#########################################################################
# BLASTZ/CHAIN/NET FR2 (DONE - 2007-01-26 - Hiram)
##  Align to fr2 scaffolds,
##	results lifted to fr2 chrUn coordinates
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.fr2.2007-01-24
    cd /cluster/data/hg18/bed/blastz.fr2.2007-01-24
    cat << '_EOF_' > DEF
# Human vs. Fugu

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LIMIT=1
SEQ1_LAP=10000

# QUERY: Fugu fr2
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.fr2.2007-01-24
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/hg18Fr2 > do.log 2>&1 &
    #	real    414m47.505s

    ##  Swap back to fr2 (duplicated in fr2.txt also)
    mkdir /cluster/data/fr2/bed/blastz.hg18.swap
    cd /cluster/data/fr2/bed/blastz.hg18.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/hg18/bed/blastz.fr2.2007-01-24/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -swap > swap.log 2>&1 &
    #	real    47m14.554s
    ssh hgwdev
    cd /cluster/data/fr2/bed/blastz.hg18.swap
    time nice -n +19 featureBits fr2 chainHg18Link \
	> fb.fr2.chainHg18Link.txt 2>&1 &
    #	42875664 bases of 393312790 (10.901%) in intersection

############################################################################
##  BLASTZ mm8 test with WindowMasker sequence (DONE - 2007-01-30 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/hg18/bed/blastz.mm8.2007-01-30
    cd /cluster/data/hg18/bed/blastz.mm8.2007-01-30

    cat << '_EOF_' > DEF
# human vs mouse

BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18.noUn.sdTrf.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Mouse Mm8 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/san/sanvol1/scratch/mm8/sdTrf/mm8.noUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/mm8/sdTrf/noUn.sdTrf.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.mm8.2007-01-30
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/hg18Mm8 \
	-chainMinScore=3000 -chainLinearGap=medium > do.out 2>&1 &
    time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/hg18Mm8 \
	-continue=cat -stop=net \
	-chainMinScore=3000 -chainLinearGap=medium > cat.out 2>&1 &
    #	real    635m55.126s
    nice -n +19 featureBits -noRandom hg18 chainMm8Link \
	> fb.noRandom.hg18.chainMm8Link.txt 2>&1
    #	991429484 bases of 2868834265 (34.559%) in intersection
    nice -n +19 featureBits -noRandom hg18 chainMm8WMLink \
	> fb.noRandom.hg18.chainMm8WMLink.txt 2>&1
    #	1071083201 bases of 2868834265 (37.335%) in intersection

    ## swap to mm8
    mkdir /cluster/data/mm8/bed/blastz.hg18.swap.2007-02-01
    cd /cluster/data/mm8/bed/blastz.hg18.swap.2007-02-01
    time doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	/cluster/data/hg18/bed/blastz.mm8.2007-01-30/DEF \
	-swap -stop=net -chainMinScore=3000 \
	-chainLinearGap=medium > swap.out 2>&1 &
    #	this created the directory /cluster/data/mm8/bed/blastz.hg18.swap
    #	after it was done, move to here blastz.hg18.swap.2007-02-01 since
    #	it is on a filesystem with some free space
    nice -n +19 featureBits -noRandom mm8 chainHg18Link \
	> fb.noRandom.mm8.chainHg18Link.txt 2>&1
    #	983004750 bases of 2550172871 (38.547%) in intersection
    nice -n +19 featureBits -noRandom mm8 chainHg18WMLink \
	> fb.noRandom.mm8.chainHg18WMLink.txt 2>&1
    #	976774811 bases of 2550172871 (38.302%) in intersection

###########################################################
# MAKE C. elegans proteins track
    ssh kkstore02
    sandir=/san/sanvol1/scratch/hg18

    mkdir -p /cluster/data/hg18/bed/tblastn.ce3WB
    cd /cluster/data/hg18/bed/tblastn.ce3WB
    echo  /san/sanvol1/scratch/hg18/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 3066 query.lst

   # we want around 200000 jobs
   calc `wc /cluster/data/ce3/bed/blat.ce3WB/ce3WB.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)

# 22395/(200000/3066) = 343.315350

   mkdir -p /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa
   split -l 343 /cluster/data/ce3/bed/blat.ce3WB/ce3WB.psl  /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa/wb
   ln -s /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa
   cd wbfa
   for i in *; do
     nice pslxToFa $i $i.fa;
     rm $i;
     done
   cd ..
   ls -1S wbfa/*.fa > wb.lst
   mkdir -p /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut
   ln -s /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut
   for i in `cat wb.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/hg18/bed/tblastn.ce3WB
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /san/sanvol1/scratch/hg18/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/ce3/bed/blat.ce3WB/protein.lft warn $f.3

        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    exit
    chmod +x blastSome
    gensub2 query.lst wb.lst blastGsub blastSpec

    ssh pk
    cd /cluster/data/hg18/bed/tblastn.ce3WB
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 195603 of 195603 jobs
# CPU time in finished jobs:   12047221s  200787.01m  3346.45h  139.44d  0.382 y
# IO & Wait Time:               9089287s  151488.12m  2524.80h  105.20d  0.288 y
# Average job time:                 108s       1.80m     0.03h    0.00d
# Longest finished job:            1002s      16.70m     0.28h    0.01d
# Submission to last job:        192221s    3203.68m    53.39h    2.22d


    ssh kkstore02
    cd /cluster/data/hg18/bed/tblastn.ce3WB
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut/c.`basename $1`.psl)
'_EOF_'
    exit
    chmod +x chainOne
    ls -1dS /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut/wb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/hg18/bed/tblastn.ce3WB/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

# Completed: 66 of 66 jobs
# CPU time in finished jobs:     161714s    2695.23m    44.92h    1.87d  0.005 y
# IO & Wait Time:                 40315s     671.92m    11.20h    0.47d  0.001 y
# Average job time:                3061s      51.02m     0.85h    0.04d
# Longest finished job:            9372s     156.20m     2.60h    0.11d
# Submission to last job:         11599s     193.32m     3.22h    0.13d

    ssh kkstore02
    cd /cluster/data/hg18/bed/tblastn.ce3WB/blastOut
    for i in wb??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/hg18/bed/tblastn.ce3WB/unliftBlastCe3WB.psl
    cd ..
    pslCheck unliftBlastCe3WB.psl
    sed "s/[0-9XY]*\///" unliftBlastCe3WB.psl | liftUp -type=.psl -nohead stdout ../../randomContigs/hg18.randomContigs.lift carry stdin | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > blastCe3WB.psl

    # load table
    ssh hgwdev
    cd /cluster/data/hg18/bed/tblastn.ce3WB
    hgLoadPsl hg18 blastCe3WB.psl

    # check coverage
    featureBits hg18 blastCe3WB
# 4326489 bases of 2881515245 (0.150%) in intersection

    featureBits hg18 knownGene:cds blastCe3WB  -enrichment
# knownGene:cds 1.111%, blastCe3WB 0.150%, both 0.086%, cover 7.76%, enrich 51.67x

    ssh kkstore04
    rm -rf /cluster/data/hg18/bed/tblastn.ce3WB/blastOut
    rm -rf /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut
#end tblastn
##########################################################################

#############################################################################
# RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)

  ssh hgwdev
  cd /cluster/data/hg18/bed

  mkdir wgRna-2007-02-07
  cd wgRna-2007-02-07

# Received the data file, wg_feb2007.txt (saved from wg_feb2007.doc)
# from Michel Weber's email
# (Michel.Weber at ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg18/bed/wgRna-2007-02-07.

# The record of hsa-mir-770 was found missing the strand info.
# manually add "+" to wg_feb2007.txt for the record of hsa-mir-770.

  cat wg_feb2007.txt|sed -e 's/ /\t/g' > wgRna.tab

  hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab

#############################################################################
# RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)

  ssh hgwdev
  cd /cluster/data/hg18/bed

  mkdir wgRna-2007-02-12
  cd wgRna-2007-02-12

# Received the data file, wg_feb2007_corrected.txt (saved from wg_feb2007_corrected.doc)
# from Michel Weber's email
# (Michel.Weber at ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg18/bed/wgRna-2007-02-12.

# The record of hsa-mir-770 was found missing the strand info.
# manually add "+" to wg_feb2007_corrected.txt for the record of hsa-mir-770.

  cat wg_feb2007_corrected.txt|sed -e 's/ /\t/g' > wgRna.tab

  hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab

#########################################################################
## BLASTZ ANOCAR1 - Lizard - (DONE - 2007-02-17 - 2007-02-18 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
    cd /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17

    cat << '_EOF_' > DEF
# human vs lizard

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1

# QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/hg18AnoCar1 > do.log 2>&1 &
    #	real    684m40.568s
    #	there was a pause in there as the pk kluster was corrected during the
    #	first kluster run to get it to finish.

    #	appears to have successfully finished
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
    time nice -n +19 featureBits hg18 chainAnoCar1Link \
	> fb.hg18.chainAnoCar1Link.txt 2>&1
    #	real    2m28.318s
    #	137554843 bases of 2881515245 (4.774%) in intersection

    #	running the swap to anoCar1 - instructions in anoCar1.txt
    cd /cluster/data/anoCar1/bed/blastz.hg18.swap
    time nice -n +19 featureBits anoCar1 chainHg18Link \
	> fb.anoCar1.chainHg18Link.txt 2>&1
    #	real    3m16.810s
    #	112434396 bases of 1741478929 (6.456%) in intersection

    # reciprocal best net mafs for multiz 2008-10-30 - Hiram
    time nice -n +19 doRecipBest.pl hg18 anoCar1 > rbest.log 2>&1 &
    #	this failed immediately:
# cd /cluster/data/hg18/bed/blastz.anoCar1/axtChain
# chainStitchId hg18.anoCar1.over.chain.gz stdout
# chainSwap stdin stdout
# chainSort stdin anoCar1.hg18.tBest.chain
# t end mismatch -526389042 vs 10481870 line 1920305 of stdin

# Command failed:
# ssh -x kkr14u04 nice /cluster/data/hg18/bed/blastz.anoCar1/axtChain/doRecipBest.csh
    #	but, then, when run locally on hgwdev, it proceeded just fine:
    time nice -n +19 ./doRecipBest.csh > doRecipBest.log 2>&1 &
    #	real    175m54.202s
    doRecipBest.pl -continue=download hg18 anoCar1

##########################################################################
# UPDATED hg18.knownToVisiGene (DONE galt 2007-02-15)
#  after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc)
ssh hgwdev
knownToVisiGene hg18 -fromProbePsl=vgAllProbes

#########################################################################
## BLASTZ OTOGAR1 - Bushbaby - (2007-02-26 kate)
#
# NOTE: using masked sequence (unlike Brian Raney's alignments)

    cd /cluster/data/otoGar1
    ln -s otoGar1.rmsk.2bit otoGar1.2bit
    mkdir -p /san/sanvol1/scratch/otoGar1
    cp -p otoGar1.2bit chrom.sizes /san/sanvol1/scratch/otoGar1

    ssh pk
    mkdir /cluster/data/hg18/bed/blastz.otoGar1.2007-02-26
    cd /cluster/data/hg18/bed/blastz.otoGar1.2007-02-26

    cat << '_EOF_' > DEF
# human vs bushbaby
# params from Hiram & Brian
BLASTZ=blastz.v7.x86_64

# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1

# QUERY: Bushbaby otoGar1 - single chunk big enough to run largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/otoGar1/otoGar1.2bit
SEQ2_LEN=/cluster/data/otoGar1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.otoGar1.2007-02-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs
    /cluster/bin/scripts/doBlastzChainNet.pl DEF \
      -bigClusterHub=pk -smallClusterHub=pk \
      -chainMinScore=3000 -chainLinearGap=medium \
      >& do.log & tail -f do.log

    # problems on cluster -- stale NFS mounts and a routing problem
    # so batch failed with 4 retries.  I restarted cluster run
    # with retries=8, and all finished except 38.  These failed due
    # to output files existing;  as the results look OK, I'm proceeding.

    para time > run.time
    /cluster/bin/scripts/doBlastzChainNet.pl DEF \
      -continue=cat -bigClusterHub=pk \
      -chainMinScore=3000 -chainLinearGap=medium \
                >&! do2.log &
    tail -f do2.log

    # failed due to pre-existing liftOver chain from Brian's run
    /cluster/bin/scripts/doBlastzChainNet.pl DEF \
      -continue=net -bigClusterHub=pk \
                >&! do3.log &
    tail -f do3.log

    # reciprocal best net mafs for multiz
    ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 otoGar1 >&! rbest.log &

    # Load net (2007-03-12 kate)
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.otoGar1/axtChain
    netFilter -minGap=10 hg18.otoGar1.rbest.net.gz |  \
        hgLoadNet -warn hg18 netRBestOtoGar1 stdin

#########################################################################
# BLASTZ/CHAIN/NET CAVPOR2 (IN PROGRESS 2007-03-06 kate)
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
    cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
    cat << '_EOF_' > DEF
# human vs. guinea pig

# dynamic masking param
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Guinea pig cavPor2
# using cat-like params, as this has similar #scaffolds
SEQ2_DIR=/san/sanvol1/scratch/cavPor2/cavPor2.2bit
SEQ2_LEN=/san/sanvol1/scratch/cavPor2/chrom.sizes
# Maximum number of scaffolds that can be lumped together
# this makes ~200K jobs
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium >& do.log &
    tail -f do.log

    # cluster brought down by site work
    # restart on 3/7

    ssh pk
    cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06/run.blastz
    para recover jobList jobList2
    para make jobList2 >&! do2.log &
    para time > run.time

    # entire run took probably 36 hours cluster time
    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
    /cluster/bin/scripts/doBlastzChainNet.pl DEF \
      -bigClusterHub pk -continue=cat -stop cleanup \
      -chainMinScore=3000 -chainLinearGap=medium >& do3.log &

    # reciprocal best net mafs for multiz
    ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 cavPor2 >&! rbest.log &

    # load nets manually -- automated loading fails as classification info
    #  not available (no database)
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.cavPor2/axtChain
    netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netCavPor2 stdin
    netFilter -minGap=10 hg18.cavPor2.rbest.net.gz |  \
        hgLoadNet -warn hg18 netRBestCavPor2 stdin


#########################################################################
# BLASTZ/CHAIN/NET ERIEUR1 (IN PROGRESS 2007-03-08 kate)
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.eriEur1.2007-03-08
    cd /cluster/data/hg18/bed/blastz.eriEur1.2007-03-08
    cat << '_EOF_' > DEF
# human vs. hedgehog

# dynamic masking param
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: hedgehog eriEur1
# using cat-like params, as this has similar #scaffolds
SEQ2_DIR=/san/sanvol1/scratch/eriEur1/eriEur1.2bit
SEQ2_LEN=/san/sanvol1/scratch/eriEur1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
# this makes ~200K jobs
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.eriEur1.2007-03-08
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium >& do.log &
    tail -f do.log

    # Reciprocal best net mafs for multiz (kate)
    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.eriEur1
    ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 eriEur1 >&! rbest.log &

    #GOT HERE

    # Load nets (2007-03-12 kate)
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.dasNov1/axtChain
    netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netDasNov1 stdin
    netFilter -minGap=10 hg18.dasNov1.rbest.net.gz |  \
        hgLoadNet -warn hg18 netRBestDasNov1 stdin

#########################################################################
# BLASTZ/CHAIN/NET SORARA1 (IN PROGRESS 2007-03-08 kate)
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.sorAra1.2007-03-08
    cd /cluster/data/hg18/bed/blastz.sorAra1.2007-03-08
    cat << '_EOF_' > DEF
# human vs. hedgehog

# dynamic masking param
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: shrew sorAra1
# using cat-like params, as this has similar #scaffolds
SEQ2_DIR=/san/sanvol1/scratch/sorAra1/sorAra1.2bit
SEQ2_LEN=/san/sanvol1/scratch/sorAra1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
# this makes ~200K jobs
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.sorAra1.2007-03-08
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium >& do.log &
    tail -f do.log

    # stopped during load step due to missing database for classifying net

    # Reciprocal best net mafs for multiz (2007-03-12 kate)
    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.sorAra1
    ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 sorAra1 >&! rbest.log &

    # GOT HERE

    # Load nets (2007-03-12 kate)
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.sorAra1/axtChain
    netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netSorAra1 stdin
    netFilter -minGap=10 hg18.sorAra1.rbest.net.gz |  \
        hgLoadNet -warn hg18 netRBestSorAra1 stdin


#########################################################################
# BLASTZ ORNANA1 (PLATYPUS) - (DONE 2007-02-23, REDONE 2007-04-04 angie)
# The first time around, the copy of ornAna1.2bit still had the pre-release --
# doh!  Differences are miniscule (a couple contigs' orientation changed),
# but redo just to get it 100% right.
# In the re-run, I changed SEQ2_LIMIT which made the cluster run more
# efficient but had side-effects on the results because blastz's dynamic
# masking was applied differently (different groupings of sequences) --
# in retrospect, would have been better to use the suboptimal SEQ2_LIMIT
# and have fewer differences to slog through.
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
    cd /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
    cat << '_EOF_' > DEF
# human vs. platypus

# Use same params as used for hg18-danRer4
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: hg18
SEQ1_DIR=/scratch/hg/hg18/hg18.2bit
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: ornAna1
SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    doBlastzChainNet.pl DEF \
      -blastzOutRoot /cluster/bluearc/hg18.ornAna1 \
      >& do.log & tail -f do.log

    cd /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
    time nice -n +19 doRecipBest.pl hg18 ornAna1 > rbest.log 2>&1 &
    #	real    238m22.247s
    #	worked OK

########################################################################
# 28-WAY VERTEBRATE MULTIZ (2007-03-20 kate)

    ssh kkstore02
    cd /cluster/data/hg18/bed
    mkdir multiz28way.2007-03-20
    ln -s multiz28way.2007-03-20 multiz28way
    cd multiz28way

    # start with 17way tree; update assemblies and add new species
    mkdir tree
    cd tree
    cp /cluster/data/hg18/bed/multiz17way/tree.nh tree.asm.nh
    # edit and create tree.28.nh, with Webb's assistance
    echo `sed 's/[a-zA-Z0-9]*_//g' tree.asm.nh` > tree.28.nh
    #
    # create version for download that includes common names and assemblies
    cp tree.asm.nh ../28way.nh
    # edit

    # create version for phyloGif program (replace spaces with commas)
    cp 28way.gif /usr/local/apache/htdocs/images/phylo/hg18_28way.gif

    # create species list
    cd ..
    sed -e 's/[()]//g' -e 's/ /\n/g' tree/tree.28.nh | \
        sed -e '/^$/d'| sort > species.28.lst
    wc -l species.28.lst
    ln -s species.28.lst species.lst

# Organisms:
(N)ew, (U)pdated, (S)ame species since 17way:
U chimp (panTro2)
S rhesus (rheMac2)
-N bushbaby (otoGar1) "Otolemur garnetti" (galago) 2X
N tree_shrew (tupBel1) "Tupaia belangeri"
S rat (rn4)
S mouse (mm8)
-N guinea_pig (cavPor2) "Cavia porcellus" 2X
S rabbit (oryCun1) 2X
-N shrew (sorAra1) "Sorex araneus" 2X
-N hedgehog (eriEur1) "Erinaceus europaeus" 2X
S dog (canFam2)
N cat (felCat3) "Felis catus" 2X
-N horse (equCab1) "Equus caballus"
U cow (bosTau3)
S armadillo (dasNov1) "Dasypus novemcinctus" 2X
S elephant (loxAfr1) 2X
S tenrec (echTel1) 2X
S opossum (monDom4)
N platypus (ornAna1) "Ornithorhychus anatinus"
U chicken (galGal3)
N lizard (anoCar1) "Anolis carolinensis" (Green Anole), Iguana family
U frog (xenTro2)
U fugu (fr2)
S tetraodon (tetNig1)
N stickleback (gasAcu1) "Gasterosteus aculeatus"
N medaka (oryLat1) "Oryzias latipes"
U zebrafish (danRer4)

    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz28way
    # verify all blastz's exists
cat > listMafs.csh << 'EOF'
    foreach db (`cat species.lst`)
        set bdir = /cluster/data/hg18/bed/blastz.$db
        if (-e $bdir/mafRBestNet/chr1.maf.gz) then
            echo "$db mafRBestNet"
        else if (-e $bdir/mafSynNet/chr1.maf.gz) then
            echo "$db mafSynNet"
        else if (-e $bdir/mafNet/chr1.maf.gz) then
            echo "$db mafNet"
        else
            echo "$db mafs not found"
        endif
    end
'EOF'

    # gather chain stats
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way
cat > getChainStats.csh << 'EOF'
    set species = $1
    foreach db (`cat $species`)
        echo -n "${db}	"
        set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
        set fb = /cluster/data/hg18/bed/blastz.$db/fb.hg18.chain${Db}Link.txt
        if (! -e $fb || -z $fb ) then
            nice featureBits hg18 chain${Db}Link >& $fb
        endif
        sed 's/.*(\(.*\)).*/\1/' $fb
        end
'EOF'
    # << happy emacs
    csh getChainStats.csh species.lst >&! species.chainStats

    # Maf types:
    # 2X mammals -> reciprocal best net
    # high cov placental mammals and opossum -> syntenic net
    # other -> standard net

    csh listMafs.csh > listMafs.log &
    cat listMafs.log

    # add links of the formt blastz.<db> to blastz.<db>.<date> dirs:
    cd /cluster/data/hg18/bed
    ln -s blastz.fr2.2007-01-24 blastz.fr2
    ln -s blastz.ornAna1.2007-02-21 blastz.ornAna1
    ln -s blastz.oryLat1.swap blastz.oryLat1

    # copy net mafs to cluster-friendly storage, splitting chroms
    # into 50MB chunks  to improve run-time
    # NOTE: splitting will be different for scaffold-based reference asemblies
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way
    mkdir run.split
    cd run.split
    mafSplitPos hg18 50 mafSplit.bed

    ssh kki
    cd /cluster/data/hg18/bed/multiz28way
    cd run.split
    cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set db = $1
set sdir = /san/sanvol1/scratch/hg18/splitStrictMafNet
mkdir -p $sdir
if (-e $sdir/$db) then
    echo "directory $sdir/$db already exists -- remove and retry"
    exit 1
endif
set bdir = /cluster/data/hg18/bed/blastz.$db
if (! -e $bdir) then
    echo "directory $bdir not found"
    exit 1
endif
mkdir -p $sdir/$db
if (-e $bdir/mafRBestNet) then
    set mdir = $bdir/mafRBestNet
else if (-e $bdir/mafSynNet) then
    set mdir = $bdir/mafSynNet
else if (-e $bdir/mafNet) then
    set mdir = $bdir/mafNet
else
    echo "$bdir maf dir not found"
    exit 1
endif
echo $mdir
foreach f ($mdir/*)
    set c = $f:t:r:r
    echo "  $c"
    nice mafSplit mafSplit.bed $sdir/$db/ $f
end
echo "gzipping $sdir/$db mafs"
nice gzip $sdir/$db/*
endif
echo $mdir > $db.done
'EOF'
# << happy emacs
    chmod +x doSplit.csh

    grep -v hg18  ../species.28.lst > split.lst
    cat > spec << 'EOF'
#LOOP
doSplit.csh $(path1) {check out line+ $(path1).done}
#ENDLOOP
'EOF'
    gensub2 split.lst single spec jobList
    para create jobList
        # 24 jobs
    para try
    para check
    para push
    # till complete
    para time >&! run.time
    # 30 minutes

    # run multiz
    ssh pk
    cd /cluster/data/hg18/bed/multiz28way
    mkdir -p maf run
    cd run
    mkdir penn
    # use latest penn utilities
    set PENN_BIN = /cluster/bin/penn/multiz.v11.2007-03-19
    cp -p $PENN_BIN/{autoMZ,multiz,maf_project} penn

    # list chrom chunks, any db dir will do; better would be for the
    # splitter to generate this file
    # We temporarily use __ instead of . to delimit chunk in filename
    # so we can use $(root) to get basename
    set mdir = /san/sanvol1/scratch/hg18/splitStrictMafNet
    ls $mdir/fr2 | sed -e 's/.maf.gz//' -e 's/\./__/' > chromChunks.lst
    wc -l chromChunks.lst
        # 93

cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef

    set db = hg18
    set c = $1
    set maf = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../tree/tree.28.nh ../species.28.lst $tmp
    pushd $tmp
    foreach s (`cat species.28.lst`)
        set c2 = `echo $c | sed 's/__/./'`
        set in = $pairs/$s/$c2.maf
        set out = $db.$s.sing.maf
        if ($s == hg18) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.28.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'EOF'
# << happy emacs
    chmod +x autoMultiz.csh

cat  << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz28way/maf/$(root1).maf}
#ENDLOOP
'EOF'
    # << emacs
    gensub2 chromChunks.lst single spec jobList
    para create jobList
        # 93 jobs
    para try
    para check
    para push
    para time > run.time
        # 4 hours!  (~9 min/species)

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/hg18/multiz28way/maf
    ln -s /cluster/data/hg18/bed/multiz28way/maf/*.maf \
                /gbdb/hg18/multiz28way/maf
    cd /cluster/data/hg18/bed/multiz28way
cat > loadMaf.csh << 'EOF'
    date
    hgLoadMaf -pathPrefix=/gbdb/hg18/multiz28way/maf hg18 multiz28way

     # load summary table
     cat maf/*.maf |  nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
                 -maxSize=200000  multiz28waySummary stdin
'EOF'
     csh loadMaf.csh >&! loadMaf.log &


    # look at coverage
    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz25wayStrict
    mkdir mafCov
    cd mafCov
    cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 \
        -otherDb=canFam2 chr7.canFam2.bed | bedSort > chr7.canFam2.bed
    echo canFam2 > species.lst
    cat ../maf/chr7__*.maf | mafSpeciesSubset stdin species.lst stdout | \
        mafToAxt stdin hg18 canFam2 stdout | \
        axtToPsl stdin /cluster/data/hg18/chrom.sizes \
                /cluster/data/canFam2/chrom.sizes chr7.canFam2.psl

    cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 -otherDb=oryCun1 chr7.oryCun1.bed
    cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 -otherDb=tetNig1 chr7.tetNig1.bed

    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz25wayStrict/mafCov
    # canFam2 syntenic net vs standard net
    nice featureBits hg18 -chrom=chr7 chr7.canFam2.bed
        # 82967535 bases of 154952424 (53.544%) in intersection
    nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.canFam2.bed
        # 86391682 bases of 154952424 (55.754%) in intersection
    nice featureBits hg18 -chrom=chr7 ../../multiz17way/mafCov/chr7.canFam2.bed
        # 86248995 bases of 154952424 (55.662%) in intersection

    # compare using another method
    cat ../maf/chr7__*.maf | mafSpeciesSubset stdin species.lst chr7.canFam2.maf
    mafToAxt chr7.canFam2.maf hg18 canFam2 chr7.canFam2.axt
    axtToPsl chr7.canFam2.axt /cluster/data/hg18/chrom.sizes \
        /cluster/data/canFam2/chrom.sizes chr7.canFam2.psl
    nice featureBits hg18 -chrom=chr7 chr7.canFam2.psl
        # 75497734 bases of 154952424 (48.723%) in intersection

    # oryCun1 reciprocal best net vs standard net
    nice featureBits hg18 -chrom=chr7 chr7.oryCun1.bed
        # 53157578 bases of 154952424 (34.306%) in intersection
    nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.oryCun1.bed
        # 56858022 bases of 154952424 (36.694%) in intersection

    # tetNig1 both used standard net
    nice featureBits hg18 -chrom=chr7 chr7.tetNig1.bed
        # 2905058 bases of 154952424 (1.875%) in intersection
    nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.tetNig1.bed
        # 2901708 bases of 154952424 (1.873%) in intersection

    # NOTE: Next time concatenate split mafs before proceeding further

    # Gap Annotation
    # prepare bed files with gap info
    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz28way
    mkdir anno
    cd anno
    mkdir maf run
    cd run

cat > doNBed.csh << 'EOF'
    foreach db (`cat species.lst`)
        echo -n "$db "
        set cdir = /cluster/data/$db
        if (! -e $cdir/$db.N.bed) then
            echo "creating N.bed"
            twoBitInfo -nBed $cdir/$db.2bit $cdir/$db.N.bed
        else
            echo ""
        endif
    end
'EOF'
    csh doNBed.csh >&! doNBed.log &

    rm -f nBeds sizes
    foreach db (`grep -v hg18 ../../species.lst`)
        echo "$db "
        ln -s  /cluster/data/$db/$db.N.bed $db.bed
        echo $db.bed  >> nBeds
        ln -s  /cluster/data/$db/chrom.sizes $db.len
        echo $db.len  >> sizes
    end

    ssh kki
    cd /cluster/data/hg18/bed/multiz28way/anno/run

cat > doAnno.csh << 'EOF'
#!/bin/csh -ef
    set dir = /cluster/data/hg18/bed/multiz28way
    set c = $1
    cat $dir/maf/${c}__*.maf | \
        nice mafAddIRows -nBeds=nBeds -sizes=sizes stdin \
                /cluster/data/hg18/hg18.2bit $2
'EOF'
#<< happy emacs
    chmod +x doAnno.csh

cat > spec << 'EOF'
#LOOP
./doAnno.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz28way/anno/maf/$(root1).maf}
#ENDLOOP
'EOF'
#<< happy emacs
    awk '{print $1}' /cluster/data/hg18/chrom.sizes > chroms.lst
    gensub2 chroms.lst single spec jobList
    para create jobList
    para try

    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way/anno
    mkdir -p /gbdb/hg18/multiz28way/anno/maf
    ln -s /cluster/data/hg18/bed/multiz28way/anno/maf/*.maf \
                /gbdb/hg18/multiz28way/anno/maf
cat > loadMaf.csh << 'EOF'
    date
    nice hgLoadMaf -pathPrefix=/gbdb/hg18/multiz28way/anno/maf \
                hg18 multiz28wayAnno
    date
    cat maf/*.maf | \
        nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
            -maxSize=200000  multiz28wayAnnoSummary stdin
    date
'EOF'
    csh loadMaf.csh >& loadMaf.log  &

    # NOTE: rebuilt hgLoadMafSummary to exclude chroms<1MB (2007-06-21 kate)


########################################################################
# ANNOTATE 28-WAY ALIGNMENT WITH QUALITY DATA (2007-06-11 rico at bx.psu.edu)
#
# The basic idea here is to create a qac file which has quality data for each
# (chromosome/scaffold/etc) and then index the qac file.  Once this is done,
# mafAddQRows can be used to add the quality data to a given maf.  The agp
# files are used so that gaps can be represented in the qac files as a special
# value.

	## create .qac and .qdx files for each species in the 28-way alignment
	o human (hg18)
		Unable to find quality data.

	o chimp (panTro2)
		/cluster/data/panTro2/bed/quality/qac/*.qac
		/cluster/data/panTro2/wustl/*.agp
		qacAddGapIdx in.agp in.qac panTro2.qac panTro2.qdx

	o rhesus (rheMac2)
		/cluster/data/rheMac2/qual/foo.qv
		/cluster/data/rheMac2/downloads/foo.agp
		qacAddGapIdx in.agp in.qac rheMac2.qac rheMac2.qdx

	o bushbaby (otoGar1)
		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/bushbaby/otoGar1
			assembly.agp
			Draft_v1.agp.chromosome.qual.gz
		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz otoGar1.qac otoGar1.qdx

	o treeshrew (tupBel1)
		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/treeShrew/tupBel1
			assembly.agp
			Draft_v1.agp.chromosome.qual.gz
		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz tupBel1.qac tupBel1.qdx

	o rat (rn4)
		/cluster/data/rn4/downloads/foo.qual
		/cluster/data/rn4/CHROM/foo.agp
		qacAddGapIdx in.agp in.qac rn4.qac rn4.qdx

	o mouse (mm8)
		Unable to find quality data.

	o guinea pig (cavPor2)
		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/guineaPig/cavPor2
			assembly.agp
			Draft_v2.agp.chromosome.qual.gz
		qaAgpToQacIdx assembly.agp Draft_v2.agp.chromosome.qual.gz cavPor2.qac cavPor2.qdx

	o rabbit (oryCun1)
		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/rabbit/oryCun1
			assembly.agp
			Draft_v1.agp.chromosome.qual.gz
		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz oryCun1.qac oryCun1.qdx

	o shrew (sorAra1)
		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/commonShrew/sorAra1
			assembly.agp
			Draft_v1.agp.chromosome.qual.gz
		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz sorAra1.qac sorAra1.qdx

	o hedgehog (eriEur1)
		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/hedgehog/eriEur1
			assembly.agp
			Draft_v1.agp.chromosome.qual.gz
		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz eriEur1.qac eriEur1.qdx

	o dog (canFam2)
		/cluster/data/canFam2/bed/quality/chrom.qac
		/cluster/data/canFam2/broad/foo.agp
		qacAddGapIdx in.agp in.qac canFam2.qac canFam2.qdx

	o cat (felCat3)
		/cluster/data/felCat3/downloads/assembly.agp
		/cluster/data/felCat3/downloads/Draft_v3.agp.chromosome.qual.gz
		qaAgpToQacIdx assembly.agp Draft_v3.agp.chromosome.qual.gz felCat3.qac felCat3.qdx

	o horse (equCab1)
		/cluster/data/equCab1/downloads/assembly.agp
		/cluster/data/equCab1/downloads/Draft_v1.agp.chromosome.qual.gz
		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz equCab1.qac equCab1.qdx

	o cow (bosTau3)
		/cluster/data/bosTau3/baylor/chroms/foo.qual
		/cluster/data/bosTau3/baylor/foo.agp
		qacAddGapIdx in.agp in.qac bosTau3.qac bosTau3.qdx

	o armadillo (dasNov1)
		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/armadillo/dasNov1
			assembly.agp
			assembly.quals.gz
		combineQuals assembly.agp assembly.quals.gz combined.quals
		qaAgpToQacIdx assembly.agp combined.quals.gz dasNov1.qac dasNov1.qdx

	o elephant (loxAfr1)
		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/elephant/loxAfr1
			assembly.agp
			assembly.quals.gz
		combineQuals assembly.agp assembly.quals.gz combined.quals
		qaAgpToQacIdx assembly.agp combined.quals.gz loxAfr1.qac loxAfr1.qdx

	o tenrec (echTel1)
		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/tenrec/echTel1
			assembly.agp
			Draft_v1.agp.chromosome.qual.gz
		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz echTel1.qac echTel1.qdx

	o opossum (monDom4)
		/cluster/data/monDom4/broad.mit.edu/foo.qac
		/cluster/data/monDom4/broad.mit.edu/foo.agp
		qacAddGapIdx in.agp in.qac monDom4.qac monDom4.qdx

	o platypus (ornAna1)
		/cluster/data/ornAna1
			agp files are present, but there are no quality files

	o chicken (galGal3)
		Unable to find quality data.

	o lizard (anoCar1)
		/cluster/data/anoCar1/downloads/assembly.agp
		/cluster/data/anoCar1/downloads/scaffold.lifted.qac
		qacAddGapIdx in.agp in.qac anoCar1.qac anoCar1.qdx

	o frog (xenTro2)
		Unable to find quality data.

	o tetraodon (tetNig1)
		Unable to find quality data.

	o fugu (fr2)
		Unable to find quality data.

	o stickleback (gasAcu1)
		/cluster/data/gasAcu1/downloads/foo.agp
		/cluster/data/gasAcu1/downloads/foo.qual
		qacAddGapIdx in.agp in.qac gasAcu1.qac gasAcu1.qdx

	o medaka (oryLat1)
		/cluster/data/oryLat1/bed/qual/foo.qual
		/cluster/data/oryLat1/downloads/foo.agp
		qacAddGapIdx in.agp in.qac oryLat1.qac oryLat1.qdx

	o zebrafish (danRer4)
		Unable to find quality data.

	## NOTE
	quality data for chrM needed: dog, guineapig, horse, hedgehog, stickleback, medaka, rat
	quality data for chrUn needed: medaka

	## copy all .qac and .qdx files to the san
	cp *.qac *.qdx /san/sanvol1/rico/quality

	## create species list (species.lst) containing the following
	anoCar1 /san/sanvol1/rico/quality
	bosTau3 /san/sanvol1/rico/quality
	canFam2 /san/sanvol1/rico/quality
	cavPor2 /san/sanvol1/rico/quality
	dasNov1 /san/sanvol1/rico/quality
	echTel1 /san/sanvol1/rico/quality
	equCab1 /san/sanvol1/rico/quality
	eriEur1 /san/sanvol1/rico/quality
	felCat3 /san/sanvol1/rico/quality
	gasAcu1 /san/sanvol1/rico/quality
	loxAfr1 /san/sanvol1/rico/quality
	monDom4 /san/sanvol1/rico/quality
	oryCun1 /san/sanvol1/rico/quality
	oryLat1 /san/sanvol1/rico/quality
	otoGar1 /san/sanvol1/rico/quality
	panTro2 /san/sanvol1/rico/quality
	rheMac2 /san/sanvol1/rico/quality
	rn4     /san/sanvol1/rico/quality
	sorAra1 /san/sanvol1/rico/quality
	tupBel1 /san/sanvol1/rico/quality

	## the following script will add quality data to each of the mafs
cat > addQData << 'EOF'
#!/bin/sh

INPUT_DIR=/cluster/data/hg18/bed/multiz28way/anno/maf
OUTPUT_DIR=/cluster/store12/rico/hg18/bed/multiz28way/qual/maf

for maf in `ls -1Sr ${INPUT_DIR}/*.maf`
do
	file=`basename $maf`

	mafAddQRows species.lst $maf ${OUTPUT_DIR}/$file
done
'EOF'

    # Gene frames
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way
    mkdir frames
    cd frames
cat > showGenes.csh << 'EOF'
    foreach db (`grep -v hg18  ../species.lst`)
        echo "    $db"
        echo -n "Tables: "
        set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
        foreach table ($tables)
            if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
                $table == "knownGene") then
                    echo -n "${table}: "
                    hgsql $db -N -e "select count(*) from $table"
            endif

        end
        echo -n "Mrnas: "
        set orgName = `hgsql hgcentraltest -N -e \
                "select scientificName from dbDb where name='$db'"`
        set orgId = `hgsql hg18 -N -e \
                "select id from organism where name='$orgName'"`
        if ($orgId == "") then
            echo "0"
        else
            hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"
        endif
    end
'EOF'

    # based on output, pick gene tables, according to the following criteria:
    # KG if present, else refGene if >10000 entries, else ensGene (unless dog),
    # else mgcGenes, else mrnas if > 10000 else none.   In all cases
    # except none, add in refGene.

hg18: knownGene
bosTau3: mrna
canFam2: mrna
cavPor2: mrna
danRer4: refGene (13K) or ensGene (36K ?)
equCab1: mrna
fr2: ensGene
galGal3: mrna
gasAcu1: ensGene
mm8: knownGene
monDom4: ensGene
oryCun1: mrna
panTro2: refGene
rheMac2: ensGene
rn4: knownGene ? (8K) or refGene (10K) or ensGene(34K) ?
tetNig1: mrna
xenTro2: mrna


    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using mrna table as genes: bostau3, canFam2, cavPor2, equCab1, galGal3, oryCun1, tetNig1, xenTro2
cat > getGenes.csh << 'EOF'
    rm -fr genes
    mkdir -p genes
    #set mrnaDbs = "bosTau3 canFam2 cavPor2 equCab1 galGal3 oryCun1 tetNig1 xenTro2"
    # use only those with databases for now
    set mrnaDbs = "bosTau3 canFam2 equCab1 galGal3 oryCun1 tetNig1 xenTro2"
    foreach queryDb ($mrnaDbs)
      set tmpExt = `mktemp temp.XXXXXX`
      set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
      set tmpMrna = ${queryDb}.mrna.${tmpExt}
      set tmpCds = ${queryDb}.cds.${tmpExt}
      echo $queryDb
      hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                   from all_mrna,gbCdnaInfo,cds \
                   where (all_mrna.qName = gbCdnaInfo.acc) and \
                     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
       $queryDb > ${tmpMrnaCds}
      cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
      cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
      mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
        genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$queryDb.tmp.gz
      rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
      rm -f $tmpExt
    end
    # using knownGene for rn4 mm8 hg18
    # using refGene for panTro2
    # using ensGene for danRer4, fr2, gasSAcu1, monDom4, rheMac2
    # genePreds; (must keep only the first 10 columns for knownGene)
    set geneDbs = "hg18 mm8 rn4 danRer4 panTro2 fr2 gasAcu1 monDom4 rheMac2"
    foreach queryDb ($geneDbs)
      if ($queryDb == "danRer4" || $queryDb == "fr2" || $queryDb == "gasAcu1" || \
                $queryDb == "monDom4" || $queryDb == "rheMac2") then
        set geneTbl = ensGene
      else if ($queryDb == "panTro2") then
        set geneTbl = refGene
      else if ($queryDb == "hg18" || $queryDb == "mm8" || $queryDb == "rn4") then
        set geneTbl = knownGene
      endif
      hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from $geneTbl" ${queryDb} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$queryDb.tmp.gz
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
    end
'EOF'
    csh getGenes.csh >&! getGenes.log &

    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz28way/frames
    # leaving out cavPor2 (no db) and tetNig1 (too few gene preds)
    (cat  ../maf/*.maf | nice genePredToMafFrames hg18 stdin stdout bosTau3 genes/bosTau3.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz fr2 genes/fr2.gp.gz galGal3 genes/galGal3.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz xenTro2 genes/xenTro2.gp.gz gasAcu1 genes/gasAcu1.gp.gz monDom4 genes/monDom4.gp.gz equCab1 genes/equCab1.gp.gz |  gzip > multiz28way.mafFrames.gz) >& frames.log &

    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way/frames
    nice hgLoadMafFrames hg18 multiz28wayFrames multiz28way.mafFrames.gz >& loadFrames. log &

# from 17way:
    hg18 = knownGene
    rn4 = knownGene
    mm8 = knownGene
    panTro1 = ensGene
    rheMac2 = mrna
    oryCun1 = mrna
    #dasNov1 =
    canFam2 = mrna
    #loxAfr1 =
    bosTau2 = mrna
    #echTel1 =
    #monDom4 =
    galGal2 = refGene
    xenTro1 = mgcGenes
    #tetNig1 =
    fr1 = ensGene
    danRer3 = mrna

############################################################################
# PHASTCONS FOR 28WAY (2007-04-04 kate)

# generate tree model with branch lengths using phyloFit from Adam
# Siepel's # phastCons package.  Input is 28way alignments of
# 4-fold degenerate sites (4d sites) determined from a
# nonredundant (non-overlapping) gene set.  Elliott Margulies
# has a perl script (extract_coding_alignments.pl) that he used
# with the ENCODE alignments.
# Adam uses his msa_view tool with the --4d option.

# For first try, use Gencode Oct '05 reference set filtered
# to longest transcript, then lifted to hg18
# Compare results with hgClusterGenes and /cluster/bin/phast/refeature,
# and genePredSingleCover

    hgsql hg17 -N -e 'select * from encodeGencodeGeneKnownOct05' > gencodeKnown.gp
    wc -l gencodeKnown.gp
        # 2608 gencodeKnown.gp
    hgsql hg17 -N -e "select count(*) from encodeGencodeGeneKnownOct05 where cdsStart <> 0 and cdsEnd <> 0"
        # 1097
    hgsql hg17 -N -e "select count(*) from encodeGencodeGeneKnownOct05 where cdsStartStat='cmpl' and cdsEndStat='cmpl'"
        # 752

    # Jim's gene uniquifier
    hgClusterGenes -noProt hg17 encodeGencodeGeneKnownOct05 \
        encodeGencodeGeneKnownOct05Clusters encodeGencodeGeneKnownOct05Canonical
        # Got 457 clusters, from 2608 genes in 46 chromosomes
    hgsql hg17 -N -e "select transcript from encodeGencodeGeneKnownOct05Canonical order by transcript" > genes.jim

    # Adam's feature uniquifier
    # requires cdsStart and cdsEnd in gene pred
    hgsql hg17 -N -e 'select * from encodeGencodeGeneKnownOct05 where cdsStart<>0 and cdsEnd <> 0' > gencodeKnownCds.gp
    wc -l gencodeKnownCds.gp
        # 1097 gencodeKnownCds.gp

    /cluster/bin/phast/refeature --unique gencodeKnownCds.gp > \
        gencodeKnownCdsNR.gff
     awk '{print $10}' gencodeKnownCdsNR.gff | sort | uniq | wc -l
     # 333

    /cluster/bin/phast/refeature -o genepred --unique \
        gencodeKnownCds.gp | sort > gencodeKnownCdsNR.gp
    wc -l gencodeKnownCdsNR.gp
        # 333
    awk '{print $1}' gencodeKnownCdsNR.gp | sort > genes.adam

    # get intersection
    comm -1 -2 genes.jim genes.adam > genes.both
    wc -l genes.both
        # 235

    # genePredSingleCover filters but leaves extended gene pred
    genePredSingleCover gencodeKnownCds.gp stdout | sort > gencodeKnownCdsNR2.gp
    wc -l gencodeKnownCdsNR2.gp
        # 423
    awk '{print $1}' gencodeKnownCdsNR2.gp | sort > genes.scov
    comm -1 -2 genes.scov genes.both > genes.all
    wc -l genes.all
        # 224 -- all 3 methods picked these

    liftOver -genePred  gencodeKnownCdsNR2.gp \
        /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz \
                gencodeKnown.hg18.gp unmapped.gp
    genePredCheck  gencodeKnown.hg18.gp
    # checked: 423 failed: 0
    # all genes mapped

    # consider using only intersection of above 3 methods

   grep chr22 gencodeKnown.hg18.gp > gencodeKnown.hg18.chr22.gp
   /cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.chr22.gp \
        -i MAF ../maf/chr22__0.maf > chr22.mfa

    # extract ENCODE regions from MAF's
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way/4d
    hgsql hg18 -N -e  \
     "select chrom, chromStart, chromEnd, name from encodeRegions" \
        > encodeRegions.bed

    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz28way/4d
cat > encodeMafs.csh << 'EOF'
    mkdir -p encodeMafs
    set chroms = `awk '{print $1}' encodeRegions.bed | sort | uniq`
    foreach c ($chroms)
        echo $c
        # needed till mafsInRegion is fixed to handle split maf files
        cat ../maf/${c}__?.maf > $c.maf
        awk -v CHR=$c '$1 == CHR {print}' encodeRegions.bed > regions.bed
        mafsInRegion regions.bed -outDir encodeMafs/ $c.maf
    end
'EOF'
    csh encodeMafs.csh >&! encodeMafs.log &

    # try it out on a few regions
    set r = "ENm001"
    set r = "ENr231"
    perl -wpe 's/^s ([^.]+)\.\S+/s $1/' encodeMafs/$r.maf > $r.clean.maf
    # generate ss file
    /cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.gp \
                -i MAF $r.clean.maf -o SS > $r.4d.3.ss
    /cluster/bin/phast/msa_view -i SS -o FASTA $r.4d.3.ss > $r.4d.3.mfa
    /cluster/bin/phast/msa_view -i SS --tuple-size 1 $r.4d.3.ss -o SS > $r.4d.1.ss
    /cluster/bin/phast/msa_view -i SS -o FASTA $r.4d.1.ss > $r.4d.1.mfa

    # now on all regions
cat > encode4d.csh << 'EOF'
    mkdir mfa4d
    foreach f (encodeMafs/*.maf)
        set r = $f:t:r
        echo $r
        perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $f > clean.maf
    /cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.gp \
                -i MAF clean.maf -o SS | \
        /cluster/bin/phast/msa_view -i SS --tuple-size 1 - > mfa4d/$r.4d.mfa
        # remove empties to satisfy msa_view --aggregate
        if (-z mfa4d/$r.4d.mfa) then
            rm mfa4d/$r.4d.mfa
        endif
    end
'EOF'
    csh encode4d.csh >&! encode4d.log &

    set species1 = `sed 's/$/,/g' ../species.lst`
    set species = `echo $species1 | sed -e 's/ //g' -e 's/,$//'`

    # From Elliott's script:
    #/cluster/bin/phast/msa_view --aggregate $species EN*.mfa | \
        #sed s/"> "/">"/ > some-4d_align.mfa

    /cluster/bin/phast/msa_view --aggregate $species mfa4d/EN*.4d.mfa | \
        sed s/"> "/">"/ > all-4d_align.mfa

    # tweak input tree -- remove common names, include commas
    sed 's/[a-z][a-z]*_//g' ../tree/tree.web.commas.nh > tree.commas.nh

    # From Elliott's script with Adam's mods (use --EM, MED)
    /cluster/bin/phast/phyloFit --EM --precision MED \
        --msa-format FASTA --subst-mod REV \
        --tree tree.commas.nh all-4d_align.mfa
    grep TREE phyloFit.mod | sed 's/TREE\:\ //' > tree_4d.28way.nh
    /cluster/bin/phast/tree_doctor --dissect tree_4d.28way.nh | \
        awk '$1 == "dparent" {x += $3} END {print x}'
            # 9.0516

    # extract species distances
    /cluster/bin/phast/all_dists tree_4d.28way.nh > 28way.distances.txt
    grep hg18 28way.distances.txt | sort -k3,3n | \
        awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
    # get chain stats ordered by distance
    awk '{print $2}' distances.txt > species.byDistance
    csh ../getChainStats.csh species.byDistance >&! species.chainStats

    # spruce up names for tree drawing
    /cluster/bin/phast/tree_doctor \
      --rename="hg18 -> human ; panTro2 -> chimp ; rheMac2 -> macaque ; otoGar1 -> bushbaby ; tupBel1 -> tree_shrew ; rn4 -> rat ; mm8 -> mouse ; cavPor2 -> guinea_pig ; oryCun1 -> rabbit ; sorAra1 -> shrew ; eriEur1 -> hedgehog ; canFam2 -> dog ; felCat3 -> cat ; equCab1 -> horse ; bosTau3 -> cow ; dasNov1 -> armadillo ; loxAfr1 -> elephant ; echTel1 -> tenrec ; monDom4 -> opossum ; ornAna1 -> platypus ; galGal3 -> chicken ; anoCar1 -> lizard ; xenTro2 -> frog ; tetNig1 -> tetraodon ; fr2 -> fugu ; gasAcu1 -> stickleback ; oryLat1 -> medaka ; danRer4 -> zebrafish" \
      tree_4d.28way.nh > tree_4d.28way.common.nh

    # compare to Elliott's latest ENCODE tree, pruned to match
    /cluster/bin/phast/tree_doctor \
        --prune-all-but=human,chimp,macaque,galago,rat,mouse,guinea_pig,rabbit,cow,cat,dog,hedgehog,shrew,armadillo,elephant,tenrec,monodelphis,platypus,chicken,xenopus \
        --rename="xenopus -> frog ; galago -> bushbaby; monodelphis -> opossum"\
            encode2007.nh > encode2007.pruned.nh

    # my 4d tree with only species in the pruned ENCODE tree
    /cluster/bin/phast/tree_doctor \
        --prune-all-but=human,chimp,macaque,bushbaby,rat,mouse,guinea_pig,rabbit,cow,cat,dog,hedgehog,shrew,armadillo,elephant,tenrec,opossum,platypus,chicken,frog \
          tree_4d.28way.common.nh > tree_4d.20way.common.nh

    # Create chrom mafs from split mafs (do this earlier next time)
    ssh kki
    cd /cluster/data/hg18/bed/multiz28way
    mkdir chromMaf
    mkdir run.merge
    cd run.merge
cat > doMerge.csh << 'EOF'
#!/bin/csh -ef
    set c = $1
    set cmaf = ../chromMaf/${c}.maf
    # NOTE: need to change mafFilter to retain (and uniquify) comments
    # begin with ##maf header
    head -1 ../maf/${c}__0.maf > $cmaf
    grep -h '# ' ../maf/${c}__?.maf | sed 's/\/scratch\/tmp.* //' | sort | uniq \
        >> $cmaf
    # don't filter out blocks with alignment this time -- might be needed
    # for symmetry with irows version, or for analysis.  Check on this.
    mafFilter -minRow=1 ../maf/${c}__?.maf >> $cmaf
'EOF'
    # << happy emacs
    chmod a+x doMerge.csh

cat > spec << 'EOF'
#LOOP
./doMerge.csh $(root1) {check out line+ ../chromMaf/$(root1).maf}
#ENDLOOP
'EOF'
    # << happy emacs

    awk '{print $1}' /cluster/data/hg18/chrom.sizes > chrom.lst
    gensub2 chrom.lst single spec jobList
    para create jobList
        # 49 jobs
    para try
    para check
    para push

    # Split chromosome MAF's into windows and use to generate
    # "sufficient statistics" (ss) files for phastCons input
    # large mem jobs so use mini-cluster
    ssh kki
    cd /cluster/data/hg18/bed/multiz28way
    mkdir cons
    cd cons

    # Create tree model for phastCons
    # Adjust model file base composition background and rate matrix to be
    # representative of whole-genome (.41 -- as was done for ENCODE)
    # using utility, 'modFreqs' from Adam (5/07)
    # NOTE: updated all phast source and rebuilt to phast.2007-05-04
    set gc = `grep BACKGROUND /cluster/data/hg18/bed/multiz17way/cons/elliotsEncode.mod | \
                awk '{printf "%0.3f\n", $3 + $4;}'`
    echo $gc
        # .41
    /cluster/bin/phast.2007-05-04/modFreqs ../4d/phyloFit.mod $gc > 28way.mod

    # split 28way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    mkdir run.split
    cd run.split
    set WINDOWS = /san/sanvol1/scratch/hg18/multiz28way/cons/ss
    rm -fr $WINDOWS
    mkdir -p $WINDOWS

    cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
    set MAFS = /cluster/data/hg18/bed/multiz28way/chromMaf
    set WINDOWS = /san/sanvol1/scratch/hg18/multiz28way/cons/ss
    cd $WINDOWS
    set c = $1
    echo $c
    rm -fr $c
    mkdir $c
    # need to truncate odd-ball scaffold/chrom names that include dots
    # as phastCons utils can't handle them
    set TMP = /scratch/tmp/$c.clean.maf.$$
    #perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $MAFS/$c.maf > $TMP
    perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $TMP
    /cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \
        -M /cluster/bluearc/hg18/chrom/$c.fa \
        -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
    rm -f $TMP
    echo "Done" >> $c.done
'EOF'
# << happy emacs
    chmod +x doSplit.csh

rm -f jobList
foreach f (../../chromMaf/*.maf)
    set c = $f:t:r
    echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
end

    para create jobList
        # 49 jobs
    para try
    para check
    para push
    # completed shorter jobs in a few hours, but others failed on memory.
    # redo on kolossus -- 14 hours!
    # NOTE: next time try harder working with split mafs!

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    ssh pk
    cd /cluster/data/hg18/bed/multiz28way/cons
    mkdir run.cons
    cd run.cons
    cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.2007-05-04
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg18/multiz28way/cons
cp -p $grp/$grp.mod $grp/$grp.non-inf .
cp -p $san/ss/$c/$f.ss ../../$grp/$grp.mod ../../$grp/$grp.non-inf $tmp
pushd $tmp > /dev/null
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 1
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'EOF'
    # << happy emacs
    chmod a+x doPhast.csh

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/hg18/multiz28way/cons
    ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
        /cluster/data/hg18/bed/multiz28way/cons/run.cons/in.list
    popd

    # run for all species
    cd ..
    mkdir -p all run.cons/all
    cd all
    cp ../28way.mod all.mod

    # non-informative option for closest relatives (exclude regions with only these aligning),
    # and till Adam fixes the problem, also exclude all species removed from tree (below)
    echo "panTro2,rheMac2" > all.non-inf
    cd ../run.cons

    # Create template file
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    cat > template << 'EOF'
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31
#ENDLOOP
'EOF'
    # << happy emacs
    cd all
    gensub2 ../in.list single ../template jobList
    para create jobList
        # 337 jobs
    para try
    para check
    para push

    # NOTE: These jobs regularly crash (too quick ?), and have to be repushed.
    # Also, a few hang, and need to be stopped and restarted.
    # The whole batch runs so fast, this isn't a problem
# CPU time in finished jobs:      34253s     570.89m     9.51h    0.40d  0.001 y
IO & Wait Time:                 61148s    1019.13m    16.99h    0.71d  0.002 y
Average job time:                 283s       4.72m     0.08h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             496s       8.27m     0.14h    0.01d
Submission to last job:           995s      16.58m     0.28h    0.01d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/hg18/multiz28way/cons/all
    cat bed/*/chr*.bed | ~/bin/${MACHTYPE}/bedSort stdin stdout | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/hg18/bed/multiz28way/cons/all

    # load into database
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way/cons/all
    hgLoadBed hg18 phastConsElements28way mostConserved.bed
        # Loaded 2183600 elements
    # compare with previous tracks
    hgsql hg18 -s -N -e "select count(*) from phastConsElements17way"
        # 2229902
    hgsql hg18 -s -N -e "select count(*) from phastConsElements17way where chrom='chr7'"
        # 114703
    # Try for 5% overall cov, and 70% CDS cov
    featureBits hg18 -enrichment refGene:cds phastConsElements28way >& fb.out &

    # Compare to chr7 for 17way -- chr7 is .7% lower than whole genome,
    # so aim for 4.3% on chr7

    featureBits hg18 -chrom=chr7 -enrichment refGene:cds phastConsElements28way
# USED FOR 17WAY
#   too little coverage
    # 14 .008 .28
# refGene:cds 0.911%, phastConsElements28way 3.551%, both 0.653%, cover 71.74%, enrich 20.20x
    # 14 .1 .28
# refGene:cds 0.911%, phastConsElements28way 3.954%, both 0.648%, cover 71.12%, enrich 17.98x
    # 12 .1 .28
# refGene:cds 0.911%, phastConsElements28way 3.914%, both 0.644%, cover 70.74%, enrich 18.08x
    # 14 .2 .3
    # 234653 elements
# refGene:cds 0.911%, phastConsElements28way 4.423%, both 0.659%, cover 72.34%, enrich 16.36x
    # 13 .2 .28
# refGene:cds 0.911%, phastConsElements28way 4.266%, both 0.644%, cover 70.73%, enrich 16.58x
# USE THIS ONE
#  minimum change to params to achieve coverage
    # 14 .2 .28
    # 249585 elements
# refGene:cds 0.911%, phastConsElements28way 4.269%, both 0.646%, cover 70.92%, enrich 16.61x
    # 15 .2 .28
# refGene:cds 0.911%, phastConsElements28way 4.271%, both 0.647%, cover 71.08%, enrich 16.64x
    # 14 .3 .28
# refGene:cds 0.911%, phastConsElements28way 4.644%, both 0.645%, cover 70.89%, enrich 15.27x
    # 14 .35 .28
# refGene:cds 0.911%, phastConsElements28way 4.879%, both 0.646%, cover 70.90%, enrich 14.53x
    # 14 .15 .3
    # 207188 elements
# refGene:cds 0.912%, phastConsElements28way 4.260%, both 0.660%, cover 72.34%, enrich 16.98x
    # 16 .15 .3
    # 193531 elements
# refGene:cds 0.912%, phastConsElements28way 4.289%, both 0.663%, cover 72.66%, enrich 16.94x
    # 20 .15 .3
    # 173668 elements
# refGene:cds 0.912%, phastConsElements28way 4.321%, both 0.667%, cover 73.11%, enrich 16.92x
    # 24 .15 .3
    # 159646 elements
# refGene:cds 0.912%, phastConsElements28way 4.338%, both 0.670%, cover 73.40%, enrich 16.92x
    # 30 .15 .3
    # 144399 elements
# refGene:cds 0.912%, phastConsElements28way 4.349%, both 0.673%, cover 73.72%, enrich 16.95x
    # 40 .15 .3
    # 128087 elements
# refGene:cds 0.912%, phastConsElements28way 4.353%, both 0.676%, cover 74.09%, enrich 17.02x
    # 50 .15 .3
    # 117338 elements
# refGene:cds 0.912%, phastConsElements28way 4.352%, both 0.678%, cover 74.32%, enrich 17.08x
    # 50 .1 .3
    # 116930 elements
# refGene:cds 0.912%, phastConsElements28way 4.347%, both 0.678%, cover 74.32%, enrich 17.10x
    # 50 .05 .3
    # 93391 elements
# refGene:cds 0.912%, phastConsElements28way 4.193%, both 0.680%, cover 74.57%, enrich 17.78x
    # 50 .07 .3
    # 99358
# refGene:cds 0.912%, phastConsElements28way 4.231%, both 0.680%, cover 74.51%, enrich 17.61x
    # 45 .07 .3
    # 102864 elements
# refGene:cds 0.912%, phastConsElements28way 4.227%, both 0.679%, cover 74.41%, enrich 17.60x
    # USE THIS ONE
    # matches element count for 17way
    # 45 .1 .3
    # 110836 elements
# refGene:cds 0.912%, phastConsElements28way 4.277%, both 0.678%, cover 74.33%, enrich 17.38x
    # 75 .1 .3
    # Try for really long elements
    # 93524 elements
# refGene:cds 0.912%, phastConsElements28way 4.279%, both 0.682%, cover 74.73%, enrich 17.47x
    # 100 .1 .3
    # 85757 elements
# refGene:cds 0.912%, phastConsElements28way 4.270%, both 0.683%, cover 74.90%, enrich 17.54
    # 71218 elements
    # 200 .1 .3
# refGene:cds 0.912%, phastConsElements28way 4.225%, both 0.686%, cover 75.16%, enrich 17.79x
    # 200 .12 .3
# refGene:cds 0.912%, phastConsElements28way 4.241%, both 0.686%, cover 75.13%, enrich 17.72x
    # USE THIS ONE
    # for really long elements
    # 200 .15 .3
        # 75659
# refGene:cds 0.912%, phastConsElements28way 4.261%, both 0.685%, cover 75.11%, enrich 17.63x

    featureBits hg18 -chrom=chr7 -enrichment refGene:cds phastConsElements17way
# refGene:cds 0.911%, phastConsElements17way 4.838%, both 0.639%, cover 70.22%, enrich 14.51x
    featureBits hg18 -enrichment refGene:cds phastConsElements17way
# refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x

    # compare element sizes to other runs:
    # e.g. select min(chromEnd-chromStart) from encodeTbaPhastConsEl
    # hg17 ENCODE TBA phastCons:  min=1, max=1961
    # hg17 ENCODE TBA gerp:       min=3, max=1426
    # hg18 17way:                 min=1, max=12590   #el on chr7: 114703

    # 45 .3 .31
    #  featureBits hg18 -enrichment refGene:cds phastConsElements28way
    refGene:cds 1.095%, phastConsElements28way 4.920%, both 0.827%, cover 75.48%, enrich 15.34x
    # 2906254 elements

    # Create merged posterier probability file and wiggle track data files
    #	pk is currently closer to the san than any other machine
    ssh pk
    cd /san/sanvol1/scratch/hg18/multiz28way/cons/all
    # sort by chromName, chromStart so that items are in numerical order
    #  for wigEncode
cat > listPp.csh << 'EOF'
    foreach d (pp/chr*/)
        ls $d/*.pp | sort -n -t\. -k2
    end
'EOF'
    csh listPp.csh | xargs cat | \
        nice wigEncode stdin phastCons28way.wig phastCons28way.wib

    # about 23 minutes for above
    cp -p phastCons28way.wi? /cluster/data/hg18/bed/multiz28way/cons/all

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way/cons/all
    ln -s /cluster/data/hg18/bed/multiz28way/cons/all/phastCons28way.wib \
        /gbdb/hg18/multiz28way
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
        phastCons28way phastCons28way.wig
    #  ~ 3 minute load

    ## Run phastCons on subgroup (placentals)
    ssh pk
    cd /cluster/data/hg18/bed/multiz28way/cons

    # create pruned tree
    set species = `cat ../species.lst`
    echo $species | sed 's/ /,/g'
    #anoCar1,bosTau3,canFam2,cavPor2,danRer4,dasNov1,echTel1,equCab1,eriEur1,felCat3,fr2,galGal3,gasAcu1,hg18,loxAfr1,mm8,monDom4,ornAna1,oryCun1,oryLat1,otoGar1,panTro2,rheMac2,rn4,sorAra1,tetNig1,tupBel1,xenTro2

    # setup placental-only run
    mkdir placental run.cons/placental
    cd placental
    # placental-only: exclude from phastCons:  10 non-placentals
    #       (platypus, opossum, 5 fish, chicken, lizard, frog)
    /cluster/bin/phast.new/tree_doctor ../28way.mod \
             --prune-all-but=bosTau3,canFam2,cavPor2,dasNov1,echTel1,equCab1,eriEur1,felCat3,hg18,loxAfr1,mm8,oryCun1,otoGar1,panTro2,rheMac2,rn4,sorAra1,tupBel1 \
                                      > placental.mod
    echo "panTro2,rheMac2,anoCar1,danRer4,fr2,galGal3,gasAcu1,monDom4,ornAna1,oryLat1,tetNig1,xenTro2" \
        > placental.non-inf

    cd ../run.cons/placental
    gensub2 ../in.list single ../template jobList
    para create jobList
    para try
    para check
    para push
    # ~30 minutes on pk
    # NOTE: sometimes jobs crash or hang due to access problems on SAN
    # para stop then push to recover

    cd ../../
    mkdir hqAll run.cons/hqAll
    cd hqAll
# high-qual only: exclude 10 low-qual mammals
    /cluster/bin/phast.new/tree_doctor 28way.mod \
        --prune-all-but=anoCar1,bosTau3,canFam2,danRer4,equCab1,fr2,galGal3,gasAcu1,hg18,mm8,monDom4,ornAna1,oryLat1,panTro2,rheMac2,rn4,tetNig1,xenTro2 \
                                      > hqAll.mod
    echo "panTro2,rheMac2,cavPor2,dasNov1,echTel1,loxAfr1,eriEur1,felCat3,oryCun1,otoGar1,sorAra1,tupBel1" \
        > hqAll.non-inf
    cd ../run.cons/hqAll
    gensub2 ../in.list single ../template jobList
    para create jobList
    para try
    para check
    para push

    cd ../../
    mkdir hqPlacental run.cons/hqPlacental
    cd hqPlacental
# high-qual placental only: exclude 10 non-placentals and 10 low-qual mammals,
    /cluster/bin/phast.new/tree_doctor ../28way.mod \
             --prune-all-but=bosTau3,canFam2,equCab1,hg18,mm8,panTro2,rheMac2,rn4 \
                                      > hqPlacental.mod
    echo "panTro2,rheMac2,cavPor2,dasNov1,echTel1,loxAfr1,eriEur1,felCat3,oryCun1,otoGar1,sorAra1,tupBel1,anoCar1,danRer4,fr2,galGal3,gasAcu1,monDom4,ornAna1,oryLat1,tetNig1,xenTro2" \
        > hqPlacental.non-inf
    cd ../run.cons/hqPlacental
    gensub2 ../in.list single ../template jobList
    para create jobList
    para try
    para check
    para push

    # add placental elements to Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/hg18/multiz28way/cons/placental
    cat bed/*/chr*.bed | ~/bin/${MACHTYPE}/bedSort stdin stdout | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/hg18/bed/multiz28way/cons/placental
    # load into database
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way/cons/placental
    hgLoadBed hg18 phastConsElements28wayPlacMammal mostConserved.bed
    featureBits hg18 -enrichment refGene:cds phastConsElements28wayPlacMammal >&! ../run.cons/placental/fb.out

    # experiments
    # USING THIS ONE: min change from 17way to achieve coverage
    # 14.2.28
    # 169516 elements
    # 169518
# refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Short 3.437%, both 0.615%, cover 67.40%, enrich 19.61x
# refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Short 3.437%, both 0.615%, cover 67.40%, enrich 19.61x
    # USING THIS ONE: vertebrate elements have similar count to 17way ("medium")
    # 45.1.3
    # 76715 elements
    # 76718 elements
# refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Med 3.312%, both 0.642%, cover 70.33%, enrich 21.24x
#refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Med 3.312%, both 0.642%, cover 70.33%, enrich 21.24x

    # Create merged posterier probability file and wiggle track data files
    #	pk is currently closer to the san than any other machine
    ssh pk
    cd /san/sanvol1/scratch/hg18/multiz28way/cons/placental

    # sort by chromName, chromStart so that items are in numerical order
    #  for wigEncode
cat > listPp.csh << 'EOF'
    foreach d (pp/chr*/)
        ls $d/*.pp | sort -n -t\. -k2
    end
'EOF'
    csh ../listPp.csh | xargs cat | \
        nice wigEncode stdin \
                phastCons28wayPlacMammal.wig phastCons28wayPlacMammal.wib
    # about 23 minutes for above
    cp -p phastCons28wayPlacMammal.wi? /cluster/data/hg18/bed/multiz28way/cons/placental

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way/cons/placental
    ln -s  \
        /cluster/data/hg18/bed/multiz28way/cons/placental/phastCons28wayPlacMammal.wib \
        /gbdb/hg18/multiz28way
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
        phastCons28wayPlacMammal phastCons28wayPlacMammal.wig
        # WARNING: Exceeded chr4_random size 842649 > 842648. dropping 2 data point(s)
        # NOTE: weird msa_split on this chrom -- sent inquiry to Adam about this
    #  ~ 3 minute load

    ########################################################################
    # phyloP conservation
    # split SS files into 100K chunks (5 min./job)
    ssh kki
    cd /cluster/data/hg18/bed/multiz28way/cons/
    mkdir run.phyloP.split
    cd run.phyloP.split

    cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
    set c = $1
    set san = /san/sanvol1/scratch/hg18/multiz28way
    set in =  $san/cons/ss
    set out = $san/phyloP/ss
    set PHASTBIN = /cluster/bin/phast.2007-05-04
    @ i=0
    foreach f (`ls $in/$c/*.ss | sort -n -t\. -k2`)
        @ i++
        mkdir -p $out/$c/$i
        $PHASTBIN/msa_split $f -i SS -o SS \
            -r $out/$c/$i/$c.$i -w 100000,0 -I 1000 -B 5000
    end
    echo "Done" >> $out/$c.done
'EOF'
# << happy emacs
    chmod +x doSplit.csh

    set san = /san/sanvol1/scratch/hg18/multiz28way
    set JOBS = /cluster/data/hg18/bed/multiz28way/cons/run.phyloP.split/jobList
    rm -f $JOBS
    foreach c (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
        echo "doSplit.csh $c {check out line+ $san/phyloP/ss/$c.done}" >> $JOBS
    end

    para create jobList
        # 49 jobs
    para try
    para check
    para push
    para time
# Completed: 49 of 49 jobs
# CPU time in finished jobs:       8827s     147.12m     2.45h    0.10d  0.000 y
# IO & Wait Time:                  6837s     113.95m     1.90h    0.08d  0.000 y
# Average job time:                 320s       5.33m     0.09h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1343s      22.38m     0.37h    0.02d
# Submission to last job:          1528s      25.47m     0.42h    0.02d

    ########################################################################
    # phyloP scoring method experiments on chr7   (2008-11-11 kate)

    ssh pk
    cd /cluster/data/hg18/bed/multiz28way/cons
    mkdir -p run.phyloPMethod
    cd run.phyloPMethod

cat > doPhyloP.csh << 'EOF'
    set method = $1
    set f = $2
    set out = $3
    set c = $f:r:r
    set n = $f:r:e
    set tmp = /scratch/tmp/$f
    mkdir -p $tmp
    cp -p /san/sanvol1/scratch/hg18/multiz28way/phyloP/ss/$c/$n/$f.ss ../tree.mod $tmp
    pushd $tmp > /dev/null

    # Built phast from CornellCVS on 11/11/08 in /cluster/bin/phast.build.
    # Symlinked the bin to /cluster/bin/phast.2008
    set PHASTBIN = /cluster/bin/phast.2008-11-13
    # PHAST version is 0.9.9.8b
    $PHASTBIN/phyloP --method $method --mode CONACC --wig-scores --chrom $c \
                -i SS tree.mod $f.ss > $f.wig

    popd > /dev/null
    mkdir -p $out:h
    mv $tmp/$f.wig $out
    rm -fr $tmp
'EOF'
    # Create list of chunks (just chr7 for now)
    pushd /san/sanvol1/scratch/hg18/multiz28way/phyloP/ss
    ls chr7/*/chr7.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
        /cluster/data/hg18/bed/multiz28way/cons/run.phyloPMethod/in.list

    # setup run
    mkdir -p all
    cd all
    cp ../../28way.mod tree.mod
    mkdir -p SCORE
    cd SCORE

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat > template << 'EOF'
#LOOP
csh ../../doPhyloP.csh SCORE $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloPMethod/all/SCORE/$(path1).wig}
#ENDLOOP
'EOF'

    # << happy emacs
    gensub2 ../../in.list single template jobList
    para create jobList
    para try
    para check
    para push

    # Completed: 1552 of 1552 jobs
    # CPU time in finished jobs:      15411s     256.84m     4.28h    0.18d  0.000 y
    # IO & Wait Time:                  7678s     127.97m     2.13h    0.09d  0.000 y
    # Average job time:                  15s       0.25m     0.00h    0.00d
    # Longest finished job:              29s       0.48m     0.01h    0.00d
    # Submission to last job:           236s       3.93m     0.07h    0.00d
    # Estimated complete:                 0s       0.00m     0.00h    0.00d

    cd ..
    mkdir -p LRT
    cd LRT

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat > template << 'EOF'
#LOOP
csh ../../doPhyloP.csh LRT $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloPMethod/all/LRT/$(path1).wig}
#ENDLOOP
'EOF'
    # << happy emacs
    gensub2 ../../in.list single template jobList
    para create jobList
    para try
    para check
    para push

    # sort by chromName, chromStart so that items are in numerical order
    #  for wigEncode
    cd /san/sanvol1/scratch/hg18/multiz28way/phyloP/all
cat > listPp.csh << 'EOF'
    foreach c (`ls -d chr*`)
        foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
            ls -1 $d/*.wig | sort -n -t\. -k3
        end
    end
'EOF'
    csh listPp.csh | xargs cat | \
        nice wigEncode stdin phyloP28way.wig phyloP28way.wib
    mkdir /cluster/data/hg18/bed/multiz28way/cons/phyloP/all
    cp -p phyloP28way.wi? /cluster/data/hg18/bed/multiz28way/cons/phyloP/all

    # setup placental run
    mkdir -p placental
    cd all
    cp ../../placental.mod tree.mod

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat > template << 'EOF'
#LOOP
csh ../doPhyloP.csh $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloP/placental/$(path1).wig}
#ENDLOOP
'EOF'

    # << happy emacs
    gensub2 ../in.list single template jobList
    para create jobList
    para try
    para check
    para push

    #CPU time in finished jobs:    1934553s   32242.55m   537.38h   22.39d  0.061 y
    #IO & Wait Time:                 82007s    1366.78m    22.78h    0.95d  0.003 y
    #Average job time:                  70s       1.16m     0.02h    0.00d
    #Longest running job:                0s       0.00m     0.00h    0.00d
    #Longest finished job:             147s       2.45m     0.04h    0.00d
    #Submission to last job:         37642s     627.37m    10.46h    0.44d

    # sort by chromName, chromStart so that items are in numerical order
    #  for wigEncode
    ssh pk
    cd /san/sanvol1/scratch/hg18/multiz28way/phyloP/placental
# check for clean dir here -- chr* will match garbage if it's there
cat > listPp.csh << 'EOF'
    foreach c (`ls -d chr*`)
        foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
            ls -1 $d/*.wig | sort -n -t\. -k3
        end
    end
'EOF'
    csh listPp.csh | xargs cat | \
        nice wigEncode stdin phyloP28wayPlacMammal.wig phyloP28wayPlacMammal.wib
    mkdir /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental
    cp -p phyloP28wayPlacMammal.wi? /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way/cons/phyloP/all
    ln -s  \
        /cluster/data/hg18/bed/multiz28way/cons/phyloP/all/phyloP28way.wib \
        /gbdb/hg18/multiz28way/phyloP28way.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
        phyloP28way phyloP28way.wig

    # WARNING: Exceeded chr4_random size 842649 > 842648. dropping 2 data point(s)
    cd ../placental
    ln -s  \
        /cluster/data/hg18/bed/multiz28way/cons/phyloP/all/phyloP28wayPlacMammal.wib \
        /gbdb/hg18/multiz28way/phyloP28wayPlacMammal.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
        phyloP28wayPlacMammal phyloP28wayPlacMammal.wig

     hgWiggle phyloP28wayChr7 | textHistogram -col=2 -real -skip=7 -binSize=.2
     0.000000 **************** 26649187
     0.200000 ************************************************************ 101774235
     0.400000 ********** 16325655
     0.600000 *** 4331032
     0.800000 * 1029490
     1.000000  0
     1.200000  456666
     1.400000  0
     1.600000  240876
     1.800000  0
     2.000000  246969
     2.200000  0
     2.400000  0
     2.600000  0
     2.800000  134764

     cd ../placental
     hgWiggle phyloP28wayPlacMammalChr7 | textHistogram -col=2 -real -skip=7 -binSize=.2 stdin

     cd ../../all
     hgWiggle phastCons28wayChr7Short | textHistogram -col=2 -real -skip=7 -binSize=.1 stdin
     0.000000 ************************************************************ 128445730
     0.100000 **** 7648620
     0.200000 ** 3473415
     0.300000 * 1986801
     0.400000 * 1399849
     0.500000 * 1096292
     0.600000  912539
     0.700000  893991
     0.800000  1008630
     0.900000 * 2940535
     1.000000 * 1383115

    ############################################################################
    # PhyloP experiments with new scoring methods:  LRT and SCORE, implemented in 2008
    # Using new PHAST package (rebuilt from cornellCVS)
    # chr7-only
    # 2008-11-11 kate

############################################################################
# DOWNLOADS FOR 28WAY (2007-05-30 kate)

    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz28way
    cat > downloads.csh << 'EOF'
    date
    set dir = /cluster/data/hg18/bed/multiz28way
    mkdir -p mafDownloads
    cd $dir/mafDownloads
    foreach f (../maf/chr*.maf)
	set c = $f:t:r
        echo $c
	nice gzip -c $f > $c.maf.gz
    end
    md5sum *.gz > md5sum.txt

    cd $dir
    mkdir -p phastConsDownloads/vertebrate phastConsDownloads/placental
    cd /san/sanvol1/scratch/hg18/multiz28way/cons
    foreach chr (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
      echo $chr
      cat `ls -1 all/pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \
        | nice gzip -c \
            > $dir/phastConsDownloads/vertebrate/$chr.pp.gz
      cat `ls -1 placental/pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \
        | nice gzip -c \
            > $dir/phastConsDownloads/placental/$chr.pp.gz
    end
    cd /cluster/data/hg18/bed/multiz28way/phastConsDownloads/vertebrate
    md5sum *.gz > md5sum.txt
    cd ../placental
    md5sum *.gz > md5sum.txt
    date
'EOF'
    csh downloads.csh >&! downloads.log &
    # << happy emacs

    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/hg18/phastCons28way
    mkdir -p $dir/vertebrate $dir/placental
    ln -s /cluster/data/hg18/bed/multiz28way/phastConsDownloads/vertebrate/{*.gz,md5sum.txt} $dir/vertebrate
    ln -s /cluster/data/hg18/bed/multiz28way/phastConsDownloads/placental/{*.gz,md5sum.txt} $dir/placental
    cp /usr/local/apache/htdocs/goldenPath/hg18/phastCons17way/README.txt $dir
    # edit this file to reflect the latest releases used.
    vi $dir/README.txt
    set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz28way/maf
    mkdir $dir
    ln -s /cluster/data/hg18/bed/multiz28way/mafDownloads/{*.gz,md5sum.txt} $dir

    # upstream mafs (mafFrags takes a while)
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way/mafDownloads
cat > mafFrags.csh << 'EOF'
    date
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad

        cat up.bad|sed -e "s/_up_${i}_/\t/" >up.bad2
	awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, $4, 0, $6)}' up.bad2 > up.bed
	rm up.bad up.bad2
	nice mafFrags hg18 multiz28way up.bed upstream$i.maf \
	   -orgs=/cluster/data/hg18/bed/multiz28way/species.lst
	rm up.bed
    end
    date
'EOF'
# << happy emacs

    ssh kkstore02
    cd /cluster/data/hg18/bed/multiz28way/mafDownloads
    csh mafFrags.csh > mafFrags.log &
    nice gzip up*.maf
    md5sum up*.gz >> md5sum.txt

    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz28way
    # link filtered nets and chains to downloads area (doRecipBest.pl could
    # be changed for this)
    # Species where syntenic net was used
    foreach db (panTro2 rheMac2 equCab1 canFam2 bosTau3 mm8 rn4,monDom4)
        echo $db
        set cd = /cluster/data/hg18/bed/blastz.$db/axtChain
        cd $cd
        set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
        set f = hg18.$db.syn.net.gz
        if (! -e $f) then
            netFilter -syn hg18.$db.net.gz > $f
        endif
        set d = /usr/local/apache/htdocs/goldenPath/hg18/vs$Db
        ln -s $cd/$f $d
        nice md5sum $f >> $d/md5sum.txt
    end

    # Create downloads dir for new species without genome databases
    #foreach db (tupBel1 cavPor2 eriEur1 sorAra1)
    # NOTE: Keeping these only on genome-test for now.
    foreach db (tupBel1 cavPor2 eriEur1 sorAra1)
        echo $db
        set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
        set d = /usr/local/apache/htdocs/goldenPath/hg18
        mkdir -p $d/vs$Db
        cp $d/vsOryCun1/README.txt $d/vs$Db
        set bd = /cluster/data/hg18/bed/blastz.$db
        cd $bd/axtChain
        set f = hg18.$db.net.gz
        if (! -e $f) then
           cat net/*.net | gzip -c > $f
        endif
        nice md5sum hg18.$db.{all.chain,net}.gz > md5sum.txt
        cd ..
        nice md5sum axtNet/*.gz >> axtChain/md5sum.txt
        ln -s $bd/axtChain/hg18.$db.{all.chain,net}.gz $d/vs$Db
        ln -s $bd/axtChain/md5sum.txt $d/vs$Db
        ln -s $bd/axtNet $d/vs$Db
    end
    # EDIT README's for the species

    # Post reciprocal best nets
    # NOTE: Keeping these only on genome-test for now.
cat > downloads4.csh << 'EOF'
    foreach db (felCat3 otoGar1 loxAfr1 oryCun1 echTel1 dasNov1 \
            tupBel1 cavPor2 eriEur1 sorAra1)
        echo $db
        set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
        set d = /usr/local/apache/htdocs/goldenPath/hg18/vs$Db
        set cd = /cluster/data/hg18/bed/blastz.$db/axtChain
        ln -s $cd/hg18.$db.rbest.{chain,net}.gz $d
        cd $d
        md5sum hg18.$db.rbest.{chain,net}.gz >> md5sum.txt
    end
'EOF'
    # EDIT README's to include reciprocal best chains & nets


############################################################################
# 28-way PhyloP downloads
# 2008-10-21 kate

    ssh kolossus
    cd /san/sanvol1/scratch/hg18/multiz28way/phyloP
cat > merge.csh << 'EOF'
    set out = $1
    rm -f *.lst
    foreach c (`ls -d chr*`)
        echo $c
        touch $c.lst
        foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
            ls -1 $d/*.wig | sort -n -t\. -k3 >> $c.lst
            xargs < $c.lst cat > $out/$c.wigFix
        end
    end
'EOF'
    # all species
    cd all 
    csh ../merge.csh /cluster/data/hg18/bed/multiz28way/cons/phyloP/all > merge.log

    cd ../placental
    csh ../merge.csh /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental > merge.log

    cd /cluster/data/hg18/bed/multiz28way/cons/phyloP
   
    # post to downloads
    cd /usr/local/apache/htdocs/goldenPath/hg18
    mkdir phyloP28way
    cd phyloP28way
    ln -s /cluster/data/hg18/bed/multiz28way/cons/phyloP/{all,placental} .

    cd all
    nice gzip $out/$c.wigFix
    cd ../placental
    nice gzip $out/$c.wigFix

############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd)

    # see hg17.txt for build temporary ccds database for CCDS.20070228

    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg18 ccdsInfo ccdsGene
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg18 -loadDb ccdsGene knownGene ccdsKgMap
    checkTableCoords hg18 -verbose=2 ccdsGene
    # update all.jointer to include hg18 in ccdsDb
    joinerCheck -database=hg18 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner

    # build initial version of ccdsMgcMap table, updated by nightly genbank update
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg18 ccdsGene mgcGenes ccdsMgcMap

    # load trackDb
    cd kent/src/hg/makeDb/trackDb
    make alpha
    # check in browser

    # request push of
        ccdsGene
        ccdsInfo
        ccdsKgMap
        ccdsMgcMap
    # << emacs

#########################################################################
# RECIPROCAL BEST CHIMP PANTRO2 (2007-03-02 kate)
#       Requested by Daryl
    cd /cluster/data/hg18/bed/blastz.panTro2
    doRecipBest.pl hg18 panTro2 >&! rbest.log &


#########################################################################
# EPONINE-TSS (TRANSCRIPTON START SITE) PREDICTION
# (DONE, 2007-03-08, hartera)
# The Eponine software is version 2 and has not changed in several years
# (contact: Thomas Down at Sanger, td2 at sanger.ac.uk). The version downloaded
# for hg16 should be the same as the current version but download again just
# to check. The application includes the TSS model file: eponine-tss2.xml

     ssh kkstore02
     # Eponine runs fine on 2.5Mb contig, but barfs on much larger contig;
     # chop up sequence at gaps into ~2.5Mb chunks for cluster run.
     mkdir /san/sanvol1/scratch/hg18/chunks
     cd /cluster/data/hg18
     foreach f (?{,?}/NT_*/NT_??????.fa)
       set ctg = $f:t:r
       /cluster/bin/x86_64/faSplit -minGapSize=10 \
        -lift=/san/sanvol1/scratch/hg18/chunks/${ctg}.lft \
        gap $f 2500000 /san/sanvol1/scratch/hg18/chunks/${ctg}.chunk
     end
     # seems to ignore the chunk part of the file name
     mkdir /cluster/data/hg18/bed/eponine
     cd /cluster/data/hg18/bed/eponine
     wget --timestamping \
       http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar
     # file has the same date and same size as the one downloaded for hg16
     # the script requires all of the path setting found in my .tcshrc file.
     # Using only set path = (/usr/java/jre1.5.0_06/bin $path)
     # as in the doEpo file for hg16 does not work.
     cat << '_EOF_' > doEpo
#!/bin/csh -ef
set path = (/usr/java/jre1.5.0_06/bin /bin /usr/bin /usr/X11R6/bin \
             /usr/local/bin . /cluster/home/hartera/bin/x86_64 \
             /cluster/bin/x86_64 /projects/compbio/bin/x86_64 \
             /projects/compbio/bin /projects/compbio/bin/x86_64-linux \
             /cluster/bin/scripts)
java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2
'_EOF_'
     chmod a+x doEpo
     cp /dev/null jobList
     foreach f (/san/sanvol1/scratch/hg18/chunks/NT*.fa)
        echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \
      >> jobList
     end
     mkdir out
     ssh pk
     cd /cluster/data/hg18/bed/eponine
     /parasol/bin/para create jobList
     /parasol/bin/para try, check, push, check etc.....
     /parasol/bin/para time
# Completed: 1408 of 1408 jobs
# CPU time in finished jobs:     105248s    1754.13m    29.24h    1.22d  0.003 y
# IO & Wait Time:                  4369s      72.82m     1.21h    0.05d  0.000 y
# Average job time:                  78s       1.30m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             104s       1.73m     0.03h    0.00d
# Submission to last job:          1295s      21.58m     0.36h    0.01d

    # lift chunks -> contigs
    mkdir contigs/
    foreach l (/san/sanvol1/scratch/hg18/chunks/*.lft)
      set ctg = $l:t:r
      liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff
    end
    # lift contigs -> chrom
    liftUp eponine.gff /cluster/data/hg18/jkStuff/liftAll.lft \
           warn contigs/NT_*.gff
    # Translate to bed 4 + float-score -- it would be a shame to lose
    # those scores in genePred or bed 5 (int score)
    awk 'BEGIN {i=0;} \
         {printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \
          i = i + 1;}' \
      eponine.gff > eponine.bed
    # load up
    ssh hgwdev
    cd /cluster/data/hg18/bed/eponine
    sed -e 's/bed6FloatScore/eponine/g' \
      $HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql
    hgLoadBed hg18 eponine eponine.bed -tab -sqlTable=eponine.sql
    # Loaded 61359 elements of size 6
    # trackDb.ra entry and eponine.html already exist in trackDb directory.

###########################################################################
# ACEScan Track (DONE 2007-03-15  Andy

ssh hgwdev
cd /cluster/data/hg18/bed
mkdir acescan
cd acescan/
cp /cluster/data/hg17/bed/acescan/acescan.hg17.gp .
liftOver -genePred acescan.hg17.gp /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
  acescan.hg18.gp unmapped
ldHgGene -predTab hg18 acescan acescan.hg18.gp

##############################################################################
# Update central DB gdbPdb table in preparation for KG III (DONE 3/22/07, Fan)

   mysql -u hgcat -p$HGPSWD -h genome-testdb -A hgcentraltest
      update gdbPdb set proteomeDb  = "proteins070202" where genomeDb = "hg18";
      quit


##############################################################################
# UPDATE CGAP TABLES (DONE, 3/26/07, Fan)

    cd /cluster/data/hg18/bed/ucsc.10
    mkdir cgap
    cd cgap

    wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat"
    hgCGAP Hs_GeneData.dat

    cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab
    hgLoadSqlTab hg18 cgapAlias ~/kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab

    hgLoadSqlTab hg18 cgapBiocPathway ~/kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab

    cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab
    hgLoadSqlTab hg18 cgapBiocDesc ~/kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab

##############################################################################
# UPDATE CGAP TABLES (DONE, 8/05/08, JK)

    cd /cluster/data/hg18/bed/ucsc.11
    mkdir cgap
    cd cgap

    wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat"
    hgCGAP Hs_GeneData.dat

    cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab
    hgLoadSqlTab hg18 cgapAlias ~/kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab

    hgLoadSqlTab hg18 cgapBiocPathway ~/kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab

    cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab
    hgLoadSqlTab hg18 cgapBiocDesc ~/kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab


##############################################################################
## BLASTZ HUMAN HG18 (DONE - 2007-03-26 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
    cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
    cat << '_EOF_' > DEF
# human vs lancelet

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1

# QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold
#       Largest scaffold 7,200,735 - 3032 scaffolds + chrM
SEQ2_DIR=/san/sanvol1/scratch/braFlo1/braFlo1.2bit
SEQ2_LEN=/san/sanvol1/scratch/braFlo1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/braFlo1/braFlo1.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -verbose=2 \
	-blastzOutRoot /cluster/bluearc/hg18BraFlo1 > do.log 2>&1 &
    #	real    458m43.961s

    cat fb.hg18.chainBraFlo1Link.txt
    #	26455595 bases of 2881515245 (0.918%) in intersection
    #	test reciprocal best chains/nets for 5-way maf alignments
    #	on braFlo1, this did not work right there
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 braFlo1 \
	> rbest.log 2>&1 &
    #	real    105m14.176s

    # and now the swap, also documented in braFlo1.txt
    mkdir /cluster/data/braFlo1/bed/blastz.hg18.swap
    cd /cluster/data/braFlo1/bed/blastz.hg18.swap
    time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \
	/cluster/data/hg18/bed/blastz.braFlo1.2007-03-26/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -verbose=2 \
	-swap > swap.log 2>&1 &
    #	real    83m46.258s
    cat fb.braFlo1.chainHg18Link.txt
    #	30912893 bases of 923355587 (3.348%) in intersection

##############################################################################
# RE-BUILD knownGeneList, (DONE, 3/29/07, Fan)

    cd /cluster/data/hg18/bed
    rm -rf knownGeneList/hg18

# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/hg18

    hgKnownGeneList hg18

# copy over to /usr/local/apache/htdocs

    rm -rf /usr/local/apache/htdocs/knownGeneList/hg18
    mkdir -p /usr/local/apache/htdocs/knownGeneList/hg18
    cp -Rfp knownGeneList/hg18/* /usr/local/apache/htdocs/knownGeneList/hg18

##############################################################################
# Update entrez DB tables.

   cd /cluster/store10/entrez
   mkdir 070329
   ln -s /cluster/store10/entrez/070329 /cluster/data/entrez/070329
   cd /cluster/data/entrez/070329

   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
   gzip -d *.gz

   cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g'|cut -f 1-2 > entrezMrna.tab
   cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
   cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g'|cut -f 1-2 > entrezRefseq.tab
   cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g'|cut -f 1,2,4 > entrezRefProt.tab

   hgLoadSqlTab entrez entrezRefseq ~/src/hg/lib/entrezRefseq.sql ./entrezRefseq.tab
   hgLoadSqlTab entrez entrezMrna ~/src/hg/lib/entrezMrna.sql ./entrezMrna.tab
   hgLoadSqlTab entrez entrezRefProt ~/src/hg/lib/entrezRefProt.sql ./entrezRefProt.tab

   cd /cluster/data/hg18/bed/ucsc.10
   hgsql entrez -N -e \
        'select mrna, refseq from entrezRefseq, entrezMrna, hg18.all_mrna where qName=mrna and entrezRefseq.geneID=entrezMrna.geneID' \
   >mrnaRefseq1.tab

# Include RefSeq as valid mRNA too.
    hgsql hg18 -N -e 'select name, name from refGene' >mrnaRefseq2.tab

    cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab

    hgLoadSqlTab hg18 mrnaRefseq ~/src/hg/lib/mrnaRefseq.sql ./mrnaRefseq.tab

##############################################################################
# RE-BUILD KEGG RELATED TABLES FOR KG III.  (DONE, 3/29/07, Fan)

    wget --timestamping -O hsa.html \
    "http://www.genome.ad.jp/dbget-bin/www_bfind_sub?dbkey=pathway&keywords=hsa&mode=bfind&max_hit=1000&.cgifields=max_hit"

    grep href hsa.html | perl -wpe "s/<[^>]+>//g" > hsa.lis

# edit hsa.lis to get rid of the first blank line and last line which is an unrelated line.

    ~/kent/src/hg/protein/getKeggList2.pl hsa > keggList.tab

    hgLoadSqlTab hg18 keggList ~/src/hg/lib/keggList.sql ./keggList.tab

# Before running hgKegg3, make sure entrez DB is updated.

    hgKegg3 hg18 hg18

# Load resulting data

    hgLoadSqlTab hg18 keggPathway ~/src/hg/lib/keggPathway.sql ./keggPathway.tab
    hgLoadSqlTab hg18 keggMapDesc ~/src/hg/lib/keggMapDesc.sql ./keggMapDesc.tab

##############################################################################
# REATTACH KEGG TO KNOWN GENES.  (DONE, 8/12/08, JK)

    mkdir -p /cluster/data/hg18/bed/ucsc.11/kegg
    cd /cluster/data/hg18/bed/ucsc.11/kegg
    kgAttachKegg hg18 ../../ucsc.10/kegg/keggList.tab keggPathway.tab
    hgLoadSqlTab hg18 keggPathway ~/src/hg/lib/keggPathway.sql ./keggPathway.tab


##############################################################################
# REATTACH SPMRNA TABLE TO KNOWN GENES.  (DONE, 8/12/08, JK)
   hgsql hg18 -N -e "select spDisplayID,kgID from kgXref where spDisplayID != ''" > spMrna.tab;
   hgLoadSqlTab hg18 spMrna ~/kent/src/hg/lib/spMrna.sql spMrna.tab

##############################################################################
# UPDATE BIOCYCTABLES NEEDED BY hgGene (DONE 3/27/07 Fan)

# First register with BioCyc to download their HumanCyc database
# The site will email you the URL for download

    wget --timesatmping \
    http://bioinformatics.ai.sri.com/ecocyc/dist/flatfiles-52983746/humancyc-flatfiles.zip

    unzip  humancyc-flatfiles.zip

    cp genes.col genes.tab
    cp pathways.col pathways.tab

# delete the first 20 or so header lines from these two files.
    vi genes.tab
    vi pathways.tab

    hgsql hg18 -e 'create database bioCyc070327'

    hgLoadSqlTab bioCyc070327 genes ~/src/hg/lib/bioCycGenes.sql ./genes.tab
    hgLoadSqlTab bioCyc070327 pathways ~/src/hg/lib/bioCycPathways.sql ./pathways.tab


# Create bioCycMapDesc.tab
    hgsql bioCyc070327 -N -e 'select UNIQUE_ID, NAME from pathways' |sort -u >  bioCycMapDesc.tab

# Create bioCycPathway.tab
    kgBioCyc0 bioCyc070327 hg18 hg18

    hgLoadSqlTab hg18 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
    hgLoadSqlTab hg18 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab

##########################################################################
# PARTIAL UPDATE OF BIOCYCTABLES NEEDED BY hgGene (DONE 8/05/08 JK)
# Note, ideally would get new data from bioCyc, but they never sent me the
# URL for the files though I filled out their web form a week ago.  So reusing the
# 3/27/07 pathways.col and genes.col files.  I did write a new kbBioCyc1 to do
# the actual load, and it is on the new UCSC genes.  It looks to be a slight
# improvement.  About 10% more genes in pathways.
    mkdir /cluster/data/hg18/bed/ucsc.11/bioCyc
    cd /cluster/data/hg18/bed/ucsc.11/bioCyc
    grep -v '^#' /cluster/data/hg18/bed/ucsc.10/bioCyc/pathways.col > pathways.tab
    grep -v '^#' /cluster/data/hg18/bed/cusc.10/bioCyc/genes.col > genes.tab
    kgBioCyc1 genes.tab pathways.tab $db bioCycPathway.tab bioCycMapDesc.tab
    hgLoadSqlTab hg18 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
    hgLoadSqlTab hg18 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab

###########################################################################
# SwitchDB TSS Track (DONE 2007-04-12 Andy)

ssh hgwdev
mkdir /cluster/data/hg18/bed/switchDbTss
cd /cluster/data/hg18/bed/switchDbTss
ln -s /cluster/data/hg17/bed/switchDbTss/switchDbTss.bed hg17.bed
liftOver -bedPlus=5 hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.bed unMapped
wc -l unMapped
#12 unMapped    (12 are "deleted in new")
ln -s ~/kent/src/hg/lib/switchDbTss.sql
hgLoadBed -sqlTable=switchDbTss.sql hg18 switchDbTss hg18.bed

###########################################################################
# ADD KG TO TREEFAM LINKS (DONE, 2007-04-13 Fan)

# Generate ucscToEnsembl.txt and send it to TreeFam
# zhongzhongchen [chenzhzh at genomics.org.cn]

    hgsql hg18 -N -e 'select * from knownToEnsembl' >ucscToEnsembl.txt


    ssh hgwdev
    cd /cluster/store12
    mkdir treeFam070413
    ln -s /cluster/store12/treeFam070413 /cluster/data/treeFam
    cd /cluster/data/treeFam

# Receive the following files from TreeFam

    ucscToEnsemblToTreefamToRefToUniprot.txt
    ucscToEnsemblToTreefamToRef.txt
    ucscToEnsemblTotreefam.txt

# Use ucscToEnsemblTotreefam.txt to construct knownToTreefam table.

    cut -f 1,3 ucscToEnsemblTotreefam.txt >knownToTreefam.tab

    hgLoadSqlTab hg18 knownToTreefam \
    ~/src/hg/lib/knownToTreefam.sql ./knownToTreefam.tab

# Add the following section into kent/src/hg/hgGene/hgGeneData/links.ra

    name treeFam
    shortLabel Treefam
    tables knownToTreefam
    idSql select value from knownToTreefam where name = '%s';
    url http://www.treefam.org/cgi-bin/TFinfo.pl?ac=%s
    priority 10

###########################################################################
# BLASTZ/CHAIN/NET HORSE (equCab1) (STARTED 2/16/07, DONE 2/21/07, Fan)
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.hg18.2007-02-15
    cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15
# NOTE: THE TARGET WAS ORIGINALLY INTENDED TO BE HORSE, BUT I DID NOT
# DISCOVER THIS UNTIL THE TASK IS DONE.
    cat << '_EOF_' > DEF
# Horse vs. Human

BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse equCab1
SEQ2_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ2_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.hg18.2007-02-15
TMPDIR=/scratch/tmp
'_EOF_'

    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab1/blastz.hg18 >& do.log &
    tail -f do.log

    ln -s blastz.hg18.2007-02-15 /cluster/data/hg18/bed/blastz.equCab1

    nice featureBits hg18 -chrom=chr1 chainEquCab1Link
    # 132947074 bases of 224999719 (59.088%) in intersection

    ssh hgwdev
    cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15

    bash
    time nice -n 19 featureBits hg18 chainEquCab1Link \
	> fb.hg18.chainEquCab1Link.txt 2>&1 &
    # 1643928877 bases of 2881515245 (57.051%) in intersection

#########################################################################
# enable ORFeome track build.  (markd 2007-05-02)
    cd ~/kent/src/hg/makeDb/genbank
    cvs update -d etc
    # edit etc/genbank.conf to add
        hg18.orfeomeTables.hgwdev = yes
        hg18.orfeomeTables.hgwbeta = yes
    # will need to enable for rr later.  In the future, this can just be enabled
    # as part the normal genbank build.  Change above to:
        hg18.orfeomeTables.default = yes


#########################################################################
# exaptedRepeats track (4/30/07, Craig)
# for full methods an analysis see: Lowe, Bejerano, Haussler.
# Thousands of human mobile element fragments undergo
# strong purifying selection near developmental genes.
# PNAS. (in press). Epub 2007 Apr 26.
#
# Code to re-make this track is in:
# build36/bed/exapted/create.csh
#
# To re-make the track all you have to do is run that c-shell
# while you are in its directory.
# It is easiest if you are on hgwdev since it uses featureBits a few times
# and gets some info from the sql database.  I would say it takes
# about two hours to run.
#

#######################################################################
# UCSC GENES (DONE 2007-03-xx kent)

see file: ucscGenes10.txt

#######################################################################
# ENCODE Regulation track  (DONE June 2010.  DNAse and TFBS redone April 30
# 2011 kent)

      #make root dir
      mkdir -p /cluster/data/hg18/bed/wgEncodeReg
      cd /cluster/data/hg18/bed/wgEncodeReg

      # Create the DNAse peak clusters subtrack.
      # Get all of the narrowPeak format files for the wgEncodeUwDnaseSeq
      # linked into directory /hive/users/kent/regulate/dnase/peaks
      mkdir dnase
      cd dnase
      mkdir peaks
      ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg18/wgEncodeUwDnase/*.narrowPeak.gz peaks

      # Process these into clusters in a bed file and load clusters into
      # table.
      /bin/ls -1 peaks/*.narrowPeak.gz > peak.lst
      regClusterMakeTableOfTables uw01 peak.lst peak.table
      regCluster peak.table /dev/null peak.bed
      awk '$4 > 1 || $5 >= 100' peak.bed > wgEncodeRegDnaseClustered.bed
      hgLoadBed hg18 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.bed

      # Make wgEncodeRegDnaseClusteredInput table.  Start with mdbQuery, and
      # then do some massaging since not completely in sync with file list.
      mdbQuery out=tab "select obj,cell,treatment,replicate,lab,dateUnrestricted from hg18 where obj like 'wgEncodeUwDnase%' and view='Peaks'" | sed 's/n\/a/None/' > inputMdb.tab
      cut -f 1 peak.table | sed 's/\.narrowPeak\.gz//' | sed 's/peaks\///' > inputs.lst
      weedLines inputs.lst inputMdb.tab wgEncodeRegDnaseClusteredInputs.tab -invert 
      hgLoadSqlTab hg18 wgEncodeRegDnaseClusteredInputs ~/kent/src/hg/lib/clusterInputDnase.sql \
	  wgEncodeRegDnaseClusteredInputs.tab

      # Create the Transcription Factor Binding Site subtrack.  This is a bit
      # complex because it is merging data from the Snyder lab (yale) and from
      # HudsonAlpha (hud),  and the hud data has replicates while the yale
      # data does not.

      # Create hud/replicates directory full of gzipped narrow peak files,
      # converting broad peak files as needed.
      mkdir -p /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud/replicates
      cd /hive/groups/encode/dcc/analysis/ftp/pipeline/hg18/wgEncodeHudsonalphaChipSeq
      foreach i (*.narrowPeak*.gz)
           cp $i /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud/replicates 
      end
      foreach i (*.broadPeak*.gz)
            zcat $i | awk '{printf("%s\t%d\n", $0, ($3-$2)/2);}' > \
	    /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud/replicates/$i:r:r.narrowPeak
      end
      cd /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud/replicates
      gzip *.narrowPeak

      # Get ra file that includes the file name and other info we need for hud
      cd /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud
      mkdir hud
      cd hud
      mdbQuery "select tableName,fileName,antibody,cell,replicate,treatment,lab from hg18 \
	where view='Peaks' and lab='HudsonAlpha' and dataType='ChipSeq'" \
	-out=ra | sed s/broadPeak/narrowPeak/ >  hud.ra

      # Generate merged dir full of merged replicates.
      mkdir merged
      encodeMergeReplicatesBatch hud.ra replicates merge.sh merged.ra merged
      chmod a+x merge.sh
      merge.sh

      # Generate symbolic links to directories containing pooled peak files
      # for yale and for hud
      cd /cluster/data/hg18/bed/wgEncodeReg/tfbs
      ln -s /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud/merged hudPeaks
      ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg18/wgEncodeYaleChIPseq yalePeaks

      # Get ra file that includes the file name and other info we need for yale
      mdbQuery "select tableName,fileName,antibody,cell,treatment,lab from hg18 where lab like '%Snyder%' and dataType like 'ChipSeq' and view='Peaks'" out=ra | sed 's/wgEncode/yalePeaks\/wgEncode/' > yale.ra

      # Get ra file for hud
      cat hud/merged.ra | sed 's/wgEncode/hudPeaks\/wgEncode/' > hud.ra

      # Combine both ra files and convert to three column tab-separated
      cat yale.ra hud.ra | raToTab stdin bothLabs.tab -cols=fileName,cell,antibody

      # Set up config file for clustering job.  This includes calculating some
      # normalization factors for the score.  The cellLetter.tab assigns
      # letters to cell lines, and is created by hand in the source tree.
      cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter.tab .
      regClusterBedExpCfg -tabList bothLabs.tab bothLabs.cfg -cellLetter=cellLetter.tab

      # Do the actual clustering and load results into database
      hgBedsToBedExps -dupeLetterOk bothLabs.cfg peak.bed peak.exps
      awk '$2 != $3' peak.bed > filtered.bed
      hgLoadBed hg18 wgEncodeRegTfbsClustered filtered.bed
      hgLoadSqlTab hg18 wgEncodeRegTfbsCells ~/kent/src/hg/lib/expRecord.sql peak.exps

      # Create inputTrackTable - three columns: <table> <source> <factor>
      #mdbQuery "select tableName,cell,antibody from hg18 where (lab like '%Snyder%' or lab='HudsonAlpha') and dataType like 'ChipSeq' and view='Peaks' and antibody not like 'Pol2%'" -out=tab > wgEncodeRegTfbsClusteredInputs.tab
      cat yale.ra hud.ra | raToTab stdin stdout -cols=tableName,cell,antibody,cell,treatment,lab \
       | awk 'BEGIN {OFS="\t";} {if ($5 != "None") $2=$2 "+" $5;print}' \
       > wgEncodeRegTfbsClusteredInputs.tab
      hgLoadSqlTab hg18 wgEncodeRegTfbsClusteredInputs ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputs.tab


#######################################################################
# CGAP SAGE (DONE 2007-04-17 Andy)

   ssh hgwdev
   bash
   mkdir /san/sanVol1/scratch/andy/cgapSage
   cd /san/sanVol1/scratch/andy/cgapSage
   echo "select * from cgapSageLib" | hgsql hg18 | tail +2 > libs.txt
   echo "select * from snp127 where class='single' and locType='exact'" | hgsql hg18 | tail +2 | cut -f2- > allSnpss.txt
   echo "select name from snp127Exceptions where exception='ObservedWrongSize' or exception='SingleClassBetweenLocType' or exception='SingleClassRangeLocType' or exception='MultipleAlignment'" | hgsql hg18 | tail +2 > exceptions
  tabGrep -v exceptions 4 allSnps.txt > snps.txt
   rm allSnps.txt exceptions
   echo select chrom,chromStart,chromEnd,name from simpleRepeat | hgsql hg18 | tail +2 > trf.bed
   cut -f1-4 snps.txt > snps.bed
   overlapSelect -selectFmt=bed -inFmt=bed -nonOverlapping trf.bed snps.bed snps.noTrf.bed
   cut -f4 snps.noTrf.bed > snps.noTrf
   tabGrep snps.noTrf 4 snps.txt > snps.noTrf.txt
   mv snps.noTrf.txt snps.txt
   grep -v random /cluster/data/hg18/chrom.sizes | grep -v hap > chrom.sizes
   mkdir chromSnps
   for c in `cat chrom.sizes | cut -f1`; do
     awk "{if (\$1==\"$c\") print;}" snps.txt > chromSnps/$c.snps.txt;
     echo $c;
   done
   rm snps.txt
   wget ftp://ftp1.nci.nih.gov/pub/SAGE/HUMAN/Hs.libraries.gz
   gunzip Hs.libraries.gz
cat << "EOF" > cleanLibs.awk
BEGIN{FS="\t"}
{
for (i = 1; i <= 12; i++)
    {
    printf("%s\t", $i);
    }
sex = "";
if ($13=="male")
    {
    sex = "male,";
    }
else if ($13=="female")
    {
    sex = "female,";
    }
else if ($13=="male and female")
    {
    sex = "male,female,";
    }
else if ($13=="unknown")
    {
    sex = "";
    }
printf("%s\t", sex);
for (i = 14; i <= 20; i++)
    {
    printf("%s\t", $i);
    }
printf("%s\n", $21);
}
EOF
   tail +2 Hs.libraries | awk -f cleanLibs.awk > libs.txt
   ln -s ~/kent/src/hg/lib/cgapSage/cgapSageLib.sql
   hgLoadSqlTab hg18 cgapSageLib cgapSageLib.sql libs.txt
   partitionSequence.pl -lstDir small 5000000 30 hg18.2bit chrom.sizes 0 > sequence.lst
   grep -v small sequence.lst > seq.lst
   cat small/* >> seq.lst
   mv seq.lst sequence.lst
   rm -rf small/
   wget ftp://ftp1.nci.nih.gov/pub/SAGE/HUMAN/Hs_long.frequencies.gz
   gunzip Hs_long.frequencies.gz
   cat << "EOF" > doJobList.sh
#!/bin/bash

# basic vars

part=$1;
range=${part#*2bit:};
chrom=${range%:*};
nums=${range#*:}
firstnum=${nums:0:1}

outDir=output/${chrom}/${firstnum}
mkdir -p $outDir
echo ./doFind.sh $1 {check out exists `pwd`/${outDir}/${range}.bed}
EOF
   chmod +x doJobList.sh
   for part in `cat sequence.lst`; do ./doJobList.sh $part >> jobList; done
   cat << "EOF" > doFind.sh
#!/bin/bash

# basic vars

part=$1;
range=${part#*2bit:};
chrom=${range%:*};
nums=${range#*:}
firstnum=${nums:0:1}

# dirs/files

startDir=`pwd`
scratch=/scratch/tmp/$part
output=$2

# begin

mkdir -p $scratch
pushd $scratch
twoBitToFa -noMask $startDir/"$part" part.fa
cgapSageFind part.fa $startDir/Hs_long.frequencies $startDir/libs.txt \
     $startDir/chromSnps/${chrom}.snps.txt output.bed
cp output.bed $output
popd
rm -rf $scratch
EOF
chmod +x doFind.sh
ssh pk
cd /san/sanVol1/scratch/andy/cgapSage
para create jobList
para try
para push
# takes like 5-10 min
exit
# back to hgwdev
find output/ -name '*.bed' -exec cat '{}' >> output.bed \;
cgapSageDupeRemove output.bed tmp.bed
cgapSageDupeRemove -unique tmp.bed final.bed
ln -s ~/kent/src/hg/lib/cgapSage/cgapSage.sql
hgLoadBed -sqlTable=cgapSage.sql -tab hg18 cgapSage final.bed

#########################################################################
# HapMap SNPs (DONE 2007-05-23 Andy)
# rel22
# OBSOLETED by Phase II+III SNPs 3/09 angie (see HAPMAP REL27 GENOTYPES)
# Tables renamed to [originalName]PhaseII 3/9/09
ssh hgwdev
bash
cd /cluster/data/hg18/bed
mkdir -p hapmap/zips
cd hapmap/zips
# archived to http://www.hapmap.org/genotypes/2007-03
wget -nd -r -N -A html http://www.hapmap.org/genotypes/latest_ncbi_build36/rs_strand/non-redundant/
grep gz index.html | sed 's/^.*href=\"\(geno.*\.txt\.gz\)\".*$/\1/' > files.txt
wget -N -i files.txt --base=http://www.hapmap.org/genotypes/latest_ncbi_build36/rs_strand/non-redundant/
rm index.html robots.txt files.txt
cd ../
mkdir samples
cd samples/
wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_CEU.txt.gz
wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_CHB.txt.gz
wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_JPT.txt.gz
wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_YRI.txt.gz
cp /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant/*.pl .
ln -s ../zips
./filterPedigree.pl < pedinfo2sample_CEU.txt > filtered.CEU
./filterPedigree.pl < pedinfo2sample_YRI.txt > filtered.YRI
zcat zips/*chr22_CEU* | head -1 | tr ' ' '\n' > header.CEU
zcat zips/*chr22_YRI* | head -1 | tr ' ' '\n' > header.YRI
grep -n -f filtered.CEU header.CEU | cut -f1 -d':' > offsets.CEU
grep -n -f filtered.YRI header.YRI | cut -f1 -d':' > offsets.YRI
for pop in CEU YRI CHB JPT; do
   for f in zips/genotypes_chr*_${pop}_r22_nr.b36.txt.gz; do
      zcat $f | ./filter${pop}.pl >> ../${pop}.merge
      echo Done with $f
   done
done
cd ../
for pop in CEU YRI CHB JPT; do
   ~/kent/src/hg/snp/snpLoad/hapmap1 ${pop}.merge ${pop}.condense
   mv hapmap1.log ${pop}.hapmap1.log
done
wc -l *.log
#0 CEU.hapmap1.log
#0 CHB.hapmap1.log
#0 JPT.hapmap1.log
#0 YRI.hapmap1.log
#0 total
rm *.log
cp ~/kent/src/hg/lib/hapmapSnps.sql .
for pop in CEU CHB JPT YRI; do
     sed "s/hapmapSnps/hapmapSnps$pop/" hapmapSnps.sql > hapmapSnps${pop}.sql
     hgLoadBed -sqlTable=hapmapSnps${pop}.sql hg18 hapmapSnps$pop ${pop}.condense
done
# Don't worry if you see:
#load of hapmapSnpsCEU did not go as planned... etc.
# unless it says rows skipped.
~/kent/src/hg/snp/snpLoad/hapmap2 hg18
#building CEU hash...
#Can't start query:
#select * from hapmapAllelesCEU
#
#mySQL error 1146: Table 'hg18.hapmapAllelesCEU' doesn't exist
# But this works:
~heather/kent/src/hg/snp/snpLoad/hapmap2 hg18
# (gotta bug Heather about that one)
ln -s ~/kent/src/hg/lib/hapmapSnpsCombined.sql
hgLoadBed -sqlTable=hapmapSnpsCombined.sql hg18 hapmapSnpsCombined hapmapSnpsCombined.tab
# Checks:
~heather/kent/src/hg/snp/snpLoad/snpCheckCluster2 hg18 hapmapSnpsCombined
#match count = 0
### clean up
rm *.sql hapmapSnpsCombined.tab bed.tab
tar cfvz merge.tar.gz *.merge
tar cfvz condense.tar.gz *.condense
rm *.condense *.merge
mkdir logs
mv *.errors *.log *.out logs
mkdir orthos
cd orthos/

# hgWiggle output has the chromosome in a comment, followed by the values
# This script prints that chromosome on every line
cat << "EOF" > joinify.awk
{
if ($1 == "variableStep")
   {
   sub("chrom=", "", $2);
   chrom = $2;
   }
else if ($1 != "#")
   {
   printf("%s,%s\t%s\n", chrom, $1, $2);
   }
}
EOF

cat << "EOF" > join.sh
#!/bin/bash
sed 's/\(^chr\w\+\)\t/\1,/' $1 > bed
sort -k1,1 bed > tmp; mv tmp bed
awk -f joinify.awk $2 > scores
sort -k1,1 scores > tmp; mv tmp scores
join -1 1 -2 1 bed scores | tr ',' ' ' |
  awk '{printf("%s\t%s\t%s\t%s\t%d\t%s\t%s\n", $1, $2, $3, $4, $8, $6, $7);}' > qual.tab
rm scores bed
EOF
chmod +x join.sh

# chimp alleles
cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
awk '{printf("%s\t%s\t%s\t%s\t0\t%s\t%s\n", $2, $3, $4, $5, $7, $8);}' snp126orthoPrelim.tab > snp126orthoPrelim.bed
cp snp126orthoPrelim.bed /cluster/data/hg18/bed/hapmap/orthos/panTro2.bed.new
cd /cluster/data/hg18/bed/hapmap/orthos
hgWiggle -db=panTro2 -bedFile=panTro2.bed quality > panTro2.scores
# create qual.tab; combine panTro2 sequence with panTro2 quality score
./join.sh panTro2.bed.new panTro2.scores
grep chr21 panTro2.bed.new >> qual.tab
grep chrY panTro2.bed.new >> qual.tab
# create snpOrtho.tab; a table in human coords that has associated ortho alleles
~heather/kent/src/hg/snp/snpLoad/snpOrtho hg18 snp126 qual.tab
sed 's/snpOrtho/snp126OrthoPanTro2/' ~/kent/src/hg/lib/snpOrtho.sql  > snpOrthoPanTro2.sql
hgLoadBed -tab -sqlTable=snpOrthoPanTro2.sql hg18 snp126OrthoPanTro2 snpOrtho.tab
mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chr21";
mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY";
mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY_random";
# get the HapMap subset
sed 's/hapmapAllelesOrtho/hapmapAllelesChimp/' ~/kent/src/hg/lib/hapmapAllelesOrtho.sql  > hapmapAllelesChimp.sql
~heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg18 hapmapSnpsCombined snp126OrthoPanTro2
hgLoadBed -tab -sqlTable=hapmapAllelesChimp.sql hg18 hapmapAllelesChimp hapmapOrtho.tab
# sanity check
mysql> select count(*) from hapmapAllelesChimp where chrom = orthoChrom;
# 3,492,708
mysql> select count(*) from hapmapAllelesChimp where chrom != orthoChrom;
# 374,010

# macaque alleles
cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
awk '{printf("%s\t%s\t%s\t%s\t0\t%s\t%s\n", $2, $3, $4, $5, $7, $8);}' snp126orthoPrelim.tab > snp126orthoPrelim.bed
cp snp126orthoPrelim.bed /cluster/data/hg18/bed/hapmap/orthos/rheMac2.bed.new
cd /cluster/data/hg18/bed/hapmap/orthos
hgWiggle -db=rheMac2 -bedFile=rheMac2.bed quality > rheMac2.scores
# create qual.tab: combine rheMac2 sequence with rheMac2 quality score
./join.sh rheMac2.bed.new rheMac2.scores
# create snpOrtho.tab; a table in human coords that has associated ortho alleles
~heather/kent/src/hg/snp/snpLoad/snpOrtho hg18 snp126 qual.tab
sed 's/snpOrtho/snp126OrthoRheMac2/' ~/kent/src/hg/lib/snpOrtho.sql  > snpOrthoRheMac2.sql
hgLoadBed -tab -sqlTable=snpOrthoRheMac2.sql hg18 snp126OrthoRheMac2 snpOrtho.tab
# get the HapMap subset
sed 's/hapmapAllelesOrtho/hapmapAllelesMacaque/' ~/kent/src/hg/lib/hapmapAllelesOrtho.sql  > hapmapAllelesMacaque.sql
~heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg18 hapmapSnpsCombined snp126OrthoRheMac2
hgLoadBed -tab -sqlTable=hapmapAllelesMacaque.sql hg18 hapmapAllelesMacaque hapmapOrtho.tab

# create summary table
~heather/kent/src/hg/snp/snpLoad/hapmapSummary hg18 hapmapSnpsCombined hapmapAllelesChimp hapmapAllelesMacaque
ln -s ~/kent/src/hg/lib/hapmapAllelesSummary.sql
hgLoadBed -tab -sqlTable=hapmapAllelesSummary.sql hg18 hapmapAllelesSummary hapmapSummary.tab


#############################################################################
# RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)

  ssh hgwdev
  cd /cluster/data/hg18/bed

  mkdir wgRna-2007-05-31
  cd wgRna-2007-05-31

# Received the data file, wg_may2007.txt (saved from wg_may2007.doc)
# from Michel Weber's email
# (Michel.Weber at ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg18/bed/wgRna-2007-05-31.

  cat wg_may2007.txt|sed -e 's/ /\t/g' > wgRna.tab

  hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab

#############################################################################
# N-SCAN GENES track (2007-06-21 markd)
# create a composite track with exists ab-inito and new PASA N-SCAN predictions

    # download pasa predictions
    cd /cluster/data/hg18/bed/nscan/pasa
    wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.pasa.gtf
    wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.prot.fa
    bzip2 hg18.*
    chmod a-w hg18.*

    ldHgGene -gtf -genePredExt hg18 nscanPasaGene hg18.pasa.gtf.bz2
    hgPepPred hg18 generic nscanPasaPep  hg18.prot.fa.bz2
    rm *.tab

    # update trackDb; need a hg18-specific page to describe informants and PASA
    human/hg18/nscan.html
    human/hg18/trackDb.ra

    # remove old human/hg18/nscanGene.html

###########################################################################
# AUGUSTUS track (DONE 2007-7-3 Mario)
#
# augustusHints subtrack

mkdir -p /cluster/data/hg18/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.X.final
cd /cluster/data/hg18/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.X.final
wget http://augustus.gobics.de/predictions/hg18/usingEvidence/augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.gff
wget http://augustus.gobics.de/predictions/hg18/usingEvidence/augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.aa
ldHgGene -bin hg18 augustusHints augustus.hg18.Trefseq.hmRNA.hsEST.R.X.gff
hgPepPred hg18 generic augustusHintsPep augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.aa


# augustus de novo subtrack

mkdir -p /cluster/data/hg18/bed/augustus/usingHints/predictions/Xp.RA.it
cd /cluster/data/hg18/bed/augustus/usingHints/predictions/Xp.RA.it
wget http://augustus.gobics.de/predictions/hg18/deNovo/augustus.hg18.Xp.RA.it.pep.gff
wget http://augustus.gobics.de/predictions/hg18/deNovo/augustus.hg18.Xp.RA.it.pep.aa
ldHgGene -bin hg18 augustusXRA augustus.hg18.Xp.RA.it.gff
hgPepPred hg18 generic augustusXRAPep augustus.hg18.Xp.RA.it.pep.aa


# augustus ab initio subtrack

mkdir -p /cluster/data/hg18/bed/augustus/abinitio
cd /cluster/data/hg18/bed/augustus/abinitio
wget http://augustus.gobics.de/predictions/hg18/abinitio/augustus.pep.gff
wget http://augustus.gobics.de/predictions/hg18/abinitio/augustus.pep.aa
ldHgGene -bin hg18 augustusAbinitio augustus.gff
hgPepPred hg18 generic augustusAbinitioPep augustus.pep.aa


#############################################################################
# Stanford NRSF ChIP-seq (DONE, Heather, July 2007)
# Add color-by-strand and overlap table (2008-05-27 kate)
# BED file of sites provided May 2008 by Tim Reddy (treddy@gmail.com)

    ssh kkstore03
    cd /cluster/data/encode/stanford/2007-03-14

    # lift to hg18
    liftOver fix.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.bed core.unmapped
    liftOver control_fix.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.control.bed control.unmapped

    # add color by strand (red for +, blue for minus)
    awk 'OFS="\t" {$6=="+" ? c="255,0,0" : c="0,0,255"; print $1, $2, $3, "", $5, $6, $2, $3, c}' hg18.bed > hg18.fixc.bed
    awk 'OFS="\t" {$6=="+" ? c="255,0,0" : c="0,0,255"; print $1, $2, $3, "", $5, $6, $2, $3, c}' hg18.control.bed > hg18.control_fixc.bed

    # load into database
    hgwdev
    cd /cluster/data/encode/stanford/2007-03-14
    hgLoadBed hg18 stanfordNRSFEnriched hg18.fixc.bed -tab
    hgLoadBed hg18 stanfordNRSFControl hg18.control_fixc.bed -tab

    # overlap tables
    set prefix = /gbdb/hg18/wib
    set table = stanfordNRSFEnrichedOverlaps
    sort -k1,1 -k2,2n hg18.bed | bedItemOverlapCount hg18 stdin | \
        wigEncode stdin ${table}.wig ${table}.wib
    ln -s /cluster/data/encode/stanford/2007-03-14/${table}.wib $prefix
    hgLoadWiggle -pathPrefix=$prefix hg18 $table ${table}.wig

    set table = stanfordNRSFControlOverlaps
    sort -k1,1 -k2,2n hg18.control.bed | bedItemOverlapCount hg18 stdin | \
        wigEncode stdin ${table}.wig ${table}.wib
    ln -s /cluster/data/encode/stanford/2007-03-14/${table}.wib $prefix
    hgLoadWiggle -pathPrefix=$prefix hg18 $table ${table}.wig

    # peaks (provided May 2008)
    sort -k1,1 -k2,2n lab/NRSF_Peak_Calls.bed | \
        awk '{print $1, $2, $3}' > peaks.bed
    wc -l peaks.bed
        # 2116
    hgLoadBed -noBin hg18 stanfordNRSFSites peaks.bed

#########################################################################
# REGULATORY POTENTIAL UPDATE (DONE - 2007-08-01 - Hiram)
    #	download data from "James Taylor" <james at bx.psu.edu>
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/regPotential7X.update
    cd /cluster/data/hg18/bed/regPotential7X.update

    #	This is a lot of data
    for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/chr${C}.scores.truncated.bz2"
echo "DONE - chr${C}.scores.truncated.bz2"
done

    #	create download gzip files from the bz2 files:
    time for F in chr*.scores.truncated.bz2
    do
	C=`echo $F | awk -F'.' '{print $1}'`
	echo -n "${C}.regPotential7X.hg18.gz working ... "
	bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz
	touch -r "${F}" "${C}.regPotential7X.hg18.gz"
	echo "done"
    done

    time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
    do
	zcat chr${C}.regPotential7X.hg18.gz
    done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    16m40.347s

    #	Loading the table on hgwdev
    ssh hgwdev
    cd /cluster/data/hg18/bed/regPotential7X.update
    mkdir /gbdb/hg18/wib/070118
    ln -s /cluster/data/hg18/bed/regPotential7X.update/regPotential7X.wib \
	/gbdb/hg18/wib/070118/regPotential7X.wib
    #	using the tmpDir is faster since it is on local disk and it will
    #	clean up any temporary .tab file it creates there
    time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \
	-pathPrefix=/gbdb/hg18/wib/070118 hg18 regPotential7X regPotential7X.wig
    #	real    0m38.247s

    #	How about a histogram of the data.
    ssh kolossus
    cd /cluster/data/hg18/bed/regPotential7X.update
    time nice -n +19 hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 \
       -hBinCount=100 -hMinVal=0.0 -db=hg18 regPotential7X > histogram.data 2>&1
    #	real    3m15.934s
    #	73 % of the data values are zero

    #	create download gzip files from the bz2 files:
    ssh kkstore02
    cd /cluster/data/hg18/bed/regPotential7X
    for F in chr*.scores.truncated.bz2
    do
	C=`echo $F | awk -F'.' '{print $1}'`
	echo -n "${C}.regPotential7X.hg18.gz working ... "
	bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz
	echo
    done

    # renaming file directory -- kuhn 08-17-2007
    cd /gbdb/hg18/wib
    mv 070118 regPot070118
    hgsql -e " update regPotential7X SET file = " \
      "/gbdb/hg18/wib/regPot070118/regPotential7X.wib" hg18
    Query OK, 2341572 rows affected (31.59 sec)
    Rows matched: 2341572  Changed: 2341572  Warnings: 0


#############################################################################
# SIB Transcriptome (DONE Aug 29, 2007 - JK)

   # Create working directory and download data from where Christian Iseli
   # (Christian.Iseli at licr.org) put it, and unpack.  The download takes about
   # ten minutes (161M file).
   cd /cluster/data/hg18/bed
   mkdir sibTranscriptome
   cd sibTranscriptome
   wget ftp://ftp.licr.org/pub/databases/trome/human/txg.tar.gz
   wget ftp://ftp.licr.org/pub/databases/trome/human/HTR.gtf.gz
   tar -zxvf txg.tar.gz

   # Load up sibGene table
   zcat HTR.gtf.gz | ldHgGene hg18 sibGene stdin

   # Do a little data cleanup and transformation and load splice graphs into database.
   sed 's/altGraphX/sibTxGraph/' ~/src/hg/lib/altGraphX.sql > sibTxGraph.sql
   sed 's/chrMt/chrM/' txg/chromMt.txg > txg/chromM.txg
   rm txg/chromMt.txt
   cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql hg18 sibTxGraph stdin

   # Create sibAltEvents track for analysed alt-splices.
   cat txg/*.txg | txgAnalyze stdin /cluster/data/hg18/hg18.2bit sibAltEvents.bed
   awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed
   hgLoadBed hg18 sibAltEvents foo.bed

#########################################################################
# BLASTZ MOUSE Mm9 (DONE - 2007-08-20 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastzMm9.2007-08-09
    cd /cluster/data/hg18/bed/blastzMm9.2007-08-09
    #	Started this before the rsync to /scratch/data/mm9/ had completed,
    #	hence the /cluster/bluearc/scratch/data/mm9/ location is used
    #	here.  (hg18 was also in transition to a new location)

    cat << '_EOF_' > DEF
# human vs mouse
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human Hg18
SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
SEQ1_SMSK=/cluster/bluearc/scratch/data/hg18/linSpecRep/notInMouseRat
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0

# QUERY: Mouse Mm9
SEQ2_DIR=/cluster/bluearc/scratch/data/mm9/nib
SEQ2_SMSK=/cluster/bluearc/scratch/data/mm9/notInOthers
SEQ2_LEN=/cluster/data/mm9/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/cluster/data/hg18/bed/blastzMm9.2007-08-09
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	real    1480m54.483s
    #	failed due to pk node difficulties, finish the run.blastz
    #	manually
# Completed: 102120 of 102120 jobs
# CPU time in finished jobs:    6908585s  115143.08m  1919.05h   79.96d  0.219 y
# IO & Wait Time:              50958894s  849314.90m 14155.25h  589.80d  1.616 y
# Average job time:                 567s       9.44m     0.16h    0.01d
# Longest finished job:            3000s      50.00m     0.83h    0.03d
# Submission to last job:        446177s    7436.28m   123.94h    5.16d

    #	continuing
    time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
	-chainLinearGap=medium -continue=cat `pwd`/DEF > cat.out 2>&1 &
    #	real    111m59.041s
    cat /cluster/data/hg18/bed/blastzMm9.2007-08-09/fb.hg18.chainMm9Link.txt
    #	1014323175 bases of 2881515245 (35.201%) in intersection
    cat /cluster/data/hg18/bed/blastz.mm8/fb.hg18.chainMm8Link.txt
    #	994530182 bases of 2881515245 (34.514%) in intersection

    cd /cluster/data/hg18/bed
    ln -s blastzMm9.2007-08-09 blastz.mm9

    #	Then to swap over to Mm9   (also in mm9.txt)
    mkdir /cluster/data/mm9/bed/blastz.hg18.swap
    cd /cluster/data/mm9/bed/blastz.hg18.swap
    time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 -swap -bigClusterHub=pk -chainMinScore=3000 \
	-chainLinearGap=medium \
	/cluster/data/hg18/bed/blastz.mm9/DEF > swap.out 2>&1 &
    #	real    67m21.146s
    cat /cluster/data/mm9/bed/blastz.hg18.swap/fb.mm9.chainHg18Link.txt
    #	1008812599 bases of 2620346127 (38.499%) in intersection
    cat /cluster/data/mm8/bed/blastz.hg18/fb.mm8.chainHg18Link
    #	984380268 bases of 2567283971 (38.343%) in intersection

    cd /cluster/data/mm9/bed
    ln -s blastz.hg18.swap blastz.hg18

    ## make syntenic net  (DONE - 2007-08-20 - Hiram)
    cd /cluster/data/hg18/bed/blastzMm9.2007-08-09
    time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
	-syntenicNet -chainLinearGap=medium -continue=syntenicNet \
	`pwd`/DEF > syntenic.out 2>&1 &
    ##	real    25m47.767s


#########################################################################
# LOAD ACEMBLY (DONE 8/28/07 angie)
    ssh kkstore02
    cd /cluster/data/hg18/bed/acembly
    # Move aside liftOver run results
    mkdir liftOver
    mv a* g* h* j* u* liftOver

    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_36.human.genes/AceView.ncbi_36.genes_gff.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_36.human.genes/AceView.ncbi_36.good_proteins_fasta.tar.gz
    tar xvzf AceView.ncbi_36.genes_gff.tar.gz
    tar xvzf AceView.ncbi_36.good_proteins_fasta.tar.gz

    cd AceView.ncbi_36.genes_gff
    # If the result of this command is > 0, then some lines have end < start
    # and need to be fixed:
    awk '$5 < $4 {print;}' *.gff | wc -l
#0

    # Filter out empty lines, lines where the product_id has a stray
    # newline before it, and $chr|Hs# IDs that don't appear liftable.
    egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \
    | sed -e 's/^/chr/;' \
      > acembly.gff

    # Extract annotation classes from original gff:
    egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \
    | perl -wpe 's/^.*Gene_type (\w+); transcript_id (\S+);.*/$2\t$1/; \
                 s/Main$/main/ || s/Putative$/putative/ || \
                   die "Unrecognized class/Gene_type:\n$_\n";' \
    | sort -u \
      > acemblyClass.tab

    # Some gff transcript_id's end in -unspliced (no intron), but the
    # corresponding protein fasta IDs to not have that suffix.  We need
    # them to match, so add where necessary.
    # Use perl to make a perl script to add -unspliced to protein IDs
    # where necessary:
    grep unspliced acemblyClass.tab | wc -l
#70156
    egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \
    | perl -wpe 's@^.*transcript_id (\S+)-unspliced;.*$@\$unsp{"$1"} = 1;@ \
               || s/^.*\n$//;' \
    | sort -u \
      > ../addUnspliced.pl
    wc -l ../addUnspliced.pl
#70156 ../addUnspliced.pl
    cat >> ../addUnspliced.pl <<'_EOF_'
while (<>) {
  if (/^>(\S+)$/) {
    if ($unsp{$1}) {
      s/^>(\S+)/>$1-unspliced/;
    }
  }
  print;
}
'_EOF_'
    # << emacs

    # Add -unspliced suffix to protein IDs where necessary, and pare down
    # proteins to just the ones that we have transcripts for:
    cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.good_proteins_fasta
    awk '{print $1;}' ../AceView.ncbi_36.genes_gff/acemblyClass.tab \
      > transcriptNames.txt
    perl ../addUnspliced.pl *.fasta \
    | faSomeRecords stdin transcriptNames.txt acemblyPep.fa
    grep unspliced acemblyPep.fa | wc -l
#55931
    # Danielle Thierry-Mieg explained that noncoding genes are included so
    # the number of proteins can be smaller than the number of transcripts.

    # Load tables
    ssh hgwdev
    cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.genes_gff
    ldHgGene -gtf hg18 acembly acembly.gff
#Read 258618 transcripts in 3451107 lines in 1 files
#  258618 groups 24 seqs 1 sources 5 feature types
#258618 gene predictions
    hgLoadSqlTab hg18 acemblyClass ~/kent/src/hg/lib/acemblyClass.sql \
      acemblyClass.tab
    cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.good_proteins_fasta
    hgPepPred hg18 generic acemblyPep acemblyPep.fa
    rm acemblyPep.tab
    runJoiner.csh hg18 acembly
# hg18.acemblyPep.name - hits 210003 of 210003 ok
# hg18.acemblyClass.name - hits 258618 of 258618 ok


###########################################################################
## Create gc5Base download raw data file (DONE - 2007-08-29 - Hiram)
    ssh kkstore02
    cd /cluster/data/hg18/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 \
	hg18 /cluster/data/hg18/hg18.2bit 2> /dev/null \
	| gzip > hg18.gc5Base.txt.gz
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg18/gc5Base
    cd /usr/local/apache/htdocs/goldenPath/hg18/gc5Base
    ln -s /cluster/data/hg18/bed/gc5Base/hg18.gc5Base.txt.gz .


###########################################################################
# GENE BOUNDS (RNACLUSTER) (REBUILT 08-30-2007 Fan)
# Create rnaCluster table (depends on {est,mrna}OrientInfo)

cd /cluster/data/hg18/bed
mv rnaCluster rnaCluster.old
mkdir rnaCluster
cd rnaCluster/
mkdir chrom

# Create a list of accessions that come from RAGE libraries and need to be excluded.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs
foreach f (/cluster/data/hg18/nib/chr*.nib)
    set c = $f:t:r
    set out = chrom/$c.bed
    # Exclude accesions in the RAGE file
    echo clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
    clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
end
hgLoadBed hg18 rnaCluster chrom/*.bed


###########################################################################
# RE-LOAD FISH CLONES after bacEnds update (DONE - 2007-09-04 - Hiram)
# The bacEnds processing results are used here

    ssh hgwdev
    mkdir /cluster/data/hg18/bed/fishClones.2007-08-29
    cd /cluster/data/hg18/bed/fishClones.2007-08-29
    ln -s ../fishClones/cl_acc_gi_len .
    ln -s ../fishClones/fhcrc.sts .

    #	have to be on hgwdev for this since it is going to read from the db
    time nice -n +19 fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg18 \
	/cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt \
	/cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out \
         ./cl_acc_gi_len \
         /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl \
            fishClones > fishClones.out 2>&1
    #	real    0m53.783s
# Reading Fish Clones file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
#	reading fishInfo file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
# Reading Clone/Acc (clac.out) file /cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out
# Reading BAC Ends file ./cl_acc_gi_len
# Reading BAC Ends psl file /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl
# Reading additional STS Marker links fhcrc.sts
# Determining good positions
#	findClonePos: determining positions of fish clones
# Writing output file
# ERROR: at line # 177, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# ERROR: at line # 178, no cytoband info for chrX:104048913-104206974
# RP11-79L11

    # Load the track
    hgLoadBed -notItemRgb -noBin -tab \
        -sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
	hg18 fishClones fishClones.bed
    #	Loaded 9788 elements of size 16

############################################################################
# INDEL-BASED CONSERVATION TRACK (DONE, 2007-09-03 - 2007-09-17, hartera)
# Data from the Gerton Lunter (gerton.lunter at anat.ox.ac.uk), MRC
# Functional Genetics Unit, University of Oxford, United Kingdom.
# Data is from the paper:
# Lunter G, Ponting CP and Hein J Genome-wide identification of human
# functional DNA using a neutral indel model. PLoS Comput Biol. 2006
# Jan;2(1):e5.
    ssh kkstore02
    mkdir -p /cluster/data/hg18/bed/consIndels/data
    cd /cluster/data/hg18/bed/consIndels
    # Add a README.indels with the e-mail from Gerton Lunter
    # get the data
    wget --timestamping \
         http://wwwfgu.anat.ox.ac.uk/~gerton/igs-hg18mm8cf2.zip
    # 38 Mb zip file in GFF format. This contains data for hg18
    # comparing it to mm8 and cf2 (canFam2).
    unzip igs-hg18mm8cf2.zip
    mv *.gff ./data/
    foreach f (./data/*.gff)
       set r = $f:r
       echo $r
       grep -v "track" $f > ${r}NoHeader.gff
    end

    # strip off the end of the name e.g. IGS0001.1:p=.74; FDR 0.27
    # so that the name displayed is short - IGS0001.1. The score field
    # is used to determine colouring and this is calculated from FDR
    ssh kkstore02
    cd /cluster/data/hg18/bed/consIndels
    perl -pi.bak -e \
's/(IGS[0-9a-z]+\.[0-9XY]+):p=?<?\.[0-9]+;\sFDR\s[0-9]+\.[0-9]+/$1/'  \
    ./data/igs*NoHeader.gff
    # check this looks ok then clean up
    rm *.bak
    # makes sense to store this as a BED5 table in order to use the score
    # for display.
    foreach f (./data/*NoHeader.gff)
       awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1,$4,$5,$9,$6}' $f \
       >> consIndelsHg18Mm8CanFam2.bed
    end

    # load data
    ssh hgwdev
    cd /cluster/data/hg18/bed/consIndels
    hgsql -e 'drop table consIndelsHg18Mm8CanFam2;' hg18
    hgLoadBed hg18 consIndelsHg18Mm8CanFam2 consIndelsHg18Mm8CanFam2.bed
    # Loaded 2603017 elements of size 5

    # Get the IDs, posterior probabilities (p) for the segment being neutral,
    # and the FDR from the original GFFs for a separate table. Some items
    # have p<.001. Can not do Table Browser queries restricting
    # p to <, =, or > a specified value unless all values are floats.
    # Contacted the data contributor, Gerton Lunter, and he said it would be
    # ok to change all p<.001 to p=0.0005
    ssh kkstore02
    cd /cluster/data/hg18/bed/consIndels/
    foreach c (`cat /cluster/data/hg18/chrom.lst`)
       echo $c
       foreach f (./data/igs.chr${c}.gff)
          echo $f
          awk 'BEGIN {FS="\t"} {if ($9 ~ /IGS/) print $9;}' $f \
              | sed -e 's/:/\t/' \
              | sed -e 's/p=\./0\./' | sed -e 's/p<\.001/0\.0005/' \
              | sed -e 's/;\sFDR/\t/' >> consIndelsHg18Mm8CanFam2Conf.txt
       end
    end
    # there are no GFF files for the haplotype chroms

    # Create a table definition for the table of identifier,  posterior
    # probability and false discovery rate (FDR).
    cat << 'EOF' > $HOME/kent/src/hg/lib/itemConf.as
table itemConf
"Probability and false discovery rate (FDR) for an element in a track."
   (
   string id;  "Identifier of element"
   float probability; "Probability associated with element"
   float fdr; "False Discovery Rate (FDR) associated with element"
   )
'EOF'
    # << emacs
    cd $HOME/kent/src/hg/lib
    autoSql itemConf.as itemConf
    mv itemConf.h ../inc/
    # commit ../inc/itemConf.h, itemConf.c, itemConf.as and
    # itemConf.sql to CVS. Add itemConf.o to src/hg/lib/makefile

    ssh hgwdev
    cd /cluster/data/hg18/bed/consIndels
    hgLoadSqlTab hg18 consIndelsHg18Mm8CanFam2Conf \
         $HOME/kent/src/hg/lib/itemConf.sql \
         consIndelsHg18Mm8CanFam2Conf.txt
    # check that all itesm are in this table.
    hgsql -N -e 'select distinct(name) from consIndelsHg18Mm8CanFam2;' hg18 \
         | sort > consIndels.names.sort
    hgsql -N -e 'select distinct(id) from consIndelsHg18Mm8CanFam2Conf;' hg18 \
         | sort > consIndels.idsfromConf.sort
    wc -l *.sort
    # 2603017 consIndels.idsfromConf.sort
    # 2603017 consIndels.names.sort
    comm -12 consIndels.names.sort consIndels.idsfromConf.sort | wc -l
    # 2603017
    # so all element IDs are in both tables.
    # cleanup
    rm ./data/*.bak *.sort

    # add trackDb/human/hg18/trackDb.ra entry and add description that
    # was written by the data contributor. Add code to hgc.c to display
    # the posterior probability and the FDR on the details page for
    # track elements. Gerton Lunter provided a description for the data
    # on 2007-09-12.

############################################################################
# Promote UCSD genome-wide ENCODE Chip tracks:
# UCSD TAF1 IMR90 Chip/chip to Regulation group
# (2007-09-14 kate)

    hgsql hg18 -e "alter table encodeUcsdNgChipSignal rename to wgEncodeUcsdNgTaf1Signal"
    hgsql hg18 -e "update wgEncodeUcsdNgTaf1Signal set file='/gbdb/hg18/encode/wib/wgEncodeUcsdNgTaf1Signal.wib'"

    hgsql hg18 -e "alter table encodeUcsdNgChipKnownSites rename to wgEncodeUcsdNgTaf1KnownSites"
    hgsql hg18 -e "alter table encodeUcsdNgChipNovelSites rename to wgEncodeUcsdNgTaf1NovelSites"

    hgsql hg18 -e "alter table encodeUcsdNgValChipH3K4me rename to wgEncodeUcsdNgTaf1ValidH3K4me"
    hgsql hg18 -e "alter table encodeUcsdNgValChipH3ac rename to wgEncodeUcsdNgTaf1ValidH3ac"
    hgsql hg18 -e "alter table encodeUcsdNgValChipRnap rename to wgEncodeUcsdNgTaf1ValidRnap"
    hgsql hg18 -e "alter table encodeUcsdNgValChipTaf rename to wgEncodeUcsdNgTaf1ValidTaf"


############################################################################
# NESTED REPEATS (DONE 9/20/07 angie)
# This track is now generated by doRepeatMasker.pl; added to this older
# assembly for interest.
    ssh kkstore02
    # First, re-liftUp the .out -- liftUp has been enhanced to uniquify the
    # RepeatMasker IDs.
    cd /cluster/data/hg18
    foreach c ( `cat chrom.lst` )
      echo lifting chr$c chunks to contigs
      foreach d ( ${c}/N{C,G,T}_* )
        cd $d
        set contig = $d:t
        liftUp $contig.IDs.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out \
        > /dev/null
        cd ../..
      end
      echo lifting contigs to chr$c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.IDs.fa.out lift/ordered.lft warn \
        `sed -e 's/.fa.out$/.IDs.fa.out/' lift/oOut.lst` \
        > /dev/null
      endif
      if (-e lift/random.lft && ! -z lift/random.lft) then
        liftUp chr${c}_random.IDs.fa.out lift/random.lft warn \
        `sed -e 's/.fa.out$/.IDs.fa.out/' lift/rOut.lst` \
        > /dev/null
      endif
      cd ..
    end
    # Now join fragments using shared IDs:
    ssh kolossus
    mkdir /cluster/data/hg18/bed/nestedRepeats
    cd /cluster/data/hg18/bed/nestedRepeats
    extractNestedRepeats.pl ../../?{,?}/chr*.IDs.fa.out \
      > hg18.nestedRepeats.bed
    # Load table:
    ssh hgwdev
    cd /cluster/data/hg18/bed/nestedRepeats
    hgLoadBed hg18 nestedRepeats hg18.nestedRepeats.bed \
      -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql


############################################################################
# Promote GIS genome-wide ENCODE tracks:
# GIS PET RNA and GIS ChIP-PET to Regulation group
# (2007-09-20 kate)

    hgsql hg18 -e "alter table encodeGisChipPet rename to wgEncodeGisChipPet"
    hgsql hg18 -e "alter table encodeGisChipPetHes3H3K27me3 rename to wgEncodeGisChipPetHes3H3K27me3"
    hgsql hg18 -e "alter table encodeGisChipPetHes3H3K4me3 rename to wgEncodeGisChipPetHes3H3K4me3"
    hgsql hg18 -e "alter table encodeGisChipPetMycP493 rename to wgEncodeGisChipPetMycP493"
    hgsql hg18 -e "alter table encodeGisChipPetStat1Gif rename to wgEncodeGisChipPetStat1Gif"
    hgsql hg18 -e "alter table encodeGisChipPetStat1NoGif rename to wgEncodeGisChipPetStat1NoGif"
    hgsql hg18 -e "alter table encodeGisRnaPetHCT116 rename to wgEncodeGisRnaPetHCT116"
    hgsql hg18 -e "alter table encodeGisRnaPetHes3 rename to wgEncodeGisRnaPetHes3"
    hgsql hg18 -e "alter table encodeGisRnaPetMCF7 rename to wgEncodeGisRnaPetMCF7"
    hgsql hg18 -e "alter table encodeGisRnaPetMCF7Estr rename to wgEncodeGisRnaPetMCF7Estr"

##########################################################
# Case Control Consortium  (DONE 2007-09-20 (Andy)

ssh hgwdev
bash
mkdir /cluster/data/hg17/bed/caseControl
cd /cluster/data/hg17/bed/caseControl
wget ftp://ftp.sanger.ac.uk/pub/WTCCC/summary_stats/summary_stats_auto_all.zip
unzip summary_stats_auto_all.zip
cd basic/
for disease in BD CAD CD HT RA T1D T2D; do
    echo $disease
    jkDisease=${disease:0:1}`echo ${disease:1} | tr [[:upper:]] [[:lower:]]`
    for f in *${disease}*.txt; do
       tail +2 $f | awk '{if ($21 == "1") print;}' | \
         cut -f1,15 >> ../chromGraphs/cccTrendPval${jkDisease}.cg
    done
done
cd ../chromGraphs/
mkdir hg17 hg18
for f in *.cg; do
    table=${f%.cg};
    echo $table
    hgLoadChromGraph -idTable=affy500k -minusLog10 -pathPrefix=/gbdb/hg17/chromGraph hg17 $table $f 2> ${table}.hg17.errors
    mv ${table}.cgb hg17/
    hgLoadChromGraph -idTable=affy500k -minusLog10 -pathPrefix=/gbdb/hg18/chromGraph hg18 $table $f 2> ${table}.hg18.errors
    mv ${table}.cgb hg18/
done
pushd /gbdb/hg18/chromGraph
ln -s /cluster/data/hg17/bed/caseControl/chromGraphs/hg18/*.cgb .
popd
pushd /gbdb/hg17/chromGraph
ln -s /cluster/data/hg17/bed/caseControl/chromGraphs/hg17/*.cgb .
popd
# Add the hack row into metaChromGraph for the composite tracks.
hgsql hg17 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile) values ("caseControl", 0, 0, "composite")'
hgsql hg18 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile) values ("caseControl", 0, 0, "composite")'


#############################################################################
# RGD HUMAN QTL (DONE 9/24/07 angie)
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/rgdQtl
    cd /cluster/data/hg18/bed/rgdQtl
    wget ftp://rgd.mcw.edu/pub/data_release/QTLS
    # Pick out the human QTLs and liftOver hg17 --> hg18.
    # Make bed4 and rgdQtlLink:
    perl -we 'open(BED, ">rgdQtl.bed") || die; \
              open(LINK, ">rgdQtlLink.txt") || die; \
              while (<>) { \
                chomp; my @w = split("\t"); \
                next unless ($w[1] eq "human" && $w[15]); \
                $w[5] =~ s/^/chr/; \
                $w[15] =~ s/^([-\d]+).*$/$1/ || die "parse start pos"; \
                $w[16] =~ s/^(\d+).*$/$1/ || die "parse end pos"; \
                if ($w[15] > $w[16]) { \
                  $tmp = $w[15];  $w[15] = $w[16];  $w[16] = $tmp; \
                } \
                $w[15]--; \
                $w[15] = 0 if ($w[15] < 0); \
                print BED "$w[5]\t$w[15]\t$w[16]\t$w[2]\n"; \
                print LINK "$w[0]\t$w[2]\t$w[3]\n"; \
              } \
              close(BED);  close(LINK);' \
      QTLS
    mv rgdQtl.bed hg17.rgdQtl.bed
    # Using a fairly loose minMatch -- the regions covered are huge.
    liftOver -minMatch=0.5 hg17.rgdQtl.bed \
      /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz \
      hg18.rgdQtl.{bed,unmapped}
    wc -l hg18*
# 254 hg18.rgdQtl.bed
#   2 hg18.rgdQtl.unmapped

    ssh hgwdev
    cd /cluster/data/hg18/bed/rgdQtl
    hgLoadBed hg18 rgdQtl hg18.rgdQtl.bed
    hgLoadSqlTab hg18 rgdQtlLink ~/kent/src/hg/lib/rgdQtlLink.sql rgdQtlLink.txt
    # Make sure there aren't any illegal coords:
    checkTableCoords -verbose=2 hg18 rgdQtl


#############################################################################
# RGD RAT QTL MAPPED TO HUMAN (DONE 9/26/07 angie)
#====== Begin work that was discarded because its output was too voluminous
#       to be very useful IMHO.  Keeping it in the doc as a lesson learned.
#       See below for what I ended up loading.
    ssh hgwdev
    cd /cluster/data/hg18/bed/rgdQtl
    genePredToPsl -bedFormat rn4 /cluster/data/rn4/bed/rgdQtl/rgdQtl.bed \
      rn4.rgdQtl.psl
    time ssh -x kolossus pslMap `pwd`/rn4.rgdQtl.psl \
      -chainMapFile /cluster/data/hg18/bed/liftOver/hg18ToRn4.over.chain.gz \
      `pwd`/hg18.rgdRatQtl.psl
#0.011u 0.006s 10:58.56 0.0%     0+0k 0+0io 0pf+0w
    # That created an 11G monstrosity of a file that dwarfs the original
    # input.  Linecount increased 3 orders of magnitude, filesize increased
    # 5 orders of magnitude.
    wc -l rn4.rgdQtl.psl
#1067 rn4.rgdQtl.psl
    ssh -x kkstore02 wc -l `pwd`/hg18.rgdRatQtl.psl
#1228306 /cluster/store11/gs.19/build36/bed/rgdQtl/hg18.rgdRatQtl.psl

    # Let's see what liftOver does...
    time ssh -x kolossus \
      liftOver -minMatch=0.5 -multiple \
        /cluster/data/rn4/bed/rgdQtl/rgdQtl.bed \
        /cluster/data/rn4/bed/liftOver/rn4ToHg18.over.chain.gz \
        `pwd`/hg18.rgdRatQtl.lo.{bed,unmapped}
#0.014u 0.004s 0:59.27 0.0%      0+0k 0+0io 0pf+0w
    wc -l hg18.rgdRatQtl.lo.{bed,unmapped}
# 1214366 hg18.rgdRatQtl.lo.bed
#      14 hg18.rgdRatQtl.lo.unmapped
    # Still got 1M lines... ugh.  Mapped all over the place, of course.
#====== end discarded work.

    # Use a stringently filtered version of over.chain to do the mapping,
    # so we only pick up large chunks (targeting >10,000bases) of these
    # enormous regions (up to 235M in rn4).
    ssh kolossus
    cd /cluster/data/hg18/bed/rgdQtl
    # rn4ToHg18 was built before doBlastz included chainStitchId in the
    # pipe to create over.chain.  Run it here, to repair any chain breaks:
    chainStitchId /cluster/data/rn4/bed/liftOver/rn4ToHg18.over.chain.gz \
      rn4ToHg18Stitch.over.chain

    # I looked at the summed scores from chainStitchId vs. the length
    # spanned by the stitched chains, and arbitrarily picked what I
    # think is a sweet spot for mapping very large ranges: at scores
    # near 500000, chains seem to span 40-60k bases.  Pretty much all
    # of the rat and human chromosomes (except human randoms) have at
    # least some chains with scores >= 500000.  So I'll filter the
    # stitched chains to keep those with score >= 500000.
# NOTE FOR NEXT TIME: consider filtering by length (see jaxQtl below).
    chainFilter rn4ToHg18Stitch.over.chain -minScore=500000 \
      > rn4ToHg18Coarse.over.chain
    # I tried liftOver with -minMatch=0.5, 0.33, 0.25 and 0.2.  These are the
    # wc -l stats for each run -- not surprisingly, many more matches with
    # lower minMatch:
#0.5:
# 1256 hg18.rgdRatQtl.coarse.lo.bed
#  998 hg18.rgdRatQtl.coarse.lo.unmapped
#0.33:
#  6748 hg18.rgdRatQtl.coarse.lo.bed
#    92 hg18.rgdRatQtl.coarse.lo.unmapped
#0.25:
#  9609 hg18.rgdRatQtl.coarse.lo.bed
#    36 hg18.rgdRatQtl.coarse.lo.unmapped
#0.2:
# 10529 hg18.rgdRatQtl.coarse.lo.bed
#    30 hg18.rgdRatQtl.coarse.lo.unmapped
    # I spot-checked by viewing a rat QTL and hg18 chains in rn4, and
    # eyeballing whether the net track looked like there were solid
    # matches for large regions.  With minMatch=0.25, most mappings
    # and unmapped looked pretty reasonable, but I still saw a few
    # (like Alc4) where a nice long chain was not being used, so I
    # kicked it down to 0.2 and checked again -- looks good.
    time liftOver -minMatch=0.2 -multiple \
      /cluster/data/rn4/bed/rgdQtl/rgdQtl.bed rn4ToHg18Coarse.over.chain \
      hg18.rgdRatQtl.coarse.lo.{bed,unmapped}
#100.476u 10.925s 1:52.31 99.1%  0+0k 0+0io 0pf+0w
   wc -l hg18.rgdRatQtl.coarse.lo.{bed,unmapped}
# see above.

    # Many of the records are completely contained within other records
    # for the same QTL (inversions I suppose) -- they don't really tell
    # us anything new about the murky QTL region, so merge them in.
# NOTE FOR NEXT TIME: instead of the perl+sort, use something like this:
#    liftOverMerge -mergeGap=10000 hg18.rgdRatQtl.coarse.lo.bed stdout \
#    | mergeOverlapBed4.pl - > hg18.rgdRatQtl.coarse.lo.pruned.bed
# liftOverMerge joins items separated by small (a relative term) gaps.
    perl -we \
      'while (<>) { \
         chomp; ($chrom, $start, $end, $name) = split; \
         push @{$item2coords{"$chrom.$name"}}, [$start, $end]; \
       } \
       foreach $item (keys %item2coords) { \
         @sortedCoords = sort { $a->[0] <=> $b->[0] } @{$item2coords{$item}}; \
         ($chrom, $name) = split(/\./, $item); \
         ($mergeStart, $mergeEnd) = @{shift @sortedCoords}; \
         foreach $rangeRef (@sortedCoords) { \
           ($rangeStart, $rangeEnd) = @{$rangeRef}; \
           next if ($rangeEnd <= $mergeEnd); \
           if ($rangeStart > $mergeEnd) { \
             print "$chrom\t$mergeStart\t$mergeEnd\t$name\n"; \
             ($mergeStart, $mergeEnd) = ($rangeStart, $rangeEnd); \
           } else { \
             $mergeEnd = $rangeEnd; \
           } \
         } \
         print "$chrom\t$mergeStart\t$mergeEnd\t$name\n" if ($mergeEnd); \
       } \
      ' hg18.rgdRatQtl.coarse.lo.bed \
      | sort -k1,1 -k2n,2n -k4,4r \
      > hg18.rgdRatQtl.coarse.lo.pruned.bed

    ssh hgwdev
    cd /cluster/data/hg18/bed/rgdQtl
    hgLoadBed hg18 rgdRatQtl hg18.rgdRatQtl.coarse.lo.pruned.bed
    # Just use rn4's non-positional associated info:
    sed -e 's/rgdQtlLink/rgdRatQtlLink/' ~/kent/src/hg/lib/rgdQtlLink.sql \
      > rgdRatQtlLink.sql
    hgLoadSqlTab hg18 rgdRatQtlLink rgdRatQtlLink.sql \
      /cluster/data/rn4/bed/rgdQtl/rgdQtlLink.txt
    # Make sure there aren't any illegal coords:
    checkTableCoords -verbose=2 hg18 rgdRatQtl
    runJoiner.csh hg18 rgdRatQtl

#====== more discarded work 10/2/07:
    ssh kolossus
    cd /cluster/data/hg18/bed/rgdQtl
    # Try pslMap with the same filtered chains:
    time pslMap -swapMap rn4.rgdQtl.psl \
      -chainMapFile rn4ToHg18Coarse.over.chain \
      hg18.rgdRatQtl.coarse.pm.psl
#444.915u 29.914s 11:20.08 69.8% 0+0k 0+0io 0pf+0w
    wc -l hg18.rgdRatQtl.coarse.pm.psl
#10755 hg18.rgdRatQtl.coarse.pm.psl
    # Again, linecount is comparable to liftOver, but the block-by-block
    # detail from pslMap creates an enormous file (10GB) even with the
    # filtered chains.
    # Recover 21G of disk space:
    rm hg18.rgdRatQtl.psl hg18.rgdRatQtl.coarse.pm.psl
#====== end discarded work.


#############################################################################
# N-SCAN GENES partial reload (2007-09-26 markd)
# reload nscanPasaGene to get fixed names and to fix search criteria

    # download pasa predictions
    cd /cluster/data/hg18/bed/nscan/pasa2
    wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.pasa.gtf
    wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.prot.fa
    bzip2 hg18.*
    chmod a-w hg18.*

    ldHgGene -gtf -genePredExt hg18 nscanPasaGene hg18.pasa.gtf.bz2
    hgPepPred hg18 generic nscanPasaPep  hg18.prot.fa.bz2
    rm *.tab

    # update trackDb to add correct termRegex entries
    human/hg18/trackDb.ra

    # push nscanPasaGene nscanPasaPep and trackDb

#############################################################################
# Blastz hg18 to J. Craig Venter chrom attempt (DONE - 2007-09-27 - Hiram)
    ssh kkstore06
    screen # use a screen to control this job

    mkdir /cluster/data/hg18/bed/blastzVenter1.2007-09-27
    cd /cluster/data/hg18/bed/blastzVenter1.2007-09-27

    cat << '_EOF_' > DEF
# human reference vs J. Craig Venter

# using -chainMinScore=10000 and -chainLinearGap=medium
#       during doBlastzChainNet.pl run

# parameters on advice from Webb for K and Q
# M as in hg18 self, O and E from Q
# Y and T as in hg18-panTro2 and mm9-rn4
BLASTZ_K=10000
BLASTZ_M=400
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q

# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib
SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Venter1
SEQ2_DIR=/iscratch/i/venter1/venter1.unmasked.2bit
SEQ2_LEN=/cluster/data/venter1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzVenter1.2007-09-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 -chainMinScore=10000 -chainLinearGap=medium \
	-bigClusterHub=kk -noDbNameCheck DEF > do.log 2>&1 &
    #	real    163m10.634s
    #	 this doesn't work, it failed due to mistakenly thinking it was a self
    #	 alignment.  Plus, we need to do the raw scaffolds, not these fake
    #	 chroms.

#############################################################################
# CONTRAST GENES (2007-10-02 markd)
# recieved predictions from Sam Gross <ssgross at stanford.edu>

    cd /cluster/data/hg18/bed/contrastGene/
    wget http://www.stanford.edu/~ssgross/contrast.hg18.bed
    # this is a custom track, not a pure BED
    tail +2 contrast.hg18.bed | hgLoadBed -tab hg18 contrastGene stdin

    # verify
    # load track db (ra and contrastGene.html are global
    # request push of contrastGene


###########################################################################
# SGP GENES Update (DONE - 2007-10-02 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/sgp.2007-10-02
    cd /cluster/data/hg18/bed/sgp.2007-10-02
    SITE="genome.imim.es/genepredictions/H.sapiens/golden_path_200603_x_mm9"
    for C in `cut -f1 ../../chrom.sizes`
do
    wget --timestamping "http://${SITE}/SGP/${C}.gtf" -O ${C}.gtf
    wget --timestamping "http://${SITE}/SGP/${C}.prot" -O ${C}.prot
done

    #	before reloading the table, measure the previous set:
    nice -n +19 featureBits -enrichment hg18 refGene:CDS sgpGene
# refGene:CDS 1.123%, sgpGene 1.272%, both 0.964%, cover 85.83%, enrich 67.47x
    nice -n +19 featureBits -enrichment hg18 knownGene:CDS sgpGene
# knownGene:CDS 1.185%, sgpGene 1.272%, both 0.989%, cover 83.43%, enrich 65.58x

    #	now reload the table
    ldHgGene -gtf -genePredExt hg18 sgpGene chr*.gtf
    #	Read 34023 transcripts in 288520 lines in 49 files
    #	34023 groups 46 seqs 1 sources 3 feature types
    #	34023 gene predictions

    #	and now measure this new set
    nice -n +19 featureBits -enrichment hg18 refGene:CDS sgpGene
# refGene:CDS 1.123%, sgpGene 1.270%, both 0.964%, cover 85.84%, enrich 67.59x
    nice -n +19 featureBits -enrichment hg18 knownGene:CDS sgpGene
# knownGene:CDS 1.185%, sgpGene 1.270%, both 0.988%, cover 83.41%, enrich 65.68x

###########################################################################
# Blastz Orangutan ponAbe2 (DONE - 2007-10-02 - 2007-10-05 - Hiram)
    ssh kkstore02
    screen # use screen to control this job
    mkdir /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
    cd /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02

    cat << '_EOF_' > DEF
# Human vs orangutan
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Orangutan ponAbe2
SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-bigClusterHub=pk > blastz.log 2>&1 &
    #	real    388m20.443s
# Completed: 126960 of 126960 jobs
# CPU time in finished jobs:    7068824s  117813.73m  1963.56h   81.82d  0.224 y
# IO & Wait Time:                517624s    8627.07m   143.78h    5.99d  0.016 y
# Average job time:                  60s       1.00m     0.02h    0.00d
# Longest finished job:            4940s      82.33m     1.37h    0.06d
# Submission to last job:         62056s    1034.27m    17.24h    0.72d

    # some jobs failed (because they were done but parasol didn't realize that)
    #	after recovery, continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-continue=cat -bigClusterHub=pk > cat.log 2>&1 &
    #	real    390m56.934s
    cat fb.hg18.chainPonAbe2Link.txt
    #	2676696124 bases of 2881515245 (92.892%) in intersection

    #	And the swap
    mkdir /cluster/data/ponAbe2/bed/blastz.hg18.swap
    cd /cluster/data/ponAbe2/bed/blastz.hg18.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-swap -bigClusterHub=pk > swap.log 2>&1 &
    #	real    123m9.197s
    cat fb.ponAbe2.chainHg18Link.txt
    #	2824501297 bases of 3093572278 (91.302%) in intersection

##############################################################
# NIMH Bipolar Genome Graphs built-in  (DONE 2007-10-04 Galt)
ssh hgwdev
mkdir /cluster/data/hg17/bed/nimhBipolar
# I registered and downloaded :
wget http://mapgenetics.nimh.nih.gov/BP_POOLING/german_data_share.csv.zip \
--user=galt --password=mypassword
wget http://mapgenetics.nimh.nih.gov/BP_POOLING/nimh_data_share.csv.zip \
--user=galt --password=mypassword
unzip german_data_share.csv.zip
unzip nimh_data_share.csv.zip
mkdir chromGraphs
tail +2 nimh_data_share.csv  | tr -d '"' | gawk -F ',' '{print $1 "\t" $9}' \
 > chromGraphs/nimhBipolarUs.cgt
tail +2 german_data_share.csv | tr -d '"' | gawk -F ',' '{print $1 "\t" $9}' \
 > chromGraphs/nimhBipolarDe.cgt
cd chromGraphs/
mkdir hg17 hg18

hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
-pathPrefix=/gbdb/hg17/chromGraph hg17 nimhBipolarUs nimhBipolarUs.cgt \
 >& nimhBipolarUs.hg17.errors
mv nimhBipolarUs.cgb hg17/
hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
-pathPrefix=/gbdb/hg17/chromGraph hg17 nimhBipolarDe nimhBipolarDe.cgt \
 >& nimhBipolarDe.hg17.errors
mv nimhBipolarDe.cgb hg17/
hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
-pathPrefix=/gbdb/hg18/chromGraph hg18 nimhBipolarUs nimhBipolarUs.cgt \
 >& nimhBipolarUs.hg18.errors
mv nimhBipolarUs.cgb hg18/
hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
-pathPrefix=/gbdb/hg18/chromGraph hg18 nimhBipolarDe nimhBipolarDe.cgt \
 >& nimhBipolarDe.hg18.errors
mv nimhBipolarDe.cgb hg18/

pushd /gbdb/hg17/chromGraph
ln -s /cluster/data/hg17/bed/nimhBipolar/chromGraphs/hg17/*.cgb .
popd
pushd /gbdb/hg18/chromGraph
ln -s /cluster/data/hg17/bed/nimhBipolar/chromGraphs/hg18/*.cgb .
popd

# Add the hack row into metaChromGraph for the composite tracks.
hgsql hg17 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile)
values ("bipolar", 0, 0, "composite")'
hgsql hg18 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile)
values ("bipolar", 0, 0, "composite")'

#Add composite track info to src/hg/makeDb/trackDb/human/trackDb.ra:


############################################################################
# MGI MOUSE QTL MAPPED TO HUMAN (DONE 10/10/07 angie)
    # Use a stringently filtered version of over.chain to do the mapping,
    # so we only pick up large chunks (targeting >10,000bases) of the
    # large fuzzy QTL regions.
    # Of the MGI QTLs, some are large as expected, but most are tiny --
    # they have only the peak STS marker coords, no indication of the
    # range.  Jim suggested padding those out to 100k.  So I will process
    # these in two batches, and make subtracks -- one for original, one
    # for our modified set.
### NOTE FOR NEXT TIME ###
### Use jaxQtl instead of jaxQTL throughout.
    ssh kolossus
    mkdir /cluster/data/hg18/bed/jaxQTL
    cd /cluster/data/hg18/bed/jaxQTL
    # mm8ToHg18 was built before doBlastz included chainStitchId in the
    # pipe to create over.chain.  Run it here, to repair any chain breaks:
    chainStitchId /cluster/data/mm8/bed/liftOver/mm8ToHg18.over.chain.gz \
      /scratch/tmp/mm8ToHg18Stitch.over.chain
    # For rn4->hg18 (rgdRatQtl above), I eyeballed scores vs. spans of
    # stitched chains, to try to find a score threshold over which almost
    # all spans were at least 10 or 20k, most >50k.  For mm8->hg18, the
    # correspondence is not quite so smooth, and in order to keep all spans
    # >= 100k, the score threshold would have to be 170k (compared to
    # 500k for rn4-hg18) and would pick up a lot of short chains.
    # So this time I'll try filtering directly by span instead of score
    # (but add a reasonable minScore to kick out some outliers).
    chainFilter /scratch/tmp/mm8ToHg18Stitch.over.chain \
      -tMinSize=20000 -qMinSize=20000 -minScore=10000\
      > mm8ToHg18Coarse.over.chain

    # Separate the mm8 jaxQtl's by size and reduce to bed4:
    awk 'BEGIN{OFS="\t";} \
         ($3-$2) < 1000 {s = $2 > 50000 ? $2-50000 : 0; \
                         print $1, s, $3+50000, $4;}' \
      /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed \
      > mm8.jaxQtl.padded.bed
    cp /dev/null tmp.bed
    foreach chr (`awk '{print $1;}' /cluster/data/mm8/chrom.sizes`)
      set size = `awk '$1 == "'$chr'" {print $2;}' /cluster/data/mm8/chrom.sizes`
      awk 'BEGIN{OFS="\t";} \
           $1 == "'$chr'" && $3 > '$size' {$3 = '$size';} \
           $1 == "'$chr'" && $3 > $2 {print;}' \
      mm8.jaxQtl.padded.bed >> tmp.bed
    end
    mv tmp.bed mm8.jaxQtl.padded.bed
    awk 'BEGIN{OFS="\t";}  ($3-$2) > 100000 {print $1, $2, $3, $4;}' \
      /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed \
      > mm8.jaxQtl.asIs.bed
    # Make sure we didn't miss any between those two size ranges (except for
    # the 4 markers whose coords are completely off the end of mm8 chroms):
    wc -l mm8.*.bed
#   73 mm8.jaxQtl.asIs.bed
# 1468 mm8.jaxQtl.padded.bed
# 1541 total
    wc -l /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed
#1545 /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed

    # Try liftOver with various -minMatch settings.  Compare the number
    # mapped and unmapped; eyeball some of the unmapped in mm8, see if
    # the hg18 Nets are truly weak there.
    foreach minMatch (0.1 0.2 0.25 0.33)
      time liftOver -minMatch=$minMatch -multiple \
        mm8.jaxQtl.asIs.bed mm8ToHg18Coarse.over.chain \
        hg18.jaxQTL.asIs.$minMatch.{bed,unmapped}
      time liftOver -minMatch=$minMatch -multiple \
        mm8.jaxQtl.padded.bed mm8ToHg18Coarse.over.chain \
        hg18.jaxQTL.padded.$minMatch.{bed,unmapped}
      wc -l hg18.jaxQTL.*.$minMatch.{bed,unmapped}
      echo ""
    end
#typical time: 23s for asIs, 45s for padded
#  757 hg18.jaxQTL.asIs.0.1.bed
# 1471 hg18.jaxQTL.padded.0.1.bed
#    0 hg18.jaxQTL.asIs.0.1.unmapped
#   54 hg18.jaxQTL.padded.0.1.unmapped
#  634 hg18.jaxQTL.asIs.0.2.bed
# 1429 hg18.jaxQTL.padded.0.2.bed
#    0 hg18.jaxQTL.asIs.0.2.unmapped
#  128 hg18.jaxQTL.padded.0.2.unmapped
#  532 hg18.jaxQTL.asIs.0.25.bed
# 1345 hg18.jaxQTL.padded.0.25.bed
#    2 hg18.jaxQTL.asIs.0.25.unmapped
#  282 hg18.jaxQTL.padded.0.25.unmapped
#  362 hg18.jaxQTL.asIs.0.33.bed
# 1146 hg18.jaxQTL.padded.0.33.bed
#    8 hg18.jaxQTL.asIs.0.33.unmapped
#  670 hg18.jaxQTL.padded.0.33.unmapped
    # I eyeballed the 0.1 .bed and .unmapped files, and they look
    # pretty good, esp. for mapped... we could probably get away with
    # 0.2 for the asIs but 0.1 looks OK.

    # Many of the records are completely contained within other records
    # for the same QTL (inversions I suppose) -- they don't really tell
    # us anything new about the murky QTL region, so merge them in.
# NOTE FOR NEXT TIME: try this:
#    liftOverMerge -mergeGap=10000 hg18.jaxQTL.asIs.0.1.bed stdout \
#    | mergeOverlapBed4.pl - > hg18.jaxQTL.asIs.0.1.pruned.bed
# liftOverMerge joins items separated by small (a relative term) gaps.
    mergeOverlapBed4.pl hg18.jaxQTL.asIs.0.1.bed \
      > hg18.jaxQTL.asIs.0.1.pruned.bed
    mergeOverlapBed4.pl hg18.jaxQTL.padded.0.1.bed \
      > hg18.jaxQTL.padded.0.1.pruned.bed
    wc -l hg18.jaxQTL.*.pruned.bed
#  398 hg18.jaxQTL.asIs.0.1.pruned.bed
# 1463 hg18.jaxQTL.padded.0.1.pruned.bed

    ssh hgwdev
    cd /cluster/data/hg18/bed/jaxQTL
### NOTE FOR NEXT TIME ###
### Call the tables jaxQtl* instead of jaxQTL* -- QA doesn't like jaxQTL.
    hgLoadBed hg18 jaxQTLAsIs hg18.jaxQTL.asIs.0.1.pruned.bed
    hgLoadBed hg18 jaxQTLPadded hg18.jaxQTL.padded.0.1.pruned.bed
  # Make sure there aren't any illegal coords:
    checkTableCoords -verbose=2 hg18 jaxQTLAsIs
    checkTableCoords -verbose=2 hg18 jaxQTLPadded
    runJoiner.csh hg18 jaxQTLAsIs
    runJoiner.csh hg18 jaxQTLPadded

# Tables renamed   kuhn 10-12-2007
# jaxQTLAsIs   to  jaxQtlAsIs
# jaxQTLPadded to  jaxQtlPadded


###########################################################################
# Build targetScanS track - (DONE - 2007-10-05 - 2007-10-31 - Hiram)
#       requested by: George Bell gbell at wi.mit.edu
    ssh hgwdev
    mkdir -p /cluster/data/hg18/bed/targetScanS
    cd /cluster/data/hg18/bed/targetScanS

    wget --timestamping \
	http://jura.wi.mit.edu/targetscan/vert_40/ucsc/hg18/hg18ConsChrALL.bed

    hgLoadBed hg18 targetScanS -tmpDir=/scratch/tmp hg18ConsChrALL.bed
    #	Loaded 50764 elements of size 6
    featureBits hg18 targetScanS
    #	313293 bases of 2881515245 (0.011%) in intersection


    ################################
    # previous attempts listed below

    #	the don't supply them all, but we don't know which ones they
    #	don't.  So, ask for them all, and remove the files that are empty.
    for C in `cut -f1 ../../chrom.sizes | sed -e "s/chr//"`
do
    wget --timestamping \
    "http://jura.wi.mit.edu/targetscan/vert_40/ucsc/NR/hg18ConsChr${C}.bed" \
	-O hg18ConsChr${C}.bed
    if [ ! -s "hg18ConsChr${C}.bed" ]; then
	rm -f "hg18ConsChr${C}.bed"
    fi
done

    # Remove the browser/track lines from these custom track files
    #	and load into the hg18.targetScanS table

    egrep -h -v "^browser|^track" hg*.bed | \
	hgLoadBed hg18 targetScanS -tmpDir=/scratch/tmp stdin
    #	Loaded 50802 elements of size 6
    featureBits hg18 targetScanS
    #	312951 bases of 2881515245 (0.011%) in intersection

    # Create/edit/check in targetScans.html and trackDb.ra under
    # kent/src/hg/makeDb/trackDb/human/hg18

###########################################################################
# RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)

  ssh hgwdev
  cd /cluster/data/hg18/bed

  mkdir wgRna-2007-10-05
  cd wgRna-2007-10-05

# Received the data file, wgtrack_oct2007.txt (saved from wgtrack_oct2007.doc)
# from Michel Weber's email
# (Michel.Weber at ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg18/bed/wgRna-2007-10-05.

  cat wg_track_oct2007.txt|sed -e 's/ /\t/g' > wgRna.tab

  hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab

#############################################################################
# BLASTZ calJac1 - Marmoset  (2007-10-09 kate)

    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastz.calJac1.2007-10-07
    cd /cluster/data/hg18/bed/blastz.calJac1.2007-10-07

    cat << '_EOF_' > DEF
# human vs. marmoset

# dynamic masking param
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Marmoset (calJac1)
SEQ2_DIR=/san/sanvol1/scratch/calJac1/calJac1.2bit
SEQ2_LEN=/san/sanvol1/scratch/calJac1/chrom.sizes
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.calJac1.2007-10-07
'_EOF_'
    # << happy emacs

    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium >& do.log &
    tail -f do.log

    # failed at download step do to pre-existing file of Brian's
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk -continue=download \
      -chainMinScore=3000 -chainLinearGap=medium >& do2.log &
    tail -f do2.log


#########################################################
# RE-BUILD GAD TRACK (Done, 10/17/06, Fan)

   mkdir /cluster/store12/gad071011
   rm /cluster/data/gad
   ln -s /cluster/store12/gad071011 /cluster/data/gad

   cd /cluster/data/gad

# Receive "all.txt" from GAD
# contact person: Garner, John (NIH/NIA/IRP) [F] [garnerjr@mail.nih.gov]

   hgsql hg18 -e 'drop table gadAll'
   hgsql hg18 <~/src/hg/lib/gadAll.sql
   hgsql hg18 -e 'load data local infile "all.txt" into table gadAll ignore 3 lines'

# create gad table

   gadPos hg18 j18.tmp
   cat j18.tmp |sort -u >hg18.gad.tab

   hgLoadBed hg18 gad hg18.gad.tab
   rm j18.tmp


#########################################################################
# HAPMAP LD (DONE 10/26/07 angie -- phased REDONE 1/30/08)
    # Based on Daryl's hg17 work.  Data version here is release #22,
    # March 2007 (2007-03).
    # 1/30/08: HapMap re-released the phased genotypes 1/22/08 -- re-run,
    # but without the removal of question marks that we had to do the
    # first time around.

    # hapmap.org offers ld_data downloads that look like the output of
    # makeDcc -- but only for older versions.  To get LD for the latest
    # release (and for hg18 coords), compute LD from genotype as Daryl did.

############################# unphased ##############################
#*** NOTE FOR NEXT TIME: don't bother with individual CHB and JPT subsets,
#*** {CEU, CHB+JPT, YRI} is what we display.
#*** Actually, if there is a next time, we'll probably just start with
#*** phased and ignore unphased.
    ssh kolossus
    mkdir -p /san/sanvol1/scratch/hg18/bed/hapmapLd/genotypes_2007-03/run.Haploview
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd/genotypes_2007-03
    # wget all genotype data:
    wget ftp://ftp.hapmap.org/pub/hapmap/public/00README.releasenotes_rel22
    wget ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2007-03/fwd_strand/non-redundant/genotypes_chr\*.txt.gz
    # Use latest Haploview to compute LD scores:
    wget http://www.broad.mit.edu/mpg/haploview/downloads/Haploview.jar

    # Haploview cluster run on whole-chrom genotype files was a bust.
    # Even on kki nodes, with java memory maxed out, 47 of 120 jobs crashed
    # and one was still running after 5.5 days so I killed it.

    # Meanwhile, Daryl suggested using the phased data instead.  It is
    # not yet available for all chrom/pops, but start with what's there
    # to iron out the flow.

    # New approach to unphased -> LD -- split, run Haploview, merge.
    ssh pk
    # Note: although the genotypes_ files are *mostly* sorted by position,
    # they're not completely sorted!  That can cause splitGenotype.pl to
    # screw up (as well as other downstream stuff), so sort them on the way
    # into splitGenotype.
    mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.splitUnphased
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.splitUnphased
    cat > runSplit.csh <<'_EOF_'
#!/bin/csh -ef

set f = $1
set base = $f:t:r:r
set scriptBin = ~/kent/src/hg/snp/hapmapLd

set tmpDir = `mktemp -d -p /scratch/tmp runSplit.XXXXXX`
zcat $f \
| sort -k4n,4n \
| $scriptBin/splitGenotype.pl -suffix .txt.gz \
  10000000 250000 $tmpDir/$base
mv $tmpDir/$base.* ../splitUnphased/$base/
rmdir $tmpDir
'_EOF_'
    # << emacs
    chmod a+x runSplit.csh
    cp /dev/null jobList
    foreach f (../genotypes_2007-03/genotypes_chr*.txt.gz)
      mkdir -p ../splitUnphased/$f:t:r:r
      echo ./runSplit.csh $f >> jobList
    end
    para make jobList
    para time
#Completed: 120 of 120 jobs
#CPU time in finished jobs:        826s      13.77m     0.23h    0.01d  0.000 y
#IO & Wait Time:                   457s       7.61m     0.13h    0.01d  0.000 y
#Average job time:                  11s       0.18m     0.00h    0.00d
#Longest finished job:              22s       0.37m     0.01h    0.00d
#Submission to last job:            29s       0.48m     0.01h    0.00d

    # Run Haploview on split files.
    ssh pk
    mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewSplitUnphased
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewSplitUnphased
    set scriptBin = ~/kent/src/hg/snp/hapmapLd
    set hvPath = /san/sanvol1/scratch/hg18/bed/hapmapLd/Haploview.jar
    # Latest installed java on the cluster nodes (not on the para hub machine):
    set javaPath = /usr/java/jre1.5.0_12/bin/java
    set javaMemSize = 1500M
    find /san/sanvol1/scratch/hg18/bed/hapmapLd/splitUnphased \
      -name \*.txt.gz -ls \
    | awk '{print $7, $11;}' | sort -nr > filesBySize
    cp /dev/null jobList
    foreach f (`awk '{print $2;}' filesBySize`)
      echo $scriptBin/runHaploview.csh $f $javaPath $hvPath $javaMemSize \
        >> jobList
    end
    para make jobList
    para time
#Completed: 1493 of 1493 jobs
#CPU time in finished jobs:     582015s    9700.25m   161.67h    6.74d  0.018 y
#IO & Wait Time:                  6558s     109.30m     1.82h    0.08d  0.000 y
#Average job time:                 394s       6.57m     0.11h    0.00d
#Longest finished job:            1711s      28.52m     0.48h    0.02d
#Submission to last job:          1740s      29.00m     0.48h    0.02d

    # Merge Haploview results.
    ssh pk
    mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.mergeSplitHapLD
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.mergeSplitHapLD
    cat > runMerge.csh <<'_EOF_'
#!/bin/csh -ef

set mapFile = $1
set outFile = $2

set scriptBin = ~/kent/src/hg/snp/hapmapLd

set tmpOut = `mktemp -p /scratch/tmp runMerge.XXXXXX`
$scriptBin/mergeHaploviewLD.pl $mapFile $tmpOut
mv $tmpOut $outFile
'_EOF_'
    # << emacs
    chmod a+x runMerge.csh
    mkdir ../mergedUnphasedLD
    cp /dev/null jobList
    foreach f (`ls -1S ../splitUnphased/genotypes_chr*/genotypes_chr*.map`)
      set base = $f:t:r
      echo ./runMerge.csh $f ../mergedUnphasedLD/$base.txt.LD.gz >> jobList
    end
    para make jobList
    para time
#Completed: 120 of 120 jobs
#CPU time in finished jobs:      16035s     267.25m     4.45h    0.19d  0.001 y
#IO & Wait Time:                 17282s     288.03m     4.80h    0.20d  0.001 y
#Average job time:                 278s       4.63m     0.08h    0.00d
#Longest finished job:             737s      12.28m     0.20h    0.01d
#Submission to last job:           738s      12.30m     0.20h    0.01d

    # Compare results of unsplit run with split/merge:
    ssh kolossus
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd
    # Compare SNP pairs:
    zcat genotypes_2007-03/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz \
    | awk '{print $1, $2;}' > /tmp/1
    zcat mergedUnphasedLD/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz \
    | awk '{print $1, $2;}' > /tmp/2
    wc -l /tmp/1 /tmp/2
#  32514982 /tmp/1
#  32514982 /tmp/2
    cmp /tmp/1 /tmp/2
    # Compare entire files:
    zcat genotypes_2007-03/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz > /tmp/1
    zcat mergedUnphasedLD/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz > /tmp/2
    head /tmp/1 /tmp/2
    cmp /tmp/1 /tmp/2
    # Woohoo!

############################# phased ##############################
    # For this build, Daryl suggested using the phased data (output of
    # Jonathan Marchini's PHASE program) instead of raw genotype data
    ssh kolossus
    mkdir -p /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22
    # 1/30/08: re-run from this point on, to pick up re-release (same URL)
    wget --timestamping \
      ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2007-08_rel22/phased/\*.gz
    # Downstream stuff depends on the inputs being sorted by position -- check:
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd
    foreach f (phased_2007-08_rel22/*_legend.txt.gz)
      echo $f
      zcat $f | tail +2 | awk '{print $2;}' > /tmp/1
      sort -n /tmp/1 > /tmp/2
      cmp /tmp/1 /tmp/2
    end
    rm -f /tmp/1 /tmp/2

    # kki cluster run -- need lots of memory!  more than pk's 2G hard limit.
    # (would use memk but it doesn't have java and kki is sufficient)
    ssh kki
    mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewPhased
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewPhased
    set scriptBin = $HOME/kent/src/hg/snp/hapmapLd
    set hv = $scriptBin/runHaploviewPhased.csh
    set phaseDir = /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22
    set hvPath = /san/sanvol1/scratch/hg18/bed/hapmapLd/Haploview.jar
    # Latest installed java on the cluster nodes (not on the para hub machine):
    set javaPath = /usr/java/jre1.5.0_12/bin/java
    set javaMemSize = 4G
    # Sort by size (descending) to kick off the biggest jobs first:
    cp /dev/null jobList
    foreach f (`ls -1S $phaseDir/genotypes_chr*.phase.gz`)
      echo $hv $f:r:r $javaPath $hvPath $javaMemSize >> jobList
    end
    para make jobList
    para time
#Completed: 66 of 66 jobs
#CPU time in finished jobs:     406845s    6780.76m   113.01h    4.71d  0.013 y
#IO & Wait Time:                  1517s      25.28m     0.42h    0.02d  0.000 y
#Average job time:                6187s     103.12m     1.72h    0.07d
#Longest finished job:           15667s     261.12m     4.35h    0.18d
#Submission to last job:         29868s     497.80m     8.30h    0.35d

    # Our software assumes that LD scores are given for consecutive SNPs
    # without gaps in between, so scores in the encoded lists can be
    # associated with other SNPs just by their position in the list.
    # Make sure that's the case!  I suspect this also depends on the
    # inputs to Haploview being sorted by position -- checked those above.
    ssh kolossus
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd
    cp /dev/null checkLD.log
    foreach f ( mergedUnphasedLD/*.LD.gz phased_2007-08_rel22/*.LD.gz )
      echo $f >> checkLD.log
      $scriptBin/checkLDSnpOrder.pl $f >>& checkLD.log
      echo "" >> checkLD.log
      date
    end
    # Takes a long time (~4 minutes for 184 files -> 11-12 hours) --
    # left to run overnight.

    # Cluster run to translate Haploview .LD output into the DCC's
    # ld_data downloads format, and in turn into our bed4+ format.
    ssh pk
    mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/{dcc,bed}{Phased,Unphased}
    mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatUnphased
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatUnphased
    cat > runFormatsUnphased.csh <<'_EOF_'
#!/bin/csh -ef

set base = $1

set db = hg18
set scriptBin = ~/kent/src/hg/snp/hapmapLd
set hapDir    = /san/sanvol1/scratch/$db/bed/hapmapLd
set unphDir   = $hapDir/genotypes_2007-03
set unphLDDir   = $hapDir/mergedUnphasedLD
set dccOut = `echo $base | sed -e 's/^genotypes_/ld_/; s/$/.txt.gz/;'`
set chr = `echo $base | perl -wpe 's/^.*_(chr[0-9MXY]+)_.*/$1/'`
set pop = `echo $base | perl -wpe 's/^.*_chr[0-9MXY]+_([A-Z+]+)_.*/$1/'`
set bedOut = $db.${pop}_$chr.bed.gz

$scriptBin/makeDccAndLdBed.pl \
  $unphDir/$base.txt.gz $unphLDDir/$base.txt.LD.gz \
  $hapDir/dccUnphased/$dccOut $hapDir/bedUnphased/$bedOut
'_EOF_'
    # << emacs
    chmod a+x runFormatsUnphased.csh
    cp /dev/null jobList
    foreach f (`ls -1S ../mergedUnphasedLD/genotypes_chr*.txt.LD.gz`)
      echo ./runFormatsUnphased.csh $f:t:r:r:r >> jobList
    end
    para make jobList
    para time
#Completed: 120 of 120 jobs
#CPU time in finished jobs:     101968s    1699.46m    28.32h    1.18d  0.003 y
#IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
#Average job time:                 847s      14.11m     0.24h    0.01d
#Longest finished job:            2276s      37.93m     0.63h    0.03d
#Submission to last job:          2276s      37.93m     0.63h    0.03d

    mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatPhased
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatPhased
    cat > runFormatsPhased.csh <<'_EOF_'
#!/bin/csh -ef

set basePath = $1
set base = $basePath:t

set db = hg18
set scriptBin = ~/kent/src/hg/snp/hapmapLd
set hapDir    = /san/sanvol1/scratch/$db/bed/hapmapLd
set dccOut = `echo $base | sed -e 's/^genotypes_/ld_/; s/$/.txt.gz/;'`
set chr    = `echo $base | perl -wpe 's/^.*_(chr[0-9MXY]+)_.*/$1/'`
set pop    = `echo $base | perl -wpe 's/^.*_chr[0-9MXY]+_([A-Z+]+)_.*/$1/'`
set bedOut = $db.${pop}_$chr.bed.gz

$scriptBin/makeDccAndLdBed.pl ${basePath}_legend.txt.gz $basePath.LD.gz \
   $hapDir/dccPhased/$dccOut $hapDir/bedPhased/$bedOut
'_EOF_'
    # << emacs
    chmod a+x runFormatsPhased.csh
    cp /dev/null jobList
    foreach f (`ls -1S ../phased_2007-08_rel22/genotypes_chr*.LD.gz`)
      echo ./runFormatsPhased.csh $f:r:r >> jobList
    end
    para make jobList
    para time
#Completed: 66 of 66 jobs
#CPU time in finished jobs:      66155s    1102.58m    18.38h    0.77d  0.002 y
#IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
#Average job time:                 972s      16.20m     0.27h    0.01d
#Longest finished job:            2292s      38.20m     0.64h    0.03d
#Submission to last job:          2292s      38.20m     0.64h    0.03d

    # Create empty tables, then load one pop_chr at a time in order
    # to avoid thrashing.
    # hg17 took about half an hour to an hour per population on hgwdev.
    # Load on kolossus, then ask cluster-admin to rsync to hgwdev.
    ssh kolossus
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd
    cat > loadOne.csh <<'_EOF_'
#!/bin/csh -ef

set tableBase = $1
set Pop = $2
set bedDir = $3

set table = $tableBase$Pop
hgsql hg18 -e "drop table if exists $table;"
sed "s/ld2/$table/" $HOME/kent/src/hg/lib/ld2.sql \
| hgsql hg18

set pop = `echo $Pop | perl -wpe 's/ChbJpt/JPT+CHB/; tr/a-z/A-Z/;'`
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
  set bed = $bedDir/hg18.${pop}_chr$c.bed.gz
  if (-e $bed) then
    echo $bed
    hgLoadBed -noSort -oldTable hg18 $table $bed
  else
    echo "\n$bed does not exist\n"
  endif
  echo ""
end
echo -n "\nDone with $table.    "; date
'_EOF_'
    # << emacs
    chmod a+x loadOne.csh
    # phased:
    cp /dev/null loadPhased.log
    foreach Pop (Ceu ChbJpt Yri)
      ./loadOne.csh hapmapLdPh $Pop bedPhased >>& loadPhased.log
    end
    # ~16 minutes for all phased on kolossus
    # 1/30/08: ~11 minutes for all phased on hgwdev!  bg load ~1.25
    # unphased:
    cp /dev/null loadUnphased.log
    foreach Pop (Ceu Chb ChbJpt Jpt Yri)
      ./loadOne.csh hapmapLd $Pop bedUnphased >>& loadUnphased.log
    end
    # ~21 minutes -- got segfaults for empty gzipped chrY files, debug later.
    rm -f bed.tab

    # Repeat hg17 sanity checks on the unphased results.
    ssh pk
    mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.maxDist
    cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.maxDist
    # Find the largest distance between any paired SNPs in DCC ld_* files.
    # Should be 249999 or less.  Also count the number of unique starting
    # coords.  We can compare those to the SNP counts in checkLD.log.
    cat > runMaxDist.csh <<'_EOF_'
#!/bin/csh -ef

set dccIn = $1
set out = $dccIn:r:r.check

echo -n "$dccIn:t    " > $out
zcat $dccIn \
| awk '{if ($2-$1>max) max=$2-$1} \
       {if (prevStart && $1 != prevStart) count++; prevStart = $1;} \
       END {print max "\t" count; \
            if (max > 249999) print "ERROR: maxDistance too large!";}' \
  >> $out
'_EOF_'
    # << emacs
    chmod a+x runMaxDist.csh
    cp /dev/null jobList
    foreach f (../dccUnphased/ld_*.txt.gz)
      echo ./runMaxDist.csh $f >> jobList
    end
    para make jobList
    para time
#Completed: 120 of 120 jobs
#CPU time in finished jobs:      12274s     204.56m     3.41h    0.14d  0.000 y
#IO & Wait Time:                  4137s      68.96m     1.15h    0.05d  0.000 y
#Average job time:                 137s       2.28m     0.04h    0.00d
#Longest finished job:             365s       6.08m     0.10h    0.00d
#Submission to last job:           365s       6.08m     0.10h    0.00d
    cd ..
    cat dccUnphased/*.check > maxDist.txt
    grep -B1 ERROR maxDist.txt

    # Other cleanup:
    rm -r splitUnphased


#########################################################################
# University of Uppsala, Sweden Chip-chip  (2007-10-18 kate)
# 3 datasets (Usf1, Usf2, H3ac) -- wiggle and bed for each, in hg16 coords
# Submitted by Adam Ameur

    ssh kkstore02
    cd /cluster/data/hg18/bed
    mkdir uppsalaChip
    cd uppsalaChip
    foreach f (H3ac Usf1 Usf2)
        #wget -nd  http://www.lcb.uu.se/~mada/UUtracks_hg16/${f}_hg16.wig.gz
        wget -nd  http://www.lcb.uu.se/~mada/UUtracks_hg16/${f}_hg16.bed
    end
    wget -nd  http://www.lcb.uu.se/~mada/UUtracks_hg16/UCSCdescription.html

    # lift to hg18
    foreach f (lab/*hg16.bed)
        set b = `echo $f:t | sed 's/_.*//'`
        echo $b
        tail +2 $f | \
            liftOver stdin \
                /cluster/data/hg16/bed/liftOver/hg16ToHg18.over.chain.gz \
                    $b.bed $b.bed.unmapped
    end

    ssh kolossus
    cd /cluster/data/hg18/bed
    cd uppsalaChip

    # remove duplicate regions resulting from liftOver
    cat > trimDups.awk << 'EOF'
BEGIN {chr=""; start="";}
{
if (!(($1 == chr) && ($2 == start)))
    print;
    chr = $1;
    start = $2;
}
'EOF'

    # process in 2 unix pipelines, so as not to overload machine
    cat > load.csh << 'EOF'
foreach f (lab/*hg16.wig.gz)
    set b = `echo $f:t | sed 's/_.*//'`
    echo $b
    date
    nice zcat $f | tail +2 | \
        nice varStepToBedGraph.pl stdin | \
        nice liftOver stdin \
            /cluster/data/hg16/bed/liftOver/hg16ToHg18.over.chain.gz \
            $b.wigBed $b.wigBed.unmapped
        nice bedSort $b.wigBed stdout | \
        nice awk -f trimDups.awk | \
        nice wigEncode stdin $b.wig $b.wib
    date
end
'EOF'
    csh load.csh >&! load.log &
    # approx. 50 minutes to process the 3 datasets

    # load bed and wiggles into database
    ssh hgwdev
    cd /cluster/data/hg18/bed/uppsalaChip
    cat > load2.csh << 'EOF'
foreach f (*.wig)
    set b = $f:r
    echo $b
    date
    set table = uppsalaChip${b}Sites
    hgLoadBed hg18 $table $b.bed
    set table = uppsalaChip${b}Signal
    ln -s /cluster/data/hg18/bed/uppsalaChip/$b.wib /gbdb/hg18/wib/uppsalaChip${b}Signal.wib
    hgLoadWiggle hg18 $table $f
    date
end
'EOF'
    csh load2.csh >&! load2.log  &
    # just a few minutes runtime

    # somehow 2 beds were left out above (lifted files were missing)
cat > loadBed.csh << 'EOF'
foreach f (*.bed)
    set b = $f:r
    echo $b
    hgLoadBed hg18 uppsalaChip${b}Sites $f
end
'EOF'
    # << emacs
    csh loadBed.csh >& loadBed.log &

    # data distribution
     textHistogram H3ac.wigBed -minVal=-2 -real -col=4 -binSize=.5
     -2.000000  611
     -1.500000  5711
     -1.000000 * 391229
     -0.500000 ************************************************************ 21240336
     0.000000 ******************************************************* 19325712
     0.500000 ** 689267
     1.000000  99083
     1.500000  24453
     2.000000  4635
     2.500000  635
     3.000000  49
     3.500000  3
     <minVal or >=4.000000  562


#########################################################################

# BLASTZ Zebrafish danRer5 (DONE - 2007-10-18 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/blastzDanRer5.2007-10-17
    cd /cluster/data/hg18/bed/blastzDanRer5.2007-10-17

    cat << '_EOF_' > DEF
# Human (hg18) vs zebrafish (danRer5)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0

# QUERY - zebrafish (danRer5)
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/cluster/data/danRer5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzDanRer5.2007-10-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
    #	real    369m20.490s
    cat fb.hg18.chainDanRer5Link.txt
    #	73923439 bases of 2881515245 (2.565%) in intersection

    mkdir /cluster/data/danRer5/bed/blastz.hg18.swap
    cd /cluster/data/danRer5/bed/blastz.hg18.swap
    time nice -n +19 doBlastzChainNet.pl \
	-chainMinScore=5000 \
	/cluster/data/hg18/bed/blastzDanRer5.2007-10-17/DEF \
	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> swap.log 2>&1 &
    #	real    11m35.536s
    cat fb.danRer5.chainHg18Link.txt
    #	74166352 bases of 1435609608 (5.166%) in intersection

#########################################################################
#  Vista Enhancers (2007-10-18, conodera)
# see also /projects/compbiousr/wet/browser/vista_enhancer/17Oct2007/Makefile
#
# download data file from the vista browser (coordinates are for hg17)
# http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1
# save as enhancerbrowser.datadownload.txt

cd /projects/compbiousr/wet/browser/vista_enhancer/

# liftOver hg17 file
liftOver vista_enhancer.hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz vista_enhancer.hg18.bed vista_enhancer.hg17ToHg18.unMapped
hgLoadBed hg18 vistaEnhancers vista_enhancer.hg18.bed

############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-10-30 markd)

    cd /cluster/data/genbank/data/ccds/
    ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
    get CCDS.20071030.tar.gz
    mkdir /scratch/tmp/ccds
    cd /scratch/tmp/ccds
    tar -zxf /cluster/data/genbank/data/ccds/CCDS.20071030.tar.gz

    # import ccds database tables
    /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt

    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg18 ccdsInfo ccdsGene
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg18 -loadDb ccdsGene knownGene ccdsKgMap
    checkTableCoords hg18 -verbose=2 ccdsGene
    # update all.jointer to include hg18 in ccdsDb
    joinerCheck -database=hg18 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner

    # build initial version of ccdsMgcMap table, updated by nightly genbank update
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg18 ccdsGene mgcGenes ccdsMgcMap

    # request push of
        ccdsGene
        ccdsInfo
        ccdsKgMap
        ccdsMgcMap
    # << emacs

#########################################################################
# Load ENSEMBL ver 45  (2007-09-5 markd)

mkdir /cluster/data/hg18/bed/ensembl45
cd /cluster/data/hg18/bed/ensembl45

##
# need to find bounds of haplotype chromosomes
##

# get unmasked haplotype pseudochroms from ensemble (dna, NOT dna_rm)
wget ftp://ftp.ensembl.org/pub/current_homo_sapiens/data/fasta/dna/
    Homo_sapiens.NCBI36.46.dna.chromosome.c22_H2.fa.gz
    Homo_sapiens.NCBI36.46.dna.chromosome.c5_H2.fa.gz
    Homo_sapiens.NCBI36.46.dna.chromosome.c6_COX.fa.gz
    Homo_sapiens.NCBI36.46.dna.chromosome.c6_QBL.fa.gz

# get gap locations and create hap.lift
foreach f ( *.fa.gz )
   faGapLocs $f $f:r:r.lift
end

# build lift file for randons and haps
(mkRandomNTLift hg18 && cat hap.lift) > randHap.lift

# load ensembl genes
hgLoadEnsembl -l randHap.lift  -p homo_sapiens core_45_36g hg18>&log

# got 1 genes with CDS exons with no frame:
ENST00000374459
# add this to problem ids and rerun

hgLoadEnsembl -l randHap.lift -f problem.ids homo_sapiens core_45_36g hg18>&log

# load pseudogenes
hgLoadEnsembl -l randHap.lift  -p homo_sapiens core_45_36g hg18>&log

# got 3 pseudogenes with CDS bounds outside of exons
ENST00000342841
ENST00000361218
ENST00000388856
# add this to problem ids and rerun

hgLoadEnsembl -l randHap.lift -f problem.ids -p homo_sapiens core_45_36g hg18>&log

# vega code is not working in robert's scripts.
# done to support CCDS; push not requested awaiting resolution of vega
# stuff

#########################################################################
# AFFY TRANSCRIPTOME PHASE 3 (2007-11-06, Andy)

ssh hgwdev
bash
cd /san/sanVol1/scratch/andy/transcriptome
mkdir splits
cd originalWigs/
for f in *.wigVar; do
   table=${f%.wigVar};
   mkdir ../splits/$table
   grep -v "^track" $f | splitWig stdin 1000000 ../splits/${table}/split
   echo Done with $table
done
# Done with cluster run
ssh kolossus
cd /san/sanVol1/scratch/andy/transcriptome/lift/bed
for tab in *; do
   for split in ${tab}/*; do
      cat $split >> ${tab}.bed
   done
   echo done catting $tab
done
# Split into chrom beds (with a cluster run)
for f in `ls -1 hg18.bed`; do
     tab=${f%.bed};
     for c in `cut -f1 chrom.sizes`; do
        cfile=hg18.bed.chromSplit/${tab}.${c}.bed;
        outFile=hg18.wigVar/${tab}.wigVar;
        if [ -e $cfile ]; then
            echo variableStep chrom=${c} span=1 >> $outFile;
            bedSort $cfile stdout | awk 'BEGIN{FS="\t"}{print $2+1, $4;}' | awk -f noDupe.awk >> $outFile;
            echo Added $cfile to $outFile >> the.log;
        fi;
     done;
     echo DONE with $tab >> the.log;
     wigEncode $outFile hg18.wigVar/${tab}.wig hg18.wigVar/${tab}.wib >> the.log;
     gzip $outFile
done
cd hg18.wigVar/
mkdir -p /cluster/data/hg18/bed/affyTxnPhase3/wib
for f in *.wib; do
   echo copying $f...;
   cp $f /cluster/data/hg18/bed/affyTxnPhase3/wib/;
done
pushd /gbdb/hg18/wib
ln -s /cluster/data/hg18/bed/affyTxnPhase3/wib/* .
popd
mkdir -p /cluster/data/hg18/bed/affyTxnPhase3/downloads
cp *.wigVar.gz /cluster/data/hg18/bed/affyTxnPhase3/downloads
mkdir -p /usr/local/apache/htdocs/goldenPath/hg18/affyTxnPhase3
pushd /usr/local/apache/htdocs/goldenPath/hg18/affyTxnPhase3
ln -s /cluster/data/hg18/bed/affyTxnPhase3/downloads/* .
for f in *Strand*; do mv $f sRNA.$f; done
for f in affyTxnPhase3*; do mv $f lRNA.$f; done

#########################################################################
# Blastz Marmoset calJac1 (DONE - 2007-11-09 - Hiram)
##	this is not necessary - already done by Kate in October
    ssh kkstore06
    screen # use screen to control this job
    mkdir /cluster/data/hg18/bed/blastzCalJac1.2007-11-09
    cd /cluster/data/hg18/bed/blastzCalJac1.2007-11-09

    cat << '_EOF_' > DEF
# Human vs marmoset
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Marmoset calJac1
SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzCalJac1.2007-11-09
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-bigClusterHub=pk > blastz.log 2>&1 &
    #	real    542m2.359s
# Completed: 230805 of 230805 jobs
# CPU time in finished jobs:    7279638s  121327.30m  2022.12h   84.26d  0.231 y
# IO & Wait Time:                831303s   13855.05m   230.92h    9.62d  0.026 y
# Average job time:                  35s       0.59m     0.01h    0.00d
# Longest finished job:             972s      16.20m     0.27h    0.01d
# Submission to last job:         20572s     342.87m     5.71h    0.24d
    cat fb.hg18.chainCalJac1Link.txt
    #	2236493373 bases of 2881515245 (77.615%) in intersection

###########################################################################
# LIFT RM ALIGN FILES, MAKE PER-CHROM DOWNLOADS (DONE 12/7/07 angie)
# Lifting of .align files is now automated by doRepeatMasker.pl, but we
# got a user request for .align files from this pre-automation db.
    ssh kkstore02
    cd /cluster/data/hg18
    mkdir downloads/RMalign
    foreach c (?{,?} ?{,?}_*hap?)
      echo linking/lifting to contigs of $c:t
      foreach ctgdir ($c/N[TC]_??????)
        set nt = $ctgdir:t
	if (! -f $ctgdir/$nt.fa.align) then
          pushd $ctgdir
          liftRMAlign.pl $nt.lft > $nt.fa.align
          popd
        endif
        ln -s $nt/$nt.fa.align $c/
      end
      set chr = chr$c:t
      if (-e $c/lift/ordered.lft && ! -z $c/lift/ordered.lft) then
        echo lifting contigs to chr$c
        liftRMAlign.pl $c/lift/ordered.lft \
        | gzip -c > downloads/RMalign/$chr.fa.align.gz
      endif
      if (-e $c/lift/random.lft && ! -z $c/lift/random.lft) then
        echo lifting contigs to chr${c}_random
        liftRMAlign.pl $c/lift/random.lft \
        | gzip -c > downloads/RMalign/${chr}_random.fa.align.gz
      endif
    end
    md5sum downloads/RMalign/*.gz > downloads/RMalign/md5sum.txt
    ssh hgwdev ln -s /cluster/data/hg18/downloads/RMalign \
      /usr/local/apache/htdocs/goldenPath/hg18/


#########################################################################
# ADD LINKS TO GENETESTS ON hgGene DETAILS PAGE (DONE 12/12/07 Fan)

    ssh hgwdev
    cd /cluster/store11/gs.19/build36/bed
    mkdir geneTests
    cd geneTests

# paste the 3 cols gene list from GeneTest web site into file geneTests.lis

    cut -f 1 geneTests.lis >j1
    cut -f 2 geneTests.lis >j2
    cut -f 3 geneTests.lis >j3

    cat j1 j2 j3 |sort -u >geneTests.tab
    rm j1 j2 j3

    hgsql hg18 -e 'drop table geneTests'
    hgsql hg18 < ~/src/hg/lib/geneTests.sql
    hgsql hg18 -e 'load data local infile "geneTests.tab" into table geneTests
ignore 1 lines'

# the list is independent of hg18, so load it into hg17 too.

    hgsql hg17 -e 'drop table geneTests'
    hgsql hg17 < ~/src/hg/lib/geneTests.sql

    hgsql hg17 -e 'load data local infile "geneTests.tab" into table geneTests
ignore 1 lines'

###########################################################################
# ADD SeattleSNPs PGA GENES ON hgGene DETAILS PAGE. (DONE, Fan, 12/13/07).

    cd /cluster/store12/snp
    mkdir pga
    cd pga

# download data from SeattleSNPs

    wget --timestamping http://pga.gs.washington.edu/data.tar.gz
    gzip -d *.gz
    tar -xvf *.tar

# create SeattleSNPs PGA gene list

    cut -f 1 FinishedGenes.txt >j1
    cut -f 2 FinishedGenes.txt >j2
    cat j1 j2 |sort -u >pga.tab
    rm j1 j2

# load the data into the pga table.

    hgsql hg18 -e 'drop table pga'
    hgsql hg18 < ~/src/hg/lib/pga.sql

    hgsql hg18 -e 'load data local infile "pga.tab" into table pga'

###########################################################################
# Reload CCDS (2007-12-12 markd)
    # import ccds database as described in ccds.txt
    set db=hg18
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    # build initial version of ccdsMgcMap table, updated by nightly genbank update
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    # update all.jointer to include ${db} in ccdsDb
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
    # request push of
        ccdsGene
        ccdsInfo
        ccdsKgMap
        ccdsMgcMap

############################################################################
# dbSNP BUILD 128 (DONE 1/22/08 angie)
# updated snp128ExceptionDesc (tweaked wording) 3/7/08
# 8/7/08: Regenerated snp128.sql with only those enum/set values that are
# actually used (except always keep unknown, the default) and reloaded snp128.
# No data change -- just the sql field definitions for enums and sets.
# QA NOTE: used sudo mytouch on the snp128 table to reset the timestamp to
# .2008-01-22 00:00:00 (was .2008-08-07 16:08:27 after Angie's re-load) in
# order to keep joinerCheck happy and avoid confusion. (8/8/08 brooke)
    # Set up build directory
    ssh kkstore06
    mkdir -p /cluster/store3/dbSNP128/{human,shared}
    ln -s /cluster/store3/dbSNP128 /cluster/data/dbSNP/128

    # Get field encodings -- if there are changes or additions to the
    # encoding of the corresponding fields, you might need to update
    # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
    # hg/lib/snp125Ui.c).
    cd /cluster/data/dbSNP/128/shared
    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
    wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz
    wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz
    wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz
    wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz
    # Here is another source -- it is not as up-to-date as the above, but
    # our encodings (enums and sets in snp128.sql) are named more similar
    # to those in the 2005 ASN:
    # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn

    ########################## DOWNLOAD #############################
    cd /cluster/data/dbSNP/128/human
    mkdir data schema rs_fasta
    # Get data from NCBI (anonymous FTP)
    wget ftp://ftp.ncbi.nih.gov/snp/00readme.txt
    cd /cluster/data/dbSNP/128/human/data
    alias wg wget --timestamping
    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
    # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
    wg $ftpSnpDb/organism_data/b128_SNPContigLoc_36_2.bcp.gz
    wg $ftpSnpDb/organism_data/b128_SNPContigLocusId_36_2.bcp.gz
    wg $ftpSnpDb/organism_data/b128_ContigInfo_36_2.bcp.gz
    # MapInfo has alignment weights
    wg $ftpSnpDb/organism_data/b128_SNPMapInfo_36_2.bcp.gz
    # SNP has univar_id, validation status and heterozygosity
    wg $ftpSnpDb/organism_data/SNP.bcp.gz

    # Get schema
    cd /cluster/data/dbSNP/128/human/schema
    wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz

    # Get fasta files
    # using headers of fasta files for molType, class, observed
    cd /cluster/data/dbSNP/128/human/rs_fasta
    wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz


    ########################## LOAD NCBI TABLES #############################
    # Simplify names of data files -- strip version & extras to get
    # local canonical table names.
    cd /cluster/data/dbSNP/128/human/data
    foreach f (*.bcp.gz)
      set new = `echo $f \
                 | sed -e 's/^b128_SNP//; s/^b128_//; s/_36_2//; s/.bcp//;'`
      mv $f $new
      echo $new
    end

    # Extract just the tables that we need from the NCBI msSQL table
    # creation file, and get CREATE statements from
    # human_9606_table.sql for our 5 tables
    cd /cluster/data/dbSNP/128/human/schema

    zcat human_9606_table.sql.gz \
    | perl -we '$/ = "\nGO\n\n\n\n"; \
        while (<>) { \
          next unless /^CREATE TABLE \[(b128_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_2)?\]/; \
          s/b128_(SNP)?//; s/_36_2//; \
          s/[\[\]]//g;  s/GO\n\n\n/;/;  s/smalldatetime/datetime/g; \
          s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
          s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
          s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
          s/(image|varchar\s+\(\d+\))/BLOB/g; \
          print; \
        }' \
      > table.sql

    # load on kolossus or a small cluster machine (mysql5 is OK for this).
    ssh kolossus
    hgsql '' -e 'create database hg18snp128'
    cd /cluster/data/dbSNP/128/human/schema
    hgsql hg18snp128 < table.sql
    cd ../data

    foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
      zcat $t.gz \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable hg18snp128 $t placeholder stdin
    end
    # There were some warnings (many cleared up by the perl substitution)
    # but no rows were dropped.  I eyeballed a few examples, seemed OK.
    foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
     echo -n "${t}:\t"
      hgsql -N -B hg18snp128 -e 'select count(*) from '$t
    end
#ContigInfo:     7067
#ContigLoc:      24685256
#ContigLocusId:  13129868
#MapInfo:        24132236
#SNP:    	 11833664
    # these counts (except for MapInfo which has ~doubled) are
    # slightly down from 126.  MapInfo has a lot of alternate assembly
    # mappings, esp. the celera assembly; maybe that's new?

    # load hg18.ctgPos into dbSnpHumanBuild128, compare contig list between
    # ctgPos and ContigInfo
    # NOTE FOR NEXT TIME: instead of going through mysql, just make a
    # tab-sep dump file of ctgPos.
    ssh hgwdev hgsql hg18 -N -B -e '"select * from ctgPos;"' \
    | hgLoadSqlTab hg18snp128 ctgPos ~/kent/src/hg/lib/ctgPos.sql stdin
    hgsql hg18snp128 -N -B -e 'select contig from ctgPos;' | sort > /tmp/1
    # Note: we used to look for group_term = "ref_assembly", but that leaves
    # behind some contigs that we include.  So use a list of group_label:
    hgsql hg18snp128 -NBe 'select distinct(group_label) from ContigInfo'
    # --> ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")
    hgsql hg18snp128 -N -B -e 'select contig_acc from ContigInfo \
        where group_label in \
        ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2
    diff /tmp/1 /tmp/2
    # No diff.

    #################### EXTRACT INFO FROM NCBI TABLES ####################
    mkdir -p /scratch/snp/128/human
    cd /scratch/snp/128/human

    # Fields of the SNP table and their NCBI source table/file:
    # chrom		ContigLoc / contigInfo / liftUp
    # chromStart	ContigLoc / liftUp; check vs phys_pos_from
    # chromEnd		ContigLoc / liftUp
    # name		rs + numeric ID that joins all the other sources
    # score		0
    # strand		ContigLoc.orientation
    # refNCBI		ContigLoc.allele
    # refUCSC		ContigLoc.allele if insertion, othw. from genomic
    # observed		fasta headers
    # molType		fasta headers
    # class		fasta headers
    # valid		SNP
    # avHet		SNP
    # avHetSE		SNP
    # func		ContigLocusId
    # locType		ContigLoc
    # weight		MapInfo

    time hgsql hg18snp128 -e \
      'alter table ContigLoc  add index (ctg_id); \
       alter table ContigInfo add index (ctg_id);'
    #kolossus load was already 1.0.
#0.001u 0.002s 4:04.73 0.0%      0+0k 0+0io 0pf+0w

    time hgsql hg18snp128 -e \
      'alter table ContigInfo add index (group_label(9));'
#0.001u 0.001s 0:00.07 0.0%      0+0k 0+0io 0pf+0w

    # Make sure there are no orient != 0 contigs among those selected.
    hgsql hg18snp128 -NBe \
      'select count(*) from ContigInfo where orient != 0 and \
         group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");'
#0

    # For joining files by shared column, we need a unique identifier in
    # that shared column.  snp_id is not unique -- the same rsID can appear
    # in both the reference assembly and on one of the others e.g. c6_COX.
    # So concatenate the assembly identifier and snp_id to get hopefully
    # unique label.
    time hgsql hg18snp128 -NBe \
      'select concat(ContigInfo.group_label, ".", snp_id), \
              ContigInfo.contig_acc, asn_from, asn_to, \
              loc_type, orientation, allele, phys_pos_from \
       from ContigLoc, ContigInfo \
       where ContigLoc.ctg_id = ContigInfo.ctg_id and ContigInfo.group_label \
             in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
      | sort \
      > ucscContigLoc.txt
    # no time output because of the pipe... took 4 minutes (load was 3 or 4).

    # Make sure these IDs are unique.
    wc -l ucscContigLoc.txt
#12275300 ucscContigLoc.txt
    awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l
#11863799
    # Doh!  Find non-unique IDs:
    awk 'prev == $1 {print;} {prev = $1;}' ucscContigLoc.txt | head
    grep ^c5_H2.10035195 ucscContigLoc.txt
#c5_H2.10035195  NT_113801       639954  639954  2       0       G       69605321
#c5_H2.10035195  NT_113801       660407  660407  2       0       G       69625774
#c5_H2.10035195  NT_113801       911780  911780  2       1       C       69877147
    # OK, they can be duplicated within the same contig.  See if we can
    # get by with anchoring everything to ucscContigLoc.txt.  But everybody
    # else better have unique IDs!

    # SNP -> valid, avHet, avHetSE
    # SNP has only snp_id as identifier, nothing relating to assembly.
    hgsql hg18snp128 -NBe \
      'select snp_id, validation_status, avg_heterozygosity, het_se \
       from SNP;' \
    | sort \
      > ucscSNP.txt
    # Check ID uniqueness:
    wc -l ucscSNP.txt
#11833664 ucscSNP.txt
    awk '{print $1;}' ucscSNP.txt | uniq | wc -l
#11833664

    # ContigLocusId -> func
    # ContigLocusId has only snp_id as an identifier (it gives one
    # example contig if the SNP is on multiple contigs).
    # The sort options and awk are to convert multiple entries with different
    # function classes for the same SNP into one entry per SNP with a list
    # of function classes.
    hgsql hg18snp128 -NBe \
      'select snp_id, fxn_class from ContigLocusId;' \
    | sort -u -k1,1 -k2,2n  \
    | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
            else { if (prevId) {print prevId "\t" prevFunc;} \
                                prevFunc = $2 ","; }} \
           {prevId = $1;} \
           END {print prevId "\t" prevFunc;}' \
      > ucscFunc.txt
    # Check ID uniqueness:
    wc -l ucscFunc.txt
#4676589 ucscFunc.txt
    awk '{print $1;}' ucscFunc.txt | sort -u | wc -l
#4676589

    # MapInfo -> weight
    # MapInfo needs assembly+snp_ids in order to have unique IDs.
    time hgsql hg18snp128 -e \
      'alter table MapInfo add index (assembly(9));'
#0.000u 0.004s 2:22.64 0.0%      0+0k 0+0io 0pf+0w
    hgsql hg18snp128 -NBe \
      'select concat(assembly, ".", snp_id), weight \
             from MapInfo where assembly \
             in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
      | sort \
      > weight.txt
    # ~1 minute
    # Check ID uniqueness:
    wc -l weight.txt
#11863799 weight.txt
    awk '{print $1;}' weight.txt | uniq | wc -l
#11863799
    awk '{print $2;}' weight.txt | sort -n | uniq -c
#   47454 0
#11621954 1
#   91766 2
#  100142 3
#    2483 10
    # SNPs w/weight 0 and 10 will be discarded later.

    # fasta headers -> observed, molType, class
    zcat /cluster/data/dbSNP/128/human/rs_fasta/rs_ch*.fas.gz \
    | grep '^>gnl' \
    | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
    | sort \
      > ucscGnl.txt
    # ~4 minutes
    wc -l ucscGnl.txt
#11833664 ucscGnl.txt
    awk '{print $1;}' ucscGnl.txt | uniq | wc -l
#11833664

    ############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################
    # Join files by ID.  Start with ContigLoc and MapInfo because they
    # share the concatenated assembly+snp_id IDs.
    time join -a 1 -e MISSING -t '	' ucscContigLoc.txt weight.txt \
      > ucscCL+w.txt
#25.408u 3.551s 0:29.26 98.9%    0+0k 0+0io 0pf+0w
    wc -l ucscCL+w.txt
#12275300 ucscCL+w.txt
    # Same as ucscContigLoc.txt above, good.
    # Any missing weights?
    grep MISSING ucscCL+w.txt | head
    # No output, good.

    # Join the files with SNP-only IDs.
    time join -e MISSING -t '	' ucscGnl.txt ucscSNP.txt \
      > ucscG+S.txt
#16.805u 1.996s 0:19.04 98.6%    0+0k 0+0io 0pf+0w
    wc -l ucscG+S.txt
#11833664 ucscG+S.txt
    # Same as ucscSNP.txt and ucscGnl.txt above.
    grep MISSING ucscG+S.txt | wc -l
#0
    time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \
      -t '	' ucscG+S.txt ucscFunc.txt \
      > ucscG+S+F.txt
#17.656u 2.318s 0:20.10 99.3%    0+0k 0+0io 0pf+0w
    wc -l ucscG+S+F.txt
#11833664 ucscG+S+F.txt
    grep MISSING ucscG+S+F.txt | wc -l
#7157075
    # Not surprising -- ucscFunc.txt has only 4676589 lines.
    expr 11833664 - 4676589
#7157075

    # Convert assembly+snp_id's to just snp_id (sorted) for final join.
    perl -wpe 's/^\S+\.(\d+)/$1/;' ucscCL+w.txt \
    | sort > ucscCL+w.snp_id.txt
    awk '{print $1;}' ucscCL+w.snp_id.txt | uniq | wc -l
#11727742
    # Interesting... which snp_ids are missing from ContigLoc?
    awk '{print $1;}' ucscCL+w.snp_id.txt | uniq > /tmp/1
    awk '{print $1;}' ucscGnl.txt | uniq > /tmp/2
    comm -13 /tmp/1 /tmp/2 > notInContigLoc.txt
    comm -23 /tmp/1 /tmp/2 > notInSNP.txt
    wc -l notIn*.txt
#105994 notInContigLoc.txt
#    72 notInSNP.txt
    expr 11833664 + 72 - 105994
#11727742

    # Final join -- treat ContigLoc as authoritative (since it has coords).
    # Arrange columns in same order as in the SNP table, with extras for
    # checking at the end (phys_pos_from).
    # chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ...
    time join -a 1 -e MISSING -t '	' \
  -o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \
      ucscCL+w.snp_id.txt ucscG+S+F.txt \
      > ucscNcbiSnp.ctg.txt
#38.497u 5.536s 2:08.18 34.3%    0+0k 0+0io 0pf+0w
    wc -l ucscNcbiSnp.ctg.txt
#12275300 ucscNcbiSnp.ctg.txt
    grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l
#7058898
    # a bit less than the 7157075 missing FUNC's above -- some overlap with
    # notInContigLoc would explain.

    # Lift the map contig coordinates to chrom coordinates (~2m);
    time liftUp ucscNcbiSnp.bed \
      /cluster/data/hg18/jkStuff/liftContigs.lft warn \
      ucscNcbiSnp.ctg.txt
#98.038u 5.974s 1:45.65 98.4%    0+0k 0+0io 5pf+0w
    wc -l ucscNcbiSnp.bed
#12275300 ucscNcbiSnp.bed

    # At this point, move back from /scratch to /cluster/data.
    nice gzip ucscNcbiSnp.bed
    cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/128/human/

    # Drum roll please... translate NCBI's encoding into UCSC's, and
    # perform a bunch of checks.  This is where developer involvement
    # is most likely as NCBI extends the encodings used in dbSNP.
    cd /cluster/data/dbSNP/128/human/
    gunzip ucscNcbiSnp.bed.gz
    # Re-ran this command 8/7/08 to get new snp128.sql that includes
    # only those enum/set values that are actually used.  No other output
    # files changed.
    time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \
      snp128
#spaces stripped from observed:
#chr12   5963395 5963395 rs41402545
#count of snps with weight  0 = 59123
#count of snps with weight  1 = 11654498
#count of snps with weight  2 = 191647
#count of snps with weight  3 = 335214
#count of snps with weight 10 = 34818
#Skipped 167 snp mappings due to errors -- see snp128Errors.bed
#176.712u 17.466s 3:34.82 90.3%  0+0k 0+0io 0pf+0w
    # The 167 errors are all for SNPs for which we don't have fasta,
    # so we also don't have observed, class, or molType.  I spot-checked
    # a few, and they have been deleted from dbSNP.  Nothing to show,
    # so we skip those 167 -- nothing catastrophic.  Watch out for new
    # types of errors reported, though:
    awk -F"\t" '{print $5;}' snp128Errors.bed | sort -u | wc -l
#1
    wc -l snp*
#  12181192 snp128.bed
#        22 snp128.sql
#       167 snp128Errors.bed
#        18 snp128ExceptionDesc.tab
#   1013020 snp128Exceptions.bed

    # Make one big fasta file.  (note: snp126 skipped chrUn... but it's small
    # compared to chr1, chr2 etc.)
    # It's a monster: 14G!  Can we split by hashing rsId?
    zcat rs_fasta/rs_ch*.fas.gz \
    | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
      > snp128.fa
    # Check for duplicates.
    grep ^\>rs snp128.fa | sort > /scratch/tmp/seqHeaders
    wc -l /scratch/tmp/seqHeaders
#11833664 /scratch/tmp/seqHeaders
    uniq /scratch/tmp/seqHeaders | wc -l
#11833664
    # Use hgLoadSeq to generate .tab output for sequence file offsets,
    # and keep only the columns that we need: acc and file_offset.
    # Index it and translate to snpSeq table format.
    time hgLoadSeq -test placeholder snp128.fa
#107.137u 37.140s 2:39.16 90.6%  0+0k 0+0io 0pf+0w
    cut -f 2,6 seq.tab > snp128Seq.tab
    rm seq.tab

    ssh hgwdev
    # Load up main track tables.
    cd /cluster/data/dbSNP/128/human
    # Re-ran this command 8/7/08 to get new snp128.sql that includes
    # only those enum/set values that are actually used.  No data values
    # changed.  Removed -noSort because Brooke had spotted some entries
    # sorted by chromEnd instead of chromStart.
    time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
      hg18 snp128 -sqlTable=snp128.sql snp128.bed
#78.060u 13.298s 7:32.71 20.1%   0+0k 0+0io 0pf+0w
    sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125Exceptions.sql \
      > snp128Exceptions.sql
    time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
      hg18 snp128Exceptions -sqlTable=snp128Exceptions.sql \
      snp128Exceptions.bed
#5.915u 0.492s 0:28.69 22.3%     0+0k 0+0io 0pf+0w
    sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
      > snp128ExceptionDesc.sql
    # 3/7/08: reloaded snp128ExceptionDesc (tweaked wording)
    hgLoadSqlTab hg18 snp128ExceptionDesc snp128ExceptionDesc.sql \
      snp128ExceptionDesc.tab
    # Load up sequences.
    sed -e 's/snpSeq/snp128Seq/' ~/kent/src/hg/lib/snpSeq.sql \
      > snp128Seq.sql
    mkdir -p /gbdb/hg18/snp
    ln -s /cluster/data/dbSNP/128/human/snp128.fa /gbdb/hg18/snp/snp128.fa
    time nice hgLoadSqlTab hg18 snp128Seq snp128Seq.sql snp128Seq.tab
#0.001u 0.000s 2:31.19 0.0%      0+0k 0+0io 0pf+0w

    # Put in a link where one would expect to find the track build dir...
    ln -s /cluster/data/dbSNP/128/human /cluster/data/hg18/bed/snp128


#######################################################################
# SNPMASKED SEQUENCE FOR SNP128 (DONE 2/1/08 angie)
    ssh kolossus
    mkdir /cluster/data/hg18/snp128Mask
    cd /cluster/data/hg18/snp128Mask

    # Identify rsIds with various problems -- we will exclude those.
    # MultipleAlignments is kinda broad because anything that maps on
    # both chrN and chrN_foo_hap1 will be excluded... similarly, extra
    # matches on chrN_random might disqualify good matches on chrN.
    # Well, erring on the side of caution is good.
    awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
      /cluster/data/dbSNP/128/human/snp128Exceptions.bed \
      | sort -u \
      > snp128ExcludeRsIds.txt
    time grep -vFwf snp128ExcludeRsIds.txt \
      /cluster/data/dbSNP/128/human/snp128.bed \
      > snp128Cleaned.bed
#100.027u 11.779s 2:09.61 86.2%  0+0k 0+0io 0pf+0w

    # Substitutions:
    mkdir substitutions
    snpMaskSingle snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
    | faSplit byname stdin substitutions/
    #-- 79 warnings about differing observed at same base positions
    #-- (66 distinct positions) -- send to NCBI. snp-admin@ncbi.nlm.nih.gov
    # Also this warning about total size -- just means that some chroms
    # didn't have any SNPS that survived the stringent filtering.
#Masked 9146694 snps in 9146642 out of 3091528550 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091528550 (difference is 16148723)
    # Make sure that sizes are identical, first diffs are normal -> IUPAC,
    # and first diffs' case is preserved:
    foreach f (substitutions/chr*.fa)
      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ"
    end
#(output OK)
    foreach f (substitutions/chr*.fa)
      echo $f:t:r
      mv $f $f:r.subst.fa
      gzip $f:r.subst.fa
    end

    # Insertions:
    mkdir insertions
    snpMaskAddInsertions snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
    | faSplit byname stdin insertions/
#Added 1332737 snps totaling 2372942 bases to 3085151178 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085151178 (difference is 22526095)
    # Again, that just means that some chroms didn't have filtered SNPs.
    # Make sure that all sizes have increased relative to original:
    foreach f (insertions/chr*.fa)
      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
      |& perl -we '$_=<>; \
           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
             if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
             else {die "ERROR: ins size $1 <= $2\n";} \
           } else {die $_;}'
    end
#(output OK)
    foreach f (insertions/chr*.fa)
      mv $f $f:r.ins.fa
      gzip $f:r.ins.fa
    end

    # Deletions:
    mkdir deletions
    snpMaskCutDeletions snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
    | faSplit byname stdin deletions/
#Cut 661637 snps totaling 1248873 bases from 3085167749 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
    # Again, that just means that some chroms didn't have filtered SNPs.
    # Make sure that all sizes have decreased relative to original:
    foreach f (deletions/chr*.fa)
      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
      |& perl -we '$_=<>; \
           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
             if ($1 < $2) {print "OK: del size $1 < $2\n";} \
             else {die "ERROR: del size $1 >= $2\n";} \
           } else {die $_;}'
    end
#(output OK)
    foreach f (deletions/chr*.fa)
      mv $f $f:r.del.fa
      gzip $f:r.del.fa
    end

    # Clean up and prepare for download:
    gzip snp128Cleaned.bed
    foreach d (substitutions insertions deletions)
      pushd $d
        md5sum *.gz > md5sum.txt
      popd
    end
    # Make a README.txt in each subdir.

    # Create download links on hgwdev.
    # NOTE: I am going to start by offering only the substitutions.
    # If we get any user requests, then maybe we can put the insertions
    # and deletions out there.
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask
    ln -s /cluster/data/hg18/snp128Mask/substitutions/* \
      /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/
## If there is user demand for ins & del, then start over with an empty
## goldenPath/snp128Mask and do this:
##    foreach type (substitutions insertions deletions)
##      mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/$type
##      ln -s /cluster/data/hg18/snp128Mask/$type/* \
##        /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/$type/
##    end


#######################################################################
# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP128 (DONE 2/8/08 angie)
# REDONE 2/29/08 (upcase ortho alleles)
    ssh kolossus
    mkdir /cluster/data/hg18/bed/snp128Ortho
    cd /cluster/data/hg18/bed/snp128Ortho

    # Following Heather's lead in snp126orthos, filter SNPs to to keep
    # only those with class=single, length=1, chrom!~random;
    # Exclude those with exceptions MultipleAlignments,
    # SingleClassTriAllelic or SingleClassQuadAllelic.
    # Unlike snp masking, we do not filter for weight -- don't know why.
    awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      /cluster/data/dbSNP/128/human/snp128Exceptions.bed \
    | sort -u \
      > snp128ExcludeIds.txt
    awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
      /cluster/data/dbSNP/128/human/snp128.bed \
    | grep -vFwf snp128ExcludeIds.txt \
      > snp128Simple.bed
    # took ~3 minutes
    wc -l snp128Simple.bed
#9133704 snp128Simple.bed
    # This is the analog of db table snp126simple.

    # Glom all human info that we need for the final table onto the
    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
    awk 'BEGIN{OFS="\t";} \
        {print $1, $2, $3, \
               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
               0, $6;}' \
      snp128Simple.bed > snp128ForLiftOver.bed

    # 2/29/08 -- re-ran from this point on to regenerate cleaned up
    # cluster run results (oops) and then force ortho alleles to upper
    # case, for consistency with dbSNP formatting.

    # Map coords to chimp using liftOver.
    # I don't know why chimp took so much longer than macaque... the
    # chimp .over has fewer chains and fewer bytes than the macaque .over.
    mkdir run.liftOChimp
    cd run.liftOChimp
    mkdir split out
    splitFile ../snp128ForLiftOver.bed 25000 split/chunk
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
        \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
        >> jobList
    end
    ssh pk
    cd /cluster/data/hg18/bed/snp128Ortho/run.liftOChimp
    para make jobList
#Completed: 366 of 366 jobs
#CPU time in finished jobs:      71660s    1194.33m    19.91h    0.83d  0.002 y
#IO & Wait Time:                  5377s      89.62m     1.49h    0.06d  0.000 y
#Average job time:                 210s       3.51m     0.06h    0.00d
#Longest finished job:             518s       8.63m     0.14h    0.01d
#Submission to last job:           518s       8.63m     0.14h    0.01d

    # Map coords to macaque using liftOver.
    mkdir ../run.liftOMac
    cd ../run.liftOMac
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
        \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 366 of 366 jobs
#CPU time in finished jobs:       5663s      94.38m     1.57h    0.07d  0.000 y
#IO & Wait Time:                 12066s     201.10m     3.35h    0.14d  0.000 y
#Average job time:                  48s       0.81m     0.01h    0.00d
#Longest finished job:             102s       1.70m     0.03h    0.00d
#Submission to last job:           102s       1.70m     0.03h    0.00d
    # Average job time was 54s with 50000 chunks, but those made chimp
    # jobs run too long.

    ssh kolossus
    cd /cluster/data/hg18/bed/snp128Ortho
    # Here is a script that looks up the base value in the ortho species
    # and swizzles columns to prepare for the joining and re-swizzling
    # of both ortho species' columns into the final product.  If it is
    # used more than once, should be checked in, perhaps in hg/snp/snpLoad.
    cat > getOrthoSeq.pl <<'_EOF_'
#!/usr/bin/env perl
# Dig up orthologous alleles and swizzle columns so the glommed name that
# includes human position info etc. is first.  It will be used as a key for
# joining up multiple other-species' ortho data.  Also swizzle columns so
# that the remaining columns are in order of appearance in the final result,
# snp128OrthoPanTro2RheMac2.  Upcase ortho alleles for consistency w/dbSNP.
use warnings;
use strict;

my $twoBitFName = shift @ARGV
  || die "usage: getOrthoSeq.pl orthoDb.2bit [file(s)]\n";

sub getOChrSeq($$) {
  # Slurp in fasta sequence using twoBitToFa.
  my ($twoBitFName, $oChr) = @_;
  open(P, "twoBitToFa -noMask $twoBitFName -seq=$oChr stdout |")
    || die "Can't open pipe from twoBitToFa $twoBitFName -seq=$oChr: $!\n";
  <P> =~ /^>\w+/
    || die "Doesn't look like we got fasta -- first line is this:\n$_";
  # From man perlfaq5: trick to slurp entire contents:
  my $c = 0;
  my $seq = do { local $/; my $data = <P>; $c = ($data =~ s/\n//g); $data; };
  close(P);
  return $seq;
}

my %rc = ( "a" => "t", "c" => "g", "g" => "c", "t" => "a",
           "A" => "T", "C" => "G", "G" => "C", "T" => "A", );
sub revComp($) {
  # Reverse-complement fasta input.  (Pass through non-agtc chars.)
  my ($seq) = @_;
  my $rcSeq = reverse $seq;
  for (my $i = 0;  $i < length($rcSeq);  $i++) {
    my $base = substr($rcSeq, $i, 1);
    my $cBase = $rc{$base} || $base;
    substr($rcSeq, $i, 1, $cBase);
  }
  return $rcSeq;
}

my $prevOChr;
my ($oChrSeq, $oChrSize);
while (<>) {
  chomp;
  my ($oChr, $oStart, $oEnd, $nameGlom, undef, $oStrand) = split;
  if (! defined $prevOChr || $oChr ne $prevOChr) {
    $oChrSeq = &getOChrSeq($twoBitFName, $oChr);
    $oChrSize = length($oChrSeq);
  }
  die "Coords out of range, input line $.: $oEnd > $oChr size $oChrSize\n\t"
    if ($oEnd > $oChrSize);
  my $oAllele = substr($oChrSeq, $oStart, $oEnd - $oStart);
  $oAllele = &revComp($oAllele) if ($oStrand eq "-");
  print join("\t", $nameGlom, $oChr, $oStart, $oEnd, $oAllele, $oStrand) .
        "\n";
  $prevOChr = $oChr;
}
'_EOF_'
    # << emacs
    chmod a+x getOrthoSeq.pl

    # Concatenate the chimp results, sorting by chimp pos in order to
    # efficiently access 2bit sequence in ./getOrthoSeq.  The output of
    # that is then sorted by the glommed human info field, so that we
    # can use join to combine chimp and macaque results in the next step.
    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
    | ./getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
    | sort > panTro2.orthoGlom.txt
    # ditto for macaque:
    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
    | ./getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
    | sort > rheMac2.orthoGlom.txt
    # The whole pipeline takes ~4-6 minutes each.
    wc -l panTro2.orthoGlom.txt rheMac2.orthoGlom.txt
#   8549323 panTro2.orthoGlom.txt
#   7324851 rheMac2.orthoGlom.txt

    # Use the glommed name field as a key to join up chimp and macaque
    # allele data.  Include glommed name from both files because if only
    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
    # in the orthoGlom files from each file, which are in the same order
    # as the chimp and macaque columns of snp128OrthoPanTro2RheMac2.
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e 0 \
      panTro2.orthoGlom.txt rheMac2.orthoGlom.txt \
    | perl -wpe 'chomp; \
        ($glom1, $glom2, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand) = split; \
        $glomKey = ($glom1 ne "0") ? $glom1 : $glom2; \
        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
          split(/\|/, $glomKey); \
        $o1Chr    =~ s/^0$/?/;   $o2Chr    =~ s/^0$/?/; \
        $o1Al     =~ s/^0$/?/;   $o2Al     =~ s/^0$/?/; \
        $o1Strand =~ s/^0$/?/;   $o2Strand =~ s/^0$/?/; \
        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand) . "\n"; \
        s/^.*$//;' \
    | sort -k1,1 -k2n,2n > snp128OrthoPanTro2RheMac2.bed
    # took ~5 minutes.
    wc -l snp128OrthoPanTro2RheMac2.bed
#8770301 snp128OrthoPanTro2RheMac2.bed

    ssh hgwdev
    cd /cluster/data/hg18/bed/snp128Ortho
    sed -e 's/snpOrthoPanTroRheMac/snp128OrthoPanTro2RheMac2/' \
      ~/kent/src/hg/lib/snpOrthoPanTroRheMac.sql \
      > snp128OrthoPanTro2RheMac2.sql
    time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
      hg18 snp128OrthoPanTro2RheMac2 -sqlTable=snp128OrthoPanTro2RheMac2.sql \
      snp128OrthoPanTro2RheMac2.bed
#Loaded 8770301 elements of size 17
#52.659u 8.528s 5:18.68 19.1%    0+0k 0+0io 0pf+0w

    # Cleanup on fileserver:
    cd /cluster/data/hg18/bed/snp128Ortho
    nice gzip snp128Simple.bed snp128ExcludeIds.txt snp128ForLiftOver.bed
    rm -r run*/split *.orthoGlom.txt


#######################################################################
# COMPARE SNP128 TO SNP126 (DONE 2/7/08 angie)

    # First, do a featureBits venn, on some machine other than hgwdev.
    # I can't find the file from which snp126 was loaded... but kkr5u00
    # has an hg18snp126 database with a snp126 that is a few hours newer,
    # but apparently the same as, hgwdev's hg18.snp126... so use that
    # (had to add gap tables too):
    ssh kkr5u00
    time featureBits hg18snp126 snp126
#12451939 bases of 2881515245 (0.432%) in intersection
#57.274u 15.283s 1:20.56 90.0%   0+0k 0+0io 0pf+0w
    # Now make sure we have a file copy of snp126 in case we need it in
    # the future:
    hgsql hg18snp126 -NBe 'select * from snp126' \
    | cut -f 2-18 \
      > /cluster/data/dbSNP/126/human/snp126.bed

    rsync /cluster/data/dbSNP/128/human/snp128.bed /scratch/tmp/
    time featureBits hg18 /scratch/tmp/snp128.bed
#12387071 bases of 2881515245 (0.430%) in intersection
#636.834u 47.039s 11:24.02 99.9% 0+0k 0+0io 0pf+0w
    # OK, db is a lot faster!
    # I am not worried about the drop -- spot-checking, I have seen some
    # dropped rsIds and some that used to have multiple mappings but now
    # have only one mapping -- an improvement.
    pushd /cluster/data/dbSNP/128/human
    hgLoadBed -tab -noSort -onServer -tmpDir=/scratch/tmp \
      hg18snp126 snp128 -sqlTable=snp128.sql snp128.bed
    popd

    # How many covered bases in common?
    time featureBits hg18snp126 snp126 snp128
#11576806 bases of 2881515245 (0.402%) in intersection
#114.365u 26.671s 3:15.55 72.1%  0+0k 0+0io 0pf+0w

    # Base coverage Venn counts:
    #            snp126    snp128   !snp126   !snp128
    # snp126   12451939  11576806         0    875133
    # snp128   11576806  12387071    810265         0

    # Do the same for SNPs (rs* records as opposed to bases):
    hgsql hg18snp126 -NBe 'select name from snp126' \
    | sort -u > /scratch/tmp/1
    hgsql hg18snp126 -NBe 'select name from snp128' \
    | sort -u > /scratch/tmp/2
    wc -l /scratch/tmp/[12]
# 11647909 /scratch/tmp/1
# 11677826 /scratch/tmp/2
    comm -12 /scratch/tmp/[12] | wc -l
#11531282
    cd /cluster/data/dbSNP/128/human
    comm -23 /scratch/tmp/[12] \
      > /cluster/data/dbSNP/128/human/ids.inSnp126Not128.txt
    comm -13 /scratch/tmp/[12] \
      > /cluster/data/dbSNP/128/human/ids.inSnp128Not126.txt

    # rsId Venn counts:
    #            snp126    snp128   !snp126   !snp128
    # snp126   11647909  11531282         0    116627
    # snp128   11531282  11677826    146544         0

    # Interesting that snp128 has more new rsIds but fewer new bases.
    # It has been 2 versions since 126... also, when spot-checking
    # exceptions I noticed that a lot of deletion SNPs used to be
    # mapped to the appropriate span in 126, but in 128 were mapped to
    # a single base and had some kind of range*tion locType... not an
    # improvement.  But that kind of observation best falls out of an
    # examination of exception cases... and that is what will be
    # useful for us to report to NCBI.


############################################################################
#  BLASTZ SELF chain minScore=2000 (DONE - 2007-12-19 - Hiram)
    ssh kkstore02
    screen # use screen to manage this job
    mkdir /cluster/data/hg18/bed/blastzSelf.2007-12-17
    cd /cluster/data/hg18/bed/blastzSelf.2007-12-17

    cat << '_EOF_' > DEF
# human vs human
BLASTZ_M=400

# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib
SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Human Hg18
SEQ2_DIR=/san/sanvol1/scratch/hg18/selfNib
SEQ2_LEN=/san/sanvol1/scratch/hg18/self.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/cluster/data/hg18/bed/blastzSelf.2006-01-17
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    cd /cluster/data/hg18/bed/blastzSelf.2007-12-17
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	`pwd`/DEF -verbose=2 -chainMinScore=2000 -chainLinearGap=medium \
	-stop=net -smallClusterHub=memk -bigClusterHub=pk > do.log 2>&1 &
    #	real    640m37.637s
    ## crafted a special loadUp.csh to avoid haplotypes and randoms,
    #	and load with normScore
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastzSelf.2007-12-17/axtChain
    time nice -n +19 ./loadUp.csh >loadUp.out 2>&1
    #	real    24m51.669s
    cd /cluster/data/hg18/bed/blastzSelf.2007-12-17
    time nice -n +19 featureBits hg18 chainSelf2KLink \
	-noRandom -noHap > fb.hg18.chainSelf2KLink.txt 2>&1 &
    #	real    11m30.010s
    cat fb.hg18.chainSelf2KLink.txt
    #	346885376 bases of 2858034764 (12.137%) in intersection

    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	`pwd`/DEF -verbose=2 -chainMinScore=2000 -chainLinearGap=medium \
	-continue=download \
	-stop=download -smallClusterHub=memk -bigClusterHub=pk \
	> download.log 2>&1 &
############################################################################
# RE-BUILD GAD TRACK (Done, 1/16/08, Fan)
# During previous build, all.txt was corrupted during receiving file from
# email.

   mkdir /cluster/store12/gad080116
   rm /cluster/data/gad
   ln -s /cluster/store12/gad080116 /cluster/data/gad

   cd /cluster/data/gad

# Receive "all.txt" from GAD
# contact person: Garner, John (NIH/NIA/IRP) [F] [garnerjr@mail.nih.gov]

   hgsql hg18 -e 'drop table gadAll'
   hgsql hg18 <~/src/hg/lib/gadAll.sql
   hgsql hg18 -e 'load data local infile "all.txt" into table gadAll ignore 3 lines'

# create gad table

   gadPos hg18 j18.tmp
   cat j18.tmp |sort -u >hg18.gad.tab

# removed 1 record from hg18.gad.tab that has multiple words in geneSymbol
# field.

# use -nobin option to ensure display order is according to genomic position
   hgLoadBed -nobin hg18 gad hg18.gad.tab
   rm j18.tmp

#######################################################################
# BLASTZ/CHAIN/NET Lamprey petMar1 (DONE - 2008-01-29 - Hiram)
#	with contigs for Lamprey
    ssh kkstore02
    screen # use screen to control this job
    mkdir /cluster/data/hg18/bed/blastzPetMar1.2008-01-29
    cd /cluster/data/hg18/bed/blastzPetMar1.2008-01-29

    cat << '_EOF_' > DEF
# Human vs. Lamprey

# using the "close" genome alignment parameters
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human - WindowMasker sequence
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1

# QUERY: Lamprey petMar1
SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzPetMar1.2008-01-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk > do.log 2>&1 &
    #	real    414m33.533s
    cat fb.hg18.chainPetMar1Link.txt
    #	36042598 bases of 2881515245 (1.251%) in intersection

    #	That is OK, now for the swap:
    mkdir /cluster/data/petMar1/bed/blastz.hg18.swap
    cd /cluster/data/petMar1/bed/blastz.hg18.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/hg18/bed/blastzPetMar1.2008-01-29/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk > swap.log 2>&1 &
    #	real    60m1.928s
    cat  fb.petMar1.chainHg18Link.txt
    #	26751073 bases of 831696438 (3.216%) in intersection

#######################################################################
###################
# Build recip-best alignments with calJac1 (DONE 2008-01-25 braney)

    cd /cluster/data/hg18/bed
    ln -s blastz.calJac1.2007-10-07 blastz.calJac1
    cd blastz.calJac1
    screen
    /cluster/bin/scripts/doRecipBest.pl hg18 calJac1


###################
# Build syntenic net for orang (DONE 2008-01-25 braney)

    cd /cluster/data/hg18/bed/blastz.ponAbe2
    screen

     /cluster/bin/scripts/doBlastzChainNet.pl -syntenicNet -continue syntenicNet -stop syntenicNet `pwd`/DEF  2>&1 | tee syntenic.out

#########################################################################
## Primate Multiz (Working
##
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/multizPrimate
    cd /cluster/data/hg18/bed/multizPrimate
    #	take the 30-way tree from mm9 and eliminate genomes not in
    #	this alignment
    #	rearrange to get hg18 on the top of the graph
    #	paste this tree into the on-line phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to create the image for the tree diagram

    /cluster/bin/phast/tree_doctor  --prune-all-but Human_hg18,Mouse_mm9,Chimp_panTro2,Orangutan_ponAbe2,Rhesus_rheMac2,Marmoset_calJac1,Bushbaby_otoGar1,TreeShrew_tupBel1,Rat_rn4,Dog_canFam2 /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh  > primate.fullNames.nh

    #	looks something like this:
(((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.249544,((((((Human_hg18:0.005873,Chimp
_panTro2:0.007668):0.013037,Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:
0.031973):0.036500,Marmoset_calJac1:0.070000):0.036500,Bushbaby_otoGar1:0.151185
):0.015682,TreeShrew_tupBel1:0.162844):0.006272):0.019763,Dog_canFam2:0.187963);

    #	rearrange to get human at the top:
    # this leaves us with:
    cat << _EOF_ > hg18.primate.nh
((((((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:0.031973):0.036500,Marmoset_calJac1:0.070000):0.036500,Bushbaby_otoGar1:0.151185):0.015682,TreeShrew_tupBel1:0.162844):0.006272,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.249544):0.019763,Dog_canFam2:0.187963);
_EOF_
    #	<< happy emacs

    #	create a species list from that file:
    sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' hg18.primate.nh \
        | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
        | sed -e "s/.*_//; s/:.*//" | sort > species.list
    # create a stripped down nh file for use in autoMZ run
    echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' hg18.primate.nh \
	| sed -e "s/  / /g"` > tree.primate.nh
    #	that looks like, as a single line:
    #  ((((((((hg18 panTro2) ponAbe2) rheMac2) calJac1) otoGar1) tupBel1) (mm9   rn4)) canFam2)

    # verify all blastz's exists
    cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/hg18/bed/multizPrimate
foreach db (`cat species.list`)
    set bdir = /cluster/data/hg18/bed/blastz.$db
    if (-e $bdir/mafRBestNet/chr1.maf.gz) then
	echo "$db mafRBestNet"
    else if (-e $bdir/mafSynNet/chr1.maf.gz) then
	echo "$db mafSynNet"
    else if (-e $bdir/mafNet/chr1.maf.gz) then
	echo "$db mafNet"
    else
	echo "$db mafs not found"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./listMafs.csh
    #	see what it says, the "mafs not found" should only show up on hg18
    ./listMafs.csh
# calJac1 mafRBestNet
# canFam2 mafSynNet
# hg18 mafNet
# mm9 mafSynNet
# otoGar1 mafRBestNet
# panTro2 mafSynNet
# ponAbe2 mafSynNet
# rheMac2 mafSynNet
# rn4 mafSynNet
# tupBel1 mafRBestNet

    /cluster/bin/phast/all_dists hg18.primate.nh > Primate.distances.txt
    grep -i hg18 Primate.distances.txt | sort -k3,3n
# Human_hg18      Chimp_panTro2   0.013541
# Human_hg18      Orangutan_ponAbe2       0.038910
# Human_hg18      Rhesus_rheMac2  0.063920
# Human_hg18      Marmoset_calJac1        0.138447
# Human_hg18      Bushbaby_otoGar1        0.256132
# Human_hg18      TreeShrew_tupBel1       0.283473
# Human_hg18      Dog_canFam2     0.334627
# Human_hg18      Mouse_mm9       0.452719
# Human_hg18      Rat_rn4 0.460828

    # copy net mafs to cluster-friendly storage, splitting chroms
    # into 50MB chunks  to improve run-time
    # NOTE: splitting will be different for scaffold-based reference asemblies
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/multizPrimate/run.split
    cd /cluster/data/hg18/bed/multizPrimate/run.split
    #	this works by examining the rmsk table for likely repeat areas
    #	that won't be used in blastz
    mafSplitPos hg18 50 mafSplit.bed

    ssh kki
    cd /cluster/data/hg18/bed/multizPrimate/run.split

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set targDb = "hg18"
set db = $1
set sdir = /san/sanvol1/scratch/$targDb/BRsplitStrictMafNet
mkdir -p $sdir
if (-e $sdir/$db) then
    echo "directory $sdir/$db already exists -- remove and retry"
    exit 1
endif
set bdir = /cluster/data/$targDb/bed/blastz.$db
if (! -e $bdir) then
    echo "directory $bdir not found"
    exit 1
endif
mkdir -p $sdir/$db
if (-e $bdir/mafRBestNet) then
    set mdir = $bdir/mafRBestNet
else if (-e $bdir/mafSynNet) then
    set mdir = $bdir/mafSynNet
else if (-e $bdir/mafNet) then
    set mdir = $bdir/mafNet
else
    echo "$bdir maf dir not found"
    exit 1
endif
echo $mdir
foreach f ($mdir/*)
    set c = $f:t:r:r
    echo "  $c"
    nice mafSplit mafSplit.bed $sdir/$db/ $f
end
echo "gzipping $sdir/$db mafs"
nice gzip $sdir/$db/*
endif
echo $mdir > $db.done
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    grep -v hg18  ../species.list > split.list
    cat << '_EOF_' > template
#LOOP
doSplit.csh $(path1) {check out line+ $(path1).done}
#ENDLOOP
'_EOF_'
    gensub2 split.list single template jobList
    para create jobList
    # start these gently, this is a good load on the san filesystem
    para -maxPush=3 push
    #	wait a while, verify these are running OK
    para push
    # let that run to a couple completions, a few minutes, then again:
    para try
    # etc ...

# Completed: 9 of 9 jobs
# CPU time in finished jobs:       9090s     151.50m     2.52h    0.11d  0.000 y
# IO & Wait Time:                  3093s      51.55m     0.86h    0.04d  0.000 y
# Average job time:                1354s      22.56m     0.38h    0.02d
# Longest finished job:            2134s      35.57m     0.59h    0.02d
# Submission to last job:          2153s      35.88m     0.60h    0.02d

    # ready for the multiz run
    ssh pk
    cd /cluster/data/hg18/bed/multizPrimate
    #	actually, the result directory here should be maf.split instead of maf
    mkdir -p maf run
    cd run
    mkdir penn
    # use latest penn utilities
    P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $P/{autoMZ,multiz,maf_project} penn

    # list chrom chunks, any db dir will do; better would be for the
    # splitter to generate this file
    # We temporarily use __ instead of . to delimit chunk in filename
    # so we can use $(root) to get basename
    find /san/sanvol1/scratch/hg18/BRsplitStrictMafNet -type f \
	| while read F; do basename $F; done \
	| sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.list
    wc -l chromChunks.list
	# 93 chromChunks.list

cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef

    set db = hg18
    set c = $1
    set maf = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /san/sanvol1/scratch/$db/BRsplitStrictMafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../tree.primate.nh ../species.list $tmp
    pushd $tmp
    foreach s (`cat species.list`)
        set c2 = `echo $c | sed 's/__/./'`
        set in = $pairs/$s/$c2.maf
        set out = $db.$s.sing.maf
        if ($s == hg18) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.primate.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multizPrimate/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 chromChunks.list single template jobList
    para create jobList

# Completed: 93 of 93 jobs
# CPU time in finished jobs:     302126s    5035.43m    83.92h    3.50d  0.010 y
# IO & Wait Time:                  3499s      58.32m     0.97h    0.04d  0.000 y
# Average job time:                3286s      54.77m     0.91h    0.04d
# Longest finished job:            6972s     116.20m     1.94h    0.08d
# Submission to last job:          7052s     117.53m     1.96h    0.08d

    # put the split maf results back together into single chroms
    ssh kkstore02
    cd /cluster/data/hg18/bed/multizPrimate
    # here is where the result directory maf should have already been maf.split
    mv maf maf.split
    mkdir maf
    # going to sort out the redundant header garbage to leave a cleaner maf
    for C in `ls maf.split | sed -e "s#__.*##" | sort -u`
do
    echo ${C}
    head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf
    grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \
	sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf
    grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf
    tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf
done

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/hg18/multizPrimate/maf
    ln -s /cluster/data/hg18/bed/multizPrimate/maf/*.maf \
                /gbdb/hg18/multizPrimate/maf
    # this generates a large 1 Gb multizPrimate.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/hg18/multizPrimate/maf hg18 multizPrimate
    # Loaded 12531777 mafs in 49 files from /gbdb/hg18/multizPrimate/maf
    # real    8m44.516s

    # load summary table
    time nice -n +19 cat /gbdb/hg18/multizPrimate/maf/*.maf \
	| hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
	 -maxSize=200000  multizPrimateSummary stdin
    #	Created 1417364 summary blocks from 29928557 components
    #	and 6981421 mafs from stdin
    #	real    21m35.057s

    # Gap Annotation
    # prepare bed files with gap info
    ssh kkstore02
    mkdir /cluster/data/hg18/bed/multizPrimate/anno
    cd /cluster/data/hg18/bed/multizPrimate/anno
    mkdir maf run

    #	these actually already all exist from previous multiple alignments
    for DB in `cat ../species.list`
do
    CDIR="/cluster/data/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
	echo "creating ${DB}.N.bed"
	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
	ls -og ${CDIR}/${DB}.N.bed
    fi
done

    cd run
    rm -f nBeds sizes
    for DB in `grep -v hg18 ../../species.list`
do
    echo "${DB} "
    ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    ssh kki
    cd /cluster/data/hg18/bed/multizPrimate/anno/run

    cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
    set dir = /cluster/data/hg18/bed/multizPrimate
    set c = $1
    cat $dir/maf/${c}.maf | \
        nice mafAddIRows -nBeds=nBeds stdin /cluster/data/hg18/hg18.2bit $2
'_EOF_'
    # << happy emacs
    chmod +x doAnno.csh

    cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ /cluster/data/hg18/bed/multizPrimate/anno/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cut -f1 /cluster/data/hg18/chrom.sizes > chrom.list
    gensub2 chrom.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 49 of 49 jobs
# CPU time in finished jobs:      10782s     179.71m     3.00h    0.12d  0.000 y
# IO & Wait Time:                  3380s      56.33m     0.94h    0.04d  0.000 y
# Average job time:                 289s       4.82m     0.08h    0.00d
# Longest finished job:             751s      12.52m     0.21h    0.01d
# Submission to last job:          1479s      24.65m     0.41h    0.02d

    ssh hgwdev
    cd /cluster/data/hg18/bed/multizPrimate/anno
    mkdir -p /gbdb/hg18/multizPrimate/anno/maf
    ln -s /cluster/data/hg18/bed/multizPrimate/anno/maf/*.maf \
                /gbdb/hg18/multizPrimate/anno/maf
    #	by loading this into the table multizPrimate, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/hg18/multizPrimate/anno/maf \
                hg18 multizPrimate
    #	Loaded 7331265 mafs in 55 files from /gbdb/hg18/multizPrimate/anno/maf
    #	real    8m31.092s

    cat /cluster/data/hg18/chrom.sizes | \
	awk '{if ($2 > 1000000) { print $1 }}' |
	while read C
do
    echo /gbdb/hg18/multizPrimate/anno/maf/$C.maf
done | xargs cat | \
        hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
            -maxSize=200000  multizPrimateSummary stdin
    # Created 1621960 summary blocks from 75794119 components and 12601786
    # mafs from stdin
    #	remove the multizPrimate*.tab files in this /scratch/tmp directory
    rm multizPrimate*

#######

################################################################################
# RE-SEQUENCING TRACE DOWNLOAD (DONE 2008-01-25, Andy)

ssh kolossus
bash
cd /san/sanVol1/scratch/andy
mkdir traces
cd traces/
cat < "EOF" > getOldTraces.sh
#!/bin/bash

echo Retrieving sequences before Jan 2008
echo Starting at `date`

# Query the database and figure out the total number of pages needed
count=`./query_tracedb "query count species_code='HOMO SAPIENS' and strategy='Re-Sequencing' and load_date<'1/1/2008'"`
pages=$(( (count/40000) + ((count % 40000) > 0) ))

echo
echo Total of $count sequences and $pages pages to retrieve
echo

for ((page=0; page < pages; page++)); do
    pagenum=`printf "%03d" $((page+1))`
    ./query_tracedb "query page_size 40000 page_number $page binary species_code='HOMO SAPIENS' and strategy='Re-Sequencing' and load_date<'1/1/2008'" > page.bin
    echo -n "Retrieving page $((page+1)) of $pages compressed fasta... "
    (echo -n "retrieve_gz fasta 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.fa.gz
    echo "done at `date +%T`"
    echo -n "Retrieving page $((page+1)) of $pages compressed quality file... "
    (echo -n "retrieve_gz quality 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.qa.gz
    echo "done at `date +%T`"
    echo -n "Retrieving page $((page+1)) of $pages xml file... "
    (echo -n "retrieve xml_info 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.xml
    gzip page-${pagenum}.xml
    echo "done at `date +%T`"
    rm page.bin
done

echo
echo All done at `date`!
EOF
chmod +x getOldTraces.sh
screen
./getOldTraces.sh > download.log
# detach screen
# tail -f download.log
#Retrieving sequences before Jan 2008
#Starting at Wed Jan 23 11:47:04 PST 2008
#
#Total of 13978657 sequences and 350 pages to retrieve
#
#Retrieving page 1 of 350 compressed fasta... done at 11:48:40
#Retrieving page 1 of 350 compressed quality file... done at 11:49:10
#Retrieving page 1 of 350 xml file... done at 11:51:05
#Retrieving page 2 of 350 compressed fasta... done at 11:52:40
#Retrieving page 2 of 350 compressed quality file... done at 11:53:10
# ...
#Retrieving page 350 of 350 compressed quality file... done at 07:07:08
#Retrieving page 350 of 350 xml file... done at 07:08:16
#
#All done at Fri Jan 25 07:08:16 PST 2008!

################################################################################
# RE-SEQUENCING TRACE ALIGNMENT TO HG18 (DONE 2008-01-31, Andy)

ssh kkr12u22
cd /san/sanVol1/scratch/andy/traces
mkdir run
cd run/
ls -1 /scratch/hg/hg18/nib/* | grep -v hap > nib.lst
ls -1 /san/sanVol1/scratch/andy/traces/page-*.fa.gz > traces.lst

cat < "EOF" > gsub
#LOOP
./doBlat.sh {check in exists $(path1)} $(path2) {check out line+ $(root2)/$(root1).$(root2).maf}
#ENDLOOP
cat < "EOF" > doBlat.sh
#!/bin/bash

thisDir=`pwd -P`
fa=`basename $1`
nib=$2
f=${fa%.fa.gz}

n=`basename $2`
n=${n%.nib}
name=${f}.${n}

out=${name}.maf
mkdir -p /scratch/tmp/andy/$name
mkdir -p $n
pushd /scratch/tmp/andy/$name
cp $1 .
blat -minMatch=12 -ooc=/scratch/hg/hg18/11.ooc -out=maf $nib $fa $out
cp $out ${thisDir}/$n
popd
rm -rf /scratch/tmp/andy/$name
EOF
chmod +x doBlat.sh

ssh pk
cd /san/sanVol1/scratch/andy/traces/run
gensub2 traces.lst nib.lst gsub spec
sed 's/\.fa\.c/.c/' spec > tmp; mv tmp spec
para create spec
para try, push, check
para time
#15750 jobs in batch
#100 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 15750 of 15750 jobs
#CPU time in finished jobs:     385991s    6433.19m   107.22h    4.47d  0.012 y
#IO & Wait Time:                 47866s     797.76m    13.30h    0.55d  0.002 y
#Average job time:                  28s       0.46m     0.01h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             186s       3.10m     0.05h    0.00d
#Submission to last job:          1551s      25.85m     0.43h    0.02d

# Cat all the alignments

ssh hgwdev
cd /san/sanVol1/scratch/andy/traces/run
head -n1 chrY/page-112.chrY.maf > maf.header
for ((i=0; i < 350; i++)); do
   echo page $((i+1))
   pagenum=`printf "%03d" $((i+1))`
   prefix=page-$pagenum
   newfile=cat/${prefix}.maf
   cp maf.header $newfile
   for f in `find . -name "${prefix}*"`; do
      tail +2 $f | sed 's/gnl|ti|//' >> $newfile
   done
done

############################################################################
# Reload CCDS (2008-02-01 markd)
    # import ccds database as described in ccds.txt
    set db=hg18
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    # update all.jointer to include ${db} in ccdsDb
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
    # request push of
        ccdsGene
        ccdsInfo
        ccdsKgMap
    # << emacs

#############################################################################
# phastCons multizPrimage
##		(DONE - 2008-02-11 braney )

    # split mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh kki
    mkdir /cluster/data/hg18/bed/multizPrimate/msa.split
    mkdir -p /san/sanvol1/scratch/hg18/multizPrimate/cons/ss
    cd /cluster/data/hg18/bed/multizPrimate
    # just use primates
    cat << '_EOF_' > primates.list
hg18
panTro2
ponAbe2
rheMac2
calJac1
otoGar1
'_EOF_'

    cd /cluster/data/hg18/bed/multizPrimate/msa.split
    zcat /san/sanvol1/braney/multizPrimate/chr1.maf.gz | \
    	perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' | \
	mafOrder stdin /cluster/data/hg18/bed/multizPrimate/primates.list chr1.maf
    twoBitToFa -seq=chr1 /scratch/data/hg18/hg18.2bit chr1.fa
    /cluster/bin/phast/$MACHTYPE/msa_split chr1.maf -i MAF -M chr1.fa \
             -o SS -r chr1 -w 300000000,0 -I 1000 -B 5000
    time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
            chr1.1-247249719.ss --tree \
	    "(((((hg18,panTro2),ponAbe2),rheMac2),calJac1),otoGar1)" \
	            --out-root starting-tree
    rm chr1.maf chr1.fa chr1.1-247249719.ss
    mkdir -p /san/sanvol1/scratch/hg18/multizPrimate/cons/estimate
    cp msa.split/starting-tree.mod /san/sanvol1/scratch/hg18/multizPrimate/cons/estimate

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /san/sanvol1/braney/multizPrimate
set WINDOWS = /san/sanvol1/scratch/hg18/multizPrimate/cons/ss
pushd $WINDOWS
set c = $1
rm -fr $c
mkdir $c
twoBitToFa -seq=$c /scratch/data/hg18/hg18.2bit /scratch/tmp/hg18.$c.fa
set TMP = /scratch/BR.$c.maf
zcat $MAFS/$c.maf.gz | perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' | \
    mafOrder stdin /cluster/data/hg18/bed/multizPrimate/primates.list $TMP
/cluster/bin/phast/$MACHTYPE/msa_split $TMP \
    -i MAF \
    -M /scratch/tmp/hg18.$c.fa \
    -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
rm -f $TMP /scratch/tmp/hg18.$c.fa
popd
date >> $c.done
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	do the easy ones first to see some immediate results
    ls -1S -r ../anno/maf | sed -e "s/.maf//" > maf.list

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... etc

# Completed: 49 of 49 jobs
# CPU time in finished jobs:       3520s      58.66m     0.98h    0.04d  0.000 y
# IO & Wait Time:                  1200s      20.00m     0.33h    0.01d  0.000 y
# Average job time:                  96s       1.61m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             464s       7.73m     0.13h    0.01d
# Submission to last job:           723s      12.05m     0.20h    0.01d

    # XXXX Estimates were attempted, not really very useful, instead, as seen
    # below, merely take the cons and noncons trees from the mouse 30-way

    # Estimate phastCons parameters
    #	see also:
    #	http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html

    # Create a list of .ss files over 3,000,000 in length
    #	this is almost everything
    cd /san/sanvol1/scratch/hg18/multizPrimate/cons/ss
    ls -1l chr*/chr*.ss | egrep -v "_hap|chrUn|random" | \
	awk '$5 > 3000000 {print $9;}' > ../tuningRun.list

    # Set up parasol directory to calculate trees on these 50 regions
    ssh pk
    mkdir /cluster/data/hg18/bed/multizPrimate/treeRun2
    cd /cluster/data/hg18/bed/multizPrimate/treeRun2
    mkdir tree log most

    #	Tuning this loop should come back to here to recalculate
    # Create script that calls phastCons with right arguments
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set SAN="/san/sanvol1/scratch/hg18/multizPrimate/cons"
set SS=$1
set C=$1:h
set F=$1:t
set tmpDir="/scratch/tmp/pA2_$2"
rm -fr $tmpDir
mkdir $tmpDir
mkdir -p log/${C} tree/${C} most/${C}
cp -p $SAN/ss/$1 $tmpDir/$F
cp -p $SAN/estimate/starting-tree.mod $tmpDir
pushd $tmpDir
/cluster/bin/phast/$MACHTYPE/phastCons $F starting-tree.mod \
      --gc 0.355 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-length 45 --target-coverage 0.3 --most-conserved $F.most \
      --quiet --log $F.log --estimate-trees $F.tree
popd
cp -p $tmpDir/$F.log log/$C
cp -p $tmpDir/$F.most most/$C
cp -p $tmpDir/$F.tree.*cons.mod tree/$C
rm -fr $tmpDir
'_EOF_'
#	<< happy emacs
      chmod a+x makeTree.csh

# Create gensub file
      cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1) $(num1)
#ENDLOOP
'_EOF_'
#	<< happy emacs

# Make cluster job and run it
  scp -p braney@pk:/san/sanvol1/scratch/hg18/multizPrimate/cons/tuningRun.list .
    gensub2 tuningRun.list single template jobList
    para create jobList
    para try/push/check/etc

# Completed: 310 of 310 jobs
# CPU time in finished jobs:     226767s    3779.45m    62.99h    2.62d  0.007 y
# IO & Wait Time:                  1224s      20.40m     0.34h    0.01d  0.000 y
# Average job time:                 735s      12.26m     0.20h    0.01d
# Longest finished job:             908s      15.13m     0.25h    0.01d
# Submission to last job:          4948s      82.47m     1.37h    0.06d

# Now combine parameter estimates.  We can average the .mod files
# using phyloBoot.  This must be done separately for the conserved
# and nonconserved models
    ls -1 tree/chr*/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ave.cons.mod > cons_summary.txt
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ave.noncons.mod > noncons_summary.txt
    sort -k1,1 -k2,2n  most/chr*/*.most > mostConserved.bed
    wc -l mostConserved.bed
# 1192414 mostConserved.bed

#	measuring entropy
#	consEntopy <target coverage> <expected lengths>
#		 ave.cons.mod ave.noncons.mod --NH 9.78
    /cluster/bin/phast/$MACHTYPE/consEntropy .3 45 \
	ave.cons.mod ave.noncons.mod

# Transition parameters: gamma=0.300000, omega=45.000000, mu=0.022222,
# nu=0.009524
# Relative entropy: H=0.141789 bits/site
# Expected min. length: L_min=98.721504 sites
# Expected max. length: L_max=62.917932 sites
# Phylogenetic information threshold: PIT=L_min*H=13.997639 bits


    ssh hgwdev featureBits -noRandom -noHap hg18 `pwd`/mostConserved.bed
    # 372348946 bases of 2858034764 (13.028%) in intersection
    ssh hgwdev featureBits -noRandom -noHap -enrichment hg18 genscan:cds \
	`pwd`/mostConserved.bed
    # genscan:cds 1.927%,
    # mostConserved.bed 13.028%,
    # both 0.300%, cover 15.57%, enrich 1.20x

    #	Estimates could be made, but more correctly, take the 30-way
    #	.mod file, and re-use it here.
    ssh hgwdev
    cd /cluster/data/hg18/bed/multizPrimate
 #   cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .

    # add up the C and G:
    grep BACKGROUND treeRun2/ave.noncons.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.355
    #	This 0.355 is used in the --gc argument below

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    ssh pk
    mkdir -p /cluster/data/hg18/bed/multizPrimate/cons/run.cons
    cd /cluster/data/hg18/bed/multizPrimate/cons/run.cons

    #	there are going to be several different phastCons runs using
    #	this same script.  They trigger off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is one of:
    #	all gliers placentals

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.2007-05-04
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/hg18/bed/multizPrimate/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg18/multizPrimate/cons
  cp -p $cons/$grp/*.mod .
  cp -p $san/ss/$c/$f.ss $cons/$grp/*.mod $tmp
pushd $tmp > /dev/null
  $PHASTBIN/phastCons $f.ss ave.cons.mod,ave.noncons.mod \
    --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
#  $PHASTBIN/phastCons $f.ss $grp.mod \
#    --rho $rho --expected-length $len --target-coverage $cov --quiet \
#    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 4
touch $san/$grp/pp/$c $san/$grp/bed/$c
rm -f $san/$grp/pp/$c/$f.pp
rm -f $san/$grp/bed/$c/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/hg18/multizPrimate/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/hg18/multizPrimate/cons
    ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
        /cluster/data/hg18/bed/multizPrimate/cons/ss.list
    popd

    # run for all species
    cd ..
    mkdir -p all run.cons/all
    cd all
#    /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
#    --prune-all-but=hg18,hg18,panTro2,rheMac2,calJac1,mm9,monDom4,ornAna1 \
#	> all.mod
    cd ../run.cons/all

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/hg18/multizPrimate/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.

# crashed jobs are OK methinks since we're checking output in
# bed file instead of pp file

# Completed: 332 of 337 jobs
# Crashed: 5 jobs
# CPU time in finished jobs:      11572s     192.86m     3.21h    0.13d  0.000 y
# IO & Wait Time:                  3189s      53.15m     0.89h    0.04d  0.000 y
# Average job time:                  44s       0.74m     0.01h    0.00d
# Longest finished job:              60s       1.00m     0.02h    0.00d
# Submission to last job:           564s       9.40m     0.16h    0.01d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all
    time nice -n +19 cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/hg18/bed/multizPrimate/cons/all

    # load into database
    ssh hgwdev
    cd /cluster/data/hg18/bed/multizPrimate/cons/all
    time nice -n +19 hgLoadBed hg18 phastConsElementsPrimate mostConserved.bed
    # Loaded 1431934 elements of size 5

    # Try for 5% overall cov, and 70% CDS cov
    featureBits hg18 phastConsElementsPrimate
    # 460640890 bases of 2881515245 (15.986%) in intersection

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    # sort by chromName, chromStart so that items are in numerical order
    #  for wigEncode
    cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastConsPrimateScores

for D in pp/chr*
do
    C=${D/pp\/}
    out=phastConsPrimateScores/${C}.data.gz
    echo "${D} > ${C}.data.gz"
    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
	gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh
    #  real    47m46.099s
    #	copy the phastCons8wayScores to:
# /cluster/data/hg18/bed/multizPrimate/downloads/phastCons8way/phastConsScores
    #	for hgdownload downloads

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all
    time nice -n +19 ls phastConsPrimateScores/*.data.gz | xargs zcat \
	| wigEncode -noOverlap stdin phastConsPrimate.wig phastConsPrimate.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
    # real    30m18.821s

    time nice -n +19 cp -p *.wi? /cluster/data/hg18/bed/multizPrimate/cons/all
    # real    1m26.426s

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/hg18/bed/multizPrimate/cons/all
    ln -s `pwd`/phastConsPrimate.wib /gbdb/hg18/multizPrimate/phastConsPrimate.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multizPrimate hg18 \
	phastConsPrimate phastConsPrimate.wig
    # real    0m53.686s


    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/hg18/bed/multizPrimate/cons/all
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg18 phastConsPrimate > histogram.data 2>&1
    # real    5m10.426s


    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Histogram phastConsPrimate track"
set xlabel " phastConsPrimate score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
## Annotate multizPrimate multiple alignment with gene annotations
##		(DONE - 2008-02-11 braney )
    # Gene frames
    ## survey all genomes to see what type of gene track to use
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/multizPrimate/frames
    cd /cluster/data/hg18/bed/multizPrimate/frames
    #	dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them
    cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`cat ../species.list`)
    echo -n "${db}: "
    echo -n "Tables: "
    set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
    foreach table ($tables)
	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
	    $table == "knownGene") then
		set count = `hgsql $db -N -e "select count(*) from $table"`
		echo -n "${table}: ${count}, "
	endif
    end
    set orgName = `hgsql hgcentraltest -N -e \
	    "select scientificName from dbDb where name='$db'"`
    set orgId = `hgsql hg18 -N -e \
	    "select id from organism where name='$orgName'"`
    if ($orgId == "") then
	echo "Mrnas: 0"
    else
	set count = `hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
	echo "Mrnas: ${count}"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./showGenes.csh
    #	given this output, manually sorted for this display:

# calJac1: Tables: Mrnas: 3558
# canFam2: Tables: ensGene: 25568, refGene: 864, Mrnas: 367629
# hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 28497, refGene:
# 26066, Mrnas: 8354195
# mm9: Tables: ensGene: 43795, knownGene: 49409, mgcGenes: 22368, refGene:
# 21395, Mrnas: 5093221
# otoGar1: Tables: Mrnas: 0
# panTro2: Tables: ensGene: 32852, mgcGenes: 4, refGene: 26344, Mrnas: 6346
# ponAbe2: Tables: Mrnas: 0
# rheMac2: Tables: ensGene: 38561, refGene: 445, Mrnas: 61770
# rn4: Tables: ensGene: 33745, knownGene: 8202, mgcGenes: 5704, refGene: 14498,
# Mrnas: 872209
# tupBel1: Tables: Mrnas: 2364

    #	use knownGene for hg18, mm9
    #	use ensGene for rn4, canFam2, panTro2, rheMac2
    #	use Mrnas for calJac1, ponAbe2
    #	no annotations for
    #		tupBel1, otoGar1

    mkdir genes
    # knownGene
    for DB in hg18 mm9
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # ensGene
    for DB in rn4 canFam2  panTro2 rheMac2
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # and finally, using the mrna tables

    for DB in calJac1 ponAbe2
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
	   from all_mrna,gbCdnaInfo,cds \
	   where (all_mrna.qName = gbCdnaInfo.acc) and \
	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done

    ssh kkstore06
    cd /cluster/data/hg18/bed/multizPrimate/frames
    time (cat  ../anno/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout rn4 genes/rn4.gp.gz mm9 genes/mm9.gp.gz hg18 genes/hg18.gp.gz rheMac2 genes/rheMac2.gp.gz ponAbe2 genes/ponAbe2.gp.gz panTro2 genes/panTro2.gp.gz canFam2 genes/canFam2.gp.gz calJac1 genes/calJac1.gp.gz | gzip > multizPrimate.mafFrames.gz) > frames.log 2>&1
    # see what it looks like in terms of number of annotations per DB:
    zcat multizPrimate.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
#   2732 calJac1
# 190927 hg18
# 195671 panTro2
# 208637 rheMac2
# 230764 mm9
# 231026 rn4
# 248086 canFam2

    #	load the resulting file
    ssh hgwdev
    cd /cluster/data/hg18/bed/multizPrimate/frames
    time nice -n +19 hgLoadMafFrames hg18 multizPrimateFrames \
	multizPrimate.mafFrames.gz
    #	real    1m1.893s

    #	enable the trackDb entries:
# frames multizPrimateFrames
# irows on


#############################################################################
## Add CTD data	(DONE - 2008-02-22, updated 2008-03-07, Fan )

    mkir /cluster/store11/gs.19/build36/bed/ctd021508
    cd /cluster/store11/gs.19/build36/bed/ctd021508

#   Download chem_gene_ixns.tsv from CTD site, http://ctd.mdibl.org/downloads/.

    hgsql hg18 -e 'create database ctd'
    hgsql ctd < ~/kent/src/hg/lib/chem_gene_ixns.sql

    hgsql ctd -e 'load data local infile "chem_gene_ixns.tsv" into table chem_gene_ixns'

# create sorted data

    hgsql hg18 -N -e \
    'select x.geneSymbol, ChemicalId, count(distinct Interaction), ChemicalName from kgXref x, ctd.chem_gene_ixns c where x.geneSymbol=c.GeneSymbol group by x.geneSymbol, ChemicalId'|\
    sort -k 1,1 -k 3,3nr -k 4,4 >ctdSorted.tab

    hgsql hgFixed < ~/kent/src/hg/lib/ctdSorted.sql
    hgsql hgFixed -e 'load data local infile "ctdSorted.tab" into table ctdSorted'

#############################################################################
# CREATE huge TABLE FOR HuGE LINK (DONE 3/6/08, Fan)

# Get HuGEgeneList.txt (list of HuGE genes from HuGE collaborator).

    mkdir /cluster/store11/gs.19/build36/bed/HuGE
    cd /cluster/store11/gs.19/build36/bed/HuGE

# put the file there.

    cp HuGEgeneList.txt huge.tab

# get rid of header lines and blank lines at the end.
    vi huge.tab

    hgsql hg17 < ~/kent/src/hg/lib/huge.sql
    hgsql hg18 < ~/kent/src/hg/lib/huge.sql

    hgsql hg17 -e 'load data local infile "huge.tab" into table huge'
    hgsql hg18 -e 'load data local infile "huge.tab" into table huge'
#############################################################################


#############################################################################
# ULTRACONSERVED TRACKS (LIFT FROM HG17) (DONE 2008-03-10, Andy)

ssh hgwdev
cd /cluster/data/hg18/bed
mkdir ultras
cd ultras/
echo "select chrom,chromStart,chromEnd,name from uc16" \
    | hgsql hg17 | tail +2 > uc16Hg17.bed
echo "select chrom,chromStart,chromEnd,name from ux16" \
    | hgsql hg17 | tail +2 > ux16Hg17.bed
liftOver uc16Hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
    uc16Hg18.bed uc16Hg18.unmapped
liftOver ux16Hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
    ux16Hg18.bed ux16Hg18.unmapped
hgLoadBed hg18 uc16 uc16Hg18.bed
hgLoadBed hg18 ux16 ux16Hg18.bed


#############################################################################
# TAJIMA'S D (LIFTOVER FROM HG17) (DONE 3/17/08 angie)
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/tajdLiftOver
    cd /cluster/data/hg18/bed/tajdLiftOver
    # The submitted hg17 bedGraph custom tracks had 1-based start coords,
    # so correct; also, the tajdSnp* tables used a sql command to set
    # the rs names, so get the data from SQL not file:
    set loChain = /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz
    foreach pop (Ad Ed Xd)
      zcat /cluster/data/hg17/bed/tajdpoly/20050603/hg17.tajd$pop.bedGraph.gz \
      | awk '{print $1 "\t" $2-1 "\t" $3 "\t" $4}' \
      | liftOver stdin -minMatch=0.5 \
          $loChain hg18.tajd$pop.bedGraph hg17.tajd$pop.unmapped
      hgsql hg17 -NBe "select chrom,chromStart,chromEnd,name from tajdSnp$pop" \
      | liftOver stdin \
          $loChain hg18.tajdSnp$pop.bed hg17.tajdSnp$pop.unmapped
    end
    foreach pop (Ad Ed Xd)
      hgLoadBed hg18 tajdSnp$pop hg18.tajdSnp$pop.bed
      hgLoadBed -bedGraph=4 hg18 tajd$pop hg18.tajd$pop.bedGraph
    end

    # The hg17 build had some fancy sql to find items overlapping with gaps,
    # awk'd to make sql to delete those items.  Use featureBits to find:
    foreach pop (Ad Ed Xd)
      featureBits hg18 -countGaps tajdSnp$pop gap -bed=tajdSnp$pop.gap.bed
      featureBits hg18 -countGaps tajd$pop gap -bed=tajd$pop.gap.bed
    end
    wc -l *.gap.bed
#  8 tajdAd.gap.bed
#  8 tajdEd.gap.bed
#  0 tajdSnpAd.gap.bed
#  0 tajdSnpEd.gap.bed
#  0 tajdSnpXd.gap.bed
#  8 tajdXd.gap.bed
    diff tajdAd.gap.bed tajdEd.gap.bed
    diff tajdAd.gap.bed tajdXd.gap.bed
    # No output from either diff -- same ranges.
    awk '{print $3 - $2;}' tajdAd.gap.bed
#2605
#5000
#5000
#1000
#1199
#1359
#5000
#4100
    # Actually, I disagree with removing the items that overlap those.
    # As the description page says, each 10kb region is really the center
    # of a 100kb window.  Those windows will overlap gaps -- and if the
    # center 10k of a window happens to overal a gap, the whole window is
    # no worse than a window that overlaps a gap 1/3 of the way in instead
    # of 1/2.


#############################################################################
# ADD ALLEN BRAIN CORTEXT LINK (DONE, 2/12/08, Fan)

    mkdir -p /cluster/store11/gs.19/build36/bed/allenBrain
    cd /cluster/store11/gs.19/build36/bed/allenBrain

# save list of genes from Allen Brain into file allenBrainGene.tab

    hgsql hg18 < ~/src/hg/lib/allenBrainGene.sql
    hgsql hg18 -e \
    'load data local infile "allenBrainGene.tab" into table allenBrainGene'

#############################################################################
# BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-10 - larrym)
    ssh kkstore04
    screen #	use screen to control this multi-day job
    mkdir /cluster/data/hg18/bed/blastz.equCab2.2008-04-10
    cd /cluster/data/hg18/bed/blastz.equCab2.2008-04-10
    cat << '_EOF_' > DEF
# Human vs. Horse

BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.equCab2.2008-04-10
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log &

    # failed so had to rerun stuff manually then, continue thus:

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log &
    0.157u 0.084s 1:21:15.25 0.0%   0+0k 0+0io 0pf+0w

    ln -s blastz.equCab2.2008-04-10 /cluster/data/hg18/bed/blastz.equCab2

    featureBits hg18 -chrom=chr1 chainEquCab2Link
    # 133103986 bases of 224999719 (59.157%) in intersection

    cd /cluster/data/hg18/bed/blastz.equCab2.2008-04-10

    cat fb.hg18.chainEquCab2Link.txt
    # 1647122438 bases of 2881515245 (57.162%) in intersection

    #	re-running with fixed UnScaffolds business with fixed chr27:
    mkdir /hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01
    cd /hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01
    cat << '_EOF_' > DEF
# Human vs. Horse

BLASTZ=blastz
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes 
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk  \
	-chainMinScore=3000 -chainLinearGap=medium  > do.log 2>&1
    #	broken chain step for chr19, ran manually all day long on swarm, then
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-continue=chainMerge -verbose=2 -workhorse=hgwdev \
	-stop=net -smallClusterHub=pk -bigClusterHub=pk  \
	-chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1
XXX - running Tue Dec  2 15:42:18 PST 2008
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-continue=syntenicNet -syntenicNet -verbose=2 -workhorse=hgwdev \
	-stop=syntenicNet -smallClusterHub=pk -bigClusterHub=pk  \
	-debug -chainMinScore=3000 -chainLinearGap=medium > syntenicNet.log 2>&1

#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (DONE 4/18/08 angie - UPDATED 11/4/08)
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/mrnaPcr
    cd /cluster/data/hg18/bed/mrnaPcr
    # First, get consistent FA and PSL for UCSC Genes.
    # Initially I tried to use files from /cluster/data/hg18/bed/ucsc.10/:
    # subColumn 10 /cluster/data/hg18/bed/ucsc.10/rnaToGenome.psl
    #   /cluster/data/hg18/bed/ucsc.10/txToAcc.tab ucscGenes.hg18.psl
    # /cluster/data/hg18/bed/ucsc.10/ucscGenes.fa
    # But the psl was not from exactly the same seq's as in the fa.
    # Jim's suggestion: use sequenceForBed to get genomic-translated
    # sequences, and then genePredToFakePsl.  sequenceToBed must be
    # run on hgwdev.
    genePredToBed /cluster/data/hg18/bed/ucsc.11/ucscGenes.gp > ucscGenes.bed
    hgsql hg18 -NBe 'select kgId,geneSymbol from kgXref' \
    | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
      > idSub.txt
    subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
    sequenceForBed -keepName -db=hg18 -bedIn=ucscGenesIdSubbed.bed \
      -fastaOut=stdout \
    | faToTwoBit stdin kgTargetSeq.2bit
    cut -f 1-10 /cluster/data/hg18/bed/ucsc.11/ucscGenes.gp \
    | genePredToFakePsl hg18 stdin kgTargetAli.psl /dev/null

    # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
    cd /cluster/data/hg18/bed/mrnaPcr
    hgLoadPsl hg18 kgTargetAli.psl
    mkdir /gbdb/hg18/targetDb
    ln -s /cluster/data/hg18/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg18/targetDb/

    # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
    # /gbdb/hg18/targetDb/kgTargetSeq.2bit .

    ssh hgwdev
    # Add records to hgcentraltest blatServers and targetDb:
    hgsql hgcentraltest -e \
      'INSERT into blatServers values ("hg18KgNov08", "blat13", 17799, 0, 1);'
    hgsql hgcentraltest -e \
      'INSERT into targetDb values("hg18KgNov08", "UCSC Genes", \
         "hg18", "kgTargetAli", "", "", \
         "/gbdb/hg18/targetDb/kgTargetSeq.2bit", 1, now(), "");'


#############################################################################
# MAKE PCR TARGET FOR SNAPSHOT OF ALL_MRNA (DONE 4/18/08 angie)
    ssh hgwdev
    # Load up native mRNA target tables:
    hgsql hg18 -NBe 'select qName from all_mrna' \
    | sort -u > mrnaAccs.txt
    $HOME/kent/src/hg/makeDb/genbank/bin/$MACHTYPE/gbGetSeqs \
      -gbRoot=/gbdb/genbank -accFile=mrnaAccs.txt \
      -db=hg18 -native genbank mrna mrnaTargetSeq.fa
    faToTwoBit mrnaTargetSeq.fa mrnaTargetSeq.2bit
    ln -s /cluster/data/hg18/bed/mrnaPcr/mrnaTargetSeq.2bit \
      /gbdb/hg18/targetDb/
    hgsql hg18 -e ' \
      create table mrnaTargetAli select * from all_mrna; \
      alter table mrnaTargetAli add index (tName,bin); \
      alter table mrnaTargetAli add index (qName);'
    rm *.tab

    ssh kolossus
    # Start up gfServer for mrnaTargetSeq:
    cd /cluster/data/hg18/bed/mrnaPcr
    faToTwoBit mrnaTargetSeq.fa mrnaTargetSeq.2bit
    gfServer -stepSize=5 -canStop start localhost 17991 mrnaTargetSeq.2bit &

    ssh hgwdev
    # Add records to hgcentraltest blatServers and targetDb:
    hgsql hgcentraltest -e \
      'INSERT into blatServers values ("hg18MrnaApr08", "kolossus", 17991, 0, 1);'
    hgsql hgcentraltest -e \
      'INSERT into targetDb values("hg18MrnaApr08", "Human mRNAs", \
         "hg18", "mrnaTargetAli", "", "", \
         "/gbdb/hg18/targetDb/mrnaTargetSeq.2bit", 2, now(), "");'


#############################################################################
# Reload CCDS from CCDS.20080502 dump (2008-05-03 markd)
    # import ccds database as described in ccds.txt
    set db=hg18
    set ncbiBld=36.3
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    # update all.jointer to include ${db} in ccdsDb
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
    # request push of
        ccdsGene
        ccdsInfo
        ccdsKgMap
    # << emacs
############################################################################
#  update vega genes to version 31 (v49 of Ensembl genes)
#	(DONE - 2008-05-15 - Hiram)
    mkdir  /cluster/data/hg18/bed/vega31_49
    cd  /cluster/data/hg18/bed/vega31_49
    wget --timestamping \
	"ftp://ftp.sanger.ac.uk/pub/vega/human/gtf_file.gz"
    wget --timestamping \
	"ftp://ftp.sanger.ac.uk/pub/vega/human/CHANGELOG.gz"
    wget --timestamping \
	"ftp://ftp.sanger.ac.uk/pub/vega/human/catalog.txt"
    wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/human/pep/Homo_sapiens.VEGA.apr.pep.tot.fa.gz"

    #	processing similar to the same processing for Ensembl genes,
    #	from /cluster/data/hg18/bed/ensGene.49/process/doProcess.csh
    zcat gtf_file.gz \
        | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
        | liftUp -type=.gtf stdout \
	    /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry stdin \
        | gzip > allGenes.gtf.gz
    gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
	| gzip > hg18.allGenes.gp.gz
    /cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl \
	infoOut.txt > ensGtp.tab
    genePredCheck -db=hg18 hg18.allGenes.gp.gz
    #	checked: 62418 failed: 0
    zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf
    zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
    gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
    gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
    genePredCheck -db=hg18 pseudo.gp
    #	checked: 5747 failed: 0
    genePredCheck -db=hg18 not.pseudo.gp
    #	checked: 56671 failed: 0
    hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
    hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp


############################################################################
# DGV V10 (DATABASE OF GENOMIC VARIANTS) (DONE 11/10/10 angie - color change 2/22/11 #2917)
# DGV V9 done 3/26/10
# DGV V8 done 8/12/09 (changed color of inverted 11/05/09 kuhn)
# DGV V7 done 3/11/09
# DGV V6 thin regions dropped 2/23/09
# DGV V6 with useless thin regions done 11/12/08
# DGV V5 done 7/16/08
# DGV V4 done 5/9/08
# 11-04-2009 color change from brown to magenta:
# old color
# 6553700	Inversion  (100,0,100)
# new:
# 13107400	Inversion  (200,0,200)
# 2/22/11 color change (Bug #2917): swap blue and red; green -> brown
# Old DGV format is obsolete; see the following section.

#######################################################################
# DGV BETA (DATABASE OF GENOMIC VARIANTS) (DONE 2/11/13 angie)
    # DGV has changed their data format, and for the time being the data are
    # served by a beta web site, http://dgvbeta.tcag.ca/ ; in time that will
    # replace their current site.
    set today = `date +%y%m%d`
    mkdir -p /hive/data/genomes/hg18/bed/dgv/$today
    cd /hive/data/genomes/hg18/bed/dgv/$today
    wget http://dgvbeta.tcag.ca/dgv/docs/NCBI36_hg18_2012-11-23.txt
    head -1 NCBI36_hg18*.txt
#variantaccession        chr     start   end     varianttype     variantsubtype  reference       pubmedid        method  platform        mergeid mergedorsample  frequency       samplesize      cohortdescription       genes
    # It's more complicated than Gain/Loss/Complex or Inversion now (+ stray commas):
    cut -f 5,6 NCBI36_hg18*.txt | sort | uniq -c | head -100
#  20156 CNV
#   1304 CNV     ""
#  27098 CNV     CNV
#   2988 CNV     Complex
# 187319 CNV     Deletion
#  17673 CNV     Duplication
# 123436 CNV     Gain
#   4170 CNV     Gain+Loss
#  27382 CNV     Insertion
# 479784 CNV     Loss
#    280 OTHER
#     31 OTHER   ""
#     44 OTHER   Complex
#   2519 OTHER   Inversion
#    663 OTHER   Tandem duplication
#     1 varianttype     variantsubtype
    # shuffle fields into bed9+ w/itemRgb
    set purple = "200,0,200"
    set red = "200,0,0"
    set blue = "0,0,200"
    set brown = "139,69,19"
    tail -n +2 NCBI36_hg18*.txt \
    | perl -wpe 'chomp; \
      s/""//; \
      ($id, $chr, $start, $end, $varType, $varSubType, $ref, $pmid, $method, $platform, \
       undef, undef, undef, $sampleSize, $sampleDesc, $genes) = split("\t"); \
      $start-- unless ($start == 0); \
      $landmark = $genes; \
      $landmark =~ s/,/, /g; \
      $varSubType =~ s/^,//;  $varSubType =~ s/,$//; \
      $varTypeOut = "$varType ($varSubType)"; \
      $ref =~ s/_/ /g; \
      $method =~ s/_/ /g;  $method =~ s/,/, /g; \
      $sample = $sampleDesc; \
      $sample .= " (sample size: $sampleSize)" if ($sampleSize); \
      $method .= " ($platform)" if ($platform && $platform ne "Not Provided"); \
      $rgb = "0,0,0"; \
      if ($varType eq "CNV") { \
        if ($varSubType eq "Gain" || $varSubType eq "Insertion" || $varSubType eq "Duplication") {\
          $rgb = "'$blue'"; \
        } elsif ($varSubType eq "Loss" ||$varSubType eq "Deletion") { \
          $rgb = "'$red'"; \
        } elsif ($varSubType eq "") { \
          $varTypeOut = $varType; \
        } else { \
          $rgb = "'$brown'"; \
        } \
      } elsif ($varType eq "OTHER") { \
        if ($varSubType eq "Inversion") { \
          $rgb =  "'$purple'"; \
        } elsif ($varSubType eq "Tandem Duplication") { \
          $rgb = "'$blue'"; \
        } else { \
          $varTypeOut = $varType; \
        } \
      } \
      $_ = join("\t", "chr$chr", $start, $end, $id, 0, "+", \
                $start, $start, $rgb, $landmark, $varTypeOut, \
                $ref, $pmid, $method, $sample) . "\n";' \
        > dgv.bed
    hgLoadBed hg18 dgv dgv.bed \
      -sqlTable=$HOME/kent/src/hg/lib/dgv.sql -renameSqlTable -tab
#Read 894847 elements of size 15 from dgv.bed


############################################################################
# AGILENT CGH PROBES (AND MM8, RN4) (Done 2008-05-13, Andy)
   ssh hgwdev
   bash
   cd /cluster/data/hg18/bed
   mkdir agilentProbes
   cd agilentProbes/
   cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Human_CGH.zip .
# (agilent-provided zips)
# what a pain... this zipfile isn't unzippable using linux unzip.
# Bob's windows machine didn't do it either.  Finally got it using the
# mac in Erich and Victoria's office.  Extracting creates a directory
# called "Agilent_Human_CGH Folder"
   cp Agilent_Human_CGH\ Folder/* .
   rmdir Agilent_Human_CGH\ Folder/
   tail +3 014693_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent244a.bed
   tail +3 014698_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent105a.bed
   tail +3 014950_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent44k.bed
   for bed in *.bed; do hgLoadBed hg18 ${bed%.bed}{,.bed}; done
   cd /cluster/data/mm8/bed
   mkdir agilentCgh
   cd agilentCgh/
   cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Mouse_CGH.zip .
# (same crap as before with the zip file)
   cp Agilent_Mouse_CGH\ Folder/* .
   rmdir Agilent_Mouse_CGH\ Folder/
   tail +3 014695_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh244a.bed
   tail +3 014699_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh105a.bed
   tail +3 015028_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh44k.bed
   for bed in *.bed; do hgLoadBed mm8 ${bed%.bed}{,.bed}; done
   cd /cluster/data/rn4/bed
   mkdir agilentCgh
   cd agilentCgh/
   cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Rat_CGH.zip .
# (yep, again)
   cp Agilent_Rat_CGH\ Folder/* .
   rmdir Agilent_Rat_CGH\ Folder/
   tail +3 015223_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh244a.bed
   tail +3 015235_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh105a.bed
   for bed in *.bed; do hgLoadBed rn4 ${bed%.bed}{,.bed}; done

############################################################################
# AGILENT HUMAN SUREPRINT G3 ARRAY PROBESETS (DONE 2008-12-09, Andy)
    ssh hgwdev
    cd /hive/data/hg18/bed/agilentProbes
    wget --timestamping --user=microarray --password=<get-it-from-agilent> \
        "ftp://ftp.agilent.com/restricted/UCSC_BED_FILES/*"
    zcat 021365_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCnv2x400k stdin
    zcat 021529_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh1x1m stdin
    zcat 021850_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh2x400k stdin
    zcat 021924_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh8x60k stdin
    zcat 022060_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh4x180k stdin

############################################################################
# TWO MORE AGILENT HUMAN ARRAYS (DONE, 2009-07-28 Andy)
    ssh hgwdev
    cd /hive/data/hg18/bed/agilentProbes
    wget --timestamping --user=microarray --password=<get-it-from-agilent> \
        "ftp://ftp.agilent.com/restricted/UCSC_BED_FILES/*"
    tail -n +3 022837_D_UCSCTrack_20090331.txt | hgLoadBed hg18 agilentCnv2x105k stdin
    tail -n +3 023642_D_BED_20090528.bed | \
      awk 'BEGIN{FS="\t";OFS="\t"}{print $0, "1000", "+";}' | \
      hgLoadBed hg18 agilentHdd1x1m stdin

############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################

############################################################################
# ILLUMINA WG-6 PROBES (2008-06-13 Andy)

# Download the Platform file from GEO here:
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6884
# Click on "Download full table"

ssh hgwdev
bash
cd /san/sanVol1/scratch/andy
mkdir illumina
cd illumina/
cp ~/GPL6884-5803.txt .

# Collect GIs for all the RNAs
# First download/install Biopython
wget http://biopython.org/DIST/biopython-1.45.tar.gz
tar xfz biopython-1.45.tar.gz
mkdir biopythonLibs
cd biopython-1.45/
python setup.py install --home=/san/sanVol1/scratch/andy/illumina/biopythonLibs
export PYTHONPATH=/san/sanVol1/scratch/andy/illumina/biopythonLibs

# Now get the RNAs
mkdir getRna grabbed
cd getRna/
tail +31 ../GPL6884-5803.txt | cut -f11 | sort | uniq > gis.txt
wc -l gis.txt
# 43338 gis.txt
split -d -l 100 -a 3 gis.txt gis-
rm gis.txt

cat < "EOF" > getSeqs.py
import Bio
from Bio import EUtils
from Bio.EUtils import HistoryClient

gis = open('gis.txt', 'r').readlines()
for i in range(len(gis)):
    gis[i] = gis[i].rstrip('\n')
ids = EUtils.DBIds('nucleotide', gis)

client = HistoryClient.HistoryClient()
result = client.post(ids)
print result.efetch(retmode="text", rettype="fasta").read()
EOF
# << emacs

cat < "EOF" > getSeqs.sh
#!/bin/bash

for gi in gis-*; do
    numGot="0";
    attempt="1";
    while [ $numGot -lt 100 ]; do
	echo Getting $gi attempt $attempt;
	cp $gi gis.txt;
	fa=${gi}.fa
	python getSeqs.py > $fa
	numGot=`grep '>' $fa | wc -l`;
	if [ $numGot = 100 ]; then
	    echo Got all for $gi
	    mv $fa ../grabbed/;
	    rm $gi
	else
	    rm $fa;
	    sleep 10;
	fi
	attempt=$((attempt+1));
    done
    sleep 5;
done
EOF
# << emacs

chmod +x getSeqs.sh
./getSeqs.sh

# there's a fair bit that retries the download over and over but eventually it
# gets to the last one, which doesn't have 100 lines, so I run the python
# program on that on by itself.

cat ../grabbed/* > probeRna.fa
rm -rf ../grabbed/
cd ../

# Now blat RNA to genome

mkdir -p blatRna/{splits,out}
cd blatRna/
faSplit sequence ../getRna/probeRNA.fa 400 splits/rna-
ls -1 splits/* > splits.lst
cat < "EOF" > runBlat.sh
#!/bin/bash

cd -P .
fa=`basename $1`
chr=`basename $2 .nib`
split=`basename $1 .fa`
out=${split}.${chr}.psl
nibDir=/scratch/hg/hg18/bothMaskedNibs
tmpDir=/scratch/tmp/$out

mkdir $tmpDir
pushd $tmpDir
oldDir=`dirs +1`
cp ${oldDir}/$1 .
blat -noHead -ooc=/scratch/hg/hg18/11.ooc -out=psl ${nibDir}/$2 $fa $out
mkdir -p ${oldDir}/out/${chr}
cp $out ${oldDir}/out/${chr}/
popd
rm -rf $tmpDir
EOF
# << emacs

chmod +x runblat.sh
cat < "EOF" > gsub
#LOOP
./runBlat.sh {check in line+ $(path1)} $(path2) {check out exists out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
EOF
# << emacs

ls -1 /cluster/data/hg18/nib > nib.lst
ssh pk
cd /san/sanVol1/scratch/andy/illumina/blatRna
gensub2 splits.lst nib.lst gsub spec
para create spec
para try
para push
para time
#17820 jobs in batch
#34457 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 17820 of 17820 jobs
#CPU time in finished jobs:      84196s    1403.26m    23.39h    0.97d  0.003 y
#IO & Wait Time:                 48448s     807.47m    13.46h    0.56d  0.002 y
#Average job time:                   7s       0.12m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             270s       4.50m     0.07h    0.00d
#Submission to last job:          1515s      25.25m     0.42h    0.02d

exit; # back to hgwdev
mkdir /tmp/andy
pslSort -nohead dirs allSorted.psl /tmp/andy out/*
rmdir /tmp/andy
pslReps -singleHit allSorted.psl single.ps{l,r}

# Blat probes against the RNAs
cd ../
mkdir -p blatProbes/out
cd blatProbes/
ln -s ../blatRna/splits .
ln -s ../blatRna/splits.lst .
ln -s ../blatRna/single.psl .
tail +31 ../GPL6884-5803.txt | cut -f1,11,18 | \
   awk '{printf("%s\tgi|%s\t%s\n", $1, $2, $3);}' > probes.tab

cat << "EOF" >
#!/bin/bash

faFile=`basename $1`;
pslFile=${faFile%.fa}.psl
probeFile=$2;
rnaOnGenomePsl=$3;
tmpDir=/scratch/andy/`date +"%T" | tr ':' '_'`.$$
mkdir -p $tmpDir
cp $1 $2 $3 $tmpDir

pushd $tmpDir
for id in `grep '>' $faFile | sed 's/^>//'`; do
     # make probe fa
     echo $id
     awk '{if ($2 == "'"$id"'") printf(">%s\n%s\n", $1, $3);}' $probeFile \
        > probe.fa
     # extract single RNA fa
     faOneRecord $faFile $id > rna.fa
     blat -noHead rna.fa probe.fa probeOnRna.psl
     awk 'BEGIN{FS="\t";OFS="\t";}{if ($10 == "'"$id"'") print;}' \
       $rnaOnGenomePsl > rnaOnGenome.psl
     if [ `find . -size '0b' -type f | wc -l` == 0 ]; then
         pslMap probeOnRna.psl rnaOnGenome.psl probeOnGenome.psl
         cat probeOnGenome.psl >> $pslFile
     fi
done
popd

cp $tmpDir/$pslFile $4
rm -rf $tmpDir
EOF
# << emacs

cat << "EOF" > gsub
#LOOP
./probeBlat.sh {check in line+ $(path1)} probes.tab single.psl {check out exists out/$(root1).psl}
#ENDLOOP
EOF
# << emacs

ssh pk
cd /san/sanVol1/scratch/andy/illumina/blatProbes
gensub2 splits.lst single gsub spec
para create spec
para try
para push
para time
#396 jobs in batch
#41977 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 396 of 396 jobs
#CPU time in finished jobs:      11101s     185.02m     3.08h    0.13d  0.000 y
#IO & Wait Time:                  1361s      22.68m     0.38h    0.02d  0.000 y
#Average job time:                  31s       0.52m     0.01h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             121s       2.02m     0.03h    0.00d
#Submission to last job:           271s       4.52m     0.08h    0.00d

exit # back to hgwdev
mkdir /tmp/andy
pslSort -nohead dirs sorted.psl /tmp/andy out
# Load stuff up
pslToBed sorted.psl sorted.bed
cd ../
mkdir tables
cd tables/
cp ../blatProbes/sorted.{psl,bed} .
hgLoadPsl -table=illuminaProbesAlign hg18 sorted.psl
hgLoadBed hg18 illuminaProbes sorted.bed
cat << "EOF" >
CREATE TABLE illuminaProbesSeq (
  id varchar(40) NOT NULL,
  seq varchar(55) NOT NULL,
  PRIMARY KEY (id)
  ) TYPE=MyISAM;
EOF
# << emacs

cut -f1,3 ../blatProbes/probes.tab > illuminaProbesSeq.tab
hgLoadSqlTab hg18 illuminaProbesSeq{,.sql,.tab}


############################################################################
# dbSNP BUILD 129 (DONE 6/24/08 angie)
# 8/6/08: Regenerated snp129.sql with only those enum/set values that are
# actually used (except always keep unknown, the default) and reloaded snp129.
# No data change -- just the sql field definitions for enums and sets.
# 8/7/08: Swapped molType values cDNA <--> genomic in snp129 because they
# were swapped in the fasta headers.
# QA NOTE: used sudo mytouch to change timestamps on all downstream snp129
# tables (snp129Exceptions, snp129ExceptionDesc, snp129OrthoPt2Pa2Rm2,
# snp129Seq) to .2008-08-08 00:00:00 to avoid unwarranted joinerCheck
# time discrepancy errors. (8/8/08, brooke)

    # Set up build directory
    mkdir -p /cluster/store3/dbSNP129/{human,shared}
    ln -s /cluster/store3/dbSNP129 /cluster/data/dbSNP/129

    # Get field encodings -- if there are changes or additions to the
    # encoding of the corresponding fields, you might need to update
    # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
    # hg/lib/snp125Ui.c).
    cd /cluster/data/dbSNP/129/shared
    alias wg wget --timestamping
    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
    wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz
    wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz
    wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz
    wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz
    # Here is another source -- it is not as up-to-date as the above, but
    # our encodings (enums and sets in snp129.sql) are named more similar
    # to those in the 2005 ASN:
    # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn

    ########################## DOWNLOAD #############################
    cd /cluster/data/dbSNP/129/human
    mkdir data schema rs_fasta
    # Get data from NCBI (anonymous FTP)
    wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
    cd /cluster/data/dbSNP/129/human/data
    # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
    wg $ftpSnpDb/organism_data/b129_SNPContigLoc_36_3.bcp.gz
    wg $ftpSnpDb/organism_data/b129_SNPContigLocusId_36_3.bcp.gz
    wg $ftpSnpDb/organism_data/b129_ContigInfo_36_3.bcp.gz
    # MapInfo has alignment weights
    wg $ftpSnpDb/organism_data/b129_SNPMapInfo_36_3.bcp.gz
    # SNP has univar_id, validation status and heterozygosity
    wg $ftpSnpDb/organism_data/SNP.bcp.gz

    # Get schema
    cd /cluster/data/dbSNP/129/human/schema
    wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz

    # Get fasta files
    # using headers of fasta files for molType, class, observed
    cd /cluster/data/dbSNP/129/human/rs_fasta
    wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz

    ########################## LOAD NCBI TABLES #############################
    # Simplify names of data files -- strip version & extras to get
    # local canonical table names.
    cd /cluster/data/dbSNP/129/human/data
    foreach f (*.bcp.gz)
      set new = `echo $f \
                 | sed -e 's/^b129_SNP//; s/^b129_//; s/_36_3//; s/.bcp//;'`
      mv $f $new
      echo $new
    end

    # Extract just the tables that we need from the NCBI msSQL table
    # creation file, and get CREATE statements from
    # human_9606_table.sql for our 5 tables
    cd /cluster/data/dbSNP/129/human/schema
    zcat human_9606_table.sql.gz \
    | perl -we '$/ = "\nGO\n\n\n"; \
        while (<>) { \
          next unless /^CREATE TABLE \[(b129_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_3)?\]/; \
          s/b129_(SNP)?//; s/_36_3//; \
          s/[\[\]]//g;  s/GO\n\n/;/;  s/smalldatetime/datetime/g; \
          s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
          s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
          s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
          s/(image|varchar\s+\(\d+\))/BLOB/g; \
          print; \
        }' \
      > table.sql

    # load on kolossus or a small cluster machine (mysql5 is OK for this;
    # in fact it's better than 4 because it has 'show warnings').
    ssh kkr3u00
    hgsql '' -e 'create database hg18snp129'
    cd /cluster/data/dbSNP/129/human/schema
    hgsql hg18snp129 < table.sql
    cd ../data

    # Avoid wasting space by excluding mappings to non-reference contigs:
    foreach t (ContigInfo MapInfo)
      zcat $t.gz \
      | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable hg18snp129 $t placeholder stdin
    end
    # Compare contig list between our ctgPos and reference contigs in
    # ContigInfo:
    ssh hgwdev-10 hgsql hg18 -N -B -e '"select contig from ctgPos;"' \
    | sort > /tmp/1
    hgsql hg18snp129 -NBe 'select distinct(group_label) from ContigInfo'
    # --> reference, c5_H2, c6_COX, c6_QBL, c22_H2, DR53
    # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
    hgsql hg18snp129 -N -B -e 'select contig_acc from ContigInfo \
        where group_label in \
        ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2
    diff /tmp/1 /tmp/2
    # No diff.
    # Make sure there are no orient != 0 contigs among those selected.
    hgsql hg18snp129 -NBe \
      'select count(*) from ContigInfo where orient != 0 and \
         group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");'
#0

    # ContigLoc is huge, and we want just the reference contig mappings.
    # So, based on the reference & haplo ctg_id values in ContigInfo,
    # filter to get just the mappings for those contigs:
    zcat ContigLoc.gz \
    | awk '$3 <= 377 || $3 == 7015' \
    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
    | hgLoadSqlTab -oldTable hg18snp129 ContigLoc placeholder stdin
    foreach t (ContigLocusId SNP)
      zcat $t.gz \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable hg18snp129 $t placeholder stdin
    end
    # There were some warnings (many cleared up by the perl substitution)
    # but no rows were dropped.  'show warnings' after a manual 'load data'
    # complains about missing values (OK when e.g. position is not known).
    foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
     echo -n "${t}:\t"
      hgsql -N -B hg18snp129 -e 'select count(*) from '$t
    end
#ContigInfo:     379
#ContigLoc:      15835019  (before filtering: 46913472)
#ContigLocusId:  25496815
#MapInfo:        14845535  (before filtering: 44627804)
#SNP:    	 14708770

    #################### EXTRACT INFO FROM NCBI TABLES ####################
    mkdir -p /scratch/snp/129/human
    cd /scratch/snp/129/human

    time hgsql hg18snp129 -e \
      'alter table ContigLoc  add index (ctg_id); \
       alter table ContigInfo add index (ctg_id);'
#0.002u 0.002s 2:14.79 0.0%      0+0k 0+0io 1pf+0w
    # was ~12m on a run without trimming ContigLoc!

    time hgsql hg18snp129 -e \
      'alter table ContigInfo add index (group_label(9));'
#0.005u 0.000s 0:00.16 0.0%      0+0k 0+0io 1pf+0w

    # For joining files by shared column, we need a unique identifier in
    # that shared column.  snp_id is not unique -- the same rsID can appear
    # in both the reference assembly and on one of the others e.g. c6_COX.
    # So concatenate the assembly identifier and snp_id to get hopefully
    # unique label.
    time hgsql hg18snp129 -NBe \
      'select concat(ContigInfo.group_label, ".", snp_id), \
              ContigInfo.contig_acc, asn_from, asn_to, \
              loc_type, orientation, allele, phys_pos_from \
       from ContigLoc, ContigInfo \
       where ContigLoc.ctg_id = ContigInfo.ctg_id and ContigInfo.group_label \
             in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
      | sort \
      > ucscContigLoc.txt
    # no time output because of the pipe... took 5 minutes.

    # Are these IDs unique?
    wc -l ucscContigLoc.txt
#15835019 ucscContigLoc.txt
    awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l
#14791529
    # Nope.  Find non-unique IDs:
    awk 'prev == $1 {print;} {prev = $1;}' ucscContigLoc.txt | head
    grep ^c5_H2.10035195 ucscContigLoc.txt
#c5_H2.10035195  NT_113801       639954  639954  2       0       G      69605321
#c5_H2.10035195  NT_113801       660407  660407  2       0       G      69625774
#c5_H2.10035195  NT_113801       911780  911780  2       1       C      69877147
#c5_H2.10035195  NT_113801       933061  933061  2       1       C      69898428
    # OK, they can be duplicated within the same contig.  See if we can
    # get by with anchoring everything to ucscContigLoc.txt.  But everybody
    # else better have unique IDs!

    # SNP -> valid, avHet, avHetSE
    # SNP has only snp_id as identifier, nothing relating to assembly.
    hgsql hg18snp129 -NBe \
      'select snp_id, validation_status, avg_heterozygosity, het_se \
       from SNP;' \
    | sort \
      > ucscSNP.txt
    # Check ID uniqueness:
    wc -l ucscSNP.txt
#14708770 ucscSNP.txt
    awk '{print $1;}' ucscSNP.txt | uniq | wc -l
#14708770

    # ContigLocusId -> func
    # ContigLocusId has only snp_id as an identifier (it gives one
    # example contig if the SNP is on multiple contigs).
    # The sort options and awk are to convert multiple entries with different
    # function classes for the same SNP into one entry per SNP with a list
    # of function classes.
    hgsql hg18snp129 -NBe \
      'select snp_id, fxn_class from ContigLocusId;' \
    | sort -u -k1,1 -k2,2n  \
    | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
            else { if (prevId) {print prevId "\t" prevFunc;} \
                                prevFunc = $2 ","; }} \
           {prevId = $1;} \
           END {print prevId "\t" prevFunc;}' \
      > ucscFunc.txt
    # Check ID uniqueness:
    wc -l ucscFunc.txt
#6136008 ucscFunc.txt
    awk '{print $1;}' ucscFunc.txt | sort -u | wc -l
#6136008

    # MapInfo -> weight
    # MapInfo needs assembly+snp_ids in order to have unique IDs.
    time hgsql hg18snp129 -e \
      'alter table MapInfo add index (assembly(9));'
#0.003u 0.003s 3:40.29 0.0%      0+0k 0+0io 1pf+0w
    hgsql hg18snp129 -NBe \
      'select concat(assembly, ".", snp_id), weight \
             from MapInfo where assembly \
             in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
      | sort \
      > weight.txt
    # ~1 minute
    # Check ID uniqueness:
    wc -l weight.txt
#14791529 weight.txt
    awk '{print $1;}' weight.txt | uniq | wc -l
#14791529
    awk '{print $2;}' weight.txt | sort -n | uniq -c
#   40910 0
#14326127 1
#  157402 2
#  256608 3
#   10482 10
    # SNPs w/weight 0 and 10 will be discarded later.

    # fasta headers -> observed, molType, class
    zcat /cluster/data/dbSNP/129/human/rs_fasta/rs_ch*.fas.gz \
    | grep '^>gnl' \
    | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
    | sort \
      > ucscGnl.txt
    # ~5m
    wc -l ucscGnl.txt
#14708630 ucscGnl.txt
    awk '{print $1;}' ucscGnl.txt | uniq | wc -l
#14708630

    ############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################
    # Join files by ID.  Start with ContigLoc and MapInfo because they
    # share the concatenated assembly+snp_id IDs.
    time join -a 1 -e MISSING -t '	' ucscContigLoc.txt weight.txt \
      > ucscCL+w.txt
#28.334u 4.730s 1:43.47 31.9%    0+0k 0+0io 0pf+0w
    wc -l ucscCL+w.txt
#15835019 ucscCL+w.txt
    # Same as ucscContigLoc.txt above, good.
    # Any missing weights?
    grep MISSING ucscCL+w.txt | head
    # No output, good.

    # Join the files with SNP-only IDs.
    time join -e MISSING -t '	' ucscGnl.txt ucscSNP.txt \
      > ucscG+S.txt
#17.375u 2.127s 0:47.40 41.1%    0+0k 0+0io 0pf+0w
    wc -l ucscG+S.txt
#14708630 ucscG+S.txt
    # Same as ucscGnl.txt -- somewhat less than ucscSNP.txt (14708770)...
    grep MISSING ucscG+S.txt | wc -l
#0
    time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \
      -t '	' ucscG+S.txt ucscFunc.txt \
      > ucscG+S+F.txt
#18.612u 2.334s 0:50.30 41.6%    0+0k 0+0io 0pf+0w
    wc -l ucscG+S+F.txt
#14708630 ucscG+S+F.txt
    grep MISSING ucscG+S+F.txt | wc -l
#8572703
    # Not surprising -- ucscFunc.txt has only 6136008 lines.
    expr 14708630 - 6136008
#8572622
    # Not an exact match like in 128, but not too far off.

    # Convert assembly+snp_id's to just snp_id (sorted) for final join.
    perl -wpe 's/^\S+\.(\d+)/$1/;' ucscCL+w.txt \
    | sort > ucscCL+w.snp_id.txt
    awk '{print $1;}' ucscCL+w.snp_id.txt | uniq | wc -l
#14626025
    # Interesting... which snp_ids are missing from ContigLoc?
    # (note: don't use sort -n | comm, it needs alphabetical sort!)
    awk '{print $1;}' ucscCL+w.snp_id.txt | sort -u > /tmp/1
    awk '{print $1;}' ucscGnl.txt | sort -u > /tmp/2
    comm -13 /tmp/1 /tmp/2 > notInContigLoc.txt
    comm -23 /tmp/1 /tmp/2 > notInSNP.txt
    wc -l notIn*.txt
# 83043 notInContigLoc.txt
#   438 notInSNP.txt
    # notInContigLoc could simply mean that they weren't mapped, which is OK.
    # notInSNP is more concerning.
    #Not deleted!: 52789237, 55664014, 61749732,
    #Invalid (not retired): 63751714, 63751902
    # -- sent email to snp-admin at ncbi.

    # Final join -- treat ContigLoc as authoritative (since it has coords).
    # Arrange columns in same order as in the SNP table, with extras for
    # checking at the end (phys_pos_from).
    # chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ...
    time join -a 1 -e MISSING -t '	' \
  -o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \
      ucscCL+w.snp_id.txt ucscG+S+F.txt \
      > ucscNcbiSnp.ctg.txt
#41.204u 6.274s 1:05.99 71.9%    0+0k 0+0io 0pf+0w
    wc -l ucscNcbiSnp.ctg.txt
#15835019 ucscNcbiSnp.ctg.txt
    grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l
#8495168

    # Lift the map contig coordinates to chrom coordinates (~2m);
    time liftUp ucscNcbiSnp.bed \
      /cluster/data/hg18/jkStuff/liftContigs.lft warn \
      ucscNcbiSnp.ctg.txt
#123.952u 7.587s 2:22.24 92.4%   0+0k 0+0io 5pf+0w
    wc -l ucscNcbiSnp.bed
#15835019 ucscNcbiSnp.bed

    # At this point, move back from /scratch to /cluster/data.
    nice gzip ucscNcbiSnp.bed
    cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/129/human/
    cp -p notIn* /cluster/data/dbSNP/129/human/

    # Drum roll please... translate NCBI's encoding into UCSC's, and
    # perform a bunch of checks.  This is where developer involvement
    # is most likely as NCBI extends the encodings used in dbSNP.
    cd /cluster/data/dbSNP/129/human/
    gunzip ucscNcbiSnp.bed.gz
    # Re-ran this command 8/6/08 to get new snp129.sql that includes
    # only those enum/set values that are actually used.  No other output
    # files changed.
    time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \
      snp129
    # 8/7/08: added the awk command to unswap the molType values that
    # were swapped in dbSNP 129 fasta headers:
    # DO NOT USE THIS COMMAND NEXT TIME UNLESS NECESSARY AGAIN:
    awk 'BEGIN{OFS="\t";} \
         {if      ($8 == "genomic") {$8 = "cDNA";} \
          else if ($8 == "cDNA") {$8 = "genomic";} \
          print;}' ucscNcbiSnp.bed \
    | snpNcbiToUcsc stdin /cluster/data/hg18/hg18.2bit snp129
#spaces stripped from observed:
#chr12   5963395 5963395 rs41402545
#count of snps with weight  0 = 63507
#count of snps with weight  1 = 14375595
#count of snps with weight  2 = 325745
#count of snps with weight  3 = 924499
#count of snps with weight 10 = 145673
#Skipped 493 snp mappings due to errors -- see snp129Errors.bed
#210.328u 10.793s 4:04.99 90.2%  0+0k 0+0io 0pf+0w
    # More skipped snps than in 128, but same reason:
    cut -f 5 snp129Errors.bed | sort | uniq -c
#    493 Missing observed value (deleted SNP?).
    cut -f 4 snp129Errors.bed | sort -u | sed -e 's/^rs//' > errIds.txt
    comm -13 notInSNP.txt errIds.txt | wc -l
#0
    # So those are a subset of the notInSNP.txt ids, good.
    wc -l snp*
#  15625346 snp129.bed
#        22 snp129.sql
#       493 snp129Errors.bed
#        18 snp129ExceptionDesc.tab
#   2673142 snp129Exceptions.bed

    # Make one big fasta file.
    # It's a monster: 16G!  Can we split by hashing rsId?
  # NOTE FOR NEXT TIME: do this on the fileserver!
    zcat rs_fasta/rs_ch*.fas.gz \
    | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
      > snp129.fa
    # Check for duplicates.
    grep ^\>rs snp129.fa | sort > /scratch/tmp/seqHeaders
    wc -l /scratch/tmp/seqHeaders
#14708630 /scratch/tmp/seqHeaders
    uniq /scratch/tmp/seqHeaders | wc -l
#14708630
    # Use hgLoadSeq to generate .tab output for sequence file offsets,
    # and keep only the columns that we need: acc and file_offset.
    # Index it and translate to snpSeq table format.
    time hgLoadSeq -test placeholder snp129.fa
#114.516u 37.585s 3:13.58 78.5%  0+0k 0+0io 6pf+0w
    cut -f 2,6 seq.tab > snp129Seq.tab
    rm seq.tab

    ssh hgwdev
    # Load up main track tables.
    cd /cluster/data/dbSNP/129/human
    # Re-ran this command 8/6/08 to get new snp129.sql that includes
    # only those enum/set values that are actually used.  No data values
    # changed.  Removed -noSort because Brooke had spotted some entries
    # sorted by chromEnd instead of chromStart.
    # Re-ran 8/7/08 to pick up corrected molType column in snp129.bed.
    time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
      hg18 snp129 -sqlTable=snp129.sql snp129.bed
#100.406u 22.673s 9:44.17 21.0%  0+0k 0+0io 0pf+0w
    sed -e 's/snp125/snp129/' ~/kent/src/hg/lib/snp125Exceptions.sql \
      > snp129Exceptions.sql
    time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
      hg18 snp129Exceptions -sqlTable=snp129Exceptions.sql \
      snp129Exceptions.bed
#13.125u 1.383s 1:15.39 19.2%    0+0k 0+0io 0pf+0w
    sed -e 's/snp125/snp129/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
      > snp129ExceptionDesc.sql
    hgLoadSqlTab hg18 snp129ExceptionDesc snp129ExceptionDesc.sql \
      snp129ExceptionDesc.tab
    # Load up sequences.
    sed -e 's/snpSeq/snp129Seq/' ~/kent/src/hg/lib/snpSeq.sql \
      > snp129Seq.sql
    mkdir -p /gbdb/hg18/snp
    ln -s /cluster/data/dbSNP/129/human/snp129.fa /gbdb/hg18/snp/snp129.fa
    time nice hgLoadSqlTab hg18 snp129Seq snp129Seq.sql snp129Seq.tab
#0.007u 0.006s 3:06.83 0.0%      0+0k 0+0io 0pf+0w

    # Put in a link where one would expect to find the track build dir...
    ln -s /cluster/data/dbSNP/129/human /cluster/data/hg18/bed/snp129

    # Look at the breakdown of exception categories:
    ssh kkr3u00
    cd /cluster/data/dbSNP/129/human
    cut -f 5 snp129Exceptions.bed | sort | uniq -c | sort -nr
#1580567 MultipleAlignments
# 628933 ObservedMismatch
# 387233 SingleClassLongerSpan
#  31425 SingleClassTriAllelic
#  13247 ObservedTooLong
#  11095 FlankMismatchGenomeShorter
#  10365 SingleClassZeroSpan
#   3345 SingleClassQuadAllelic
#   3310 FlankMismatchGenomeLonger
#   1397 DuplicateObserved
#   1250 MixedObserved
#    547 NamedDeletionZeroSpan
#    296 FlankMismatchGenomeEqual
#     93 ObservedContainsIupac
#     35 NamedInsertionNonzeroSpan
#      3 RefAlleleMismatch
#      1 ObservedWrongFormat


#######################################################################
# SNPMASKED SEQUENCE FOR SNP129 (DONE 7/1/08 angie)
    ssh kolossus
    mkdir /cluster/data/hg18/snp129Mask
    cd /cluster/data/hg18/snp129Mask

    # Identify rsIds with various problems -- we will exclude those.
    # MultipleAlignments is kinda broad because anything that maps on
    # both chrN and chrN_foo_hap1 will be excluded... similarly, extra
    # matches on chrN_random might disqualify good matches on chrN.
    # Well, erring on the side of caution is good.
    awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
      /cluster/data/dbSNP/129/human/snp129Exceptions.bed \
      | sort -u \
      > snp129ExcludeRsIds.txt
    time grep -vFwf snp129ExcludeRsIds.txt \
      /cluster/data/dbSNP/129/human/snp129.bed \
      > snp129Cleaned.bed
#154.384u 12.550s 3:09.01 88.3%  0+0k 0+0io 0pf+0w

    # Substitutions:
    mkdir substitutions
    snpMaskSingle snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
    | faSplit byname stdin substitutions/
    # Also this warning about total size -- just means that some chroms
    # didn't have any SNPS that survived the stringent filtering.
    #-- 113 warnings about differing observed at same base positions
    #-- (113 distinct positions).  saved as diffObserved.txt.
    #-- Spot-checking, I see a case (chr1|1476801|1476802) where two SNPs
    #-- should have been merged -- their flanking sequences were just from
    #-- diff. strands.  In another case (chr9|10122961|10122962), one of
    #-- the mappings looks like an insertion instead of a substitution but
    #-- the SNP's class is single, and one genomic base is mapped.
    #-- IMO not serious to bother dbSNP about, they want to get on w/130.
#Masked 10637395 snps in 10637306 out of 3091528550 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091528550 (difference is 16148723)
    # Make sure that sizes are identical, first diffs are normal -> IUPAC,
    # and first diffs' case is preserved:
    foreach f (substitutions/chr*.fa)
      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ"
    end
#(output OK)
    foreach f (substitutions/chr*.fa)
      echo $f:t:r
      mv $f $f:r.subst.fa
      gzip $f:r.subst.fa
    end

    # Insertions:
    mkdir insertions
    snpMaskAddInsertions snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
    | faSplit byname stdin insertions/
#Added 1617522 snps totaling 3251578 bases to 3085167749 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
    # Again, that just means that some chroms didn't have filtered SNPs.
    # Make sure that all sizes have increased relative to original:
    foreach f (insertions/chr*.fa)
      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
      |& perl -we '$_=<>; \
           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
             if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
             else {die "ERROR: ins size $1 <= $2\n";} \
           } else {die $_;}'
    end
#(output OK)
    foreach f (insertions/chr*.fa)
      mv $f $f:r.ins.fa
      gzip $f:r.ins.fa
    end

    # Deletions:
    mkdir deletions
    snpMaskCutDeletions snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
    | faSplit byname stdin deletions/
#Cut 1046324 snps totaling 2173708 bases from 3085167749 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
    # Again, that just means that some chroms didn't have filtered SNPs.
    # Make sure that all sizes have decreased relative to original:
    foreach f (deletions/chr*.fa)
      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
      |& perl -we '$_=<>; \
           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
             if ($1 < $2) {print "OK: del size $1 < $2\n";} \
             else {die "ERROR: del size $1 >= $2\n";} \
           } else {die $_;}'
    end
#(output OK)
    foreach f (deletions/chr*.fa)
      mv $f $f:r.del.fa
      gzip $f:r.del.fa
    end

    # Clean up and prepare for download:
    gzip snp129Cleaned.bed
    foreach d (substitutions insertions deletions)
      pushd $d
        md5sum *.gz > md5sum.txt
      popd
    end
    # Make a README.txt in each subdir.

    # Create download links on hgwdev.
    # NOTE: Currently we offer only the substitutions.
    # If we get any user requests, then maybe we can put the insertions
    # and deletions out there.
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask
    ln -s /cluster/data/hg18/snp129Mask/substitutions/* \
      /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/
## If there is user demand for ins & del, then start over with an empty
## goldenPath/snp129Mask and do this:
##    foreach type (substitutions insertions deletions)
##      mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/$type
##      ln -s /cluster/data/hg18/snp129Mask/$type/* \
##        /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/$type/
##    end


#######################################################################
# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP129 (DONE 7/2/08 angie)
    ssh kolossus
    mkdir /cluster/data/hg18/bed/snp129Ortho
    cd /cluster/data/hg18/bed/snp129Ortho

    # Following Heather's lead in snp126orthos, filter SNPs to to keep
    # only those with class=single, length=1, chrom!~random;
    # Exclude those with exceptions MultipleAlignments,
    # SingleClassTriAllelic or SingleClassQuadAllelic.
    # Unlike snp masking, we do not filter for weight -- don't know why.
    awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      /cluster/data/dbSNP/129/human/snp129Exceptions.bed \
    | sort -u \
      > snp129ExcludeIds.txt
    awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
      /cluster/data/dbSNP/129/human/snp129.bed \
    | grep -vFwf snp129ExcludeIds.txt \
      > snp129Simple.bed
    # took ~3 minutes
    wc -l snp129Simple.bed
#10633840 snp129Simple.bed

    # Glom all human info that we need for the final table onto the
    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
    awk 'BEGIN{OFS="\t";} \
        {print $1, $2, $3, \
               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
               0, $6;}' \
      snp129Simple.bed > snp129ForLiftOver.bed

    # Map coords to chimp using liftOver.
    # I don't know why chimp took so much longer than macaque... the
    # chimp .over has fewer chains and fewer bytes than the macaque .over.
    mkdir run.liftOChimp
    cd run.liftOChimp
    mkdir split out
    splitFile ../snp129ForLiftOver.bed 25000 split/chunk
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
        \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
        >> jobList
    end
    ssh pk
    cd /cluster/data/hg18/bed/snp129Ortho/run.liftOChimp
    para make jobList
#Completed: 426 of 426 jobs
#CPU time in finished jobs:      83616s    1393.60m    23.23h    0.97d  0.003 y
#IO & Wait Time:                  1501s      25.02m     0.42h    0.02d  0.000 y
#Average job time:                 200s       3.33m     0.06h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             574s       9.57m     0.16h    0.01d
#Submission to last job:           939s      15.65m     0.26h    0.01d

    # Map coords to orangutan using liftOver.
    mkdir ../run.liftOPon
    cd ../run.liftOPon
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /cluster/data/hg18/bed/liftOver/hg18ToPonAbe2.over.chain.gz \
        \{check out exists out/ponAbe2.$f:t.bed\} out/hg18.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 426 of 426 jobs
#CPU time in finished jobs:     171875s    2864.58m    47.74h    1.99d  0.005 y
#IO & Wait Time:                  1767s      29.45m     0.49h    0.02d  0.000 y
#Average job time:                 408s       6.79m     0.11h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            1268s      21.13m     0.35h    0.01d
#Submission to last job:          1743s      29.05m     0.48h    0.02d

    # Map coords to macaque using liftOver.
    mkdir ../run.liftOMac
    cd ../run.liftOMac
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
        \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 426 of 426 jobs
#CPU time in finished jobs:       6356s     105.93m     1.77h    0.07d  0.000 y
#IO & Wait Time:                  1812s      30.21m     0.50h    0.02d  0.000 y
#Average job time:                  19s       0.32m     0.01h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              51s       0.85m     0.01h    0.00d
#Submission to last job:           221s       3.68m     0.06h    0.00d

    ssh kolossus
    cd /cluster/data/hg18/bed/snp129Ortho
    # Note: the formerly inlined script getOrthoSeq.pl has been checked in
    # as kent/src/hg/snp/snpLoad/getOrthoSeq.pl.

    # Concatenate the chimp results, sorting by chimp pos in order to
    # efficiently access 2bit sequence in getOrthoSeq.  The output of
    # that is then sorted by the glommed human info field, so that we
    # can use join to combine chimp and macaque results in the next step.
    # Ditto for macaque and orangutan.
    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
    | sort > panTro2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
    | sort > ponAbe2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
    | sort > rheMac2.orthoGlom.txt
    # The whole pipeline takes ~5-7 minutes each.
    wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
#   9909458 panTro2.orthoGlom.txt
#   9597270 ponAbe2.orthoGlom.txt
#   8467866 rheMac2.orthoGlom.txt

    # Use the glommed name field as a key to join up chimp and macaque
    # allele data.  Include glommed name from both files because if only
    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
    # in the orthoGlom fields from each file, which are in the same order
    # as the chimp and macaque columns of snp129OrthoPanTro2RheMac2.
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
      > tmp.txt
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      tmp.txt rheMac2.orthoGlom.txt \
    | perl -wpe 'chomp; \
        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
          split(/\|/, $glomKey); \
        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
        s/^.*$//;' \
    | sort -k1,1 -k2n,2n > snp129OrthoPt2Pa2Rm2.bed
    # took ~6 minutes.
    wc -l snp129OrthoPt2Pa2Rm2.bed
#10325827 snp129OrthoPt2Pa2Rm2.bed

    ssh hgwdev
    cd /cluster/data/hg18/bed/snp129Ortho
    time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
      hg18 snp129OrthoPt2Pa2Rm2 snp129OrthoPt2Pa2Rm2.bed
#Loaded 10325827 elements of size 22
#73.396u 10.864s 10:14.76 13.7%  0+0k 0+0io 0pf+0w

    # Cleanup on fileserver:
    cd /cluster/data/hg18/bed/snp129Ortho
    nice gzip snp129Simple.bed snp129ExcludeIds.txt snp129ForLiftOver.bed
    rm -r run*/split tmp.txt *.orthoGlom.txt


############################################################################
# dbSNP BUILD 130 (UPDATED 8/18/09 angie)
# Originally done 5/22/09.
# Functional annotations restricted by mapping position 7/7.
# dbSNP corrections applied to func field 8/18.
    # Set up build directory
    mkdir -p /hive/data/outside/dbSNP/130/{human,shared}

    # Get field encodings -- if there are changes or additions to the
    # encoding of the corresponding fields, you might need to update
    # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
    # hg/lib/snp125Ui.c).
    cd /hive/data/outside/dbSNP/130/shared
    alias wg wget --timestamping
    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
    wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz
    wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz
    wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz
    wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz
    # Here is another source -- it is not as up-to-date as the above, but
    # our encodings (enums and sets in snp130.sql) are named more similar
    # to those in the 2005 ASN:
    # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn

    ########################## DOWNLOAD #############################
    cd /hive/data/outside/dbSNP/130/human
    mkdir data schema rs_fasta
    # Get data from NCBI (anonymous FTP)
    wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
    cd /hive/data/outside/dbSNP/130/human/data
    # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
    wg $ftpSnpDb/organism_data/b130_SNPContigLoc_36_3.bcp.gz
    wg $ftpSnpDb/organism_data/b130_SNPContigLocusId_36_3.bcp.gz
    wg $ftpSnpDb/organism_data/b130_ContigInfo_36_3.bcp.gz
    # MapInfo has alignment weights
    wg $ftpSnpDb/organism_data/b130_SNPMapInfo_36_3.bcp.gz
    # SNP has univar_id, validation status and heterozygosity
    wg $ftpSnpDb/organism_data/SNP.bcp.gz

    # Get schema
    cd /hive/data/outside/dbSNP/130/human/schema
    wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
    wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz

    # Get fasta files
    # using headers of fasta files for molType, class, observed
    cd /hive/data/outside/dbSNP/130/human/rs_fasta
    wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz

    # Get 1000 Genomes IDs (unfortunately not in validation field as Sol suggested)
    cd /hive/data/outside/dbSNP/130/human/data
    wg -O 1000Genomes_README ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/1000Genomes/ReadMe.txt
    wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/1000Genomes/B130_1000G_RsClusterReport.txt.gz
    zcat B130_1000G_RsClusterReport.txt.gz | wc -l
#7512342
    # Make a uniquified list of only the numeric portion of the assigned rs IDs:
    zcat B130_1000G_RsClusterReport.txt.gz \
    | cut -d, -f 3 | sed -e 's/^rs//' \
    | sort -nu > 1000GenomesRsIds.txt
    wc -l 1000GenomesRsIds.txt
#5611085 1000GenomesRsIds.txt

    ########################## LOAD NCBI TABLES #############################
    # Simplify names of data files -- strip version & extras to get
    # local canonical table names.
    cd /hive/data/outside/dbSNP/130/human/data
    foreach f (*.bcp.gz)
      set new = `echo $f \
                 | sed -e 's/^b130_SNP//; s/^b130_//; s/_36_3//; s/.bcp//;'`
      mv $f $new
      echo $new
    end

    cd /hive/data/outside/dbSNP/130/human/schema
    zcat human_9606_table.sql.gz \
    | perl -we '$/ = "\nGO\n\n\n"; \
        while (<>) { \
          next unless /^CREATE TABLE \[(b130_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_3)?\]/; \
          s/b130_(SNP)?//; s/_36_3//; \
          s/[\[\]]//g;  s/GO\n\n/;/;  s/smalldatetime/datetime/g; \
          s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
          s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
          s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
          s/(image|varchar\s+\(\d+\))/BLOB/g; \
          print; \
        }' \
      > table.sql

    # load on hgwdev (kolossus disk almost full, no more small cluster mysql5's):
    hgsql '' -e 'create database hg18snp130'
    cd /hive/data/outside/dbSNP/130/human/schema
    hgsql hg18snp130 < table.sql
    cd ../data

    # Avoid wasting space by excluding mappings to non-reference contigs:
    foreach t (ContigInfo MapInfo)
      zcat $t.gz \
      | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable hg18snp130 $t placeholder stdin
    end
#load of ContigInfo did not go as planned: 379 record(s), 0 row(s) skipped, 88 warning(s) loading /dev/stdin
    # Checked ContigInfo visually, looks OK.
    # Compare contig list between our ctgPos and reference contigs in
    # ContigInfo:
    ssh hgwdev-10 hgsql hg18 -N -B -e '"select contig from ctgPos;"' \
    | sort > /tmp/1
    hgsql hg18snp130 -NBe 'select distinct(group_label) from ContigInfo'
    # --> reference, c5_H2, c6_COX, c6_QBL, c22_H2, DR53
    # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
    hgsql hg18snp130 -N -B -e 'select contig_acc from ContigInfo \
        where group_label in \
        ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2
    diff /tmp/1 /tmp/2
    # No diff.
    # Make sure there are no orient != 0 contigs among those selected.
    hgsql hg18snp130 -NBe \
      'select count(*) from ContigInfo where orient != 0 and \
         group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");'
#0

    # ContigLoc is huge, and we want just the reference contig mappings.
    # So, based on the reference & haplo ctg_id values in ContigInfo,
    # filter to get just the mappings for those contigs:
    zcat ContigLoc.gz \
    | awk '$3 <= 377 || $3 == 7015' \
    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
    | hgLoadSqlTab -oldTable hg18snp130 ContigLoc placeholder stdin
    zcat SNP.gz \
    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
    | hgLoadSqlTab -oldTable hg18snp130 SNP placeholder stdin
    zcat ContigLocusId.gz \
    | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
    | hgLoadSqlTab -oldTable hg18snp130 ContigLocusId placeholder stdin
    # There were some warnings (many cleared up by the perl substitution)
    # but no rows were dropped.  In mysql5, 'show warnings' after a manual 'load data'
    # complains about missing values (OK when e.g. position is not known).
    foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
     echo -n "${t}:\t"
      hgsql -N -B hg18snp130 -e 'select count(*) from '$t
    end
#ContigInfo:     379
#ContigLoc:      19189750
#ContigLocusId:  11790054
#MapInfo:        17928700
#SNP:    	 17804034


    #################### EXTRACT INFO FROM NCBI TABLES ####################
    # Glom each SNP's function codes together and load up a new hg18Snp130 table.
    # Also extract NCBI's annotations of coding SNPs' effects on translation.
    # We extract ContigLocusId info only for reference assembly mapping.
    # Some SNP's functional annotations are for an alternate assembly, so we will
    # have no NCBI functional annotations to display for those (but our own are 
    # available).
    cd /hive/data/outside/dbSNP/130/human
    hgsql hg18snp130 -NBe 'select snp_id, ci.contig_acc, asn_from, asn_to, mrna_acc, \
                           fxn_class, reading_frame, allele, residue, codon, cli.ctg_id \
                           from ContigLocusId as cli, ContigInfo as ci \
                           where cli.ctg_id = ci.ctg_id and \
                           group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")' \
      > ncbiFuncAnnotations.txt
    # Ignore function code 8 (cds-reference, just means that some allele matches reference)
    # and glom functions for each SNP id:
    cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
    | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
    | perl -we 'while (<>) { chomp; \
                  ($id, undef, $s, $e, $f, $c) = split; \
                  if (defined $prevId && $id == $prevId && $c == $prevC && $s == $prevS) { \
                    $prevFunc .= "$f," unless ($f == 8); \
                  } else { \
                    print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if (defined $prevId); \
                    $prevFunc = ($f == 8) ? "" : "$f,"; \
                  } \
                  ($prevId, $prevC, $prevS, $prevE) = ($id, $c, $s, $e); \
                } \
                print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n"' \
      > ucscFunc.txt
    wc -l ucscFunc.txt
#7035685 ucscFunc.txt
    cat > ucscFunc.sql <<EOF
CREATE TABLE ucscFunc (
        snp_id int NOT NULL ,
        ctg_id int(10) NOT NULL ,
        asn_from int(10) NOT NULL ,
        asn_to int(10) NOT NULL ,
        fxn_class varchar(255) NOT NULL ,
        INDEX snp_id (snp_id),
        INDEX ctg_id (ctg_id)
);
EOF
    hgLoadSqlTab hg18snp130 ucscFunc{,.sql,.txt}
    # 10/12/10: Those coords are NCBI's 0-based, fully-closed, 2-base-wide insertions.
    # We need to leave the coords alone here so ucscFunc can be joined below.
    # Make a list of SNPs with func anno's that are insertion SNPs, so we can use 
    # the list to determine what type of coord fix to apply to each annotation
    # when making snp130CodingDbSnp below.
    hgsql hg18snp130 -NBe \
      'select ci.contig_acc, cl.asn_from, cl.asn_to, uf.snp_id \
       from ucscFunc as uf, ContigLoc as cl, ContigInfo as ci \
       where uf.snp_id = cl.snp_id and \
             uf.ctg_id = cl.ctg_id and uf.asn_from = cl.asn_from and uf.asn_to = cl.asn_to and \
             cl.loc_type = 3 and \
             cl.ctg_id = ci.ctg_id and \
             ci.group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")' \
      > ncbiFuncInsertions.ctg.bed
    wc -l ncbiFuncInsertions.ctg.bed
#1089086 ncbiFuncInsertions.ctg.bed

    # Extract observed allele, molType and snp class from FASTA headers gnl
    zcat /hive/data/outside/dbSNP/130/human/rs_fasta/rs_ch*.fas.gz \
    | grep '^>gnl' \
    | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
    | sort -n \
      > ucscGnl.txt
#407.555u 57.499s 4:32.89 170.4% 0+0k 0+0io 0pf+0w
    wc -l ucscGnl.txt
#17804034 ucscGnl.txt
    cut -f 1 ucscGnl.txt | uniq | wc -l
#17804034
    cat > ucscGnl.sql <<EOF
CREATE TABLE ucscGnl (
        snp_id int NOT NULL ,
        observed varchar(255) NOT NULL,
        molType varchar(255) NOT NULL,
        class varchar(255) NULL ,
        INDEX snp_id (snp_id)
);
EOF
    hgLoadSqlTab hg18snp130 ucscGnl{,.sql,.txt}

    # Add indices to tables for a big join (5 or 6 minutes):
    hgsql hg18snp130 -e \
      'alter table ContigLoc  add index (ctg_id); \
       alter table ContigInfo add index (ctg_id); \
       alter table ContigLocusId add index (snp_id); \
       alter table SNP        add index (snp_id); \
       alter table MapInfo    add index (snp_id);'

    # Big leftie join to bring together all of the columns that we want in snp130,
    # using all of the available joining info:
    hgsql hg18snp130 -NBe \
     'SELECT ci.contig_acc, cl.asn_from, cl.asn_to, cl.snp_id, cl.orientation, cl.allele, \
             ug.observed, ug.molType, ug.class, \
             s.validation_status, s.avg_heterozygosity, s.het_se, \
             uf.fxn_class, cl.loc_type, mi.weight, cl.phys_pos_from \
      FROM \
      ((((ContigLoc as cl JOIN ContigInfo as ci \
               ON cl.ctg_id = ci.ctg_id and \
                  ci.group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")) \
          LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
         LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
        LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
       LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
                                and uf.asn_from = cl.asn_from;' \
      > ucscNcbiSnp.ctg.bed
#on a not-so busy hgwdev: 80.735u 36.958s 8:54.76 22.0%   0+0k 0+0io 0pf+0w
#on a busy hgwdev:   78.753u 41.304s 30:19.77 6.5%   0+0k 0+0io 0pf+0w
#on hgwdev with giant chains loading in parallel:
#                         78.213u 33.826s 58:16.41 3.2%   0+0k 0+0io 0pf+0w
    wc -l ucscNcbiSnp.ctg.bed 
#19189750 ucscNcbiSnp.ctg.bed

    liftUp ucscNcbiSnp.bed \
      /hive/data/genomes/hg18/jkStuff/liftContigs.lft warn \
      ucscNcbiSnp.ctg.bed
#119.644u 8.992s 2:36.67 82.1%   0+0k 0+0io 3pf+0w

    # Drum roll please... translate NCBI's encoding into UCSC's, and
    # perform a bunch of checks.  This is where developer involvement
    # is most likely as NCBI extends the encodings used in dbSNP.
    cd /hive/data/outside/dbSNP/130/human/
    snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \
      -1000GenomesRsIds=data/1000GenomesRsIds.txt snp130
#spaces stripped from observed:
#chr12	5963395	5963395	rs41402545
#Line 8106609 of ucscNcbiSnp.bed: Encountered something that doesn't fit observedMixedFormat: GCAACTTCA
#count of snps with weight  0 = 74828
#count of snps with weight  1 = 17254041
#count of snps with weight  2 = 389501
#count of snps with weight  3 = 1189989
#count of snps with weight 10 = 281391
#Found no errors.
#163.878u 10.302s 3:33.84 81.4%	0+0k 0+0io 0pf+0w
    wc -l snp*
#  18833531 snp130.bed
#        22 snp130.sql
#         0 snp130Errors.bed
#        18 snp130ExceptionDesc.tab
#   2631563 snp130Exceptions.bed
    # More SNPs but 0 errors and a bit fewer exceptions that snp129, cool!

    # Make one big fasta file.
    # It's a monster: 18G!  Can we split by hashing rsId?
    zcat rs_fasta/rs_ch*.fas.gz \
    | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
      > snp130.fa
    # Check for duplicates.
    grep ^\>rs snp130.fa | sort > /scratch/tmp/seqHeaders
    wc -l /scratch/tmp/seqHeaders
#17804034 /scratch/tmp/seqHeaders
    uniq /scratch/tmp/seqHeaders | wc -l
#17804034
    # Use hgLoadSeq to generate .tab output for sequence file offsets,
    # and keep only the columns that we need: acc and file_offset.
    # Index it and translate to snpSeq table format.
    time hgLoadSeq -test placeholder snp130.fa
#107.748u 24.338s 6:58.50 31.5%  0+0k 0+0io 0pf+0w
    cut -f 2,6 seq.tab > snp130Seq.tab
    rm seq.tab

    # Load up main track tables.
    cd /hive/data/outside/dbSNP/130/human
    time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
      hg18 snp130 -sqlTable=snp130.sql snp130.bed
#Loaded 18833531 elements of size 17
#114.088u 12.924s 12:54.18 16.4%	0+0k 0+0io 0pf+0w
    time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
      hg18 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
      snp130Exceptions.bed
#15.255u 1.257s 1:11.11 23.2%    0+0k 0+0io 0pf+0w
    hgLoadSqlTab hg18 snp130ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
      snp130ExceptionDesc.tab
    # Load up sequences.
    mkdir -p /gbdb/hg18/snp
    ln -s /hive/data/outside/dbSNP/130/human/snp130.fa /gbdb/hg18/snp/snp130.fa
    time nice hgLoadSqlTab hg18 snp130Seq ~/kent/src/hg/lib/snpSeq.sql snp130Seq.tab
#0.005u 0.002s 6:02.78 0.0%	0+0k 0+0io 0pf+0w

    # Put in a link where one would expect to find the track build dir...
    ln -s /hive/data/outside/dbSNP/130/human /cluster/data/hg18/bed/snp130

    # Look at the breakdown of exception categories:
    cd /hive/data/outside/dbSNP/130/human
    cut -f 5 snp130Exceptions.bed | sort | uniq -c | sort -nr
#1960737 MultipleAlignments
# 519222 ObservedMismatch
#  38444 ObservedTooLong
#  32069 SingleClassTriAllelic
#  26351 FlankMismatchGenomeShorter
#  19089 SingleClassLongerSpan
#  15441 SingleClassZeroSpan
#   6583 FlankMismatchGenomeLonger
#   4108 DuplicateObserved
#   3627 SingleClassQuadAllelic
#   3473 MixedObserved
#   1369 NamedDeletionZeroSpan
#    547 FlankMismatchGenomeEqual
#    355 NamedInsertionNonzeroSpan
#    136 ObservedContainsIupac
#      8 ObservedWrongFormat
#      4 RefAlleleMismatch

#TODO: go through those above and send some bug reports to dbSNP.

    # 8/18/09: dbSNP announced a correction to some functional class 
    # annotations (- strand mRNA -> swapped near-gene-3 and near-gene-5).
    cd /hive/data/outside/dbSNP/130/human
    # This is a list of affected rs IDs, genes, old funcs and new funcs:
    wget ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database/organism_data/b130_update/b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt
    wc -l b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt
#163147 b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt
    # The first 19 lines are the header.

    # Use the info in that file to make a series of sql update commands:
    tail -n +20 b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt \
    | perl -we '$fns[6]="intron"; $fns[13]="near-gene-3"; $fns[15]="near-gene-5"; \
      $fns[41]="nonsense"; $fns[42]="missense"; \
      $fns[53]="untranslated-3"; $fns[55]="untranslated-5"; \
      while (<>) { \
      ($rs,undef,undef,$old,undef,$new) = split(","); \
      $oldF = $fns[$old];  $newF = $fns[$new]; die if (!(defined $oldF && defined $newF)); \
      print "UPDATE snp130 set func=(REPLACE(func,\"$oldF\",\"$newF\")) where name=\"rs$rs\";\n"; \
      }' \
      > snp130.func_13_15_fix.sql
    wc -l snp130.func_13_15_fix.sql
#163128 snp130.func_13_15_fix.sql
    hgsql hg18 < snp130.func_13_15_fix.sql
    # The number of rows changed has to be smaller because some of those replacements
    # are for annotations relative to a different assembly; we have func=unknown for
    # those.  E.g. rs437678.


#######################################################################
# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 5/26/09 angie)
# Originally done 5/15; reloaded 5/26 after making sure no coords had changed,
# reloaded 7/7/09 to bump timestamp
    mkdir /hive/data/genomes/hg18/bed/snp130Ortho
    cd /hive/data/genomes/hg18/bed/snp130Ortho

    # Following Heather's lead in snp126orthos, filter SNPs to to keep
    # only those with class=single, length=1, chrom!~random;
    # Exclude those with exceptions MultipleAlignments,
    # SingleClassTriAllelic or SingleClassQuadAllelic.
    # Unlike snp masking, we do not filter for weight -- don't know why.
    awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
      /hive/data/outside/dbSNP/130/human/snp130Exceptions.bed \
    | sort -u \
      > snp130ExcludeIds.txt
    awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
      /hive/data/outside/dbSNP/130/human/snp130.bed \
    | grep -vFwf snp130ExcludeIds.txt \
      > snp130Simple.bed
#182.396u 12.388s 2:10.30 149.4% 0+0k 0+0io 0pf+0w
    wc -l snp130Simple.bed
#12141377 snp130Simple.bed

    # Glom all human info that we need for the final table onto the
    # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
    awk 'BEGIN{OFS="\t";} \
        {print $1, $2, $3, \
               $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
               0, $6;}' \
      snp130Simple.bed > snp130ForLiftOver.bed

    # Map coords to chimp using liftOver.
    # I don't know why chimp took so much longer than macaque... the
    # chimp .over has fewer chains and fewer bytes than the macaque .over.
    mkdir run.liftOChimp
    cd run.liftOChimp
    mkdir split out
    splitFile ../snp130ForLiftOver.bed 25000 split/chunk
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
        \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
        >> jobList
    end
    ssh pk
    cd /hive/data/genomes/hg18/bed/snp130Ortho/run.liftOChimp
    para make jobList
#Completed: 486 of 486 jobs
#CPU time in finished jobs:      76679s    1277.99m    21.30h    0.89d  0.002 y
#IO & Wait Time:                  1828s      30.46m     0.51h    0.02d  0.000 y
#Average job time:                 162s       2.69m     0.04h    0.00d
#Longest finished job:             486s       8.10m     0.14h    0.01d
#Submission to last job:           513s       8.55m     0.14h    0.01d

    # Map coords to orangutan using liftOver.
    mkdir ../run.liftOPon
    cd ../run.liftOPon
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToPonAbe2.over.chain.gz \
        \{check out exists out/ponAbe2.$f:t.bed\} out/hg18.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 486 of 486 jobs
#CPU time in finished jobs:     165378s    2756.31m    45.94h    1.91d  0.005 y
#IO & Wait Time:                  2614s      43.56m     0.73h    0.03d  0.000 y
#Average job time:                 346s       5.76m     0.10h    0.00d
#Longest finished job:            1017s      16.95m     0.28h    0.01d
#Submission to last job:          1051s      17.52m     0.29h    0.01d

    # Map coords to macaque using liftOver.
    mkdir ../run.liftOMac
    cd ../run.liftOMac
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
        \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 486 of 486 jobs
#CPU time in finished jobs:       4068s      67.80m     1.13h    0.05d  0.000 y
#IO & Wait Time:                  1944s      32.40m     0.54h    0.02d  0.000 y
#Average job time:                  12s       0.21m     0.00h    0.00d
#Longest finished job:              38s       0.63m     0.01h    0.00d
#Submission to last job:           126s       2.10m     0.04h    0.00d

    cd /hive/data/genomes/hg18/bed/snp130Ortho
    # Concatenate the chimp results, sorting by chimp pos in order to
    # efficiently access 2bit sequence in getOrthoSeq.  The output of
    # that is then sorted by the glommed human info field, so that we
    # can use join to combine chimp and macaque results in the next step.
    # Ditto for macaque and orangutan.  Each command pipe takes ~5 minutes:
    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
    | sort > panTro2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
    | sort > ponAbe2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
    | sort > rheMac2.orthoGlom.txt
    wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
#  11318466 panTro2.orthoGlom.txt
#  10976821 ponAbe2.orthoGlom.txt
#   9702063 rheMac2.orthoGlom.txt

    # Use the glommed name field as a key to join up chimp and macaque
    # allele data.  Include glommed name from both files because if only
    # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
    # in the orthoGlom fields from each file, which are in the same order
    # as the chimp and macaque columns of snp130OrthoPanTro2RheMac2.
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
    | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
            else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
      > tmp.txt
    join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
      -a 1 -a 2 -e '?' \
      tmp.txt rheMac2.orthoGlom.txt \
    | perl -wpe 'chomp; \
        ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
        $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
        ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
          split(/\|/, $glomKey); \
        $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
        $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
        print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                         $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                         $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                         $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
        s/^.*$//;' \
    | sort -k1,1 -k2n,2n > snp130OrthoPt2Pa2Rm2.bed
#300.357u 31.419s 4:33.00 121.5% 0+0k 0+0io 0pf+0w
    wc -l snp130OrthoPt2Pa2Rm2.bed
#11797184 snp130OrthoPt2Pa2Rm2.bed

    cd /hive/data/genomes/hg18/bed/snp130Ortho
    hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
      -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
      hg18 snp130OrthoPt2Pa2Rm2 snp130OrthoPt2Pa2Rm2.bed
#Loaded 11797184 elements of size 22
#83.624u 9.627s 10:19.26 15.0%   0+0k 0+0io 0pf+0w

    # Cleanup fileserver:
    cd /hive/data/genomes/hg18/bed/snp130Ortho
    nice gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed
    rm -r run*/split tmp.txt *.orthoGlom.txt


#######################################################################
# DBSNP CODING ANNOTATIONS (DONE 10/12/10 angie)
# Updated 10/12/10 - redone w/corrected genome coords (Redmine Track #1249)
# Updated 7/7/09 - redone w/snp130, using mapping locations of dbSNP's func. annos
# originally done 6/2/09
    cd /hive/data/outside/dbSNP/130/human
    # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.  
    # For anything except an insertion (0 bases between flanks), 
    # we need to add 1 to the end coord.  For an insertion, we need
    # to add 1 to the start coord.  Make a hash of the insertion IDs,
    # then look up each ID in ncbiFuncAnnotations.txt to tell which
    # transform to apply.
    # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
    perl -we 'open($IDS, "ncbiFuncInsertions.ctg.bed") || die "ids: $!"; \
              while (<$IDS>) { chomp; $ids{$_} = 1; } \
              close($IDS); \
              while (<>) { \
                chomp;  @w = split("\t"); # id, ctg, start, end, ... \
                next unless ($w[7] || $w[8] || $w[9]); \
                $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \
                if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \
                  $w[2]++; # 2-base insertions: increment start coord \
                } else { \
                  $w[3]++; # increment end coord to get half-open \
                } \
                print join("\t", @w) . "\n"; \
              }' ncbiFuncAnnotations.txt \
    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
    | uniq \
      > ncbiCodingAnnotations.txt
    wc -l ncbiCodingAnnotations.txt
#576726 ncbiCodingAnnotations.txt

    # How many & what kinds of function types?
    cut -f 6 ncbiCodingAnnotations.txt \
    | sort -n | uniq -c
# 107963  3  (coding-synon)
# 276197  8  (cds-reference)
#   4664 41  (nonsense)
# 146908 42  (missense)
#  40994 44  (frameshift)
    # Does everybody have a reference annotation?
    awk '$6 == 8 {print $1 "\t" $5;}' ncbiCodingAnnotations.txt | uniq > tmp1
    awk '$6 != 8 {print $1 "\t" $5;}' ncbiCodingAnnotations.txt | uniq > tmp2
    wc -l tmp1 tmp2
#  276113 tmp1
#  279647 tmp2
    # Doh! not everybody.  So hgTracks will sometimes have to process ref itself...
    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
    perl -e  'while (<>) { chomp; \
                my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
                if (defined $lastRs && \
                    ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
                     $lastTx ne $txId || $lastFrm ne $frm)) { \
                  if (defined $refRow) { \
                    $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                    $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
                  } \
                  print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                        "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
                  $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
                } \
                ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
                    ($rsId, $ctg, $s, $e, $txId, $frm); \
                $count++; \
                if ($fxn == 8) { \
                  $refRow = [$fxn, $nt, $aa, $codon]; \
                } else { \
                 $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
                } \
              } \
              if (defined $refRow) { \
                $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
                $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
              } \
              print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
                    "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
      ncbiCodingAnnotations.txt \
        > snp130CodingDbSnp.ctg.txt
    liftUp snp130CodingDbSnp.bed \
      /hive/data/genomes/hg18/jkStuff/liftContigs.lft warn snp130CodingDbSnp.ctg.txt
    hgLoadBed hg18 snp130CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      snp130CodingDbSnp.bed
#Loaded 279815 elements of size 11


#######################################################################
# SNPMASKED SEQUENCE FOR SNP130 (DONE 7/10/09 angie)
    mkdir /hive/data/genomes/hg18/snp130Mask
    cd /hive/data/genomes/hg18/snp130Mask

    # Identify rsIds with various problems -- we will exclude those.
    # MultipleAlignments is kinda broad because anything that maps on
    # both chrN and chrN_foo_hap1 will be excluded... similarly, extra
    # matches on chrN_random might disqualify good matches on chrN.
    # Well, erring on the side of caution is good.
    awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
      /hive/data/outside/dbSNP/130/human/snp130Exceptions.bed \
      | sort -u \
      > snp130ExcludeRsIds.txt
    time grep -vFwf snp130ExcludeRsIds.txt \
      /hive/data/outside/dbSNP/130/human/snp130.bed \
      > snp130Cleaned.bed
#185.202u 4.847s 3:22.55 93.8%   0+0k 0+0io 0pf+0w

    # Substitutions:
    mkdir substitutions
    snpMaskSingle snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \
    | faSplit byname stdin substitutions/
#Masked 12142171 snps in 12141860 out of 3091592211 genomic bases
#/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091592211 (difference is 16085062)
#94.376u 16.038s 3:10.37 57.9%   0+0k 0+0io 0pf+0w
    # Check that 16085062 is the total #bases in sequences with nothing in snp130Cleaned:
    cut -f 1 snp130Cleaned.bed | uniq > /tmp/1
    grep -vwf /tmp/1 ../chrom.sizes
    grep -vwf /tmp/1 ../chrom.sizes \
    | awk 'BEGIN {TOTAL = 0 ; } {TOTAL += $2 ; } END {printf "%d\n", TOTAL ; }'
#16085062
    # 338 warnings about differing observed strings at same base position --
    # saved as diffObserved.txt.
#TODO: send list to dbSNP.
    # Make sure that sizes are identical, first diffs are normal -> IUPAC,
    # and first diffs' case is preserved:
    foreach f (substitutions/chr*.fa)
      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ"
    end
#chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 491 (y != c)
#chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 55877 (s != c)
#...
#(output OK -- ambiguous bases replacing [agct] at SNP positions)
    foreach f (substitutions/chr*.fa)
      echo $f:t:r
      mv $f $f:r.subst.fa
      gzip $f:r.subst.fa
    end

    # Insertions:
    mkdir insertions
    snpMaskAddInsertions snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \
    | faSplit byname stdin insertions/
#Added 2464798 snps totaling 5891837 bases to 3085167749 genomic bases
#/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
#99.269u 17.928s 3:31.80 55.3%   0+0k 0+0io 1pf+0w
    # Again, that just means that some chroms didn't have filtered SNPs.
    # Make sure that all sizes have increased relative to original:
    foreach f (insertions/chr*.fa)
      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
      |& perl -we '$_=<>; \
           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
             if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
             else {die "ERROR: ins size $1 <= $2\n";} \
           } else {die $_;}'
    end
#OK: ins size 247711739 > 247249719
#OK: ins size 135642480 > 135374737
#...
#(output OK -- new sizes > old)
    foreach f (insertions/chr*.fa)
      mv $f $f:r.ins.fa
      gzip $f:r.ins.fa
    end

    # Deletions:
    mkdir deletions
    snpMaskCutDeletions snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \
    | faSplit byname stdin deletions/
#Cut 1514798 snps totaling 3554896 bases from 3086962619 genomic bases
#/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3086962619 (difference is 20714654)
#103.312u 31.094s 3:56.12 56.9%  0+0k 0+0io 1pf+0w
    # Again, that just means that some chroms didn't have filtered SNPs.
    # Make sure that all sizes have decreased relative to original:
    foreach f (deletions/chr*.fa)
      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
      |& perl -we '$_=<>; \
           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
             if ($1 < $2) {print "OK: del size $1 < $2\n";} \
             else {die "ERROR: del size $1 >= $2\n";} \
           } else {die $_;}'
    end
#OK: del size 246960459 < 247249719
#OK: del size 135214654 < 135374737
#...
#(output OK -- del sizes < old)
    foreach f (deletions/chr*.fa)
      mv $f $f:r.del.fa
      gzip $f:r.del.fa
    end

    # Clean up and prepare for download:
    gzip snp130Cleaned.bed
    foreach d (substitutions insertions deletions)
      pushd $d
        md5sum *.gz > md5sum.txt
        cp ../../snp129Mask/$d/README.txt .
      popd
    end
    # Edit the README.txt in each subdir.

    # Create download links on hgwdev.
    # NOTE: Currently we offer only the substitutions.
    # If we get any user requests, then maybe we can put the insertions
    # and deletions out there.
    mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask
    ln -s /hive/data/genomes/hg18/snp130Mask/substitutions/* \
      /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/
## If there is user demand for ins & del, then start over with an empty
## goldenPath/snp130Mask and do this:
##    foreach type (substitutions insertions deletions)
##      mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/$type
##      ln -s /hive/data/genomes/hg18/snp130Mask/$type/* \
##        /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/$type/
##    end


############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.
############################################################################


############################################################################
# Nuclear Lamina (2008-06-16 mikep)
#  "Domain organization of human chromosomes revealed by mapping of nuclear lamina interactions"
# We received these files from authors of Guelen et al. Nature 2008
# doi:10.138/nature06947
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/nuclearLamina
    cd /cluster/data/hg18/bed/nuclearLamina/
    mv /var/ftp/encode/LADs_080513.bed.bz2 .
    mv /var/ftp/encode/LaminB1_080513.wig.bz2 .
    mv /var/ftp/encode/LaminB1_LAD.md5sum .
    # to check the md5sum we need to unzip it to its original name, done on the NFS host for this directory
    df -h .
    # Filesystem            Size  Used Avail Use% Mounted on
    # kkstore02-10:/export/cluster/store11
    #                       1.8T  1.7T   94G  95% /cluster/store11
    ssh kkstore02-10
    cd /cluster/data/hg18/bed/nuclearLamina/
    # check they are not too big to unzip, look ok
    ll -h L*bz2
    # -rw-r--r--  1 mikep protein 13K Jun 10 00:58 LADs_080513.bed.bz2
    # -rw-r--r--  1 mikep protein 16M Jun 10 01:02 LaminB1_080513.wig.bz2
    bunzip2 -dk L*bz2
    md5sum -c LaminB1_LAD.md5sum
    # all ok
    # LADs_080513.bed: OK
    # LaminB1_080513.wig: OK

    # Description files were received via email and copied directly to this dir.
    # Needed to convert from mac to unix due to ^M chars:
    mac2unix L*.html
    # Checked files looked OK, needed to remove HTML tags such as: DOCTYPE <HTML> <BODY> </BODY> </HTML>
    vi L*.html
    # Now find the min/max/avg range of values from the wiggle file
    egrep  "^[0-9]" LaminB1_080513.wig |ave -col=2 stdin
    # Q1 -0.509000
    # median -0.000000
    # Q3 0.514000
    # average -0.041192
    # min -6.602000
    # max 5.678000
    # count 2909178
    # total -119833.701411
    # standard deviation 1.037038

    # Now load the tracks on hgwdev
    ssh hgwdev
    cd /cluster/data/hg18/bed/nuclearLamina/
    # First two lines are custom track header
    tail +3 LADs_080513.bed | hgLoadBed hg18 laminB1Lads stdin
    # wigEncode the .wig and .wib files from the supplied wig ascii file, and symlink the .wib file from /gbdb
    wigEncode LaminB1_080513.wig laminB1.wig laminB1.wib
    ln -s /cluster/data/hg18/bed/nuclearLamina/laminB1.wib /gbdb/hg18/wib/
    # Converted LaminB1_080513.wig, upper limit 5.68, lower limit -6.60
    hgLoadWiggle hg18 laminB1 laminB1.wig
    rm bed.tab wiggle.tab

    ## Create the track definitions in hg18, copy them over, (these are my paths) and do make
    ## Make entries for: bed = "track laminB1Lads" wiggle = "track laminB1"
    ssh hgwdev
    # vi /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/trackDb.ra
    # cp /cluster/data/hg18/bed/nuclearLamina/laminB1.html      /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/
    # cp /cluster/data/hg18/bed/nuclearLamina/laminB1Lads.html  /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/
    # cp /cluster/data/hg18/bed/nuclearLamina/laminB1Super.html /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/
    # cp /cluster/data/hg18/bed/nuclearLamina/laminB1Super.gif  /cluster/home/mikep/browser/images/
    # cd /cluster/home/mikep/kent/src/hg/makeDb/trackDb
    # make

    # Add wig ascii track (+readme) to goldenPath so it can be downloaded
    mkdir  /data/apache/htdocs/goldenPath/hg18/nuclearLamina
    cp /cluster/data/hg18/bed/nuclearLamina/LaminB1_080513.wig.bz2 /data/apache/htdocs/goldenPath/hg18/nuclearLamina/hg18.laminB1.txt.bz2
    cp /cluster/data/hg18/bed/nuclearLamina/goldenPath.README.txt  /data/apache/htdocs/goldenPath/hg18/nuclearLamina/README.txt

    # Add both tracks to all.joiner under section: tablesIgnored $hg

############################################################################
##### Positively Selected Genes (Pos Sel Genes)  (braney - DONE - 2008-07-07)

    # get SQL data (mammalPsq.sql) from Adam Siepel
    #    and Tomas Vinar (acs4@cornell.edu)
    hgsql hg18 < mammalPsg.sql
    echo "alter table mammalPsg add index (chrom(7));" | hgsql hg18

####################################################################
# UPDATE UNIGENE/SAGE TRACK (DONE - 2008-08-09 Fan)

# Create the uniGene alignments

    # Download of the latest UniGene version is now automated by a
    # cron job -- see /cluster/home/angie/crontab ,
    # /cluster/home/angie/unigeneVers/unigene.csh .
    # If hgwdev gets rebooted, that needs to be restarted... maybe there's
    # a more stable place to set up that cron job.

    ssh hgwdev
    cd /cluster/store11/gs.19/build36/bed
    cd uniGene
    mkdir old
    mv * old

    set Version = 214

    zcat /cluster/store7/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\
    sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa

    ssh pk
    set Version = 214
    mv /san/sanvol1/scratch/hg18/uniGene /san/sanvol1/scratch/hg18/uniGene.old
    mkdir /san/sanvol1/scratch/hg18/uniGene/
    cd /san/sanvol1/scratch/hg18/uniGene/
    cp -p /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa .
    ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
    ls -1S \
    /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa \
      > uniGene.lst
    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2)  {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 genome.lst uniGene.lst template.sub para.spec
    para create para.spec
    mkdir psl
    para try
    para check
    para push

Completed: 49 of 49 jobs
CPU time in finished jobs:      59778s     996.30m    16.60h    0.69d  0.002 y
IO & Wait Time:                   208s       3.47m     0.06h    0.00d  0.000 y
Average job time:                1224s      20.40m     0.34h    0.01d
Longest finished job:            4549s      75.82m     1.26h    0.05d
Submission to last job:          4653s      77.55m     1.29h    0.05d
Estimated complete:                 0s       0.00m     0.00h    0.00d

    pslSort dirs raw.psl tmp psl >& pslSort.log
    cat raw.psl|\
    pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
      stdin hg18.uniGene.pslReps.psl /dev/null

# Processed 553470 alignments
    gzip raw.psl
    gzip Hs.seq.uniq.simpleHeader.fa

    ssh hgwdev
    cd /cluster/store11/gs.19/build36/bed/uniGene
    cp -p /san/sanvol1/scratch/hg18/uniGene/hg18.uniGene.pslReps.psl .

    hgLoadPsl -table=uniGene_3 hg18 hg18.uniGene.pslReps.psl

# load the sequence with -replace option

    hgLoadSeq -replace hg18 /gbdb/hg18/uniGene/Hs.seq.uniq.simpleHeader.fa

#############################################################################
# BLASTZ/CHAIN/NET dipOrd1 (DONE - 2008-10-22 - Hiram)
    screen #	use screen to control this multi-day job
    mkdir /hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21
    cd /hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21
    cat << '_EOF_' > DEF
# Human vs. Kangaroo rat

BLASTZ_M=50
BLASTZ=lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000

# QUERY: Kangaroo rat
SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit
SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    881m33.829s
    cat fb.hg18.chainDipOrd1Link.txt 
    #	786126212 bases of 2881515245 (27.282%) in intersection
    #	slight difficulty with the makeMd5sum.csh script, fixed in the source
    #	and completed the copy of the liftOver file, then continuing,
    #	with -syntenicNet:

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet -continue=cleanup -bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 &
    #	real    86m15.646s

    cd /cluster/data/hg18/bed/blastzDipOrd1.2008-10-21

    time nice -n +19 doRecipBest.pl hg18 dipOrd1 > rbest.log 2>&1 &
    #	real    327m0.719s

#############################################################################
# BLASTZ/CHAIN/NET pteVam1 (DONE - 2008-10-21,29 - Hiram)
    screen #	use screen to control this multi-day job
    mkdir /hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21
    cd /hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21
    cat << '_EOF_' > DEF
# Human vs. Megabat

BLASTZ_M=50
BLASTZ=lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000

# QUERY: Megabat
SEQ2_DIR=/scratch/data/pteVam1/pteVam1.2bit
SEQ2_LEN=/scratch/data/pteVam1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    595m14.168s
    #	some crashed jobs, finish the batch on pk manually, then, continuing:
    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-continue=cat -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
    #	real    151m54.924s
    cat fb.hg18.chainPteVam1Link.txt
    #	1311133709 bases of 2881515245 (45.502%) in intersection

    cd /cluster/data/hg18/bed/blastzPteVam1.2008-10-21

    time nice -n +19 doRecipBest.pl hg18 pteVam1 > rbest.log 2>&1 &
    #	finish manually due to problems:
    #	real    286m25.330s
    doRecipBest.pl -continue=download hg18 pteVam1 > rbestDownload.log 2>&1

#############################################################################
# BLASTZ/CHAIN/NET turTru1 (DONE - 2008-10-22 - Hiram)
    screen #	use screen to control this multi-day job
    mkdir /hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21
    cd /hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21
    cat << '_EOF_' > DEF
# Human vs. Dolphin

BLASTZ_M=50
BLASTZ=lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000

# QUERY: Dolphin
SEQ2_DIR=/scratch/data/turTru1/turTru1.2bit
SEQ2_LEN=/scratch/data/turTru1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    702m54.490s
    cat fb.hg18.chainTurTru1Link.txt
    #	1398587431 bases of 2881515245 (48.537%) in intersection
    #	slight difficulty with the makeMd5sum.csh script, fixed in the source
    #	and completed the copy of the liftOver file, then continuing,
    #	with -syntenicNet:

    cd /cluster/data/hg18/bed/blastzTurTru1.2008-10-21
    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet -continue=cleanup -bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 &
    #	real    74m4.276s

    time nice -n +19 doRecipBest.pl hg18 turTru1 > rbest.log 2>&1 &
    #	real    275m19.714s

#############################################################################
# BLASTZ/CHAIN/NET tarSyr1 (DONE - 2008-10-21,29 - Hiram)
    screen #	use screen to control this multi-day job
    mkdir /hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21
    cd /hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21
    cat << '_EOF_' > DEF
# Human vs. Tarsier

BLASTZ_M=50
BLASTZ=lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000

# QUERY: Tarsier
SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit
SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    1518m42.776s
    #	recovered the batch on pk, then continuing:
    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-continue=cat -syntenicNet -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
    #	real    526m45.582s

    cat fb.hg18.chainTarSyr1Link.txt 
    #	1383104827 bases of 2881515245 (47.999%) in intersection

    cd /cluster/data/hg18/bed/blastzTarSyr1.2008-10-21

    time nice -n +19 doRecipBest.pl hg18 tarSyr1 > rbest.log 2>&1 &
    #	failed, finishing manually
    #	real    155m48.855s
    doRecipBest.pl -continue=download hg18 tarSyr1 > rbest.log 2>&1

#############################################################################
# BLASTZ/CHAIN/NET proCap1 (DONE - 2008-10-22,29 - Hiram)
    screen #	use screen to control this multi-day job
    mkdir /hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22
    cd /hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22
    cat << '_EOF_' > DEF
# Human vs. Rock Hyrax

BLASTZ_M=50
BLASTZ=lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000

# QUERY: Rock Hyrax
SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit
SEQ2_LEN=/scratch/data/proCap1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    1654m44.904s
    #	finish lastz batch manually after script difficulties, then continuing:
    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-continue=cat -syntenicNet -bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
    #	real    227m41.045s
    cat fb.hg18.chainProCap1Link.txt 
    #	891406629 bases of 2881515245 (30.935%) in intersection

    cd /cluster/data/hg18/bed/blastzProCap1.2008-10-22

    time nice -n +19 doRecipBest.pl hg18 proCap1 > rbest.log 2>&1 &
    #	real    232m9.789s
    #	failed
    #	running the last couple of commands to finish this off
    #	real    561m51.171s
    doRecipBest.pl -continue=download hg18 proCap1 > rbestDownload.log 2>&1

#############################################################################
# BLASTZ/CHAIN/NET choHof1 (DONE - 2008-10-22,28 - Hiram)
    screen #	use screen to control this multi-day job
    mkdir /hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22
    cd /hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22
    cat << '_EOF_' > DEF
# Human vs. Sloth

BLASTZ_M=50
BLASTZ=lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000

# QUERY: Sloth
SEQ2_DIR=/scratch/data/choHof1/choHof1.2bit
SEQ2_LEN=/scratch/data/choHof1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    1649m6.606s
    #	finish lastz batch manually after script difficulties, then continuing:
    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-continue=cat -syntenicNet -bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
    #	real    276m1.827s
    cat fb.hg18.chainChoHof1Link.txt
    #	993065598 bases of 2881515245 (34.463%) in intersection

    cd /cluster/data/hg18/bed/blastz.choHof1.2008-10-22

    time nice -n +19 doRecipBest.pl hg18 choHof1 > rbest.log 2>&1 &
    #	real    900m50.222s

#############################################################################
# BLASTZ/CHAIN/NET dasNov2 (DONE - 2008-10-22,29 - Hiram)
    screen #	use screen to control this multi-day job
    mkdir /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22
    cd /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22
    cat << '_EOF_' > DEF
# Human vs. Armadillo

BLASTZ_M=50
BLASTZ=lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000

# QUERY: Armadillo
SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit
SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    1664m4.331s
    #	finish this batch manually after some code troubles, then:
    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-continue=cat -syntenicNet -bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
    #	real    230m4.513s
    #	something broke during chainSplit, try that manuallyo
    nice -n +19 chainSplit \
/hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22/axtChain/chain \
/hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22/axtChain/hg18.dasNov2.all.chain.gz
    #	no problem with that, continuing:
    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-continue=net -syntenicNet -bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1 &
    #	real    206m54.072s

    cd /cluster/data/hg18/bed/blastzDasNov2.2008-10-22

    time nice -n +19 doRecipBest.pl hg18 dasNov2 > rbest.log 2>&1 &
    #	failed, finishing manually:
    #	real    680m1.703s
    #	the following takes an instant:
    doRecipBest.pl -continue=download hg18 dasNov2 \
	> rbestDownload.log 2>&1 &

#############################################################################
# BLASTZ/CHAIN/NET loxAfr2 (DONE - 2008-10-22,29 - Hiram)
    screen #	use screen to control this multi-day job
    mkdir /hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22
    cd /hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22
    cat << '_EOF_' > DEF
# Human vs. Elephant

BLASTZ_M=50
BLASTZ=lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000

# QUERY: Elephant
SEQ2_DIR=/scratch/data/loxAfr2/loxAfr2.2bit
SEQ2_LEN=/scratch/data/loxAfr2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	1580m26.439s
    #	problems with batch do to scriping errors, finishing the batch
    #	manually
    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-continue=cat -syntenicNet -bigClusterHub=swarm  \
      -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
    #	real    264m46.272s

    cat fb.hg18.chainLoxAfr2Link.txt 
    #	1014404239 bases of 2881515245 (35.204%) in intersection

    cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-10-22
    time nice -n +19 doRecipBest.pl hg18 loxAfr2 > rbest.log 2>&1 &
    #	real    622m17.655s

#############################################################################
# BLASTZ/CHAIN/NET vicPac1 (DONE - 2008-10-28,29 - Hiram)
    screen #	use screen to control this multi-day job
    mkdir /hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28
    cd /hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28
    cat << '_EOF_' > DEF
# Human vs. Alpaca

BLASTZ_M=50
BLASTZ=lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000

# QUERY: Alpaca
SEQ2_DIR=/scratch/data/vicPac1/vicPac1.2bit
SEQ2_LEN=/scratch/data/vicPac1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=swarm -syntenicNet \
      -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    488m36.288s
    cat fb.hg18.chainVicPac1Link.txt 
    #	1139088501 bases of 2881515245 (39.531%) in intersection

    cd /cluster/data/hg18/bed/blastzVicPac1.2008-10-28

    time nice -n +19 doRecipBest.pl hg18 vicPac1 > rbest.log 2>&1 &
    #	real    380m17.963s

#############################################################################
# BLASTZ/CHAIN/NET Gorilla gorGor1 (DONE - 2008-11-04,05 - Hiram)
    screen #	use screen to control this multi-day job
    mkdir /hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04
    cd /hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04
    cat << '_EOF_' > DEF
# Human vs. Alpaca

BLASTZ_M=50
BLASTZ=lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000

# QUERY: Alpaca
SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=284
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-bigClusterHub=swarm -syntenicNet \
      -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    644m45.816s
    cat fb.hg18.chainGorGor1Link.txt 
    #	1778801556 bases of 2881515245 (61.731%) in intersection

    cd /cluster/data/hg18/bed/blastzGorGor1.2008-11-04

    time nice -n +19 doRecipBest.pl hg18 gorGor1 > rbest.log 2>&1 &
    #	real    171m42.585s
    #	failed, need to finish manually
    cd /hive/data/genomes/hg18/bed/blastz.gorGor1/axtChain
    # alter the doRecipBest.csh script to finiRecipBest.csh and run:
    time ./finiRecipBest.csh > finiRecipBest.log 2>&1
    #	real    1177m37.534s
    #	then, continuing:
    doRecipBest.pl -continue=download hg18 gorGor1

#############################################################################
# BLASTZ/CHAIN/NET ochPri2 (DONE braney 2008-07-30)
    ssh kkstore02
    screen #	use screen to control this multi-day job
    mkdir /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
    cd /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
    cat << _EOF_ > DEF
# Human vs. Pika

BLASTZ_M=50
BLASTZ=/cluster/home/braney/bin/x86_64/lastz

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Pika
SEQ2_DIR=/san/sanvol1/scratch/ochPri2/ochPri2.2bit
SEQ2_LEN=/san/sanvol1/scratch/ochPri2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
TMPDIR=/scratch/tmp
_EOF_

    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do.log 2>&1 &

# Completed: 654120 of 654120 jobs
# CPU time in finished jobs:   14082913s  234715.22m  3911.92h  163.00d  0.447 y
# IO & Wait Time:               2257180s   37619.67m   626.99h   26.12d  0.072 y
# Average job time:                  25s       0.42m     0.01h    0.00d
# Longest finished job:             292s       4.87m     0.08h    0.00d
# Submission to last job:         59396s     989.93m    16.50h    0.69d

    nice doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do2.log 2>&1 &

    # memk cluster couldn't find san for chainRun, ran on pk

    nice doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=chainMerge \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do3.log 2>&1 &

    ln -s `pwd`/blastz.ochPri2.2008-07-29 /cluster/data/hg18/bed/blastz.ochPri2

    featureBits hg18 chainOchPri2Link
# 806073890 bases of 2881515245 (27.974%) in intersection

    cd /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29

    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 ochPri2 > rbest.log 2>&1 &

#############################################################################
# BLASTZ/CHAIN/NET myoLuc1 (DONE braney 2008-08-02)
    ssh kkstore02
    screen #	use screen to control this multi-day job
    mkdir /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
    cd /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
    cat << _EOF_ > DEF
# Human vs. Microbat

BLASTZ_M=50
BLASTZ_T=2
BLASTZ=/cluster/home/braney/bin/x86_64/lastz

# TARGET: Human Hg18 (whole chroms)
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=0

# QUERY: Microbat
SEQ2_DIR=/san/sanvol1/scratch/myoLuc1/myoLuc1.2bit
SEQ2_LEN=/san/sanvol1/scratch/myoLuc1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
TMPDIR=/scratch/tmp
_EOF_

    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/myoLuc1/blastz.hg18 > do.log 2>&1 &

# Completed: 98879 of 99144 jobs
# Crashed: 56 jobs
# Other count: 209 jobs
# CPU time in finished jobs:    2327505s   38791.75m   646.53h   26.94d  0.074 y
# IO & Wait Time:                340164s    5669.40m    94.49h    3.94d  0.011 y
# Average job time:                  27s       0.45m     0.01h    0.00d
# Longest finished job:            1034s      17.23m     0.29h    0.01d
# Submission to last job:         56968s     949.47m    15.82h    0.66d

# do remaining jobs on kolossus

    nice doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/myoLuc1/blastz.hg18 > do2.log 2>&1 &

    ln -s `pwd`/blastz.myoLuc1.2008-07-31 /cluster/data/hg18/bed/blastz.myoLuc1

    featureBits hg18 chainMyoLuc1Link
# 952177725 bases of 2881515245 (33.044%) in intersection

    cd /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31

    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 myoLuc1 > rbest.log 2>&1 &

#############################################################################
# BLASTZ/CHAIN/NET loxAfr2 (not done)
    ssh kkstore02
    screen #	use screen to control this multi-day job
    mkdir /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
    cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
    cat << _EOF_ > DEF
# Human vs. Elephant

BLASTZ_M=50
BLASTZ=/cluster/home/braney/bin/x86_64/lastz

# TARGET: Human Hg18 (whole chroms)
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=0

# QUERY: Elephant
SEQ2_DIR=/san/sanvol1/scratch/loxAfr2/loxAfr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/loxAfr2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
TMPDIR=/scratch/tmp
_EOF_

    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do.log 2>&1 &

# had to run some jobs on memk

    nice doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do2.log 2>&1 &

    # netChainSubset barfed with memory error (skipped over chains)

    nice doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do3.log 2>&1 &

    ln -s `pwd`/blastz.loxAfr2.2008-08-01 /cluster/data/hg18/bed/blastz.loxAfr2

    featureBits hg18 chainLoxAfr2Link
# 1025499138 bases of 2881515245 (35.589%) in intersection

    cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01

    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 loxAfr2 > rbest.log 2>&1 &

#############################################################################
# BUILD snpArrayIllumina1M SUB-TRACK (DONE 8/4/08, Fan)

# Received raw data file Illumina_Human1M-duoV3_SNPlist_Strand_Location.csv
# from Illumina,  Luana Galver (lgalver at illumina.com).

    mkdir -p /cluster/store11/gs.19/build36/bed/snp/illumina/1M
    cd /cluster/store11/gs.19/build36/bed/snp/illumina/1M

    cat Illumina_Human1M-duoV3_SNPlist_Strand_Location.csv |\
    sed -e 's/,/\t/g' >1M.tab

    hgsql hg18 < ~src/hg/lib/snpArrayIllumina1MRaw.sql
    hgsql hg18 -e 'load data local infile "1M.tab" into table snpArrayIllumina1MRaw'

    ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIllumina1MRaw snp129

# The illuminaLookup1M generate two files:
#
#	illuminaLookup1M.out  contains all Illumina 1M probes found in snp129
#	illuminaLookup1M.err  contains all Illumina 1M probes not found in snp129

    mv illuminaLookup.out illuminaLookup1Ma.out

    cut -f 1 illuminaLookup.err >j.1
    cat j.1 |sed -e 's/chrMt/chrM/' |\
    sed -e 's/XY/X/'  >j.chr

    cut -f 2-5 illuminaLookup.err >j.2

    cut -f 6 illuminaLookup.err >j.3
    cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand

    cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed

    paste j.chr j.2 j.strand j.observed >illuminaLookup1Mb.out

# combine two parts
    cat illuminaLookup1Ma.out illuminaLookup1Mb.out >snpArrayIllumina1M.tab

# load the table
    hgLoadBed hg18 snpArrayIllumina1M snpArrayIllumina1M.tab -tab -sqlTable=snpArrayIllumina1M.sql

#############################################################################
# BLASTZ/CHAIN/NET micMur1 (DONE braney 2008-08-04 )
    ssh kkstore02
    screen #	use screen to control this multi-day job
    mkdir /cluster/data/hg18/bed/blastz.micMur1.2008-08-03
    cd /cluster/data/hg18/bed/blastz.micMur1.2008-08-03
    cat << _EOF_ > DEF
# Human vs. Mouse lemur

BLASTZ_M=50
BLASTZ_T=2
BLASTZ=/cluster/home/braney/bin/x86_64/lastz

# TARGET: Human Hg18 (whole chroms)
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=0

# QUERY: Mouse lemur
SEQ2_DIR=/san/sanvol1/scratch/micMur1/micMur1.2bit
SEQ2_LEN=/san/sanvol1/scratch/micMur1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.micMur1.2008-08-03
TMPDIR=/scratch/tmp
_EOF_

    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/micMur1/blastz.hg18 > do.log 2>&1 &

# did remaining jobs on memk

    nice doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/micMur1/blastz.hg18 > do2.log 2>&1 &

    ssh hgwdev
    cd /cluster/data/hg18/bed

    ln -s `pwd`/blastz.micMur1.2008-08-03 /cluster/data/hg18/bed/blastz.micMur1

    featureBits hg18 chainMicMur1Link
# 1338330504 bases of 2881515245 (46.445%) in intersection

    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.micMur1.2008-08-03

    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 micMur1 > rbest.log 2>&1 &

#############################################################################
# BLASTZ/CHAIN/NET speTri1 (DONE braney 2008-08-05)
    ssh kkstore02
    screen #	use screen to control this multi-day job
    mkdir /cluster/data/hg18/bed/blastz.speTri1.2008-08-04
    cd /cluster/data/hg18/bed/blastz.speTri1.2008-08-04
    cat << _EOF_ > DEF
# Human vs. Squirrel

BLASTZ_M=50
BLASTZ=/cluster/home/braney/bin/x86_64/lastz

# TARGET: Human Hg18 (whole chroms)
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=0

# QUERY: Squirrel
SEQ2_DIR=/san/sanvol1/scratch/speTri1/speTri1.2bit
SEQ2_LEN=/san/sanvol1/scratch/speTri1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastz.speTri1.2008-08-04
TMPDIR=/scratch/tmp
_EOF_

    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do.log 2>&1 &

# did crashed jobs on memk

    nice doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do2.log 2>&1 &

# had to run netChains.csh by hand due to PATH problem
    nice doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do3.log 2>&1 &

    ssh hgwdev
    cd /cluster/data/hg18/bed
    ln -s `pwd`/blastz.speTri1.2008-08-04 /cluster/data/hg18/bed/blastz.speTri1

    featureBits hg18 chainSpeTri1Link
# 1032377454 bases of 2881515245 (35.828%) in intersection

    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.speTri1.2008-08-04

    nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 speTri1 > rbest.log 2>&1 &

#######################################################
## 44-way multiz (braney working....

mkdir /cluster/data/hg18/bed/multiz44way
cd /cluster/data/hg18/bed/multiz44way
cp /cluster/data/mm9/bed/multiz30way/mm9.guess.30way.nh .

# get mammal tree from Michele Clamp (clamp.nh)
# that I re-rooted

#######################################################
# UW nucleosome occupancy predictions  (2008-08-13 markd)
# update due to chr3 being truncated (2009-05-12 markd)
# contact  William Stafford Noble <noble@gs.washington.edu>
    # obtain data:
    mkdir -p /cluster/data/hg18/bed/uwNucOcc
    cd /cluster/data/hg18/bed/uwNucOcc
    http://USER:PASS@noble.gs.washington.edu/~noble/proj/dennis/results/2008-08-11/ucsc.tgz
    mkdir wig
    cd wig
    tar -zxf ../ucsc.tgz
    cd ..
    rm  ucsc.tgz

    # encode and load wiggles
    ssh kkstore02
    cd /cluster/data/hg18/bed/uwNucOcc/wib
    zcat ../wig/a375/a375.chr*.wig.gz|wigEncode stdin uwNucOccA375.wig uwNucOccA375.wib
    # Converted stdin, upper limit 9.88, lower limit -5.19

    zcat ../wig/dennis/dennis.chr*.wig.gz|wigEncode stdin uwNucOccDennis.wig uwNucOccDennis.wib
    # Converted stdin, upper limit 8.26, lower limit -9.68

    zcat ../wig/mec/mec.chr*.wig.gz|wigEncode stdin uwNucOccMec.wig uwNucOccMec.wib
    # Converted stdin, upper limit 5.05, lower limit -9.86

    # link-n-load
    ssh hgwdev
    cd cluster/data/hg18/bed/uwNucOcc/wib
    ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccA375.wib /gbdb/hg18/wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccA375 uwNucOccA375.wig

    ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccDennis.wib /gbdb/hg18/wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccDennis uwNucOccDennis.wig

    ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccMec.wib /gbdb/hg18/wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccMec uwNucOccMec.wig

    rm wiggle.tab

    # noble lab supplied update due to chr3 being truncated (2009-05-12 markd)
    cd /cluster/data/hg18/bed/uwNucOcc
    mkdir bad
    mv wig/*/*.chr3.hg18.wig.gz bad/
    mv wib  bad/
    wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/a375/a375.chr3.hg18.wig.gz
    wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/mec/mec.chr3.hg18.wig.gz
    wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/dennis/dennis.chr3.hg18.wig.gz
    mv dennis.chr3.hg18.wig.gz wig/dennis/
    mv mec.chr3.hg18.wig.gz wig/mec/
    mv a375.chr3.hg18.wig.gz wig/a375/

    cd /cluster/data/hg18/bed/uwNucOcc/wib
    zcat ../wig/a375/a375.chr*.wig.gz|wigEncode stdin uwNucOccA375.wig uwNucOccA375.wib
    zcat ../wig/dennis/dennis.chr*.wig.gz|wigEncode stdin uwNucOccDennis.wig uwNucOccDennis.wib
    zcat ../wig/mec/mec.chr*.wig.gz|wigEncode stdin uwNucOccMec.wig uwNucOccMec.wib
    cd ..
    hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccA375 uwNucOccA375.wig
    hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccDennis uwNucOccDennis.wig
    hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccMec uwNucOccMec.wig

#########################################################################
# BLASTZ/CHAIN/NET oryLat2 (DONE - 2008-08-19,25 - Hiram)
    ssh kkstore02
    screen	# use a screen to manage this longish running job
    mkdir /cluster/data/hg18/bed/blastzOryLat2.2008-08-19
    cd /cluster/data/hg18/bed/blastzOryLat2.2008-08-19
    cat << '_EOF_' > DEF
# Human vs. Medaka
BLASTZ=/cluster/bin/penn/x86_64/lastz

# typical parameters for a genome that is distant from human
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Human hg18, randoms complete, as they are, no contig confusion
SEQ1_DIR=/scratch/data/hg18/hg18.2bit
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1

# QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=200
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzOryLat2.2008-08-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -verbose=2 > do.log 2>&1 &
    cat fb.hg18.chainOryLat2Link.txt
    #	52713428 bases of 2881515245 (1.829%) in intersection
    cd /cluster/data/hg18/bed
    ln -s blastzOryLat2.2008-08-19 blastz.oryLat2


    #	That is OK, now for the swap:
    mkdir /cluster/data/oryLat2/bed/blastz.hg18.swap
    cd /cluster/data/oryLat2/bed/blastz.hg18.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/hg18/bed/blastzOryLat2.2008-08-19/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust \
	-bigClusterHub=pk > swap.log 2>&1 &
    #	real    17m9.675s
    cat fb.oryLat2.chainHg18Link.txt
    #	46961822 bases of 700386597 (6.705%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET TAEGUT1 (DONE braney 2008-09-10)
    ssh swarm
    screen
    mkdir /cluster/data/hg18/bed/blastz.taeGut1.2008-09-09
    cd /cluster/data/hg18/bed/blastz.taeGut1.2008-09-09
    cat << _EOF_ > DEF
# human vs. zebra finch
BLASTZ_M=50

# Copied settings from human vs galGal3
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human hg18
SEQ1_DIR=/scratch/data/hg18/hg18.2bit
# SEQ1_SMSK=/hive/data/genomes/hg18/linSpecRep/notInChicken
SEQ1_LEN=/scratch/data/hg18/chrom.sizes

# one chrom at a time
SEQ1_CHUNK=200000000
SEQ1_LAP=0

# QUERY: Zebra finch taeGut1
SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
# SEQ2_DIR=/hive/data/genomes/taeGut1/taeGut1.2bit
# SEQ2_LEN=/hive/data/genomes/taeGut1/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/hg18/bed/blastz.taeGut1.2008-09-09
_EOF_
    # << emacs
     doBlastzChainNet.pl -syntenicNet \
     -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=loose \
     -smallClusterHub=swarm DEF -workhorse=swarm \
     -qRepeats=windowmaskerSdust > do.log 2>&1

# Completed: 14910 of 14910 jobs
# CPU time in finished jobs:    2744737s   45745.62m   762.43h   31.77d  0.087 y
# IO & Wait Time:               1493361s   24889.34m   414.82h   17.28d  0.047 y
# Average job time:                 284s       4.74m     0.08h    0.00d
# Longest finished job:            3678s      61.30m     1.02h    0.04d
# Submission to last job:          6687s     111.45m     1.86h    0.08d

    cd /cluster/data/hg18/bed
    rm -f blastz.taeGut1
    ln -s blastz.taeGut1.2008-09-09 /cluster/data/hg18/bed/blastz.taeGut1

################################################################
# HUMAN FETAL BRAIN EXON ARRAYS (YALE) (Andy)

ssh hgwdev
bash

mkdir /hive/data/genomes/hg18/bed/yaleMicroarrays
cd /hive/data/genomes/hg18/bed/yaleMicroarrays
cp /var/ftp/encode/Sestan_fetal_brain_exon_arrays.rar .
rar e Sestan_fetal_brain_exon_arrays.rar
tail +2 18_19_21_23_full_SLR_converted.txt | grep -v "\-\-\-" > sestanBrainAtlas.bed
hgLoadBed hg18 sestanBrainAtlas{,.bed}
# just a little array name organization
head -n1 18_19_21_23_full_SLR_converted.txt | \
   sed 's/.*expNames=\"//;s/\"\ name=.*//;s/\.CEL//g' | \
   tr ',' '\n' | sed '/^$/d' | grep -n '' | tr ':' '\t' | \
   awk 'BEGIN{OFS="\t";}{$1=$1 - 1; print;}' \
      > arrays.txt
awk 'BEGIN{OFS="\t";}{print $1, $2, $2, "n/a", "n/a", "n/a", "103", "n/a,n/a,"$2",";}' \
   arrays.txt > sestanBrainAtlasExps.tab
ln -s ~/kent/src/hg/lib/expRecord.sql sestanBrainAtlasExps.sql
hgLoadSqlTab hgFixed sestanBrainAtlasExps{,.sql,.tab}

# Removed some of the arrays... the manual way
# something's weird
tr '\r' '\n' < sestanBrainAtlas.bed | sed '/^$/d; s/$/,/' > ses.bed
cut -f1-14 ses.bed | \
   awk 'BEGIN{FS="\t";OFS="\t"}{$2 = $2 - 1; $13 = $13 - 8; print;}' | \
   sed 's/95,96,97,98,99,100,101,102//' > ses14.bed
cut -f15 ses.bed | cut -d',' -f1-74,77-92,99- > ses15.bed
paste ses14.bed ses15.bed > newSestan.bed
hgLoadBed hg18 sestanBrainAtlas newSestan.bed
ln -s ~/kent/src/hg/makeDb/hgCgiData/Human/microarrayGroups.ra
grep -A5 sestanBrainAtlasAll microarrayGroups.ra | grep "^names" | sed 's/names //' | tr ',' '\n' | sed '/^$/d' > namesCol.txt
grep -A5 sestanBrainAtlasAll microarrayGroups.ra | grep "^expIds" | sed 's/expIds //' | tr ',' '\n' | sed '/^$/d' >expCol.txt
paste expCol.txt namesCol.txt > arrays.txt
awk 'BEGIN{OFS="\t";}{print $1, $2, $2, "n/a", "n/a", "n/a", "103", "n/a,n/a,"$2",";}' \
   arrays.txt > sestanBrainAtlasExps.tab
hgLoadSqlTab hgFixed sestanBrainAtlasExps{,.sql,.tab}
ssh kolossus

################################################################
# HUMAN TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)

#
# AffyExonTissue Step 1: download exon array coordinate data from Affy
# and extract coordinates Download HuEx-1_0-st-v2 Annotations, Full,
# Hg18/Build 36 gff.  The file is available at
# http://www.affymetrix.com/support/technical/byproduct.affx?product=huexon-st
# and the download requires logging in to NetAffx (free, registration
# required) Uncompress the GFF files.  Parse out key fields with the
# script below, generating hg18.affy.exon.coords.tab
#

#---------
#!/usr/bin/env perl

=pod

=head1 NAME

parseGffArrayData.pl

=head1 SYNOPSYS

cat *gff |parseGffArrayData.pl > array.coords.tab

=head1 DESCRIPTION

Parses probeset coordinates out of the Affy design data

=cut

{
    use strict;
    use Getopt::Long;
    use GFF;
    use GFF::GeneFeature;
    use FileHandle;

    print "chr\tstart\tend\tID\tscore\tstrand\n";
    while (my $line = <>) {
	chomp;
	my @tokens = split /\s/, $line;
	if ($tokens[2] eq "probeset") {
	    my $gffFeature = new GFF::GeneFeature;
	    my $gffData = $gffFeature->new_from_line($line);
	    my $probesetId = $gffData->group_value('probeset_id');
	    my $probesetLevel = $gffData->group_value('level');
	    my $bounded = $gffData->group_value('bounded');
	    my $cds = $gffData->group_value('cds');
	    my $score;
	    if ($probesetLevel eq "core") {
		$score = 900;
	    } elsif ($probesetLevel eq "extended") {
		$score = 500;
	    } else {
		$score = 200;
	    }
	    if ($bounded) { $score -= 200; }
	    if ($cds) { $score += 100; }
	    if ($score < 100) { $score = 100; }
	    print($gffData->seqname(), "\t", $gffData->start(), "\t",
		  $gffData->end(), "\t", $probesetId, "\t$score\t",
		  $gffData->strand(), "\n");
	}
    }
}
#-------


#
# AffyExonTissue Step 2: download tissue data from Affy, generate bed15 file
#
# Download Human Exon 1.0 ST APT results from
# http://www.affymetrix.com/support/technical/sample_data/exon_array_data.affx
# (requires free registration and login, as above)
# Uncompress, and get rid of the undesired tissue mixture columns.
cut -f 1-34 \
  < apt-probeset-summarize-results-exon/quant-norm.pm-gcbg.plier.summary.txt \
  > quant-norm.pm-gcbg.plier.nomix.summary.txt

#
# Generate a bed15 file using the command below, and script below that.
# For the purposes of generating a track, ignore the first line.
#
arrayToBed15.py \
  --coordinates hg18.affy.exon.coords.tab \
  --plier quant-norm.pm-gcbg.plier.nomix.summary.txt \
  --name "humanExon" \
  --groups "breast,breast,breast,cerebellum,cerebellum,cerebellum,heart,heart,heart,kidney,kidney,kidney,liver,liver,liver,muscle,muscle,muscle,pancreas,pancreas,pancreas,prostate,prostate,prostate,spleen,spleen,spleen,testes,testes,testes,thyroid,thyroid,thyroid" \
|tail -n +2 > human.exon.headless.bed15

#---
#!/usr/bin/python

from optparse import OptionParser
import math
import re

#
# get the genomic probeset coordinates
#
def parseProbesetCoordinates(coordinatesFilename):
    """Build a dictionary of coordinates from a tab-delmited file"""
    coordinateData = {}
    coordinatesFileHandle = open(coordinatesFilename)
    coordinatesFileHandle.readline()  # skip the header line
    for line in coordinatesFileHandle:
        line = line.rstrip();
        tokens = line.split('\t')
        id = tokens[3]
        coordinateData[id] = tokens;
    return(coordinateData)

def median(numbers):
    """Sort the input list and return the middle element."""
    nn = len(numbers)
    copy = numbers[:] # So that "numbers" keeps its original order
    copy.sort()
    if nn & 1:         # There is an odd number of elements
        return copy[nn // 2]
    else:
        return (copy[nn // 2] + copy[nn // 2 - 1]) / 2


def medianOfMedians(experimentNames, experimentValues):
       """Given replicated values, find the median of the replicate medians."""
       # Create a dictionary to sort the values by experiment set
       replicates = {}
       #
       # Group the epxeriments into replicate sets by experiment names.
       # This assumes that experiments in the same replicate set have the
       # same name.
       #
       for ii in range(0,len(experimentNames)-1):
           if not replicates.has_key(experimentNames[ii]):
               replicates[experimentNames[ii]] = [experimentValues[ii]]
           else :
               replicates[experimentNames[ii]].append(experimentValues[ii])
       # Make a list containing the median value of each replicate set.
       medians = list()
       for replicateSet in replicates.keys() :
           values = replicates[replicateSet]
           thisMedian = median(values)
           medians.append(thisMedian)
       # Now get the median value of the median list
       medianValue = median(medians)
       return(medianValue)


def printHeaderData(experimentList, trackName):
    """Print a header line for a bed15 file"""
    expNames = ",".join(experimentList)
    print "track type=\"array\" expScale=3 expStep=0.5 ",
    print " name=\"" + str(trackName) + "\"", \
          " description=\"Microarray custom track\"",
    print " expNames=" "\"" + expNames + "\""


def printPlierResults(resultsLine, experimentGroups, probesetCoordinates):
    """median-center a line of expression results, print in bed15 format"""
    background = 10
    plierResultsLine = line.split('\t')
    probesetId = plierResultsLine[0]
    del plierResultsLine[0]
    if probesetCoordinates.has_key(probesetId):
        coordinates = probesetCoordinates[probesetId]
        #
        # Given coordinate data (chr start end ID score strand)
        #   and given experimental data (ID exp1 exp2 exp3 ... expN)
        # Print as follows:
        # 1. Basic bedfile stuff: chromosome, start, end, name, score,
        #    strand, thickStart (=start), thickEnd (=end), 0, blocks (=1),
        #    blocklengths (=end-start+1,), blockstarts (=0,)
        #
        start = int(coordinates[1]) - 1
        end = int(coordinates[2])
        length = end - start
        print str(coordinates[0]) + "\t" + str(start) + "\t" \
              + str(end) + "\t" + str(probesetId) + "\t", \
              coordinates[4], "\t", coordinates[5], "\t", start, "\t", \
              end, "\t0\t1\t", \
              str(length) + ",\t", "0,\t",
        #
        # Continue with microarray-specific stuff:
        # - experiment count
        # - comma-separated list of experiment IDs (0 .. experimentCount)
        # - comma-separated list of experiment scores (log(result)-log(median))
        #
        experimentCount = len(plierResultsLine)
        experimentValues = list()
        for value in plierResultsLine:
            experimentValues.append(float(value))
        medianValue = medianOfMedians(experimentGroups, experimentValues)
        logMedian = math.log(medianValue+background)
        valuesStrings = list()
        for thisValue in experimentValues:
            thisScore = math.log(thisValue+background) - logMedian
            valuesStrings.append(str(thisScore))
        experimentScoreString = ",".join(valuesStrings)
        ids = list()
        for ii in range(0, experimentCount):
            ids.append(str(ii))
        experimentIdString = ",".join(ids)
        print experimentCount, "\t", experimentIdString, "\t", \
              experimentScoreString
    return

parser = OptionParser()
parser.add_option("--coordinates", dest="coordinatesFile")
parser.add_option("--plier", dest="plierResultsFile")
parser.add_option("--name", dest="trackName")
parser.add_option("--groups", dest="experimentGroups")
(parameters, args) = parser.parse_args()

experimentGroups = parameters.experimentGroups.split(",")
probesetCoordinates = parseProbesetCoordinates(parameters.coordinatesFile)

plierResults = open(parameters.plierResultsFile)
for line in plierResults:
    line = line.rstrip()
    if (re.search("^#", line)) :
        continue
    elif (re.search("^probeset_id", line)) :
        printHeaderData(experimentGroups, parameters.trackName)
    else :
        printPlierResults(line, experimentGroups, probesetCoordinates)
#---


#
# AffyExonTissue Step 3: set up a browser track from the bed15 file
#   created offline: trackDb.affyExonTissues.ra,
#                    affyExonTissues.html,
#                    microarrayGroups.affyExonTissues.ra
#
cat $KENT/src/hg/makeDb/trackDb/human/trackDb.ra trackDb.affyExonTissues.ra \
    > trackDb.new.ra
cp trackDb.new.ra  $KENT/src/hg/makeDb/trackDb/human/trackDb.ra
cp affyExonTissues.html $KENT/src/hg/makeDb/trackDb/human
cat $KENT/src/hg/makeDb/hgCgiData/Human/microarrayGroups.ra \
    microarrayGroups.affyExonTissues.ra  > microarrayGroups.new.ra
hgLoadBed hg18 affyExonTissues human.exon.headless.bed15
cd $KENT/src/hg/makeDb/trackDb
make update DBS="hg17 hg18"
cd $KENT/src
make -j8 cgi >& ~/make.j8.cgi.errout


#
# AffyExonTissue Step 4: load the appropriate fields into hgFixed
#
grep -A5 affyExonTissuesAll microarrayGroups.ra | grep "^names" \
   | sed 's/names //' | tr ',' '\n' | sed '/^$/d' | sed 's/^\s\+//' > n.txt
grep -A5 affyExonTissuesAll microarrayGroups.ra | grep "^expIds" \
   | sed 's/expIds //' | tr ',' '\n' | sed '/^$/d' | sed 's/^\s\+//' > e.txt
paste e.txt n.txt > a.txt
awk 'BEGIN{OFS="\t";}
    {print $1, $2, $2, "n/a", "n/a", "n/a", "33", "n/a,n/a,"$2",";}' a.txt \
  > exps.tab
ln -s ../../../lib/expRecord.sql
hgLoadSqlTab hgFixed affyMouseExonTissuesAllExps expRecord.sql exps.tab
rm a.txt n.txt e.txt exps.tab
############

########################################################################
## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy)
## 1. Log into Affymetrix netaffx site.
## 2. Use Firefox add-on "Export Cookies" to save a file called cookies.txt 
ssh hgwdev
grep affymetrix.com cookies.txt > affycookies.txt
wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/HuEx-1_0-st-v2.na27.hg18.probeset.csv.zip
wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/MoEx-1_0-st-v1.na27.mm9.probeset.csv.zip
wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/RaEx-1_0-st-v1.na27.rn4.probeset.csv.zip
rm affycookies.txt
for z in *.zip; do unzip $z; done
rm *.zip
ln -s HuEx-1_0-st-v2.na27.hg18.probeset.csv hg18.csv
ln -s RaEx-1_0-st-v1.na27.rn4.probeset.csv rn4.csv
ln -s MoEx-1_0-st-v1.na27.mm9.probeset.csv mm9.csv

for csv in {hg18,mm9,rn4}.csv; do
    bed=${csv%.csv}.bed
    sed '1,20d' $csv | tr ',' '\t' | sed 's/\"//g' | cut -f1-5,16 \
     | grep -v "\-\-\-" \
     | awk 'BEGIN{FS="\t";OFS="\t";}{if ($6 == "core") score = 1000; else if ($6 == "extended") score = 700; else if ($6 == "full") score = 300; else score = 100; name = $1"|"$6; print $2, $4-1, $5, name, score, $3}' \
     | bedSort stdin $bed
done
for db in hg18 mm9 rn4; do hgLoadBed $db affyAllExonProbes $db.bed; done
rm hg18.csv mm9.csv rn4.csv
gzip *.bed *.csv
mkdir -p /hive/data/genomes/{hg18,mm9,rn4}/bed/affyAllExonProbes
mv HuEx-1_0-st-v2.na27.* hg18.bed.gz /hive/data/genomes/hg18/bed/affyAllExonProbes/
mv MoEx-1_0-st-v1.na27.* mm9.bed.gz /hive/data/genomes/mm9/bed/affyAllExonProbes/
mv * /hive/data/genomes/rn4/bed/affyAllExonProbes/
## forgot mm8 (see mm8.txt for that one)


################################################
# SPLIT EXPRESSION & REGULATION GROUPS
# (2008-09-09 kate)

echo "insert into grp (name, label, priority, defaultIsClosed) values ('expression', 'Expression', 4.5, 1)" | hgsql hg18
echo "update grp set label='Regulation' where name='regulation'" | hgsql hg18


############################################################################
# KIDD/EICHLER DISCORDANT CLONE ENDS (DONE 9/16/08 angie)
    ssh hgwdev
    mkdir /cluster/data/hg18/bed/kiddEichlerDiscordant
    cd /cluster/data/hg18/bed/kiddEichlerDiscordant
    foreach i (ABC7 ABC8 ABC9 ABC10 ABC11 ABC12 ABC13 ABC14 G248)
      wget --user=uuuu  --password=ppppppp \
        http://eichlerlab.gs.washington.edu/kiddj/hg18_fosmidmap/$i.bestdiscordant.sorted.gz
    end
    # Load the tracks (translate bacEndPairs-inspired format to bed12):
    foreach f (*.gz)
      set track = `echo $f:r:r:r \
        | perl -wpe 's/^([AG])(\w+)$/kiddEichlerDisc$1\L$2/ || die;'`
      if ($status != 0) break
      echo $track
      zcat $f \
      | perl -wpe 'if (/^chrom\s+chromStart/) {s/^.*\n$//; next;} \
        my ($c, $s, $e, $n, $sc, $st, $bSt, $bSz, undef, $t) = split; \
        @bSts = split(",", $bSt);  @bSzs = split(",", $bSz); \
        if ($t =~ /^transchrm_/) { \
          @bSts = (0);  @bSzs = ($e - $s); \
        } elsif ($t =~ /^OEA_/) { \
          die "\nERROR: bSts[0] $bSts[0] != s $s\n" if ($bSts[0] != $s); \
          $bSzs[0]--; \
          $bE = $bSts[0] + $bSzs[0]; \
          die "bE $bE != e $e\n" if ($bE != $e); \
          $bSts[0] -= $s; \
        } elsif ($#bSts == 1) { \
          if ($bSts[0] > $bSts[1]) { \
            # warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \
            $tmp = $bSts[0];  $bSts[0] = $bSts[1];  $bSts[1] = $tmp; \
            $tmp = $bSzs[0];  $bSzs[0] = $bSzs[1];  $bSzs[1] = $tmp; \
          } \
          if ($bSts[0] != $s) { \
            die "\nERROR: n=$n,$t: bSts[0]=$bSts[0] but s=$s\n\t"; \
          } \
          $bSzs[0]--;  $bSzs[1]--; \
          $bE0 = $bSts[0] + $bSzs[0]; \
          $bE1 = $bSts[1] + $bSzs[1]; \
          $bE = $bE0 > $bE1 ? $bE0 : $bE1; \
          if ($bE != $e) { \
            warn "n=$n,$t: bE0=$bE0, bE1=$bE1, bE=$bE, e=$e\n"; \
            if ($bE1 > $e) { \
              warn "n=$n,$t: tweaking bSzs[1] (clip to chromEnd)\n"; \
              $bSzs[1] = $e - $bSts[1]; \
            } \
          } \
          $bSts[0] -= $s;  $bSts[1] -= $s; \
        } else { die "t is $t but \$#bSts is $#bSts"; } \
        $bSt = join(",", @bSts) . ",";  $bSz = join(",", @bSzs) . ","; \
        $rgb = ($t =~ /^deletion/) ? "224,0,0" : \
               ($t =~ /^insertion/) ? "0,0,224" : \
               ($t =~ /^inversion/) ? "0,224,0" : \
               ($t =~ /^OEA/) ? "240,160,64" : "0,0,0"; \
        $_ = join("\t", $c, $s, $e, "$n,$t", int($sc+0.5), $st, $s, $e, $rgb, \
                        scalar(@bSzs), $bSz, $bSt) . "\n";' \
      | hgLoadBed -tab hg18 $track stdin
      checkTableCoords hg18 $track
    end
    # Tons of overlapping block and blockEnd[n-1]!=end warnings from
    # checkTableCoords -- but these are discordant mappings, so we
    # expect those.  Make sure there aren't any other types of errors:
    foreach f (*.gz)
      set track = `echo $f:r:r:r \
      checkTableCoords hg18 $track |& egrep -v 'overlapping|!= end'`
    end
    # No output, good.

    # Get clone ID -> NCBI acc mapping (same as for hg17; redownloaded to
    # make sure).
    mkdir /cluster/data/hg18/bed/kiddEichlerDiscordant/cloneIds
    cd /cluster/data/hg18/bed/kiddEichlerDiscordant/cloneIds
    # Saved off emailed file from Jeff Kidd to clones_used_3nov.txt.accessions;
    # get trace archive trace names for end reads:
    foreach n (7 9 10 11 12 13 14)
      wget http://hgsv.washington.edu/general/download/clone_mapping/ABC$n/ABC$n.conversion.gz
    end
    # ABC8 has _a and _b files:
    wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_a.conversion.gz
    wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_b.conversion.gz
    # That file is not available for G248.
    gunzip *.gz
    # Combine the relevant data from the .conversion files; keep only those
    # IDs that are used in the tracks.
    zcat ../[AG]*.gz \
    | cut -f 4 \
    | egrep -v '^(#chrom|track|name)' \
    | sed -e 's/,.*//' \
    | sort -u > discIds.txt
    grep -h -v ^163722_163722- *.conversion \
    | perl -wpe 's/^OurClone.*\n// || s/^\d+_(HUMAN|\d+_).*\n$// || \
      s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\.(F|FORWARD|R|REVERSE)\.\d+\t(\w+)$/$2$3_$6\t$7\t$8/ || \
      warn "Parse line $.:\n$_";' \
    | sort > allEnds.tab
    grep -wFf discIds.txt allEnds.tab > discEnds.txt
    wc -l discIds.txt allEnds.tab discEnds.txt
#   352330 discIds.txt
# 17490847 allEnds.tab
#   781513 discEnds.txt
    # discEnds.txt has 2 lines (forward & reverse) for most of its ids...
    # ideally we would see 2*(352330) lines in discEnds.txt.
    # Get a list of which discordant clone IDs don't have ends in *.conv*:
    cut -f 1 allEnds.tab | uniq > all.tmp
    comm -23 discIds.txt all.tmp > discNotInConv.txt
    wc -l discNotInConv.txt
#41853 discNotInConv.txt
    cat > combine.pl <<'_EOF_'
#!/usr/bin/perl -w
use strict;
my ($cloneFile, $endsFile) = @ARGV;
open(CLONES, $cloneFile) || die "Can't open $cloneFile: $!\n";
my %idInfo;
while(<CLONES>) {
  (s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\t(\w+)$/$2$3_$6\t$7/ && m/^(\w+)\t(\w+)/) || \
  m/^(G248\w+)\t(\w+)$/ || die "Parse line $.:$_";
  my ($id, $acc) = ($1, $2);
  $idInfo{$id}->[0] = $acc;
}
close(CLONES);
open(ENDS, $endsFile) || die "Can't open $endsFile: $!\n";
while (<ENDS>) {
  chomp; my ($id, $dir, $traceName) = split("\t");
  if ($dir =~ /^F/) {
    $idInfo{$id}->[1] = $traceName;
  } elsif ($dir =~ /^R/) {
    $idInfo{$id}->[2] = $traceName;
  } else { die "What is this \$dir: $dir ?\n"; }
}
close(ENDS);
foreach my $id (sort keys %idInfo) {
  my $infoRef = $idInfo{$id};
  $infoRef->[0] = '' if (! defined $infoRef->[0]);
  $infoRef->[1] = 0 if (! defined $infoRef->[1]);
  $infoRef->[2] = 0 if (! defined $infoRef->[2]);
  print join("\t", $id, @{$infoRef}) . "\n";
}
'_EOF_'
    # << emacs
    chmod a+x combine.pl
    combine.pl clones_used_3nov.txt.accessions discEnds.txt \
    | sort > kiddEichlerToNcbi.txt
    # Load table:
    hgLoadSqlTab hg18 kiddEichlerToNcbi \
      $HOME/kent/src/hg/lib/kiddEichlerToNcbi.sql kiddEichlerToNcbi.txt
    # Add to makeDb/schema/all.joiner, then check:
    runJoiner.csh hg18 kiddEichlerToNcbi $HOME/kent/src/hg/makeDb/schema


############################################################################
# hgPal downloads 28way refGene, knownGene, knownCanonical
    ssh hgwdev
    screen
    bash
    rm -rf /cluster/data/hg18/bed/multiz28way/pal
    mkdir /cluster/data/hg18/bed/multiz28way/pal
    cd /cluster/data/hg18/bed/multiz28way/pal
    cat > order.lst <<EOF
hg18
panTro2
rheMac2
otoGar1
tupBel1
mm8
rn4
cavPor2
oryCun1
sorAra1
eriEur1
canFam2
felCat3
equCab1
bosTau3
dasNov1
loxAfr1
echTel1
monDom4
ornAna1
anoCar1
galGal3
xenTro2
danRer4
tetNig1
fr2
gasAcu1
oryLat1
EOF

    mz=multiz28way
    gp=refGene
    db=hg18
    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.jobs

    time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
    sleep 1
    tail -f $gp.jobs.log

# real    232m24.611s
# user    13m59.669s
# sys     5m5.601s

    zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    # we're only distributing exons at the moment
    mz=multiz28way
    gp=refGene
    db=hg18
    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

    mz=multiz28way
    gp=knownGene
    db=hg18

    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
    sleep 1
    tail -f $gp.$mz.job.log

# real    248m39.293s
# user    23m30.788s
# sys     8m2.714s

    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

    # now do the canonical set
    cd /cluster/data/hg18/bed/multiz28way/pal
    mz=multiz28way
    gp=knownCanonical
    db=hg18
    for j in `awk '{print $1}' /cluster/data/hg18/chrom.sizes`
    do
	echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
    done

    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -geneBeds=$j.known.bed  $db $mz knownGene order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
    sleep 1
    tail -f $gp.$mz.job.log

# real    216m41.700s
# user    10m22.016s
# sys     4m6.917s

    rm *.known.bed
    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

#########################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
hg18.upstreamGeneTbl = refGene
hg18.upstreamMaf = multiz28way /hive/data/genomes/hg18/bed/multiz28way/species.lst

#########################################################################
# BarskiChIPseq tracks      Begun: 2008-09-19 Finished: 2008-09-22 Tim
# Barski, et al 2007 Paper - High-Resolution Mapping of Histone Modifications in the Human Genome
# Solexa high-throughput sequencing: ChIPseq data
# http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/hgtcell.html

ssh hgwdev
mkdir /hive/data/genomes/hg18/bed/Barski2007/lab
cd /hive/data/genomes/hg18/bed/Barski2007/lab
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4R3me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2BK5me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2AZ.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/PolII.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/CTCF.bed

wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4R3me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2BK5me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2AZ.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/PolII.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/CTCF.vstep.gz

gunzip *.gz

mv H3K4me1.vstep  H3K4me1.wig
mv H3K4me2.vstep  H3K4me2.wig
mv H3K4me3.vstep  H3K4me3.wig
mv H3K9me1.vstep  H3K9me1.wig
mv H3K9me2.vstep  H3K9me2.wig
mv H3K9me3.vstep  H3K9me3.wig
mv H3K27me1.vstep H3K27me1.wig
mv H3K27me2.vstep H3K27me2.wig
mv H3K27me3.vstep H3K27me3.wig
mv H3K36me1.vstep H3K36me1.wig
mv H3K36me3.vstep H3K36me3.wig
mv H3K79me1.vstep H3K79me1.wig
mv H3K79me2.vstep H3K79me2.wig
mv H3K79me3.vstep H3K79me3.wig
mv H3R2me1.vstep  H3R2me1.wig
mv H3R2me2.vstep  H3R2me2.wig
mv H4K20me1.vstep H4K20me1.wig
mv H4K20me3.vstep H4K20me3.wig
mv H4R3me2.vstep  H4R3me2.wig
mv H2BK5me1.vstep H2BK5me1.wig
mv H2AZ.vstep     H2AZ.wig
mv PolII.vstep    PolII.wig
mv CTCF.vstep     CTCF.wig

head -1 H3K4me1.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me1/g"  > barskiChIPseqH3K4me1.wigVar
head -1 H3K4me2.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me2/g"  > barskiChIPseqH3K4me2.wigVar
head -1 H3K4me3.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me3/g"  > barskiChIPseqH3K4me3.wigVar
head -1 H3K9me1.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me1/g"  > barskiChIPseqH3K9me1.wigVar
head -1 H3K9me2.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me2/g"  > barskiChIPseqH3K9me2.wigVar
head -1 H3K9me3.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me3/g"  > barskiChIPseqH3K9me3.wigVar
head -1 H3K27me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me1/g" > barskiChIPseqH3K27me1.wigVar
head -1 H3K27me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me2/g" > barskiChIPseqH3K27me2.wigVar
head -1 H3K27me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me3/g" > barskiChIPseqH3K27me3.wigVar
head -1 H3K36me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K36me1/g" > barskiChIPseqH3K36me1.wigVar
head -1 H3K36me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K36me3/g" > barskiChIPseqH3K36me3.wigVar
head -1 H3K79me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me1/g" > barskiChIPseqH3K79me1.wigVar
head -1 H3K79me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me2/g" > barskiChIPseqH3K79me2.wigVar
head -1 H3K79me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me3/g" > barskiChIPseqH3K79me3.wigVar
head -1 H3R2me1.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3R2me1/g"  > barskiChIPseqH3R2me1.wigVar
head -1 H3R2me2.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3R2me2/g"  > barskiChIPseqH3R2me2.wigVar
head -1 H4K20me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH4K20me1/g" > barskiChIPseqH4K20me1.wigVar
head -1 H4K20me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH4K20me3/g" > barskiChIPseqH4K20me3.wigVar
head -1 H4R3me2.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH4R3me2/g"  > barskiChIPseqH4R3me2.wigVar
head -1 H2BK5me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH2BK5me1/g" > barskiChIPseqH2BK5me1.wigVar
head -1 H2AZ.vstep     | sed -e "s/\"CTCF/\"BarskiChIPseqH2AZ/g"     > barskiChIPseqH2AZ.wigVar
head -1 PolII.vstep    | sed -e "s/\"CTCF/\"BarskiChIPseqPolII/g"    > barskiChIPseqPolII.wigVar
head -1 CTCF.vstep     | sed -e "s/\"CTCF/\"BarskiChIPseqCTCF/g"     > barskiChIPseqCTCF.wigVar

tail --lines=+2 H3K4me1.vstep  >> barskiChIPseqH3K4me1.wigVar
tail --lines=+2 H3K4me2.vstep  >> barskiChIPseqH3K4me2.wigVar
tail --lines=+2 H3K4me3.vstep  >> barskiChIPseqH3K4me3.wigVar
tail --lines=+2 H3K9me1.vstep  >> barskiChIPseqH3K9me1.wigVar
tail --lines=+2 H3K9me2.vstep  >> barskiChIPseqH3K9me2.wigVar
tail --lines=+2 H3K9me3.vstep  >> barskiChIPseqH3K9me3.wigVar
tail --lines=+2 H3K27me1.vstep >> barskiChIPseqH3K27me1.wigVar
tail --lines=+2 H3K27me2.vstep >> barskiChIPseqH3K27me2.wigVar
tail --lines=+2 H3K27me3.vstep >> barskiChIPseqH3K27me3.wigVar
tail --lines=+2 H3K36me1.vstep >> barskiChIPseqH3K36me1.wigVar
tail --lines=+2 H3K36me3.vstep >> barskiChIPseqH3K36me3.wigVar
tail --lines=+2 H3K79me1.vstep >> barskiChIPseqH3K79me1.wigVar
tail --lines=+2 H3K79me2.vstep >> barskiChIPseqH3K79me2.wigVar
tail --lines=+2 H3K79me3.vstep >> barskiChIPseqH3K79me3.wigVar
tail --lines=+2 H3R2me1.vstep  >> barskiChIPseqH3R2me1.wigVar
tail --lines=+2 H3R2me2.vstep  >> barskiChIPseqH3R2me2.wigVar
tail --lines=+2 H4K20me1.vstep >> barskiChIPseqH4K20me1.wigVar
tail --lines=+2 H4K20me3.vstep >> barskiChIPseqH4K20me3.wigVar
tail --lines=+2 H4R3me2.vstep  >> barskiChIPseqH4R3me2.wigVar
tail --lines=+2 H2BK5me1.vstep >> barskiChIPseqH2BK5me1.wigVar
tail --lines=+2 H2AZ.vstep     >> barskiChIPseqH2AZ.wigVar
tail --lines=+2 PolII.vstep    >> barskiChIPseqPolII.wigVar
tail --lines=+2 CTCF.vstep     >> barskiChIPseqCTCF.wigVar

mkdir ../signal
mv *.wigVar ../signal
gzip *

mkdir ../tags
mv H3K4me1.bed  ../tags/barskiChIPseqH3K4me1.bed
mv H3K4me2.bed  ../tags/barskiChIPseqH3K4me2.bed
mv H3K4me3.bed  ../tags/barskiChIPseqH3K4me3.bed
mv H3K9me1.bed  ../tags/barskiChIPseqH3K9me1.bed
mv H3K9me2.bed  ../tags/barskiChIPseqH3K9me2.bed
mv H3K9me3.bed  ../tags/barskiChIPseqH3K9me3.bed
mv H3K27me1.bed ../tags/barskiChIPseqH3K27me1.bed
mv H3K27me2.bed ../tags/barskiChIPseqH3K27me2.bed
mv H3K27me3.bed ../tags/barskiChIPseqH3K27me3.bed
mv H3K36me1.bed ../tags/barskiChIPseqH3K36me1.bed
mv H3K36me3.bed ../tags/barskiChIPseqH3K36me3.bed
mv H3K79me1.bed ../tags/barskiChIPseqH3K79me1.bed
mv H3K79me2.bed ../tags/barskiChIPseqH3K79me2.bed
mv H3K79me3.bed ../tags/barskiChIPseqH3K79me3.bed
mv H3R2me1.bed  ../tags/barskiChIPseqH3R2me1.bed
mv H3R2me2.bed  ../tags/barskiChIPseqH3R2me2.bed
mv H4K20me1.bed ../tags/barskiChIPseqH4K20me1.bed
mv H4K20me3.bed ../tags/barskiChIPseqH4K20me3.bed
mv H4R3me2.bed  ../tags/barskiChIPseqH4R3me2.bed
mv H2BK5me1.bed ../tags/barskiChIPseqH2BK5me1.bed
mv H2AZ.bed     ../tags/barskiChIPseqH2AZ.bed
mv PolII.bed    ../tags/barskiChIPseqPolII.bed
mv CTCF.bed     ../tags/barskiChIPseqCTCF.bed

cd ..

cd ../signal
cat > makeWig.sh << \_EOF_
#!/bin/bash
    genDir=/gbdb/hg18/barskiChIPseq
    mkdir \${genDir}
    for file in *.wigVar
    do
        base=\${file%.wigVar}
        echo "Loading \${file} to \${base}..."
        time nice -n +19 wigEncode base \${base}.wigVar \${base}.wig \${base}.wib
        time nice -n +19 hgLoadWiggle -pathPrefix=\${genDir} hg18 \${base} \${base}.wig
        ln -sf `pwd`/\${base}.wib \${genDir}/\${base}.wib
    done
_EOF_
chmod 755 makeWig.sh
./makeWig.sh &

# ................ Got to here
# ................ Got to here
# ................ Got to here
# ................ Got to here

# .............. I have not loaded the tags !!!
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me1  BarskiChIPseqH3K4me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me2  BarskiChIPseqH3K4me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me3  BarskiChIPseqH3K4me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me1  BarskiChIPseqH3K9me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me2  BarskiChIPseqH3K9me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me3  BarskiChIPseqH3K9me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me1 BarskiChIPseqH3K27me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me2 BarskiChIPseqH3K27me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me3 BarskiChIPseqH3K27me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K36me1 BarskiChIPseqH3K36me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K36me3 BarskiChIPseqH3K36me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me1 BarskiChIPseqH3K79me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me2 BarskiChIPseqH3K79me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me3 BarskiChIPseqH3K79me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3R2me1  BarskiChIPseqH3R2me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3R2me2  BarskiChIPseqH3R2me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH4K20me1 BarskiChIPseqH4K20me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH4K20me3 BarskiChIPseqH4K20me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH4R3me2  BarskiChIPseqH4R3me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH2BK5me1 BarskiChIPseqH2BK5me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH2AZ     BarskiChIPseqH2AZ.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqPolII    BarskiChIPseqPolII.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqCTCF     BarskiChIPseqCTCF.bed
# .............. I have not loaded the tags !!!

#########################################################################
## 44-Way Multiz (DONE - 2008-11-10,15 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/hg18/bed/multiz44way
    cd /hive/data/genomes/hg18/bed/multiz44way

    #	starting with the 44way tree that Brian made earlier:
    cp -p ../multiz44way.2008-08-06/44way.db.nh ./44way.nh

    sed -e "s/oryLat1/hg18/; s/danRer4/danRer5/; s/oryLat1/oryLat2/" \
	/cluster/data/oryLat1/bed/multiz44way/44way.nh > 44way.nh
    #	this file looks like:

    cat << '_EOF_' > 44way.nh
  (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2,
    (stickleback_gasAcu1:0.2,medaka_hg18:0.2):0.2):0.292961,
        zebrafish_danRer5:0.782561);
'_EOF_'
    # << happy emacs

    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a gif image for htdocs/images/phylo/hg18_44way.gif

    /cluster/bin/phast/all_dists 44way.nh > 44way.distances.txt
    #	Use this output to create the table below, with this perl script:
    cat << '_EOF_' > sizeStats.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH, "grep -y hg18 44way.distances.txt | sort -k3,3n|") or
        die "can not read 44way.distances.txt";

my $count = 0;
while (my $line = <FH>) {
    chomp $line;
    my ($hg18, $D, $dist) = split('\s+', $line);
    my $chain = "chain" . ucfirst($D);
    my $B="/hive/data/genomes/hg18/bed/blastz.$D/fb.hg18." .
        $chain . "Link.txt";
    my $chainLinkMeasure =
        `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
    chomp $chainLinkMeasure;
    $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
    $chainLinkMeasure =~ s/\%//;
    my $orgName=
    `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
    chomp $orgName;
    if (length($orgName) < 1) {
        $orgName="N/A";
    }
    ++$count;
    printf "# %02d  %.4f - %s %s\t(%% %.3f)\n", $count, $dist, $orgName, $D,
        $chainLinkMeasure
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./sizeStats.pl
    ./sizeStats.pl
#
#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#
#                         featureBits chainLink measures
#                                        chainOryLat1Link   chain    linearGap
#    distance                      on hg18    on other   minScore
# 01  0.0092 - Chimp panTro2    (% 94.888)
# 02  0.0267 - Gorilla gorGor1  (% 61.731)
# 03  0.0467 - Orangutan ponAbe2        (% 92.892)
# 04  0.0667 - Marmoset calJac1 (% 78.351)
# 05  0.0783 - Rhesus rheMac2   (% 85.552)
# 06  0.1767 - Tarsier tarSyr1  (% 47.999)
# 07  0.2448 - Mouse lemur micMur1      (% 46.445)
# 08  0.3061 - Bushbaby otoGar1 (% 44.638)
# 09  0.3367 - Rabbit oryCun1   (% 34.015)
# 10  0.3507 - TreeShrew tupBel1        (% 37.348)
# 11  0.3567 - Squirrel speTri1 (% 35.828)
# 12  0.4067 - Guinea Pig cavPor3       (% 43.971)
# 13  0.4067 - Alpaca vicPac1   (% 39.531)
# 14  0.4098 - Megabat pteVam1  (% 45.502)
# 15  0.4099 - Microbat myoLuc1 (% 33.044)
# 16  0.4154 - Cat felCat3      (% 35.888)
# 17  0.4293 - Elephant loxAfr2 (% 35.204)
# 18  0.4314 - Dog canFam2      (% 52.915)
# 19  0.4317 - Mouse mm9        (% 35.201)
# 20  0.4362 - Rat rn4  (% 32.893)
# 21  0.4367 - Pika ochPri2     (% 27.974)
# 22  0.4639 - Horse equCab2    (% 57.162)
# 23  0.4693 - Rock hyrax proCap1       (% 30.935)
# 24  0.4767 - Dolphin turTru1  (% 48.537)
# 25  0.5067 - Kangaroo rat dipOrd1     (% 27.282)
# 26  0.5187 - Armadillo dasNov2        (% 33.663)
# 27  0.5191 - Cow bosTau4      (% 46.689)
# 28  0.5298 - hedgehog eriEur1 (% 19.622)
# 29  0.5399 - Sloth choHof1    (% 34.463)
# 30  0.5605 - Shrew sorAra1    (% 20.056)
# 31  0.5815 - Tenrec echTel1   (% 23.645)
# 32  0.7309 - Opossum monDom4  (% 12.385)
# 33  0.9870 - Platypus ornAna1 (% 7.870)
# 34  1.0313 - Zebra finch taeGut1      (% 3.503)
# 35  1.0436 - Lamprey petMar1  (% 1.251)
# 36  1.1013 - Chicken galGal3  (% 3.589)
# 37  1.2253 - Lizard anoCar1   (% 4.774)
# 38  1.5473 - X. tropicalis xenTro2    (% 2.623)
# 39  1.8337 - Stickleback gasAcu1      (% 1.923)
# 40  1.8482 - Zebrafish danRer5        (% 2.565)
# 41  1.8721 - Tetraodon tetNig1        (% 2.001)
# 42  1.9077 - Fugu fr2 (% 1.766)
# 43  2.0215 - Medaka oryLat2   (% 1.829)


    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	44way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.list

    cd /hive/data/genomes/hg18/bed/multiz44way
    #	bash shell syntax here ...
    export H=/hive/data/genomes/hg18/bed
    mkdir mafLinks
    for G in `sed -e "s/hg18 //" species.list`
    do
	mkdir mafLinks/$G
	if [ -s ${H}/blastz.${G}/mafRBestNet/chr1.maf.gz ]; then
	    echo "$G - recipBest"
	    ln -s ${H}/blastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G
	else
	    if [ -s ${H}/blastz.${G}/mafSynNet/chr1.maf.gz ]; then
		echo "$G - synNet"
		ln -s ${H}/blastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G
	    else
		if [ -s ${H}/blastz.${G}/mafNet/chr1.maf.gz ]; then
		    echo "$G - mafNet"
		    ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
		else
		    echo "missing directory blastz.${G}/*Net"
		fi
	    fi
	fi
    done

    #	need to split these things up into smaller pieces for
    #	efficient kluster run.  Using the new hive architecture.
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way
    mkdir mafSplit
    #	mafSplitPos splits on repeat areas that will not have any chains
    mafSplitPos -minGap=50000 hg18 10 mafSplit.bed

    for G in `sed -e "s/hg18 //" species.list`
do
    echo -n "working ${G} ..."
    rm -fr mafSplit/${G}
    mkdir mafSplit/${G}
    cd mafSplit/${G}
    mafSplit ../../mafSplit.bed hg18_ ../../mafLinks/${G}/chr*.maf.gz \
	-verbose=2 
    cd /hive/data/genomes/hg18/bed/multiz44way
    echo " done"
done

    #	create a run-time list of files to operate on, not all file names
    #	exist for all assemblies
    cd mafSplit
    for D in *
do
    cd "${D}"
    find . -type f
    cd ..
done | sort -u | sed -e "s#./##" > ../44-way.split.list
    wc -l ../44-way.split.list
    #	267 ../44-way.split.list
    
    # the autoMultiz cluster run
    ssh swarm
    cd /hive/data/genomes/hg18/bed/multiz44way/

    mkdir splitRun
    cd splitRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = hg18
set c = $1
set result = $2
set run = `pwd`
set tmp = $run/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/hg18/bed/multiz44way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp
foreach s (`sed -e "s/ $db//" species.list`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
    else if (-e $in) then
        ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
/bin/rm -f $result
/bin/cp -p $tmp/$c.maf $result
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
/bin/rmdir --ignore-fail-on-non-empty $run/tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/hg18/bed/multiz44way/splitRun/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    gensub2 ../../44-way.split.list single template jobList
    para create jobList
    #	initial run experience suggest some of the big jobs reach 8 Gb
    #	of memory usage, so, tell parasol to limit the number of jobs per
    #	node to avoid thrashing
    para -ram=6g push
    #	8 jobs were finished manually on hgwdev, kolossus and memk nodes
XXXX - running 2008-11-12 - Wed Nov 12 15:29:39 PST 2008

# Completed: 792 of 792 jobs
# CPU time in finished jobs:       5423s      90.38m     1.51h    0.06d  0.000 y
# IO & Wait Time:                138287s    2304.79m    38.41h    1.60d  0.004 y
# Average job time:                 181s       3.02m     0.05h    0.00d
# Longest finished job:             404s       6.73m     0.11h    0.00d
# Submission to last job:           436s       7.27m     0.12h    0.01d
# Estimated complete:                 0s       0.00m     0.00h    0.00d

    # put the split maf results back together into a single maf file
    #	eliminate duplicate comments
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way/splitRun
    mkdir ../maf
    #	the sed edits take out partitioning name information from the comments
    #	so the multiple parts will condense to smaller number of lines
    #	this takes almost 2 hours of time, resulting in a bit over 150 Gb,
    #	almost all chrom files over 1 Gb, up to almost 10 Gb for chr2
    #	HOWEVER, this is actually not necessary to maintain these comments,
    #	they are lost during the mafAddIRows
    ls maf | sed -e "s/hg18_//; s/\..*//" | sort -u | while read C
do
    echo "==========  $C  =============="
    rm -f ../maf/${C}.maf.gz
    head -q -n 1 maf/hg18_${C}.*.maf | sort -u > ../maf/${C}.maf
    grep -h "^#" maf/hg18_${C}.*.maf | egrep -v "maf version=1|eof maf" | \
        sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \
        | sort -u >> ../maf/${C}.maf
    grep -h -v "^#" `ls maf/hg18_${C}.*.maf | sort -t. -k2,2n` \
        >> ../maf/${C}.maf
    tail -q -n 1 maf/hg18_${C}.*.maf | sort -u >> ../maf/${C}.maf
done

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/hg18/multiz44way/maf
    cd /hive/data/genomes/hg18/bed/multiz44way/maf
    ln -s `pwd`/*.maf /gbdb/hg18/multiz44way/maf

    # this generates an immense multiz44way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /data/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/hg18/multiz44way/maf hg18 multiz44way
    #	real    1m10.380s
    #	Loaded 1366931 mafs in 1 files from /gbdb/hg18/multiz44way/maf
    # load summary table
    time nice -n +19 cat /gbdb/hg18/multiz44way/maf/*.maf \
	| hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
	 -maxSize=200000  multiz44waySummary stdin
    #	real    2m39.822
    #	Created 353577 summary blocks from 2852890 components and 1197504 mafs
    #	from stdin

    # Gap Annotation
    # prepare bed files with gap info
    mkdir /hive/data/genomes/hg18/bed/multiz44way/anno
    cd /hive/data/genomes/hg18/bed/multiz44way/anno
    mkdir maf run

    #	most of these will already exist from previous multiple alignments
    #	remove the echo from in front of the twoBitInfo command to get them
    #	to run if this loop appears to be correct
    for DB in `cat ../species.list`
do
    CDIR="/hive/data/genomes/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
	echo "creating ${DB}.N.bed"
	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
	ls -og ${CDIR}/${DB}.N.bed
    fi
done

    cd run
    rm -f nBeds sizes
    for DB in `sed -e "s/hg18 //" ../../species.list`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    #	the annotation step requires large memory, run on memk nodes
    ssh memk
    cd /hive/data/genomes/hg18/bed/multiz44way/anno/run
    ls ../../maf | sed -e "s/.maf//" > chr.list
    cat << '_EOF_' > template
#LOOP
./anno.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > anno.csh
#!/bin/csh -fe

set inMaf = ../../maf/$1.maf
set outMaf = ../maf/$1.maf
rm -f $outMaf
mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/hg18/hg18.2bit $outMaf
'_EOF_'
    # << happy emacs
    chmod +x anno.csh

    gensub2 chr.list single template jobList
    para create jobList
    #	specify lots of ram to get one job per node
    para -ram=30g push

    ssh hgwdev
    rm -fr /gbdb/hg18/multiz44way/maf
    mkdir /gbdb/hg18/multiz44way/maf
    cd /hive/data/genomes/hg18/bed/multiz44way/anno/maf
    ln -s `pwd`/*.maf /gbdb/hg18/multiz44way/maf/
    #	by loading this into the table multiz44way, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /data/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/hg18/multiz44way/maf hg18 multiz44way
    #	with final set of quality annotated files:
    #	Loaded 33320838 mafs in 49 files from /gbdb/hg18/multiz44way/maf
    #	real    91m46.889s
    #	running on Irow annotated mafs Fri Nov 28 00:28:09 PST 2008
    #	Loaded 33320675 mafs in 49 files from /gbdb/hg18/multiz44way/maf
    #	real    236m15.279s
    #	running on bare bones mafs Thu Nov 27 19:29:44 PST 2008
    #	Loaded 33273351 mafs in 49 files from /gbdb/hg18/multiz44way/maf
    #	real    198m55.761s - while swarm busy with rebalancing
    # from before the fixed multiz:
    #	Loaded 35154852 mafs in 49 files from /gbdb/hg18/multiz44way/maf
    #	real    71m5.594s

    time nice -n +19 cat /gbdb/hg18/multiz44way/maf/*.maf \
	| hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
                 -maxSize=200000  multiz44waySummary stdin
    #	with the quality annotated mafs, and mem interference on hgwdev:
    #	Created 8514381 summary blocks from 600504256 components \
    #	and 33320838 mafs from stdin
    #	real    169m56.936s

    #	with the Irow annotations after the multiz fix:
    #	Created 8514380 summary blocks from 600499937
    #		components and 33298894 mafs from stdin
    #	real    184m42.893s
    #	user    70m44.431s
    #	sys     8m7.970s

    #	Created 8514078 summary blocks from 604683213 components
    #	and 35125649 mafs from stdin
    #	real    130m55.115s
    #	user    71m37.409s
    #	sys     8m5.110s

    #	by loading this into the table multiz44waySummary, it will replace
    #	the previously loaded table with the unannotated mafs
    #	remove the multiz44way*.tab files in this /data/tmp directory
# -rw-rw-r--   1 1949221892 Nov 15 14:04 multiz44way.tab
# -rw-rw-r--   1  417994189 Nov 15 20:57 multiz44waySummary.tab
    wc -l multiz44way*.tab
    #	33964377 multiz44way.tab
    #	 8514078 multiz44waySummary.tab
    #	42478455 total
    rm multiz44way*.tab

    # create some downloads
    mkdir -p /hive/data/genomes/hg18/bed/multiz44way/download/maf
    cd /hive/data/genomes/hg18/bed/multiz44way/download/maf
    time cp -p ../../anno/maf/chr*.maf .
    #	real    72m46.514s
    #	user    0m1.293s
    #	sys     5m15.981s
    time gzip --rsyncable *.maf
    time gzip --rsyncable *.maf
    #	real    185m37.884s
    #	user    179m51.161s
    #	sys     3m48.016s
    time md5sum *.gz > md5sum.txt
    #	real    3m59.009s
    #	user    1m19.338s
    #	sys     0m18.976s

#############################################################################
## Annotate 44-way multiple alignment with gene annotations
##		(DONE - 2008-12-08,23 - Hiram)
    # Gene frames
    ## survey all genomes to see what type of gene track to use
    ssh hgwdev
    mkdir /hive/data/genomes/hg18/bed/multiz44way/frames
    cd /hive/data/genomes/hg18/bed/multiz44way/frames
    #
    cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`cat ../species.list`)
    echo -n "${db}: "
    set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
    foreach table ($tables)
	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
	    $table == "knownGene") then
		set count = `hgsql $db -N -e "select count(*) from $table"`
		echo -n "${table}: ${count}, "
	endif
    end
    set orgName = `hgsql hgcentraltest -N -e \
	    "select scientificName from dbDb where name='$db'"`
    set orgId = `hgsql hg18 -N -e \
	    "select id from organism where name='$orgName'"`
    if ($orgId == "") then
	echo "Mrnas: 0"
    else
	set count = `hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
	echo "Mrnas: ${count}"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./showGenes.csh
    #	rearrange that output to create four sections:
    #	1. knownGenes for hg18, mm9
    #	2. ensGene for almost everything else
    #	3. Mrnas for taeGut1, anoCar1, petMar1, calJac1
    #	4. nothing for loxAfr2, dasNov2, choHof1

    mkdir genes
    # knownGene
    for DB in hg18 mm9
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    # ensGene
    for DB in bosTau4 canFam2 cavPor3 danRer5 dipOrd1 echTel1 equCab2 \
	eriEur1 felCat3 fr2 galGal3 gasAcu1 gorGor1 micMur1 monDom4 myoLuc1 \
	ochPri2 ornAna1 oryCun1 oryLat2 otoGar1 panTro2 ponAbe2 proCap1 \
	pteVam1 rheMac2 rn4 sorAra1 speTri1 tarSyr1 tetNig1 tupBel1 \
	turTru1 vicPac1 xenTro2
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    #	use Mrnas for  taeGut1 anoCar1 petMar1 calJac1
    for DB in taeGut1 anoCar1 petMar1 calJac1
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
	   from all_mrna,gbCdnaInfo,cds \
	   where (all_mrna.qName = gbCdnaInfo.acc) and \
	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done

    # leaving out loxAfr2, dasNov2, choHof1 since no gene preds there
    #	Create this command with this script:
    cat << '_EOF_' > mkCmd.sh
#!/bin/sh

echo "time (cat ../quals/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout \\"
if [ ! -s genes/mm9.gp.gz ]; then
    echo "missing genes/mm9.gp.gz"
    exit 255
fi
echo "mm9 genes/mm9.gp.gz \\"
for D in `sort ensGene.list`
do
    if [ ! -s genes/${D}.gp.gz ]; then
        echo "missing genes/${D}.gp.gz"
        exit 255
    fi
    echo -n "${D} genes/${D}.gp.gz "
done
echo "\\"
for D in `sort mrna.list`
do
    if [ ! -s genes/${D}.gp.gz ]; then
        echo "missing genes/${D}.gp.gz"
        exit 255
    fi
    echo -n "${D} genes/${D}.gp.gz "
done
echo "\\"
echo "    | gzip > multiz44way.mafFrames.gz) > frames.log 2>&1"
'_EOF_'
    # << happy emacs
    chmod +x ./mkCmd.sh

    time (cat ../quals/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout \
mm9 genes/mm9.gp.gz \
bosTau4 genes/bosTau4.gp.gz canFam2 genes/canFam2.gp.gz cavPor3 genes/cavPor3.gp.gz danRer5 genes/danRer5.gp.gz dipOrd1 genes/dipOrd1.gp.gz echTel1 genes/echTel1.gp.gz equCab2 genes/equCab2.gp.gz eriEur1 genes/eriEur1.gp.gz felCat3 genes/felCat3.gp.gz fr2 genes/fr2.gp.gz galGal3 genes/galGal3.gp.gz gasAcu1 genes/gasAcu1.gp.gz micMur1 genes/micMur1.gp.gz monDom4 genes/monDom4.gp.gz myoLuc1 genes/myoLuc1.gp.gz ochPri2 genes/ochPri2.gp.gz ornAna1 genes/ornAna1.gp.gz oryCun1 genes/oryCun1.gp.gz oryLat2 genes/oryLat2.gp.gz otoGar1 genes/otoGar1.gp.gz panTro2 genes/panTro2.gp.gz ponAbe2 genes/ponAbe2.gp.gz proCap1 genes/proCap1.gp.gz pteVam1 genes/pteVam1.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz sorAra1 genes/sorAra1.gp.gz speTri1 genes/speTri1.gp.gz tarSyr1 genes/tarSyr1.gp.gz tetNig1 genes/tetNig1.gp.gz tupBel1 genes/tupBel1.gp.gz turTru1 genes/turTru1.gp.gz vicPac1 genes/vicPac1.gp.gz xenTro2 genes/xenTro2.gp.gz \
anoCar1 genes/anoCar1.gp.gz calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz taeGut1 genes/taeGut1.gp.gz \
    | gzip > multiz44way.mafFrames.gz) > frames.log 2>&1

    #	that doesn't work on any 32 Gb computer, requires much more memory
    #	turn it into a kluster job
    ssh swarm
    cd /hive/data/genomes/hg18/bed/multiz44way/frames
    cat << '_EOF_' > runOne
#!/bin/csh -fe

set C = $1
set G = $2

cat ../quals/maf/${C}.maf | genePredToMafFrames hg18 stdin stdout \
        ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
'_EOF_'
    # << happy emacs
    chmod +x runOne

    ls ../quals/maf | sed -e "s/.maf//" > chr.list
    ls genes | sed -e "s/.gp.gz//" | grep -v hg18 > gene.list

    cat << '_EOF_' > template
#LOOP
runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
#ENDLOOP
'_EOF_'
    # << happy emacs

    mkdir parts
    gensub2 chr.list gene.list template jobList
    para -ram=8g create jobList
    para try ... check ... push
# Completed: 1911 of 1911 jobs
# CPU time in finished jobs:     126751s    2112.52m    35.21h    1.47d  0.004 y
# IO & Wait Time:               2573543s   42892.38m   714.87h   29.79d  0.082 y
# Average job time:                1413s      23.55m     0.39h    0.02d
# Longest finished job:            6490s     108.17m     1.80h    0.08d
# Submission to last job:         11310s     188.50m     3.14h    0.13d

    # see what it looks like in terms of number of annotations per DB:
    find ./parts -type f | while read F
do
    zcat ${F}
done | cut -f4 | sort | uniq -c | sort -n
    165 anoCar1
   2807 calJac1
   3306 taeGut1
   5416 petMar1
 141256 tarSyr1
 142346 vicPac1
 163854 sorAra1
 164475 galGal3
 174150 felCat3
 178531 oryCun1
 178744 ornAna1
 179511 turTru1
 190622 eriEur1
 191477 tupBel1
 197338 panTro2
 198063 speTri1
 199541 micMur1
 207391 ponAbe2
 208629 rheMac2
 208850 otoGar1
 212751 myoLuc1
 212857 dipOrd1
 213343 proCap1
 214972 echTel1
 216367 monDom4
 220724 ochPri2
 223159 equCab2
 227928 bosTau4
 231351 cavPor3
 231553 pteVam1
 233980 mm9
 234268 rn4
 249016 canFam2
 258191 xenTro2
 315098 danRer5
 365824 oryLat2
 387739 fr2
 423941 gasAcu1
 549846 tetNig1

    #	load the resulting file
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz44way/frames
    find ./parts -type f | while read F
do
    zcat ${F}
done | sort -k1,1 -k2,2n | hgLoadMafFrames hg18 multiz44wayFrames stdin

    find ./parts -type f | while read F
do
    zcat ${F}
done | sort -k1,1 -k2,2n > multiz44wayFrames.bed
    featureBits -countGaps hg18 multiz44wayFrames.bed
    #	62315198 bases of 3107677273 (2.005%) in intersection
    featureBits -countGaps hg18 multiz28wayFrames
    #	48236360 bases of 3107677273 (1.552%) in intersection

    #	enable the trackDb entries:
# frames multiz44wayFrames
# irows on
    #	appears to work OK

#########################################################################
# Phylogenetic tree from 44-way  (2008-12-06 kate)

    # Extract 4-fold degenerate sites based on 
    # of RefSeq Reviewed, coding
    ssh pk
    cd /hive/data/genomes/hg18/bed/multiz44way
    mkdir 4d
    cd 4d

    hgsql hg18 -Ne \
    "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" | cut -f 2-20 >  refSeqReviewed.gp
    wc -l refSeqReviewed.gp
    #12684 refSeqReviewed.gp
    genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
    wc -l refSeqReviewedNR.gp
    #7365 refSeqReviewedNR.gp

    mkdir run
    cd run

# chopped up mafs version
# run on swarm with -ram=8g
cat > 4d.csh << 'EOF'
    set infile = $1
    set outfile = $2
    set c = `echo $1 | sed 's/^.*hg18_\(chr[^.][^.]*\).*.maf/\1/'`
    echo $c
    cd /scratch/tmp
    # 'clean' maf
    perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
    awk -v C=$c '$2 == C {print}' /cluster/data/hg18/bed/multiz44way/4d/refSeqReviewedNR.gp > $c.gp
    set PHASTBIN=/cluster/bin/phast.2008-11-30
    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
    #rm -f $c.gp $c.maf $c.ss
'EOF'

# whole chrom mafs version, using new version of 
# uses memory-efficient version of phast, from Melissa Hubisz at Cornell (mjhubisz@gmail.com)
cat > 4d.csh << 'EOF'
    set c = $1
    set infile = $2
    set outfile = $3
    echo $c
    cd /scratch/tmp
    # 'clean' maf
    perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
    awk -v C=$c '$2 == C {print}' /cluster/data/hg18/bed/multiz44way/4d/refSeqReviewedNR.gp > $c.gp
    set PHASTBIN=/cluster/bin/phast.2008-12-18
    $PHASTBIN/msa_view --4d --features --do-cats 3 $c.gp -i MAF $c.maf -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
    rm -f $c.gp $c.maf $c.ss
'EOF'

    ls -1S /hive/data/genomes/hg18/bed/multiz44way/maf/*.maf | \
        grep -v random | grep -v chrM | grep -v hap > in.lst

cat << 'EOF' > template
#LOOP
csh 4d.csh $(root1) {check in line+ $(path1)}  {check out line+ /cluster/data/hg18/bed/multiz44way/4d/mfa/$(root1).mfa}
#ENDLOOP
'EOF'
# << this line makes emacs coloring happy

cat << 'EOF' > template
#LOOP
csh 4d.csh $(root1) {check in line+ $(path1)}  {check out line+ /cluster/data/hg18/bed/multiz44way/4d/mfa2/$(root1).mfa}
#ENDLOOP
'EOF'
# << this line makes emacs coloring happy

    gensub2 in.lst single template stdout | tac > jobList
    rm -fr /cluster/data/hg18/bed/multiz44way/4d/mfa
    mkdir /cluster/data/hg18/bed/multiz44way/4d/mfa
    para create jobList
    para try
    para check
    para push

    # combine mfa files
    cd ..
    sed -e "s/ /,/g" ../species.list > species.lst
    /cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \
        sed s/"> "/">"/ > 4d.all.mfa

    sed -e 's/,monDom4.*//' species.lst > placentals.lst
    /cluster/bin/phast/msa_view --aggregate `cat placentals.lst` mfa/*.mfa | \
        sed s/"> "/">"/ > 4d.placentals.mfa

    # use phyloFit to create tree model (output is phyloFit.mod)
    set PHASTBIN=/cluster/bin/phast.2008-12-18
    $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA --subst-mod REV --tree ../tree-commas.nh 4d.all.mfa
    # started at 5:50pm
    # ended at 7:27 => ~90 min on swarm
    mv phyloFit.mod phyloFit.all.mod
    grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.44way.nh

    $PHASTBIN/tree_doctor \
        --prune=monDom4,ornAna1,taeGut1,petMar1,galGal3,anoCar1,xenTro2,gasAcu1,danRer5,tetNig1,fr2,oryLat2 \
                tree_4d.44way.nh > tree_4d.44way.placental.nh

    # chrX-only for placental subset (requested by 2X project)
    set PHASTBIN=/cluster/bin/phast.2008-12-18
    $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA --subst-mod REV --tree ../tree-commas.nh --out-root 4d.chrX mfa/chrX.mfa

#############################################################################
# phastCons 44-way (DONE - 2008-12-23 - 2009-01-02 - Hiram)

    # split 44way mafs into 10M chunks and generate sufficient statistics 
    # files for # phastCons
    ssh memk
    mkdir -p /hive/data/genomes/hg18/bed/multiz44way/cons/msa.split
    mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/ss
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/msa.split

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/hg18/bed/multiz44way/maf/$c.maf
set WINDOWS = /hive/data/genomes/hg18/bed/multiz44way/cons/ss/$c
rm -fr $WINDOWS
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
twoBitToFa -seq=$c /hive/data/genomes/hg18/hg18.2bit hg18.$c.fa
/cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
    -M hg18.$c.fa -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
rm -f hg18.$c.fa
popd > /dev/null
date >> $c.done
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	do the easy ones first to see some immediate results
    ls -1S -r ../maf | sed -e "s/.maf//" > maf.list

    gensub2 maf.list single template jobList
    para -ram=32g create jobList
    para try ... check ... etc
    #	this takes a really long time.  memk was down to 2 usable
    #	machines - got it finished manually on a combination of hgwdevnew CPUs
    #	and other machines

    # Estimate phastCons parameters
    #	experimented with this as a parasol job on hgwdevnew to try a number
    #	of SS files.  With a command of:

/cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
--tree "(((((((((((((((((hg18,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
--out-root=$OUT/starting_tree

    #	running over the input files ../ss/*/*.ss results to
#.../genomes/hg18/bed/multiz44way/cons/startingTree/result/*/starting-tree.mod

    # add up the C and G:
    find ./result -type f | xargs ls -rt | while read F
do
    D=`dirname $F`
    echo -n `basename $D`" - "
    grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
done
    #	counting number of species seen in the maf file:
    find ./result -type f | xargs ls -rt | while read F
do
    D=`dirname $F`
    echo -n `basename $D`" - "
    grep TREE $F | sed -e \
"s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g"  | tr ',' '\n' | wc -l
done

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    ssh swarm
    mkdir -p /hive/data/genomes/hg18/bed/multiz44way/cons/run.cons
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/run.cons

    #	there are going to be several different phastCons runs using
    #	this same script.  They trigger off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is one of:
    #	all euarchontogliers placentals

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast/x86_64
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set cons = /hive/data/genomes/hg18/bed/multiz44way/cons
set tmp = $cons/tmp/$f
mkdir -p $tmp
set ssSrc = $cons
if (-s $cons/$grp/$grp.non-inf) then
  ln -s $cons/$grp/$grp.mod $tmp
  ln -s $cons/$grp/$grp.non-inf $tmp
  ln -s $ssSrc/ss/$c/$f.ss $tmp
  ln -s $cons/$grp/$grp.mod $tmp
  ln -s $cons/$grp/$grp.non-inf $tmp
else
  ln -s $ssSrc/ss/$c/$f.ss $tmp
  ln -s $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p pp/$c bed/$c
sleep 4
touch pp/$c bed/$c
rm -f pp/$c/$f.pp
rm -f bed/$c/$f.bed
mv $tmp/$f.pp pp/$c
mv $tmp/$f.bed bed/$c
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > ss.list

    # run for all species
    cd /hive/data/genomes/hg18/bed/multiz44way/cons
    mkdir -p all
    cd all
    #	Using Kate's .mod tree
    cp -p ../../4d/44way.all.mod ./all.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=8g create jobList
    para try ... check ... push ... etc.
XXX - running Tue Jan 13 22:19:21 PST 2009
# Completed: 322 of 322 jobs
# CPU time in finished jobs:      47406s     790.10m    13.17h    0.55d  0.002 y
# IO & Wait Time:                 29902s     498.37m     8.31h    0.35d  0.001 y
# Average job time:                 240s       4.00m     0.07h    0.00d
# Longest finished job:             354s       5.90m     0.10h    0.00d
# Submission to last job:           536s       8.93m     0.15h    0.01d

    # create Most Conserved track
    cd /hive/data/genomes/hg18/bed/multiz44way/cons
    cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
    time nice -n +19 hgLoadBed hg18 phastConsElements44way mostConserved.bed
    #	Loaded 4878296 elements of size 5
    #	real     2m3.414s

    # Try for 5% overall cov, and 70% CDS cov 
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits hg18 -enrichment refGene:cds phastConsElements44way
    #	refGene:cds 1.144%, mostConserved.bed 4.973%,
    #	both 0.854%, cover 74.62%, enrich 15.01x

    #	--rho .31 --expected-length 45 --target-coverage .3
    #	refGene:cds 1.144%, phastConsElements44way 4.706%,
    #	both 0.824%, cover 72.07%, enrich 15.31x

    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits hg18 -enrichment knownGene:cds phastConsElements44way
    #	knownGene:cds 1.205%, mostConserved.bed 4.973%,
    #	both 0.874%, cover 72.55%, enrich 14.59x

    #	--rho .31 --expected-length 45 --target-coverage .3
    #	knownGene:cds 1.205%, phastConsElements44way 4.706%,
    #	both 0.844%, cover 70.05%, enrich 14.88x

    featureBits hg18 -enrichment refGene:cds phastConsElements28way
    #	refGene:cds 1.144%, phastConsElements28way 4.920%,
    #	both 0.858%, cover 74.96%, enrich 15.24x
    featureBits hg18 -enrichment knownGene:cds phastConsElements28way
    #	knownGene:cds 1.205%, phastConsElements28way 4.920%,
    #	both 0.878%, cover 72.88%, enrich 14.81x

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p downloads

for D in pp/chr*
do
    C=${D/pp\/}
    out=downloads/${C}.phastCons44way.wigFix.gz
    echo "${D} > ${C}.phastCons44way.wigFix.gz"
    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
	gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh
    #	real    30m7.228s

    #	encode those files into wiggle data
    zcat downloads/*.wigFix.gz \
	| wigEncode stdin phastCons44way.wig phastCons44way.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    22m54.291s

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
    ln -s `pwd`/phastCons44way.wib /gbdb/hg18/multiz44way/phastCons44way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
	phastCons44way phastCons44way.wig
    #	real    1m13.681s

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg18 phastCons44way > histogram.data 2>&1
    #	real    8m6.841s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg18 Histogram phastCons44way track"
set xlabel " phastCons44way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ########################################################################
    ### Create a phastCons data set for Primates

    # setup primates-only run
    ssh swarm
    mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/primates
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
    # primates-only: exclude all but these for phastCons tree:

    /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1 \
	> primates.mod
    #	and place the removed ones in the non-inf file so phastCons will
    #	truly ignore them:
    echo "tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1,monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \
	> primates.non-inf

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=8g create jobList
    para try ... check ... push ... etc.
# bed/chr18_random/chr18_random.1-4262.bed is empty
# bed/chr19_random/chr19_random.1-301858.bed is empty
# bed/chr21/chr21.1-10000000.bed is empty
# bed/chrM/chrM.1-16571.bed is empty

    #	the jobs that fail have messages like this:
# bed/chrM/chrM.1-16571.bed is empty
# WARNING: No match for name "tupBel1" in alignment.
# WARNING: No match for name "sorAra1" in alignment.

# Completed: 318 of 322 jobs
# Crashed: 4 jobs
# CPU time in finished jobs:      20253s     337.54m     5.63h    0.23d  0.001 y
# IO & Wait Time:                 33093s     551.56m     9.19h    0.38d  0.001 y
# Average job time:                 168s       2.80m     0.05h    0.00d
# Longest finished job:             249s       4.15m     0.07h    0.00d
# Submission to last job:           282s       4.70m     0.08h    0.00d

    # create Most Conserved track
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
    cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
    time nice -n +19 hgLoadBed hg18 phastConsElements44wayPrimates \
	mostConserved.bed
    #	Loaded 808218 elements of size 5
    #	real    0m16.817s
    # verify coverage
    featureBits hg18 phastConsElements44wayPrimates
    #	113268574 bases of 2881515245 (3.931%) in intersection

    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits hg18 -enrichment refGene:cds phastConsElements44wayPrimates
    #	refGene:cds 1.144%, phastConsElements44wayPrimates 4.222%,
    #	both 0.756%, cover 66.07%, enrich 15.65x

    featureBits hg18 -enrichment knownGene:cds phastConsElements44wayPrimates
    #	knownGene:cds 1.205%, phastConsElements44wayPrimates 4.222%,
    #	both 0.769%, cover 63.84%, enrich 15.12x

    #	Create the downloads .pp files, from which the phastCons wiggle data
    #	is calculated
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
    mkdir downloads
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
    for D in pp/chr*
do
    C=${D/pp\//}
    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
	> downloads/${C}.primates.wigFix.gz
    echo $D $C done
done
'_EOF_'
    # << happy emacs
    time nice -n +19 ./gzipAscii.sh
    #	real    36m13.492s

    # Create merged posterier probability file and wiggle track data files
    zcat downloads/chr*.wigFix.gz \
	 | wigEncode stdin phastCons44wayPrimates.wig phastCons44wayPrimates.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    24m15.688s

    ## load table with wiggle data
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
    ln -s `pwd`/phastCons44wayPrimates.wib \
	/gbdb/hg18/multiz44way/phastCons44wayPrimates.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
	phastCons44wayPrimates phastCons44wayPrimates.wig
    #	real    0m48.942s

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg18 phastCons44wayPrimates > histogram.data 2>&1
    #	real    5m50.154s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Hg18 Histogram phastCons44wayPrimates track"
set xlabel " phastCons44wayPrimates score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ########################################################################
    ### Create a phastCons data set for Euarchontoglires

    # setup euarchontoglires-only run
    ssh swarm
    cd /hive/data/genomes/hg18/bed/multiz44way/cons
    mkdir euarchontoglires
    cd euarchontoglires
    # euarchontoglires-only: exclude all but these for phastCons tree:


    /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2 \
	> euarchontoglires.mod
    #	and place the removed ones in the non-inf file so phastCons will
    #	truly ignore them:
    echo "vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1,monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \
	> euarchontoglires.non-inf

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=8g create jobList
    para try ... check ... push ... etc.
    #	Two of these jobs fail to produce any output in the bed file:
    #	I believe this is because there is a missing sequence in these files
    #	compared to the ones specified in euarchontoglires.mod:
    #	bed/chr18_random/chr18_random.1-4262.bed is empty
    #	bed/chr19_random/chr19_random.1-301858.bed is empty
# Completed: 320 of 322 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:      25869s     431.14m     7.19h    0.30d  0.001 y
# IO & Wait Time:                 34404s     573.41m     9.56h    0.40d  0.001 y
# Average job time:                 188s       3.14m     0.05h    0.00d
# Longest finished job:             272s       4.53m     0.08h    0.00d
# Submission to last job:           309s       5.15m     0.09h    0.00d

    # create Most Conserved track
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
    cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
    time nice -n +19 hgLoadBed hg18 phastConsElements44wayEuarch \
	mostConserved.bed
    #	Loaded 1623656 elements of size 5
    #	real    4m15.125s
    # verify coverage
    featureBits hg18 phastConsElements44wayEuarch
    #	109221588 bases of 2881515245 (3.790%) in intersection

    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits hg18 -enrichment refGene:cds phastConsElements44wayEuarch
    #	refGene:cds 1.144%, mostConserved.bed 3.696%,
    #	both 0.822%, cover 71.87%, enrich 19.45x

    #	--rho 0.31 --expected-length 45 --target-coverage 0.3
    #	refGene:cds 1.144%, phastConsElements44wayEuarch 3.790%,
    #	both 0.822%, cover 71.79%, enrich 18.94x

    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits hg18 -enrichment knownGene:cds phastConsElements44wayEuarch
    #	knownGene:cds 1.205%, mostConserved.bed 3.696%,
    #	both 0.839%, cover 69.59%, enrich 18.83x

    #	--rho 0.31 --expected-length 45 --target-coverage 0.3
    #	knownGene:cds 1.205%, phastConsElements44wayEuarch 3.790%,
    #	both 0.838%, cover 69.51%, enrich 18.34x

    #	Create the downloads .pp files, from which the phastCons wiggle data
    #	is calculated
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
    mkdir downloads
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
    for D in pp/chr*
do
    C=${D/pp\//}
    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
	> downloads/${C}.euarchontoglires.wigFix.gz
    echo $D $C done
done
'_EOF_'
    # << happy emacs
    time nice -n +19 ./gzipAscii.sh
    #	real    26m54.263s

    # Create merged posterier probability file and wiggle track data files
    zcat downloads/chr*.wigFix.gz \
	 | wigEncode stdin phastCons44wayEuarch.wig phastCons44wayEuarch.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    18m15.693s

    ## load table with wiggle data
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
    ln -s `pwd`/phastCons44wayEuarch.wib \
	/gbdb/hg18/multiz44way/phastCons44wayEuarch.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
	phastCons44wayEuarch phastCons44wayEuarch.wig
    #	real     0m57.590s

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg18 phastCons44wayEuarch > histogram.data 2>&1
    #	real    6m37.512s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Hg18 Histogram phastCons44wayEuarch track"
set xlabel " phastCons44wayEuarch score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ########################################################################
    ### Create a phastCons data set for Placentals
    # setup placental-only run
    ssh swarm
    mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/placental
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental

    # placental-only: exclude all but these for phastCons tree:
    /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1 \
	> placental.mod
    #	and place the removed ones in the non-inf file so phastCons will
    #	truly ignore them:
    echo "monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \
        > placental.non-inf

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=8g create jobList
    para try ... check ... push ... etc.
    #	Two of these jobs fail to produce any output:
    #	bed/chr18_random/chr18_random.1-4262.bed is empty
    #	bed/chr19_random/chr19_random.1-301858.bed is empty
# Completed: 320 of 322 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:      38258s     637.63m    10.63h    0.44d  0.001 y
# IO & Wait Time:                 34704s     578.40m     9.64h    0.40d  0.001 y
# Average job time:                 228s       3.80m     0.06h    0.00d
# Longest finished job:             313s       5.22m     0.09h    0.00d
# Submission to last job:          1030s      17.17m     0.29h    0.01d

    # create Most Conserved track
    cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
    time nice -n +19 hgLoadBed hg18 phastConsElements44wayPlacental \
	mostConserved.bed
    #	Loaded  3962527 elements of size 5
    #	real    3m28.564s
    # verify coverage
    featureBits hg18 phastConsElements44wayPlacental
    #	119635433 bases of 2881515245 (4.152%) in intersection

    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits hg18 -enrichment refGene:cds phastConsElements44wayPlacental
    #	refGene:cds 1.144%, phastConsElements44wayPlacental 4.329%,
    #	both 0.840%, cover 73.41%, enrich 16.96x
    featureBits hg18 -enrichment knownGene:cds phastConsElements44wayPlacental
    #	knownGene:cds 1.205%, phastConsElements44wayPlacental 4.329%,
    #	both 0.858%, cover 71.17%, enrich 16.44x

    #	Create the downloads .pp files, from which the phastCons wiggle data
    #	is calculated
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
    mkdir downloads
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
    for D in pp/chr*
do
    C=${D/pp\//}
    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
	> downloads/${C}.placental.wigFix.gz
    echo $D $C done
done
'_EOF_'
    # << happy emacs
    time nice -n +19 ./gzipAscii.sh
    #	real    22m12.762s

    # Create merged posterier probability file and wiggle track data files
    zcat downloads/chr*.wigFix.gz \
	| wigEncode stdin phastCons44wayPlacental.wig \
		phastCons44wayPlacental.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    37m20.176s

    ## load table with wiggle data
    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
    ln -s `pwd`/phastCons44wayPlacental.wib \
	/gbdb/hg18/multiz44way/phastCons44wayPlacental.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
	phastCons44wayPlacental phastCons44wayPlacental.wig
    #	real    1m16.900s

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg18 phastCons44wayPlacental > histogram.data 2>&1
    #	real    8m15.623s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg18 Histogram phastCons44wayPlacental track"
set xlabel " phastCons44wayPlacental score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#########################################################################
# Update phastCons44way tables from Adam (DONE - 2009-05-22 - Hiram)
    mkdir /hive/data/genomes/hg18/bed/multiz44way/chrX.phastCons
    cd /hive/data/genomes/hg18/bed/multiz44way/chrX.phastCons
    mkdir  primates
    cd primates
    wget --timestamping \
ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/primates/*
    cd ..

    mkdir placental
    cd placental
wget --timestamping \
ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/placental/*
    cd ..

    mkdir all
    cd all
    wget --timestamping \
ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/all/*

    zcat all/*.wigFix.gz \
	| wigEncode stdin phastCons44way_v2.wig phastCons44way_v2.wib
    zcat primates/*.wigFix.gz \
 | wigEncode stdin phastCons44wayPrimates_v2.wig phastCons44wayPrimates_v2.wib
    zcat placental/*.wigFix.gz \
 | wigEncode stdin phastCons44wayPlacental_v2.wig phastCons44wayPlacental_v2.wib

    ln -s `pwd`/*.wib /gbdb/hg18/multiz44way
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
	phastCons44way_v2 phastCons44way_v2.wig
    #	real    0m43.022s
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
	phastCons44wayPrimates_v2 phastCons44wayPrimates_v2.wig
    #	real    0m43.660s
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
	phastCons44wayPlacental_v2 phastCons44wayPlacental_v2.wig
    #	real    0m44.607s

    time nice -n +19 hgLoadBed hg18 phastConsElements44way_v2 \
	all/mostConserved.bed
    #	Loaded 4779670 elements of size 5
    #	real    2m10.975s
    time nice -n +19 hgLoadBed hg18 phastConsElements44wayPrimates_v2 \
	primates/mostConserved.bed
    #	Loaded 785075 elements of size 5
    #	real    0m21.619s
    time nice -n +19 hgLoadBed hg18 phastConsElements44wayPlacental_v2 \
	placental/mostConserved.bed
    #	Loaded 3862854 elements of size 5
    #	real    1m41.223s

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg18 phastCons44wayPlacental_v2 > placental.histogram.data 2>&1
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg18 phastCons44wayPrimates_v2 > primates.histogram.data 2>&1
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg18 phastCons44way_v2 > vertebrate.histogram.data 2>&1

    cat << '_EOF_' | gnuplot > placental.histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg18 Histogram phastCons44wayPlacental_v2 track"
set xlabel " phastCons44wayPlacental_v2 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "placental.histogram.data" using 2:5 title " RelFreq" with impulses, \
        "placental.histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display placental.histo.png &

    cat << '_EOF_' | gnuplot > primates.histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg18 Histogram phastCons44wayPrimates_v2 track"
set xlabel " phastCons44wayPrimates_v2 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "primates.histogram.data" using 2:5 title " RelFreq" with impulses, \
        "primates.histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display primates.histo.png &

    cat << '_EOF_' | gnuplot > vertebrate.histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg18 Histogram phastCons44way_v2 track"
set xlabel " phastCons44way_v2 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "vertebrate.histogram.data" using 2:5 title " RelFreq" with impulses, \
        "vertebrate.histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display placental.histo.png &

#########################################################################
# phyloP conservation for 44-way (2009-01-05 kate)
#
# Vertebrate, Placental
# Also doing Euarchontoglire, since Hiram did
#
# Using newer scoring method LRT (replaces SPH), based
# on scoring method experiments, above (compared to SCORE method).
# Using phast from Adam's student Melissa Hubisz, with fixes needed for LRT scoring
# Will replace with version from CVS if/when these fixes are integrated
# PHAST version is 0.9.9.9b

    # split SS files into 1M chunks (tried 10M used for phastCons, and these
    #   took 5hrs/chunk w/ LRT scoring)

    ssh swarm
    cd /cluster/data/hg18/bed/multiz44way
    mkdir consPhyloP
    cd consPhyloP
    mkdir ss run.split
    cd run.split

cat << 'EOF' > doSplit.csh
    set c = $1
    set d = /cluster/data/hg18/bed/multiz44way
    set in =  $d/cons/ss
    set out = $d/consPhyloP/ss
    set PHASTBIN = /cluster/bin/phast.2008-12-18
    @ i=0
    foreach f (`ls $in/$c/*.ss | sort -n -t\. -k2`)
        @ i++
        mkdir -p $out/$c/$i
        $PHASTBIN/msa_split $f -i SS -o SS \
            -r $out/$c/$i/$c.$i -w 1000000,0 -I 1000 -B 5000
    end
    echo "Done" >> $out/$c.done
'EOF'
# << happy emacs

    set d = /cluster/data/hg18/bed/multiz44way/consPhyloP
    set JOBS = $d/run.split/jobList
    rm -f $JOBS
    touch $JOBS
    foreach c (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
        echo "csh doSplit.csh $c {check out line+ $d/ss/$c.done}" >> $JOBS
    end

    para create jobList
        # 49 jobs
    para try
    para check
    para push
    para time

    # run phyloP with score=LRT 
    ssh swarm
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP
    mkdir run.phyloP
    cd run.phyloP

    # Adjust model file base composition background and rate matrix to be
    # representative of whole-genome (.41 -- as was done for ENCODE)
    # using utility, 'modFreqs' from PHAST package

    set PHASTBIN = /cluster/bin/phast.2008-12-18
    set gc = `grep BACKGROUND /cluster/data/hg18/bed/multiz17way/cons/elliotsEncode.mod |  awk '{printf "%0.3f\n", $3 + $4}'`
    echo $gc
    # .410
    # NOTE: this corresponds well to Hiram's GC values from his phyloFit runs
    # on the 44-way ss files
    $PHASTBIN/modFreqs ../../4d/phyloFit.all.mod $gc > ../../4d/44way.all.mod

    # repeat for chrX only tree
    cd /cluster/data/hg18/bed/multiz44way/4d
    $PHASTBIN/modFreqs 4d.chrX.mod $gc > 44way.chrX.mod
    ln -s `pwd`/44way.chrX.mod /usr/local/apache/golenPath/hg18/phastCons44way

cat > doPhyloP.csh << 'EOF'
    set f = $1
    set out = $2
    set c = $f:r:r
    set n = $f:r:e
    set tmp = /scratch/tmp/$f
    rm -fr $tmp
    mkdir -p $tmp
    cp -p /cluster/data/hg18/bed/multiz44way/consPhyloP/ss/$c/$n/$f.ss $tmp
    cp -p tree.mod $tmp
    pushd $tmp > /dev/null
    set PHASTBIN = /cluster/bin/phast.2008-12-18
    $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $c \
                -i SS tree.mod $f.ss > $f.wig
    popd > /dev/null
    mkdir -p $out:h
    mv $tmp/$f.wig $out
    rm -fr $tmp
'EOF'

    # Create list of chunks
    pushd /cluster/data/hg18/bed/multiz44way/consPhyloP/ss
    ls chr*/*/chr*.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
        /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/in.list
    popd > /dev/null

    # need to fill in chr8, neglected in main run
    pushd /cluster/data/hg18/bed/multiz44way/consPhyloP/ss
    ls chr8/*/chr*.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
        /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/in.chr8.list
    popd > /dev/null

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat > template << 'EOF'
#LOOP
csh ../doPhyloP.csh $(file1) {check out line+ wig/$(dir1)/$(file1).wig}
#ENDLOOP
'EOF'
    # setup run for all species
    mkdir all
    cd all
    cp ../../../4d/44way.all.mod tree.mod
    rm -fr wig
    mkdir wig

    # << happy emacs
    gensub2 ../in.list single ../template jobList
    # 2823 jobs
    para create jobList
    para try
    para check
    para push

    para time
    #Completed: 2823 of 2823 jobs
    #CPU time in finished jobs:    4691641s   78194.02m  1303.23h   54.30d  0.149 y
    #IO & Wait Time:                171343s    2855.71m    47.60h    1.98d  0.005 y
    #Average job time:                1723s      28.71m     0.48h    0.02d
    #Longest finished job:            2451s      40.85m     0.68h    0.03d
    #Submission to last job:          6055s     100.92m     1.68h    0.07d

    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
# check for clean dir here -- chr* will match garbage if it's there
cat > listWig.csh << 'EOF'
    foreach c (`ls -d chr*`)
        foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
            ls -1 $d/*.wig | sort -n -t\. -k3
        end
    end
'EOF'

    cd all/wig
    csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayAll.wig phyloP44wayAll.wib
    # Reloaded to include chr8 (2008-01-15 kate)
    #Converted stdin, upper limit 7.13, lower limit -15.41
    # Load gbdb and database with wiggle.
    ln -s  \
        /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/all/wig/phyloP44wayAll.wib \
        /gbdb/hg18/multiz44way/phyloP44wayAll.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayAll phyloP44wayAll.wig

    # placental-only: exclude all but these: 
    cd /cluster/data/hg18/bed/multiz44way/4d
    set PHASTBIN = /cluster/bin/phast.2008-12-18
    $PHASTBIN/tree_doctor 44way.all.mod \
	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,\
          micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,\
          vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,\
          sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1 \
	> 44way.placental.mod
    cd ../consPhyloP/run.phyloP
    mkdir placental
    cd placental
    cp ../../../4d/44way.placental.mod tree.mod
    mkdir wig
    gensub2 ../in.list single ../template jobList
    # 2823 jobs
    para create jobList
    para try
    para check
    para push

    para time
    #Completed: 2823 of 2823 jobs
    #CPU time in finished jobs:    3358003s   55966.71m   932.78h   38.87d  0.106 y
    #IO & Wait Time:                142664s    2377.74m    39.63h    1.65d  0.005 y
    #Average job time:                1240s      20.67m     0.34h    0.01d
    #Longest finished job:            1781s      29.68m     0.49h    0.02d
    #Submission to last job:          4383s      73.05m     1.22h    0.05d

    # load wiggle
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/wig
    csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPlacMammal.wig phyloP44wayPlacMammal.wib
    #Converted stdin, upper limit 3.46, lower limit -14.42

    # Load gbdb and database with wiggle.
    ln -s  \
        /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/wig/phyloP44wayPlacMammal.wib \
        /gbdb/hg18/multiz44way/phyloP44wayPlacMammal.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPlacMammal phyloP44wayPlacMammal.wig

    cd /cluster/data/hg18/bed/multiz44way/4d
    set PHASTBIN = /cluster/bin/phast.2008-12-18
    $PHASTBIN/tree_doctor 44way.all.mod \
	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2 \
	> 44way.euarchontoglires.mod

    # euarchontoglires only: exclude all but these: 
    cd ../consPhyloP/run.phyloP
    mkdir euarch
    cd euarch
    cp ../../../4d/44way.euarchontoglires.mod tree.mod
    mkdir wig
    gensub2 ../in.list single ../template jobList
    # 2823 jobs
    para create jobList
    para try
    para check
    para push

    para time
    #Completed: 2823 of 2823 jobs
    #CPU time in finished jobs:    1646910s   27448.49m   457.47h   19.06d  0.052 y
    #IO & Wait Time:                 94310s    1571.84m    26.20h    1.09d  0.003 y
    #Average job time:                 617s      10.28m     0.17h    0.01d
    #Longest finished job:             901s      15.02m     0.25h    0.01d
    #Submission to last job:          2127s      35.45m     0.59h    0.02d

    # process results and load wiggle
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/euarch/wig
    csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayEuarch.wig phyloP44wayEuarch.wib
    #Converted stdin, upper limit 2.03, lower limit -9.78
    ln -s  \
        /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/euarch/wig/phyloP44wayEuarch.wib \
        /gbdb/hg18/multiz44way/phyloP44wayEuarch.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayEuarch phyloP44wayEuarch.wig

    # primates only: exclude all but these: 
    cd /cluster/data/hg18/bed/multiz44way/4d
    set PHASTBIN = /cluster/bin/phast.2008-12-18
    $PHASTBIN/tree_doctor 44way.all.mod \
	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1 \
	> 44way.primate.mod
    cd ../consPhyloP/run.phyloP
    mkdir primate
    cd primate
    cp ../../../4d/44way.primate.mod tree.mod
    mkdir wig
    gensub2 ../in.list single ../template jobList
    para create jobList
    # 2823 jobs
    para try
    para check
    para push

    # quick!
    para time
    #Completed: 2823 of 2823 jobs
    #CPU time in finished jobs:     895998s   14933.30m   248.89h   10.37d  0.028 y
    #IO & Wait Time:                 66654s    1110.90m    18.52h    0.77d  0.002 y
    #Average job time:                 341s       5.68m     0.09h    0.00d
    #Longest finished job:             503s       8.38m     0.14h    0.01d
    #Submission to last job:          1190s      19.83m     0.33h    0.01d

    # process results and load wiggle
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/wig
    csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPrimate.wig phyloP44wayPrimate.wib
    #Converted stdin, upper limit 0.99, lower limit -8.17
    ln -s  /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/wig/phyloP44wayPrimate.wib /gbdb/hg18/multiz44way/phyloP44wayPrimate.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimate phyloP44wayPrimate.wig

# get stats
    cd run.phyloP/all
    hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayAll > stats.out
    hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayAll | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out

    cd ../placental
    hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayPlacMammal > stats.out
    hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayPlacMammal | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out

    cd ../euarch
    hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayEuarch > stats.out
    hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayEuarch | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out

    cd ../primate
    hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayPrimate > stats.out
    hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayPrimate | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out

    # Downloads
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
    cat > listWigsByChrom.csh << 'EOF'
            set c = $1
            foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
                ls -1 $d/*.wig | sort -n -t\. -k3
            end
    'EOF'

    cat > downloads.csh << 'EOF'
        mkdir ../downloads
        foreach c (`ls -d chr*`)
            echo $c
            csh ../../listWigsByChrom.csh $c > ../downloads/$c.lst
            csh ../../listWigsByChrom.csh $c | xargs cat | gzip -c > ../downloads/$c.$1.wigFix.gz
            end
        cd ../downloads
        md5sum *.wigFix.gz > md5sum.txt
    'EOF'

    cd all/wig
    csh ../../downloads.csh phyloP44way >&! downloads.log &

    cd ../../placental/wig
    csh ../../downloads.csh phyloP44way.placental >&! downloads.log &
    cd ../../primate/wig
    csh ../../downloads.csh phyloP44way.primate >&! downloads.log &

    # add create web downloads dir and add symlinks to files
    cd ../../
    mkdir downloads
    cp /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/README.txt downloads
    # edit
    cd /usr/local/apache/htdocs/goldenPath/hg18/
    mkdir phyloP44way
    cd  phyloP44way
    ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/downloads/README.txt .
    mkdir vertebrate
    ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/all/downloads/{*.gz,md5sum.txt} vertebrate
    mkdir placentalMammals
    ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/downloads/{*.gz,md5sum.txt} placentalMammals
    mkdir primates
    ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/downloads/{*.gz,md5sum.txt} primates

    # Lineage-specific runs
    # uses --subtree option of phyloP

    # name ancestor nodes 
    cd /cluster/data/hg18/bed/multiz44way/4d
    set PHASTBIN = /cluster/bin/phast.2008-12-18
    $PHASTBIN/tree_doctor 44way.all.mod --name-ancestors >44way.all-ancestors.mod 
    cd ../consPhyloP/run.phyloP
    
    # built new PHAST package with fix from Adam for --subtree problems:w
    sed -e 's/phyloP/phyloP --subtree=$3/' -e 's/phast.2008-12-18/phast.2009-01-26/' doPhyloP.csh > doPhyloPSubtree.csh
    # visually inspect shell script

    cat > template.subtree << 'EOF'
#LOOP
csh ../doPhyloPSubtree.csh $(file1) {check out line+ wig/$(dir1)/$(file1).wig} SUBTREE
#ENDLOOP
'EOF'

    # primate lineage-specific
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
    mkdir primate-ls
    cd primate-ls
    cp ../../../4d/44way.all-ancestors.mod tree.mod
    mkdir wig
    sed 's/SUBTREE/hg18-micMur1/' ../template.subtree > template.ls
    gensub2 ../in.list single template.ls jobList
    para create jobList
    # 2823 jobs
    para try
    para check
    para push

    para time
#CPU time in finished jobs:    4949300s   82488.33m  1374.81h   57.28d  0.157 y
#IO & Wait Time:                143956s    2399.27m    39.99h    1.67d  0.005 y
#Average job time:                1805s      30.08m     0.50h    0.02d
#Longest finished job:            2780s      46.33m     0.77h    0.03d
#Submission to last job:          6447s     107.45m     1.79h    0.07d

    # process results and load wiggle
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate-ls/wig
    csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPrimateLs.wig phyloP44wayPrimateLs.wib
    #Converted stdin, upper limit 3.91, lower limit -9.28
    ln -s  /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate-ls/wig/phyloP44wayPrimateLs.wib /gbdb/hg18/multiz44way/phyloP44wayPrimateLs.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimateLs phyloP44wayPrimateLs.wig

    # glire lineage-specfic
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
    mkdir glire-ls
    cd glire-ls
    cp ../../../4d/44way.all-ancestors.mod tree.mod
    mkdir wig
    sed 's/SUBTREE/mm9-oryCun1/' ../template.subtree > template.ls
    gensub2 ../in.list single template.ls jobList
    para create jobList
    # 2823 jobs
    para try
    para check
    para push

    para time
    #CPU time in finished jobs:    5173192s   86219.87m  1437.00h   59.87d  0.164 y
    #IO & Wait Time:                145615s    2426.91m    40.45h    1.69d  0.005 y
    #Average job time:                1884s      31.40m     0.52h    0.02d
    #Longest finished job:            2721s      45.35m     0.76h    0.03d
    #Submission to last job:          6883s     114.72m     1.91h    0.08d

    # process results and load wiggle
    ssh hgwdev
    cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/glire-ls/wig
    csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayGlireLs.wig phyloP44wayGlireLs.wib
    #Converted stdin, upper limit 5.95, lower limit -6.99

    ln -s  /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/glire-ls/wig/phyloP44wayGlireLs.wib /gbdb/hg18/multiz44way/phyloP44wayGlireLs.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayGlireLs phyloP44wayGlireLs.wig

#########################################################################
# Update phyloP44way tables from Adam Siepel, Melissa Hubisz at Cornell
# This version uses a different neutral tree model for chrX
# and will replace the original version as default view on the Conservation track
# ( 2009-06-30  kate)
    mkdir /hive/data/genomes/hg18/bed/multiz44way/chrX.phyloP
    cd /hive/data/genomes/hg18/bed/multiz44way/chrX.phyloP
    mkdir  primates
    cd primates
    wget --timestamping ftp:ftp.biotech.cornell.edu/2x/phyloP/44way/primates/\*
    cd ..

    mkdir placental
    cd placental
    wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/placental/\*
    cd ..

    mkdir all
    cd all
    wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/all/\*
    cd ..

    zcat all/*.wigFix.gz | wigEncode stdin phyloP44way_v2.wig phyloP44way_v2.wib
    zcat primates/*.wigFix.gz | wigEncode stdin phyloP44wayPrimates_v2.wig phyloP44wayPrimates_v2.wib
    zcat placental/*.wigFix.gz | wigEncode stdin phyloP44wayPlacental_v2.wig phyloP44wayPlacental_v2.wib

    ln -s `pwd`/*.wib /gbdb/hg18/multiz44way
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18  phyloP44way_v2 phyloP44way_v2.wig
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18  phyloP44wayPrimates_v2 phyloP44wayPrimates_v2.wig
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18  phyloP44wayPlacental_v2 phyloP44wayPlacental_v2.wig

# Lineage specific phyloP
# These updated tables will appear in the Lineage Cons track

    mkdir glires-ls
    cd glires-ls
    wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/glires-ls/\*
    cd ..

    mkdir primates-ls
    cd primates-ls
    wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/primates-ls/\*
    cd ..

    zcat glires-ls/*.wigFix.gz | wigEncode stdin phyloP44wayGliresLs_v2.wig phyloP44wayGliresLs_v2.wib
    zcat primates-ls/*.wigFix.gz | wigEncode stdin phyloP44wayPrimatesLs_v2.wig phyloP44wayPrimatesLs_v2.wib

    ln -s `pwd`/phyloP44wayGliresLs_v2.wib /gbdb/hg18/multiz44way
    nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayGliresLs_v2 phyloP44wayGliresLs_v2.wig

    ln -s `pwd`/phyloP44wayPrimatesLs_v2.wib /gbdb/hg18/multiz44way
    nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimatesLs_v2 phyloP44wayPrimatesLs_v2.wig


######################################################################
# downloads for 44-way (DONE - 2009-01-09 - Hiram)
    mkdir -p /hive/data/genomes/hg18/bed/multiz44way/downloads/maf
    cd /hive/data/genomes/hg18/bed/multiz44way/downloads/maf
    # bash script
#!/bin/sh
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits hg18 refGene:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags hg18 multiz44way \
                stdin stdout \
                -orgs=/hive/data/genomes/hg18/bed/multiz44way/species.list \
        | gzip -c > upstream${S}.maf.gz
    echo "done upstream${S}.maf.gz"
done

    cd /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/maf
    ln -s /hive/data/genomes/hg18/bed/multiz44way/downloads/maf/up*.gz .
    md5sum up*.gz >> md5sum.txt

    mkdir /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way
    cd /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way
    mkdir placentalMammals primates vertebrate
    cd vertebrate
    ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/all/downloads/* .
    cd ../placentalMammals
    ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/placental/downloads/* .
    cd ../primates
    ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/primates/downloads/* .
    cd ..
    ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/all/all.mod \
	vertebrate.mod
    ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/primates/primates.mod .
    ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/placental/placental.mod \
	./placentalMammals.mod
    ln -s \
/hive/data/genomes/hg18/bed/multiz44way/downloads/phastCons44way/README.txt .

    #	pushQ MySQL tables:
phastCons44way, phastCons44wayPlacental, phastCons44wayPrimates,
multiz44way, multiz44wayFrames, multiz44waySummary,
phastConsElements44way, phastConsElements44wayPlacental,
phastConsElements44wayPrimates, phyloP44wayAll, phyloP44wayPlacMammal,
phyloP44wayPrimate

    #	pushQ files:
/gbdb/hg18/multiz44way/maf/*
/gbdb/hg18/multiz44way/phastCons44way.wib
/gbdb/hg18/multiz44way/phastCons44wayPlacental.wib
/gbdb/hg18/multiz44way/phastCons44wayPrimates.wib
/gbdb/hg18/multiz44way/phyloP44wayAll.wib
/gbdb/hg18/multiz44way/phyloP44wayPlacMammal.wib
/gbdb/hg18/multiz44way/phyloP44wayPrimate.wib
/usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/vertebrate/*
/usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/primates/*
/usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/placentalMammals/*
/usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/*.mod
/usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/README.txt
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/maf/*
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/alignments/
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/*.nh
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/README.txt
/usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/vertebrate/*
/usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/placentalMammals/*
/usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/primate/*

    #	MySQL tables:	5,624,932,756 = 5,364 Mb
    #	gbdb files:	271,318,361,985 = 258,749 Mb
    #	apache htdocs:	58,767,852,372 = 56,045 Mb
    #	Total 335,711,147,113  = 320,159 Mb

    #	An extra set of error corrected MAF's from the Siepel lab:
    mkdir /hive/data/genomes/hg18/bed/multiz44way/errorCorrectedMafs
    wget --timestamping \
    "ftp://siepellab:XXXXXX@ftp.biotech.cornell.edu/2x/maf-ec/*"
    #	not showing the password here on purpose
    # verify md5sums:
    md5sum *.maf.gz > md5sum.here
    diff md5sum.txt md5sum.here
    #	no difference
    rm md5sum.here
    mkdir \
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/SiepelLabCorrectedMafs
    cd \
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/SiepelLabCorrectedMafs
    ln -s /hive/data/genomes/hg18/bed/multiz44way/errorCorrectedMafs/* .

#########################################################################
# Create Syntenic and Recip Best net files to load into tracks to view
#	on the browser to see what was used during the multiple alignment
    cd /hive/data/genomes/hg18/bed/blastz.gorGor1/axtChain
    netClass -verbose=0 -noAr hg18.gorGor1.rbest.net.gz hg18 gorGor1 stdout \
	| gzip -c > netRBestGorGor1.net.gz
    hgLoadNet hg18 netRBestGorGor1 netRBestGorGor1.net.gz

    cd /hive/data/genomes/hg18/bed/blastz.ponAbe2/axtChain
    hgLoadNet hg18 netSyntenyPonAbe2 hg18.ponAbe2.syn.net.gz

    cd /hive/data/genomes/hg18/bed/blastz.calJac1/axtChain
    netClass -verbose=0 -noAr hg18.calJac1.rbest.net.gz hg18 calJac1 stdout \
	| gzip -c > netRBestCalJac1.net.gz
    hgLoadNet hg18 netRBestCalJac1 netRBestCalJac1.net.gz

    cd /hive/data/genomes/hg18/bed/blastz.tarSyr1/axtChain
    netClass -verbose=0 -noAr hg18.tarSyr1.rbest.net.gz hg18 tarSyr1 stdout \
	| gzip -c > netRBestTarSyr1.net.gz
    hgLoadNet hg18 netRBestTarSyr1 netRBestTarSyr1.net.gz


#########################################################################
# EIO/JCVI NAS TRACK (2008-11-25 Fan)
# Contact: Gaetano Gargiulo [gaetano.gargiulo@ifom-ieo-campus.it]

    cd /hive/data/genomes/hg18/bed
    mkdir eioJcviNAS
    cd eioJcviNAS

# receive the doc and two bed files and put them there.

    fgrep -v description HG18_NAS_CD34_neg.bed| \
    cut -f 1-3 |hgLoadBed -noBin hg18 eioJcviNASNeg stdin
    checkTableCoords -table=eioJcviNASNeg hg18

    fgrep -v description HG18_NAS_CD34_pos.bed| \
    cut -f 1-3 |hgLoadBed -noBin hg18 eioJcviNASPos stdin
    checkTableCoords -table=eioJcviNASPos hg18

# Create the description file, eioJcviNAS.html, according to 
# according to the latest doc file from Gaetano.
#
# Add the two composite sub-tracks to human/hg18/trackDb.ra.

#########################################################################
# hgPal downloads (DONE braney 2008-12-07)
#   FASTA from 44way for refGene, knownGene, knownCanonical 

    ssh hgwdev
    screen
    bash
    rm -rf /cluster/data/hg18/bed/multiz44way/pal
    mkdir /cluster/data/hg18/bed/multiz44way/pal
    cd /cluster/data/hg18/bed/multiz44way/pal
    echo hg18 | cat - /cluster/data/hg18/bed/multiz44way/ordered.list > order.lst

    mz=multiz44way
    gp=refGene
    db=hg18
    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.jobs

    time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
    sleep 1
    tail -f $gp.jobs.log

# real    525m57.376s
# user    25m36.072s
# sys     7m41.565s

    ssh kolossus
    mz=multiz44way
    gp=refGene
    db=hg18
    zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    # we're only distributing exons at the moment
    mz=multiz44way
    gp=refGene
    db=hg18
    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

    mz=multiz44way
    gp=knownGene
    db=hg18
    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
    sleep 1
    tail -f $gp.$mz.job.log

# real    442m46.735s
# user    43m3.060s
# sys     10m45.635s


    mz=multiz44way
    gp=knownGene
    db=hg18

    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    mz=multiz44way
    gp=knownGene
    db=hg18
    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

    # now do the canonical set
    cd /cluster/data/hg18/bed/multiz44way/pal
    mz=multiz44way
    gp=knownCanonical
    db=hg18
    for j in `awk '{print $1}' /cluster/data/hg18/chrom.sizes`
    do
	echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
    done

    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -geneBeds=$j.known.bed  $db $mz knownGene order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
    sleep 1
    tail -f $gp.$mz.job.log

# real    326m12.849s
# user    17m40.850s
# sys     3m59.648s

    rm *.known.bed
    mz=multiz44way
    gp=knownCanonical
    db=hg18
    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    mz=multiz44way
    gp=knownCanonical
    db=hg18
    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

#########################################################################
# BUILD OMIM RELATED GENES TRACK (complete rebuild, 10/13/09 JK)

ssh hgwdev
cd /hive/data/genomes/gs.19/build36/bed
mkdir omimGene
cd omimGene

# download the file morbidmap and genemap from OMIM

mkdir omim
cd omim
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/morbidmap
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/genemap
cat genemap|sed -e 's/|/\t/g' > genemap.tab
autoSql ~/src/hg/lib/omimGeneMap.as x
cat x.sql |sed -e 's/PRIMARY KEY(numbering)/KEY(omimId)/' >omimGeneMap.sql
hgLoadSqlTab -warn hg18 omimGeneMap omimGeneMap.sql genemap.tab

# got warning on 3 records, just ignore them
# Warning: load of omimGeneMap did not go as planned: 12216 record(s), 0 row(s)

rm x.c x.h
cd ..
cat omim/morbidmap|sed -e 's/|/\t/g' > mobidmap.tab
autoSql ~/src/hg/lib/omimMorbidMap.as x 
cat x.sql |sed -e 's/PRIMARY KEY(description)/KEY(omimId)/' >omimMorbidMap.sql
hgLoadSqlTab -warn hg18 omimMorbidMap omimMorbidMap.sql mobidmap.tab

# get all UCSC genes (from the knownGene table) that cross-reference to a RefSeq gene 
# that has a non-empty OMIM ID according to the refLink table.  And use OMIM ID as
# the gene name for this new table.  Please note the alignId field still holds the KG ID.

hgsql hg18 -N -e \
'select omimId, kg.* from knownGene kg, knownToRefSeq kr, refLink l where omimId != 0 and mrnaAcc=kr.value and kg.name=kr.name ' \
|cut -f 1,3-13 >o1.tab

# collect more OMIM related genes via the MIM external DB links from UniProt

hgsql hg18 -N -e \
'select extAC, kg.* from knownGene kg, kgXref k, proteome.spXref2 p where spId=p.accession and extDB="MIM" and kg.name=kgId ' \
|cut -f 1,3-13 >o2.tab

# concatenate the above two gene sets and remove duplications.

cat o1.tab o2.tab |sort -u >o3.tab

# load the result into a temp table, fanO3
hgLoadSqlTab hg18 fanO3 ~/src/hg/lib/knownGene.sql o3.tab

# while holding onto the OMIM ID, get the canonical gene (via the knownGene, knowIsoforms, 
# and knownCanonical tables) that represent a cluster which contains 
# initial OMIM gene in the fanO3 table

hgsql hg18 -N -e \
'select f3.name, kg.* from fanO3 f3, knownGene kg, knownCanonical c, knownIsoforms i where f3.alignId=i.transcript and kg.name=c.transcript and c.clusterId=i.clusterId'\
> o4.tab

# first column is the OMIM ID
cut -f 1 o4.tab >j1.tmp

# col 3-13 is the gene structure of the canonical KG
cut -f 3-13 o4.tab >j2.tmp

# stitch them together and remove duplicates, load the result into fanO4 table
paste j1.tmp j2.tmp |sort -u >fanO4.tab
hgLoadSqlTab hg18 fanO4  ~/src/hg/lib/knownGene.sql fanO4.tab

# finally sort the table and create bed 4 file and load it as the omimGene table

hgsql hg18 -N -e 'select chrom, txStart, txEnd, name from fanO4 order by chrom, txStart, txEnd' |sort -u >omimGene.bed
hgLoadBed hg18 omimGene omimGene.bed

# create and load the omimToKnownCanonical table.

hgsql hg18 -N -e 'select name, alignId from fanO4 order by name'\
> omimToKnownCanonical.tab

hgLoadSqlTab hg18 omimToKnownCanonical  \
~/src/hg/lib/omimToKnownCanonical.sql omimToKnownCanonical.tab

# The following clean up could be done.
# hgsql hg18 -e 'drop table fanO3'
# hgsql hg18 -e 'drop table fanO4'
# rm j*.tmp
# rm o1.tab o2.tab o3.tab o4.tab

#############################################################################
# fox2ClipSeq from Gene Yeo (DONE - 2009-01-08 - Hiram)
    mkdir /hive/data/genomes/hg18/bed/fox2ClipSeq
    cd /hive/data/genomes/hg18/bed/fox2ClipSeq
    #	lift the hg17 data to here
    liftOver -bedPlus=9 \
	/hive/data/genomes/hg17/bed/fox2ClipSeq/forwardStrand.bed.gz \
/usr/local/apache/htdocs/goldenPath/hg17/liftOver/hg17ToHg18.over.chain.gz \
	    stdout forwardStrand.unMapped | gzip -c > forwardStrand.bed.gz

    liftOver -bedPlus=9 \
	/hive/data/genomes/hg17/bed/fox2ClipSeq/reverseStrand.bed.gz \
/usr/local/apache/htdocs/goldenPath/hg17/liftOver/hg17ToHg18.over.chain.gz \
	    stdout reverseStrand.unMapped | gzip -c > reverseStrand.bed.gz

    #	turn into wiggle density plot
    zcat forwardStrand.bed.gz | bedItemOverlapCount hg18 stdin \
        | wigEncode stdin fox2ClipSeqDensityForwardStrand.wig \
	fox2ClipSeqDensityForwardStrand.wib
    #	Converted stdin, upper limit 2401.00, lower limit 1.00
    zcat reverseStrand.bed.gz | bedItemOverlapCount hg18 stdin \
        | wigEncode stdin fox2ClipSeqDensityReverseStrand.wig \
		fox2ClipSeqDensityReverseStrand.wib
    #	Converted stdin, upper limit 1406.00, lower limit 1.00
    #	and load tables
    zcat forwardStrand.bed.gz reverseStrand.bed.gz \
	| hgLoadBed hg18 fox2ClipSeq stdin
    #	Loaded 4418298 elements of size 9
    ln -s `pwd`/*.wib /gbdb/hg18/wib
    hgLoadWiggle hg18 fox2ClipSeqDensityForwardStrand \
	fox2ClipSeqDensityForwardStrand.wig
    hgLoadWiggle hg18 fox2ClipSeqDensityReverseStrand \
	fox2ClipSeqDensityReverseStrand.wig
    #	add composite track definitions to makeDb/trackDb/human/trackDb.ra

#############################################################################
# REPEATMASKER - LATEST VERSION, 3.2.7 (DONE 1/30/09 rhubley and angie)
    # Robert Hubley ran the new and improved version (3.2.7) of RepeatMasker 
    # but politely deferred to staff to load the results:
    mkdir /hive/data/genomes/hg18/bed/RMRunRMH
    cd /hive/data/genomes/hg18/bed/RMRunRMH
    doRepeatMasker.pl -stop mask -buildDir `pwd` hg18
    # see do.log, cat.log

    # Angie loaded with new table name, chr*_rmskRM327.  Used -debug to 
    # make scripts, edited those.
    cd /hive/data/genomes/hg18/bed/RMRunRMH
    doRepeatMasker.pl -debug \
      -continue install -buildDir `pwd` hg18
    # Edit doLoad.csh: change table names: rmsk -> rmskRM327,
    # nestedRepeats -> nestedRepeatsRM327
    ./doLoad.csh >& load.log & tail -f load.log
    # Edit doSplit.csh: change -ending to .RM327.fa.out
    ./doSplit.csh >& split.log & tail -f split.log
    doRepeatMasker.pl -continue cleanup -buildDir `pwd` \
      -fileServer hgwdev hg18 >& cleanup.log & tail -f cleanup.log
    # Compare coverage to original RepeatMasker run:
    featureBits hg18 rmskRM327
#1457032101 bases of 2881515245 (50.565%) in intersection
    featureBits hg18 rmsk
#1406290513 bases of 2881515245 (48.804%) in intersection
    # Wow, Arian got his 50%!  :)
    # Compare Alu counts, since that is supposed to be an area of improvement:
    grep SINE/Alu hg18.fa.out | wc -l
#1186885
    ls /hive/data/genomes/hg18/?{,?}{,_*_hap[12]}/chr[0-9XYM]{,[0-9]}{,_random,*_hap[12]}.fa.out \
    | uniq | xargs grep SINE/Alu | wc -l
#1189976
    # A decrease... weird.  OK, breaking it down chrom-by-chrom, the _random's
    # have fewer and the regular chrom's have more Alu's.  Sounds OK to me :)
    featureBits hg18 rmsk \!rmskRM327
#12318974 bases of 2881515245 (0.428%) in intersection
    featureBits hg18 rmskRM327 \!rmsk
#63060562 bases of 2881515245 (2.188%) in intersection
    # hgTables: 49,804 rmskRM327 items (4,805,535 bases) have no overlap with rmsk

    # Added download file 2/5/09:
    cd /hive/data/genomes/hg18
    zip -j bigZips/chromOut.RM3.2.7.zip */chr*.RM327.fa.out
    ln -s /hive/data/genomes/hg18/bigZips/chromOut.RM3.2.7.zip \
      /usr/local/apache/htdocs/goldenPath/hg18/bigZips/

#############################################################################
# GENOME VARIANTS - adding AK1, Saqqaq(Eskimo), Quake, Tutu, + Bushmen
# also adding phenotype information for those from PSU
# Mar 8, 2010
Load from exports from PSU Browser.
Merge needed code changes from PSU Browser for phenotype.

#############################################################################
# GENOME VARIANTS - 1000 GENOMES (DONE 1/7/2009 giardine, adapted from an email to angie)
    # December release from 1000 Genomes: SNP calls on four of the 6 high-cov
    # individuals: a CEU trio and a YRI daughter.
    # see ftp://ftp-trace.ncbi.nih.gov/1000genomes/release/2008_12/README_December2008_release
    cd /hive/data/genomes/hg18/bed/pgSnp/
    cat > trio2pg.pl <<'EOF'
#!/usr/bin/perl -w
use strict;

#split out individual SNPs from trio file
#format:chr     loc     ref     alleles snp.Q   av.max.map.Q    depth.cov       NA12891 NA12891.Q        NA12892
NA12892.Q       NA12878 NA12878.Q       hwe     maf    tdt      display

my $ac = shift @ARGV; #allele column, zero based
if (!$ac) {
   print "Usage: trio2pg.pl alleleColumn# < infile > outfile\n";
   exit;
}
while (<>) { 
   chomp;
   my @f = split(/\t/);
   if ($f[0] eq 'chr') { next; }
   $f[$ac] =~ s/([ATGC])\/\1/$1/;
   if ($f[$ac] eq uc($f[2])) { next; } #reference allele only
   print "chr$f[0]\t", ($f[1]-1), "\t$f[1]\t$f[$ac]\t";
   my $c = ($f[$ac] =~ tr/\//\//) + 1;
   my $s = $f[$ac+1];
   if ($s !~ /\//) { 
      for (my $i = 1; $c > $i; $i++) { $s .= ",$f[$ac+1]"; }
   }else {
      $s =~ s/\//,/g;
      if ($c == 1) { $s =~ s/,.*//; }
   }
   my $n = "0";
   for (my $i = 1; $c > $i; $i++) { $n .= ",0"; } #allele count
   print "$c\t$n\t$s\n";
}

exit;
'EOF'
    # << emacs
    chmod a+x trio2pg.pl

    #convert to pgSnp
    set relDir = /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/release/2008_12/
    zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 7 > NA12891.pgSnp
    zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 9 > NA12892.pgSnp
    zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 11 > NA12878.pgSnp
    zcat $relDir/YRI.child.dec.intersect.calls.gz | trio2pg.pl 7 > NA19240.pgSnp
    #gff for indels does not give nts, can't put in pgSnp format

    # 9/25/09: use samtools pileup to add base counts back in to those files.
    cat > addCounts.pl <<'_EOF_'
#!/usr/bin/env perl
use warnings;
use strict;

my $sample = shift @ARGV;
my $bamTemplate = shift @ARGV;
if (! (defined $sample && defined $bamTemplate)) {
  die "Usage: $0 sampleId bamTemplate [pgSnpFile]\n";
}

my $prevChr;
my ($bamFile, $PLUP);
while (<>) {
  my ($chr, $s, $e, $alleles, $aCount, $baseCounts, $quals) = split("\t");
  # New chrom?  open pipe from samtools pileup:
  if (!defined $prevChr || $prevChr ne $chr) {
    close ($PLUP) if (defined $PLUP);
    (my $c = $chr) =~ s/^chr//;
    ($bamFile = $bamTemplate) =~ s/__S__/$sample/g;  $bamFile =~ s/__C__/$c/;
    if (-e $bamFile) {
      my $pileupPipe = "samtools pileup $bamFile |";
      warn "Opening '$pileupPipe'\n";
      open($PLUP, $pileupPipe) || die "Can't open pipe '$pileupPipe': $!\n";
    }
    else {
      warn "bamFile '$bamFile' does not exist";
      $PLUP = undef;
    }
  }
  # Fast-forward to pileup line corresponding to this pgSnp line:
  if (defined $PLUP) {
    my ($pc, $ps, undef, $depth, $bases);
    do {
      ($pc, $ps, undef, $depth, $bases) = split("\t", <$PLUP>);
      if (defined $pc) {
        die "Unexpected chrom '$pc' (!~ '$chr') in $bamFile" if ("chr$pc" ne $chr);
        $ps--;
      } else {
        $ps = $s+1;
        close($PLUP);
        $PLUP = undef;
      }
    } while ($ps < $s);
    if (defined $pc && $ps == $s) {
      $bases =~ s/\^.//g;  $bases =~ s/\$//g;  # ignore begin/end-of-read markers
      while ($bases =~ /[-+](\d+)\w+/) {       # ignore indels
        my $count = $1;
        $bases =~ s/[-+]$count\w{$count}//;
      }
      die "length of $bases (" . length($bases) . ") != $depth" if (length($bases) != $depth);
      $bases =~ tr/acgtn/ACGTN/;
      my @origBaseCounts = split(',', $baseCounts);
      $baseCounts = "";
      foreach my $al (split("/", $alleles)) {
        my $alCt = ($bases =~ s/$al//g) + shift @origBaseCounts;
        $baseCounts .= ',' if ($baseCounts ne "");
        $baseCounts .= $alCt;
      }
      #warn "Leftover bases: $bases ($alleles)" if (length($bases) > 10);
      # Sometimes the allele is given as homozygous but there are many other
      # copies of some other base detected...?   And sometimes lots of "*"
      # characters, not described on http://samtools.sourceforge.net/pileup.shtml
    } # end if we found the pileup line for this pgSnp line
  } # end if there is a $bamFile for this template and chrom.
  print join("\t", $chr, $s, $e, $alleles, $aCount, $baseCounts, $quals);
  $prevChr = $chr;
}
'_EOF_'
    # << emacs
    chmod a+x addCounts.pl
    foreach f (NA*.pgSNP)
      set s = $f:r
      cat $f \
      | ./addCounts.pl $s \
         /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/__S__/alignment/__S__.chrom__C__.SLX.maq.SRP000032.2009_07.bam \
      | ./addCounts.pl $s \
         /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/__S__/alignment/__S__.chrom__C__.SOLID.corona.SRP000032.2009_08.bam \
      | ./addCounts.pl $s \
         /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/__S__/alignment/__S__.chrom__C__.454.ssaha.SRP000032.2009_07.bam \
        > $f.counts
    end
    # NA12878 and NA19240 have all 3 platforms;  just SLX.maq for NA12891, NA12892

    hgLoadBed hg18 pgNA12878 NA12878.pgSnp.counts \
      -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
#Loaded 3049749 elements of size 7
    hgLoadBed hg18 pgNA12891 NA12891.pgSnp.counts \
      -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
#Loaded 2968312 elements of size 7
    hgLoadBed hg18 pgNA12892 NA12892.pgSnp.counts \
      -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
#Loaded 2972120 elements of size 7
    hgLoadBed hg18 pgNA19240 NA19240.pgSnp.counts \
      -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
#Loaded 3586490 elements of size 7


#############################################################################
# GENOME VARIANTS - (DONE 1/7/09 giardine, adapted by angie from pgSnp/README)
    # File pgVenter.bed placed in /hive/data/genomes/hg18/bed/pgSnp/ by 
    # Belinda.
    cd /hive/data/genomes/hg18/bed/pgSnp/
    grep "^chr" pgVenter.bed | sort -k1,1 -k2,2n \
    | hgLoadBed hg18 pgVenter stdin \
      -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
    # 3/11/09: fetching this file because I think it's the original data (angie)
    wget ftp://ftp.jcvi.org/pub/data/huref/HuRef.InternalHuRef-NCBI.gff


#############################################################################
# GENOME VARIANTS - YRI NA18507 (DONE 1/9/07 giardine, adapted by angie from pgSnp/README)
    # SNP calls made by Aakrosh Ratan at PSU.
    # Files pgYri{2,3}.txt placed in /hive/data/genomes/hg18/bed/pgSnp/ by 
    # Belinda.
    # yoruban snp calls (using solid software instead of maq)
    # Loaded 11/4/08 according to hg18.history, but table status says created
    # 1/7/09:
    cd /hive/data/genomes/hg18/bed/pgSnp/
    grep "^chr" pgYri2.txt | sort -k1,1 -k2,2n \
    | hgLoadBed hg18 pgYoruban2 stdin \
      -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
    #Another yoruban SNP set, same individual, Solexa reads, includes indels
    # Loaded 11/7/08 according to hg18.history, but table status says created
    # 1/7/09:
    grep "^chr" pgYri3.txt | sort -k1,1 -k2,2n \
    | hgLoadBed hg18 pgYoruban3 stdin \
      -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab


#############################################################################
# GENOME VARIANTS - YH (DONE 2/24/09 giardine, adapted by angie from pgSnp/README)
    #Asian individual (YH1) from Nature paper
    #http://yh.genomics.org.cn/index.jsp
    # File pgSnpYh.txt placed in /hive/data/genomes/hg18/bed/pgSnp/ by 
    # Belinda.
    cd /hive/data/genomes/hg18/bed/pgSnp/
    grep "^chr" pgSnpYh.txt | sort -k1,1 -k2,2n \
    | hgLoadBed hg18 pgYh1 stdin \
      -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
    # 3/11/09: fetching this file because I think it's the original data (angie)
    wget -O "yhsnp_add.gff" \
      'http://yh.genomics.org.cn/do.downServlet?file=data/snps/yhsnp_add.gff'


#############################################################################
# GENOME VARIANTS - KOREF (DONE 9/17/09 angie)
    # Korean individual (Seong-Jin Kim) from Genome Research paper
    cd /hive/data/genomes/hg18/bed/pgSnp/
    # Download Belinda's file from PSU, use same table name (pgSjk) as on
    # http://main.genome-browser.bx.psu.edu/ :
    wget http://www.bx.psu.edu/~giardine/tests/tmp/koref.sub.pgSnp
    hgLoadBed hg18 pgSjk koref.sub.pgSnp \
      -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
#Loaded 3439107 elements of size 7
    # Downloading because I think it's the original data:
    wget ftp://ftp.kobic.kr/pub/KOBIC-KoreanGenome/genetic_variations/KOREF-solexa-snp-X30_Q40d4D100.gff


#############################################################################
#  Initial import of LSSNP data for SNP and hgGene linking (2009-02-02 markd)
#############################################################################
# dump and load LSSNP databases from Johns Hopkins.  This will be automated
# soon.
    # download dump into tmp directory LSSNP; must load on bugle as the
    # database is mysql 5
    ssh bugle
    hgsql -e 'create database LSSNP'
    cat LSSNP/*.sql |hgsql  LSSNP
    hgsqlimport LSSNP `pwd`/LSSNP/*.txt
    ssh hgwdev
    hgLsSnpPdbLoad fetch bugle:LSSNP lsSnpPdb.tab
    hgLsSnpPdbLoad load hg18 lsSnpPdb lsSnpPdb.tab 
#############################################################################


#############################################################################
# HGDP GEOGRAPHIC SNP MAPS (DONE 2/5/09 angie - UPDATED 9/15/10)
    # Project data downloaded and parsed in /hive/data/outside/hgdpGeo,
    # see makeDb/doc/hgdpGeo.txt.
    mkdir /hive/data/genomes/hg18/bed/hgdpGeo
    cd /hive/data/genomes/hg18/bed/hgdpGeo
    # Make an rsId-sorted snp coords file for joining with the hgdpGeo data.
    grep -Fwf /hive/data/outside/hgdpGeo/rsIDs.lst \
      ../snp129/snp129.bed \
    | awk 'BEGIN{OFS="\t";} {print $4, $1, $2, $3;}' \
    | sort > snp129Coords.txt
    wc -l snp129Coords.txt
#660280 snp129Coords.txt
    # How many distinct SNPs in there?  (compare to 657000 from HGDP):
    cut -f 1 snp129Coords.txt |uniq | wc -l
#656496

    # Join files to make a track table:
    join -e ERROR -t'	' -o 1.2,1.3,1.4,1.1,2.2,2.3,2.4 \
      snp129Coords.txt /hive/data/outside/hgdpGeo/hgdpGeoCoordless.txt \
    | sed -re 's/([AGTC])\*/\1/' \
    | sort -k1,1 -k2n,2n \
      > hgdpGeo.tab
    wc -l hgdpGeo.tab
#660280 hgdpGeo.tab
    grep ERROR hgdpGeo.tab | wc -l
#0

    hgLoadBed hg18 hgdpGeo hgdpGeo.tab \
      -sqlTable=$HOME/kent/src/hg/lib/hgdpGeo.sql
#Loaded 660280 elements of size 7

    # Correcting strand and remapping to snp130 9/15/10:
    mkdir /hive/data/genomes/hg18/bed/hgdpGeo/100915
    cd /hive/data/genomes/hg18/bed/hgdpGeo/100915
    grep -Fwf /hive/data/outside/hgdpGeo/rsIDs.lst \
      ../../snp130/snp130.bed \
    | awk 'BEGIN{OFS="\t";} {print $4, $1, $2, $3, $8;}' \
    | sort > snp130CoordsAndRef.txt
    cut -f 1 snp130CoordsAndRef.txt | uniq | wc -l
#656484
    join -e ERROR -t'	' -o 1.2,1.3,1.4,1.1,2.2,2.3,2.4,1.5 \
      snp130CoordsAndRef.txt /hive/data/outside/hgdpGeo/hgdpGeoCoordless.txt \
    | sed -re 's/([AGTC])\*/\1/' \
    | sort -k1,1 -k2n,2n \
      > hgdpGeo.fixme
    wc -l hgdpGeo.fixme
#660265 hgdpGeo.fixme
    # Use the snp130 reference allele to detect when we need to rev-comp
    # the alleles to match the + strand.  Also, throw out SNPs for which 
    # the ref allele is multi-base -- it's questionable whether we're giving
    # the right coords (some funny things happen with dbSNP's clustering...):
    cat > fixAlleles.pl <<'_EOF_'
#!/usr/bin/env perl
use warnings;
use strict;
my %rc = ('A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A');
while (<>) {
  chomp;  my ($c, $s, $e, $rs, $ancAl, $derAl, $freqs, $ref) = split;
  next unless ($ref =~ /^[ACGT]$/);
  if ($ancAl ne $ref && $derAl ne $ref) {
    $ancAl = $rc{$ancAl};
    $derAl = $rc{$derAl};
  }
  print join("\t", $c, $s, $e, $rs, $ancAl, $derAl, $freqs) . "\n";
}
'_EOF_'
    # << emacs
    chmod a+x fixAlleles.pl
    ./fixAlleles.pl hgdpGeo.fixme > hgdpGeo.tab
    wc -l hgdpGeo.tab
#660221 hgdpGeo.tab
    hgLoadBed hg18 hgdpGeo hgdpGeo.tab \
      -sqlTable=$HOME/kent/src/hg/lib/hgdpGeo.sql
#Loaded 660219 elements of size 7


#############################################################################
# HGDP HETEROZYGOSITY (DONE 2/12/09 angie, except for Bantu 3/12/09)
    mkdir /hive/data/genomes/hg18/bed/hgdpHzy
    cd /hive/data/genomes/hg18/bed/hgdpHzy
    foreach continent (african americas easia european mideast oceania sasia)
      wget --timestamping http://hgdp.uchicago.edu/data/hzy/$continent.gff.gz
    end
    wget --timestamping http://hgdp.uchicago.edu/data/hzy/allbantu.hzy.gff.gz
    foreach continent (african allbantu americas easia european mideast oceania sasia)
      set bedGraph = `echo $continent \
                      | sed -re 's/can$/ca/; s/pean$/pe/; s/asia/Asia/; s/allbantu/bantu/; \
                                 s/(.*)/hgdpHzy\u\1.bedGraph/'`
      echo $bedGraph
      zcat $continent.gff.gz \
      | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
        > $bedGraph
    end

    # 3/12/09: All of the original files' coords were intervals between SNPs,
    # but the Bantu file had SNP coordinates, and one more line per chrom than
    # the others.  So (after getting OK from Joe) I am going to transform the
    # Bantu SNP coords to intervals like the others.
    perl -we 'while (<>) { \
      chomp; ($c, $s, undef, $h) = split; \
      if (defined $lastC) { \
        if ($lastC eq $c) { \
          print "$c\t$lastS\t$s\t$lastH\n"; \
        } # Discarding last SNP on each chrom \
      } \
      ($lastC, $lastS, $lastH) = ($c, $s, $h); \
    }' \
      hgdpHzyBantu.bedGraph > tmp
    mv tmp hgdpHzyBantu.bedGraph

    # Using bedGraph, not wig, because there are only 640k datapoints and 
    # some are over the 10Mbase wiggle item size limit.
    foreach f (*.bedGraph)
      hgLoadBed hg18 $f:r $f -bedGraph=4
    end
    # All have same size:
#Loaded 640676 elements of size 4


#############################################################################
# HGDP FST (DONE 2/12/09 angie)
    mkdir /hive/data/genomes/hg18/bed/hgdpFst
    cd /hive/data/genomes/hg18/bed/hgdpFst
    wget --timestamping \
      http://hgdp.uchicago.edu/data/FST/autosomal_illuminasnps7_pval.gff.gz
    zcat autosomal_illuminasnps7_pval.gff.gz \
    | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
      > hgdpFst.bedGraph
    hgLoadBed hg18 hgdpFst hgdpFst.bedGraph -bedGraph=4
#Loaded 640676 elements of size 4


#############################################################################
# HGDP IHS (DONE 2/13/09 angie)
    mkdir /hive/data/genomes/hg18/bed/hgdpIhs
    cd /hive/data/genomes/hg18/bed/hgdpIhs
    foreach continent (Bantu Americas E.Asia European MiddleEast Oceania S.Asian)
      wget --timestamping \
        http://hgdp.uchicago.edu/data/iHS/smoothed$continent.iHS.gff.gz
      set bedGraph = `echo $continent \
                      | sed -re 's/pean$/pe/; s/\.Asian?/Asia/; \
                                 s/MiddleEast/Mideast/; s/(.*)/hgdpIhs\1.bedGraph/'`
      echo $bedGraph
      zcat smoothed$continent.iHS.gff.gz \
      | sed -e 's/^chr23/chrX/' \
      | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
        > $bedGraph
    end
    foreach f (*.bedGraph)
      hgLoadBed hg18 $f:r $f -bedGraph=4
    end
#Reading hgdpIhsBantu.bedGraph
#Loaded 540438 elements of size 4
#Reading hgdpIhsAmericas.bedGraph
#Loaded 422167 elements of size 4
#Reading hgdpIhsEAsia.bedGraph
#Loaded 487801 elements of size 4
#Reading hgdpIhsEurope.bedGraph
#Loaded 543875 elements of size 4
#Reading hgdpIhsMideast.bedGraph
#Loaded 552277 elements of size 4
#Reading hgdpIhsOceania.bedGraph
#Loaded 425340 elements of size 4
#Reading hgdpIhsSAsia.bedGraph
#Loaded 550231 elements of size 4


#############################################################################
# HGDP XP-EHH (DONE 2/12/09 angie)
    mkdir /hive/data/genomes/hg18/bed/hgdpXpehh
    cd /hive/data/genomes/hg18/bed/hgdpXpehh
    foreach continent (Bantu Americas E.Asia Europe Mideast Oceania S.Asia)
      wget --timestamping \
        http://hgdp.uchicago.edu/data/XPEHH/$continent.xpehh.forbrowser.gff.gz
      set bedGraph = `echo $continent \
                      | sed -re 's/\.Asia?/Asia/; s/(.*)/hgdpXpehh\1.bedGraph/'`
      echo $bedGraph
      zcat $continent.xpehh.forbrowser.gff.gz \
      | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
        > $bedGraph
    end
    foreach f (*.bedGraph)
      hgLoadBed hg18 $f:r $f -bedGraph=4
    end
#Reading hgdpXpehhBantu.bedGraph
#Loaded 636680 elements of size 4
#Reading hgdpXpehhAmericas.bedGraph
#Loaded 636143 elements of size 4
#Reading hgdpXpehhEAsia.bedGraph
#Loaded 635799 elements of size 4
#Reading hgdpXpehhEurope.bedGraph
#Loaded 636680 elements of size 4
#Reading hgdpXpehhMideast.bedGraph
#Loaded 636849 elements of size 4
#Reading hgdpXpehhOceania.bedGraph
#Loaded 637418 elements of size 4
#Reading hgdpXpehhSAsia.bedGraph
#Loaded 636773 elements of size 4


#############################################################################
# LIFTOVER TO Hg19 (DONE - 2009-03-06 - Hiram )
    mkdir /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
    cd /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug hg18 hg19
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
	 hg18 hg19 > do.log 2>&1
    #	real    85m8.064s

#############################################################################

# HAPMAP REL22 RECOMBINATION RATES (PHASE II)  (DONE 2/24/09 angie)
    mkdir -p /hive/data/outside/hapmap/recombination/2008-03_rel22_B36/rates
    cd /hive/data/outside/hapmap/recombination/2008-03_rel22_B36/
    wget --timestamping \
      ftp://ftp.hapmap.org/pub/hapmap/public/recombination/2008-03_rel22_B36/00README.txt
    cd rates
    wget --timestamping \
      ftp://ftp.hapmap.org/pub/hapmap/public/recombination/2008-03_rel22_B36/rates/\*

    # Make bedGraph-formatted files.
    mkdir -p /hive/data/genomes/hg18/bed/hapmap/recombination/2008-03_rel22_B36
    cd /hive/data/genomes/hg18/bed/hapmap/recombination/2008-03_rel22_B36
    cp /dev/null hapmapRecombRate.bed
    foreach f (/hive/data/outside/hapmap/recombination/2008-03_rel22_B36/rates/*.txt)
      set chr = `echo $f:t:r | sed -e 's/^.*chr/chr/; s/_b36.*//;'`
      echo $f $chr
      perl -wpe 's/^position .*\n// && next; \
                 m/^(\d+) (\d+\.?\d*) .*/ || die $_; $end=$1; $rate=$2; \
                 $start=$end-100 unless (defined $start); \
                 $_ = "'$chr'\t$start\t$end\t$rate\n";  $start = $end;' \
        $f >> hapmapRecombRate.bedGraph
    end
    # Some items are over the 10Mbase wiggle item size limit, so use bedGraph.
    time hgLoadBed hg18 hapmapRecombRate hapmapRecombRate.bedGraph -bedGraph=4
#Loaded 3281323 elements of size 4
#14.688u 1.796s 0:31.99 51.4%    0+0k 0+0io 0pf+0w

    # There are >3M items...  try bigWig!  :)
    wigToBigWig hapmapRecombRate.bedGraph /hive/data/genomes/hg18/chrom.sizes \
      hapmapRecombRate.bw
    ln -s `pwd`/hapmapRecombRate.bw /gbdb/hg18/bbi/
    hgsql hg18 -e 'drop table if exists hapmapRecombRateBW; \
            create table hapmapRecombRateBW (fileName varchar(255) not null); \
            insert into hapmapRecombRateBW values ("/gbdb/hg18/bbi/hapmapRecombRate.bw");'


#############################################################################
# HAPMAP REL27 GENOTYPES (MERGED PHASE II+III)  (DONE 2/25/09 angie)
    # First, download release to /hive/data/outside...
    mkdir -p /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III/{excluded,forward}
    cd /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III
    wget --timestamping \
      ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/00README.txt
    cd excluded
    wget --timestamping \
      ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/excluded/\*
    cd ../forward
    wget --timestamping \
      ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/forward/\*

    # This directory's README refers to the README from the
    # phaseIII-only 2009_01, which gives the file format and explains
    # the population codes:
    wget --timestamping -o 00README_2009-01_phaseIII.txt \
      ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-01_phaseIII/00README.txt

    # For details page... this is Coriell's NHGRI panel (all HapMap except 
    # CEPH): http://ccr.coriell.org/Sections/Collections/NHGRI/?SsId=11
    # http://www.broad.mit.edu/mpg/hapmap3/
    # Broad, BCM and Sanger have a nice phase3 writeup.  Here is Broad's
    # copy: http://www.broad.mit.edu/mpg/hapmap3/

    # Now translate those into hapmapSnps* tables.
    # NOTE FOR NEXT TIME: make this a cluster job.  It takes ~half hour each pop!
    # Could run the script on each downloaded file as a separate job, and then
    # concatenate results (or just feed chr*_$pop to hgLoadBed).
    mkdir -p /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
    cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
    set sourceDir = /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III/forward
    foreach pop (ASW CEU CHB CHD GIH JPT LWK MEX MKK TSI YRI)
      echo $pop
      zcat $sourceDir/genotypes_chr*_${pop}_r27_nr.b36_fwd.txt.gz \
      | perl -wpe 'chomp; \
          if (/^rs# alleles c\w+ pos s\w+ a\w+# c\w+ protLSID assayLSID panelLSID QCcode NA/) { \
            $_ = "";  # skip header lines  \
          } elsif (s/^(rs\d+) ([ACGT])\/([ACGT]) (chr\w+) (\d+) \+ ncbi_[bB]?36 .* QC\+ //) { \
            ($rsId, $obs1, $obs2, $chr, $end) = ($1, $2, $3, $4, $5); \
            %compl = (A=>"T", C=>"G", G=>"C", T=>"A"); \
            %hom = ();  %het = (); \
      # NOTE: one trouble-maker (other pop files have A/C with AC genotypes): \
            if ($rsId eq "rs7059622" && "'$pop'" eq "YRI") { warn "Tweaking YRI rs7059622.\n"; } \
            foreach my $al (split()) { \
              next if ($al eq "NN"); \
              $al =~ /^([ACGT])([ACGT])$/ || die "Unrecognized allele string $al"; \
              ($a1, $a2) = ($1, $2); \
      # NOTE: one trouble-maker (other pop files have A/C with AC genotypes): \
              if ($rsId eq "rs7059622" && "'$pop'" eq "YRI") \
                { $a1 = $compl{$a1}; $a2 = $compl{$a2}; } \
      # The error that the trouble-maker triggered: \
              if (($a1 !~ /^[$obs1$obs2]$/) || ($a2 !~ /^[$obs1$obs2]$/)) \
                { die "$rsId (${chr}_'$pop'): obs $obs1/$obs2 !~ $a1$a2!\n\t"; } \
              if ($a1 eq $a2) { $hom{$a1}++; } else { $het{$a1}++; $het{$a2}++; } \
            } \
            $start = $end - 1; \
            $hom1 = $hom{$obs1} || 0; $hom2 = $hom{$obs2} || 0; \
            $het = $het{$obs1} || 0;  $het2 = $het{$obs2} || 0; \
            $score = (1000 * (2*$hom2 + $het) / (2*($hom1 + $hom2 + $het))); \
            if ($score >= 500) { $score = 1000 - $score; } \
            $score = int($score + 0.5); \
            if ($het != $het2) { die "het{$obs1} ($het{$obs1}) != het{$obs2} ($het{$obs2})"; } \
            $_ = "$chr\t$start\t$end\t$rsId\t$score\t+\t$obs1/$obs2\t$obs1\t$hom1\t$obs2\t$hom2\t$het\n"; \
          } else { \
            die "Unrecognized format:\n$_\n\t"; \
          }' > hapmapSnps$pop.bed
    end
    wc -l hapmapSnps*.bed
#   1561453 hapmapSnpsASW.bed
#   4030774 hapmapSnpsCEU.bed
#   4052336 hapmapSnpsCHB.bed
#   1306196 hapmapSnpsCHD.bed
#   1407877 hapmapSnpsGIH.bed
#   4052423 hapmapSnpsJPT.bed
#   1529764 hapmapSnpsLWK.bed
#   1410265 hapmapSnpsMEX.bed
#   1537638 hapmapSnpsMKK.bed
#   1419921 hapmapSnpsTSI.bed
#   3984356 hapmapSnpsYRI.bed
    foreach pop (ASW CEU CHB CHD GIH JPT LWK MEX MKK TSI YRI)
      hgLoadBed hg18 hapmapSnps$pop hapmapSnps$pop.bed -renameSqlTable \
        -sqlTable=$HOME/kent/src/hg/lib/hapmapSnps.sql
    end
#Reading hapmapSnpsASW.bed
#Loaded 1561453 elements of size 12
#Reading hapmapSnpsCEU.bed
#Loaded 4030774 elements of size 12
#Reading hapmapSnpsCHB.bed
#Loaded 4052336 elements of size 12
#Reading hapmapSnpsCHD.bed
#Loaded 1306196 elements of size 12
#Reading hapmapSnpsGIH.bed
#Loaded 1407877 elements of size 12
#Reading hapmapSnpsJPT.bed
#Loaded 4052423 elements of size 12
#Reading hapmapSnpsLWK.bed
#Loaded 1529764 elements of size 12
#Reading hapmapSnpsMEX.bed
#Loaded 1410265 elements of size 12
#Reading hapmapSnpsMKK.bed
#Loaded 1537638 elements of size 12
#Reading hapmapSnpsTSI.bed
#Loaded 1419921 elements of size 12
#Reading hapmapSnpsYRI.bed
#Loaded 3984356 elements of size 12
    rm bed.tab; nice gzip *.bed


#############################################################################
# HAPMAP REL27 ORTHOLOGOUS ALLELES (DONE 3/4/09 angie)
    # Similar procedure to snp129Ortho, but we make one table per species
    # because they are independent subtracks of HapMap SNPs.
    cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
    # Glom all human info that we need for the final table onto the
    # name, to sneak it through liftOver: rsId|chr|start|end|obs|strand
    awk 'BEGIN{OFS="\t";} \
        {print $1, $2, $3, \
               $4 "|" $1 "|" $2 "|" $3 "|" $7 "|" $6, \
               0, $6;}' \
      hapmapSnps???.bed \
    | sort -u -k1,1 -k2n,2n \
      > hapmapSnpsForLiftOver.bed
    wc -l hapmapSnpsForLiftOver.bed
#4165831 hapmapSnpsCombined.bed

    # Orthologous allele locations:
    mkdir run.liftOChimp
    cd run.liftOChimp
    mkdir split out
    splitFile ../hapmapSnpsForLiftOver.bed 25000 split/chunk
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
        \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
        >> jobList
    end
    ssh pk
    cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III/run.liftOChimp
    para make jobList
#Completed: 167 of 167 jobs
#CPU time in finished jobs:      31364s     522.74m     8.71h    0.36d  0.001 y
#IO & Wait Time:                   800s      13.33m     0.22h    0.01d  0.000 y
#Average job time:                 193s       3.21m     0.05h    0.00d
#Longest finished job:             431s       7.18m     0.12h    0.00d
#Submission to last job:           442s       7.37m     0.12h    0.01d
    mkdir ../run.liftOMac
    cd ../run.liftOMac
    mkdir out
    ln -s ../run.liftOChimp/split .
    cp /dev/null jobList
    foreach f (split/chunk*)
      echo liftOver $f \
        /hive/data/genomes/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
        \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
        >> jobList
    end
    para make jobList
#Completed: 167 of 167 jobs
#CPU time in finished jobs:       2482s      41.36m     0.69h    0.03d  0.000 y
#IO & Wait Time:                  1361s      22.69m     0.38h    0.02d  0.000 y
#Average job time:                  23s       0.38m     0.01h    0.00d
#Longest finished job:              33s       0.55m     0.01h    0.00d
#Submission to last job:            97s       1.62m     0.03h    0.00d

    # Concatenate the liftOver results, sorting by ortho pos in order to
    # efficiently access 2bit sequence in getOrthoSeq.  The output of
    # that is swizzled so that a glom of ortho coords is the first column,
    # and then we sort by that for joining with base quality info.
    # Ditto for macaque.  ~5 minutes per species:
    cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
    sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /hive/data/genomes/panTro2/panTro2.2bit \
    | awk 'BEGIN{OFS="\t";} {print $2 ":" $3 ":" $4, $5, $6, $1;}' \
    | sort > panTro2.orthoGlom.txt
    sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
    | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /hive/data/genomes/rheMac2/rheMac2.2bit \
    | awk 'BEGIN{OFS="\t";} {print $2 ":" $3 ":" $4, $5, $6, $1;}' \
    | sort > rheMac2.orthoGlom.txt
    wc -l panTro2.orthoGlom.txt rheMac2.orthoGlom.txt
#  4057739 panTro2.orthoGlom.txt
#  3750076 rheMac2.orthoGlom.txt
    # Get base qualities -- ~12-16min per species.
    cut -f 1 panTro2.orthoGlom.txt | sed -e 's/:/\t/g' \
    | hgWiggle -db=panTro2 -lift=1 -doAscii -bedFile=stdin quality \
    | varStepToBedGraph.pl stdin \
    | awk 'BEGIN{OFS="\t";} {print $1 ":" $2 ":" $3, int($4+0.5);}' \
    | sort > panTro2.baseQuals.txt
#Processed 4003968 lines input, 4003685 data lines, 47 variable step declarations
    cut -f 1 rheMac2.orthoGlom.txt | sed -e 's/:/\t/g' \
    | hgWiggle -db=rheMac2 -lift=1 -doAscii -bedFile=stdin quality \
    | varStepToBedGraph.pl stdin \
    | awk 'BEGIN{OFS="\t";} {print $1 ":" $2 ":" $3, int($4+0.5);}' \
    | sort > rheMac2.baseQuals.txt
#Processed 3749772 lines input, 3749645 data lines, 21 variable step declarations

    # Join the allele-glom with the base qual-glom and swizzle columns into
    # the right order for a hapmapAllelesOrtho table.
    join -a 1 -e 0 panTro2.orthoGlom.txt panTro2.baseQuals.txt \
    | perl -wpe 'chomp; ($oG, $oA, $oStr, $hG, $bQ) = split; \
        ($oC, $oS, $oE) = split(":", $oG); \
        ($rs, $hC, $hS, $hE, $hO, $hStr) = split(/\|/, $hG); \
        unless (defined $bQ) { \
          if ($oC =~ /^chr(21|Y|Y_random)$/) { $bQ = 98; } # per panTro2 quality track desc \
          elsif ($oC eq "chrM") { $bQ = 0; } \
          else { die "missing qual for $oC: $_\n\t"; } } \
        $_ = "$hC\t$hS\t$hE\t$rs\t$bQ\t$hStr\t\t$hO\t$oC\t$oS\t$oE\t$oStr\t$oA\n";' \
    | sort -k1,1 -k2n,2n \
        > hapmapAllelesChimp.bed
    wc -l hapmapAllelesChimp.bed
#4057739 hapmapAllelesChimp.bed
    join -a 1 -e 0 rheMac2.orthoGlom.txt rheMac2.baseQuals.txt \
    | perl -wpe 'chomp; ($oG, $oA, $oStr, $hG, $bQ) = split; \
        ($oC, $oS, $oE) = split(":", $oG); \
        ($rs, $hC, $hS, $hE, $hO, $hStr) = split(/\|/, $hG); \
        unless (defined $bQ) { die "missing qual for $oC: $_\n\t"; } \
        $_ = "$hC\t$hS\t$hE\t$rs\t$bQ\t$hStr\t\t$hO\t$oC\t$oS\t$oE\t$oStr\t$oA\n";' \
    | sort -k1,1 -k2n,2n \
        > hapmapAllelesMacaque.bed
    wc -l hapmapAllelesMacaque.bed
#3750076 hapmapAllelesMacaque.bed

    # Load tables.
    cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
    hgLoadBed hg18 hapmapAllelesChimp hapmapAllelesChimp.bed \
      -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapAllelesOrtho.sql
#Loaded 4057739 elements of size 13
    hgLoadBed hg18 hapmapAllelesMacaque hapmapAllelesMacaque.bed \
      -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapAllelesOrtho.sql


#############################################################################
# HAPMAP REL27 SUMMARY FOR HGTRACKS FILTERING (DONE 3/5/09 angie)
    cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
    time hapmapPhaseIIISummary .
#115.244u 5.009s 2:10.08 92.4%   0+0k 0+0io 2pf+0w
    time hgLoadBed hg18 hapmapPhaseIIISummary hapmapPhaseIIISummary.bed \
      -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapPhaseIIISummary.sql
#Loaded 4166007 elements of size 18
#33.401u 3.275s 1:46.95 34.2%    0+0k 0+0io 0pf+0w

#############################################################################
# DOWNLOAD HAPMAP PHASED GENOTYPES (PHASE III)  (DONE 2/23/09 angie)
    mkdir -p /hive/data/outside/hapmap/phasing/2009-02_phaseIII/HapMap3_r2
    cd /hive/data/outside/hapmap/phasing/2009-02_phaseIII/HapMap3_r2
    wget --timestamping \
      ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2009-02_phaseIII/HapMap3_r2/\*
    foreach pop (ASW CEU CHD GIH JPT+CHB LWK MEX MKK TSI YRI)
      foreach type (DUOS TRIOS UNRELATED)
        mkdir -p $pop/$type
        pushd $pop/$type
        wget --timestamping \
          ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2009-02_phaseIII/HapMap3_r2/$pop/$type/\*
        popd
      end
    end
    # Looks like phased genotypes are given only for the populations with 
    # family structure: ASW, CEU, MEX, MKK, and YRI.
    # Next: use these data to make LD tracks.


#############################################################################
# HAPMAP LD COMPUTED ON PHASED & UNPHASED GENOTYPES (TODO angie)


#############################################################################
# GERP Conservation scoring and elements for Ensembl 31-way alignments
# From Javier Guerroro
# ENCODE-related data (equested by Margulies, for use by ENCODE analysis group)
# (2009-03-05 kate)

    ssh hgwdev
    cd /cluster/data/hg18/bed
    mkdir -p ensembl31wayGerp/lab
    cd ensembl31wayGerp/lab
    wget -r ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/31way_msa/
    cd ..
    bzcat lab/31way_gerp_elements.bed.bz2 | \
        tail -n +2 | \
        sed 's/31way_gerp_elem_365000000/gerp31./' | \
        hgLoadBed hg18 ensembl31wayGerpElements stdin \
            -sqlTable=$HOME/kent/src/hg/lib/encode/broadPeak.sql -renameSqlTable
    # Loaded 1464897 elements of size 9

cat > we.csh << 'EOF'
    foreach f (lab/*.wig.bz2)
        echo $f
        bzcat $f | tail -n +2 | wigEncode stdin temp.wig temp.wib
    end
'EOF'
    # << emacs

    bzcat lab/*.wig.bz2 | tail -n +2 | \
        wigEncode stdin ensembl31wayGerpScores.wig ensembl31wayGerpScores.wib

    #   load database
    mkdir /gbdb/hg18/wib
    ln -s `pwd`/ensembl31wayGerpScores.wib /gbdb/hg18/wib
    hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 ensembl31wayGerpScores ensembl31wayGerpScores.wig


############################################################################
# VEGA GENES UPDATE (BUILD 33) (DONE 2008-03-11 Andy)
    mkdir  /cluster/data/hg18/bed/vega33
    cd  /cluster/data/hg18/bed/vega33
    wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/human/*" \
         "ftp://ftp.sanger.ac.uk/pub/vega/human/pep/*.tot.fa.gz"
    zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
        | grep "^chr" > nonHaps.gtf
    zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
        | grep -v "^chr" > haps.gtf
    awk 'BEGIN{OFS="\t";FS="\t";}{ if ($1 == "c6_COX") { if (($4 >= 28688544) && ($5 <= 33420241)) print; } else if ($1 == "c6_QBL") { if (($4 >= 28885510) && ($5 <= 33451440)) print;}}' haps.gtf > keeptHaps.gtf
    liftUp -type=.gtf lifted.gtf /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry keeptHaps.gtf
    cat nonHaps.gtf lifted.gtf > all.gtf
    gzip all.gtf
    rm *.gtf
    gtfToGenePred -infoOut=infoOut.txt -genePredExt all.gtf.gz stdout | gzip > all.gp.gz
    /cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl infoOut.txt > ensGtp.tab
    genePredCheck -db=hg18 all.gp.gz 
#checked: 69859 failed: 0
    zcat all.gtf.gz | grep -i pseudo > pseudo.gtf
    zcat all.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
    gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
    gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
    genePredCheck -db=hg18 pseudo.gp
#checked: 6901 failed: 0
    genePredCheck -db=hg18 not.pseudo.gp
#checked: 62958 failed: 0
    hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
    hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp


#############################################################################
# COVERAGE FOR 1000 GENOMES HIGH-COV INDIVIDS (IN PROGRESS 6/10/09 angie)
#TODO: try again now that wigToBigWig is more mem-efficient
# also, new alignments have probably become available since then.
    # wigBedToStep ran out of memory on hgwdev (w/limit of 32G)... roll own:
    cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes
    foreach s (NA12878  NA12891  NA12892  NA19238  NA19239  NA19240)
      pushd data/$s/alignment
      foreach p (454 SLX)
        echo "==== $s $p ===="
        ls -1 $s.chrom*.$p.SRP000032.2009_04.bam \
        | grep -v chromMT \
        | xargs -L 1 samtools pileup \
        | perl -pe '($c, $start, undef, $depth) = split; \
                    if ($c ne $lastC || $start != $lastStart+1) { \
                      print "fixedStep chrom=chr$c start=$start step=1 span=1\n"; \
                    } \
                    $_ = "$depth\n"; \
                    ($lastC, $lastStart) = ($c, $start);' \
        | gzip -c > cov${s}By{$p}.fixedStep.gz
        echo ""
      end
      popd
    end

#TODO
    # Killing memory -- run separately:
        | wigToBigWig -clip stdin /hive/data/genomes/hg18/chrom.sizes cov${s}By$p.bw

#[bam_pileup] fail to read the header of NA12878.chromY.454.SRP000032.2009_04.bam: non-exisiting file or wrong format.
    # NA12878.chromY.454.SRP000032.2009_04.bam is an empty file.

    # Load tables
    foreach bw (`find /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes \
                   -name cov\*.bw`)
      ln -s $bw /gbdb/hg18/bbi/
      hgsql hg18 -e "drop table if exists $bw:t:r; \
                     create table $bw:t:r (fileName varchar(255) not null); \
                     insert into $bw:t:r values ('/gbdb/hg18/bbi/$bw:t');"
    end


#############################################################################
# 1000 GENOMES HIGH-COV INDIVIDS READ ALIGNMENTS (DONE 11/30/09 angie)
    # one-off to test BAM as track type:
    cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes
    set testBam = NA12878/alignment/NA12878.chrom22.SRP000032.2009_02.bam
    ln -s `pwd`/$testBam{,.bai} \
       /gbdb/hg18/bbi/
    hgsql hg18 -e "drop table if exists bamNA12878; \
                   create table bamNA12878 (fileName varchar(255) not null); \
                   insert into bamNA12878 values ('/gbdb/hg18/bbi/$testBam:t');"

    # 9/14/09: update bamNA12878 to use new seqName column and try samtools'
    # capability to fetch ftp sparsely:
    hgsql hg18 -e "drop table if exists bamNA12878; \
                   create table bamNA12878 (fileName varchar(255) not null, \
                                            seqName varchar(255) not null); \
                   insert into bamNA12878 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12878/alignment/NA12878.chrom21.SLX.maq.SRP000032.2009_07.bam', '21'); \
                   insert into bamNA12878 values ('/gbdb/hg18/bbi/NA12878.chrom22.SLX.SRP000032.2009_04.bam', '22');"
    # 11/30/09: Add more remote files:
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Y)
      hgsql hg18 -e "insert into bamNA12878 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12878/alignment/NA12878.chrom$c.SLX.maq.SRP000032.2009_07.bam', '$c');"
    end
    # Add an all-remote NA12891 for testing composite track:
    hgsql hg18 -e "create table bamNA12891 (fileName varchar(255) not null, \
                                            seqName varchar(255) not null);"
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
      hgsql hg18 -e "insert into bamNA12891 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12891/alignment/NA12891.chrom$c.SLX.maq.SRP000032.2009_07.bam', '$c');"
    end


##############################################################################
# UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
    mkdir /hive/data/genomes/hg18/bed/ucscToEnsembl
    cd /hive/data/genomes/hg18/bed/ucscToEnsembl
    awk '{printf "%s\t%s\n", $4, $2}' ../../jkStuff/ensGene.haplotype.lift \
	> ucscToEnsembl.tab

    cat << '_EOF_' > ucscToEnsembl.sql
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
    ucsc varchar(255) not null,        # UCSC chromosome name
    ensembl varchar(255) not null,     # Ensembl chromosome name
              #Indices
    PRIMARY KEY(ucsc(21))
);
'_EOF_'

    hgsql hg18 < ucscToEnsembl.sql
    hgsql hg18 \
-e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'

    awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \
	> ensemblLift.tab

    cat << '_EOF_' > ensemblLift.sql
# UCSC offset to Ensembl coordinates
CREATE TABLE ensemblLift (
    chrom varchar(255) not null,      # Ensembl chromosome name
    offset int unsigned not null,     # offset to add to UCSC position 
              #Indices
    PRIMARY KEY(chrom(6))
);
'_EOF_'

    hgsql hg18 < ensemblLift.sql
    hgsql hg18 \
-e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift'

##############################################################################
# FOX2 CLUSTERS (DONE 2009-04-08, Andy)
    cp cluster.combine.bed /hive/data/genomes/hg18/bed/fox2ClipSeq
## (got the data as an attachment from Gene Yeo)
    cd /hive/data/genomes/hg18/bed/fox2ClipSeq
    grep chr cluster.combine.bed | cut -f1-4 | \
      bedSort stdin fox2ClipClusters.hg17.bed
    liftOver fox2ClipClusters.hg17.bed \
      /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
      fox2ClipClusters.bed unmapped.bed
    hgLoadBed hg18 fox2ClipClusters{,.bed}

##############################################################################
#  RE-BUILD sno/miRNA TRACK (DONE, 2009-06-11 - 2009-06-13, hartera)
    # The data in this track is out of date so update the track. 
    mkdir -p /hive/data/genomes/hg18/bed/wgRna-2009-06-11
    cd /hive/data/genomes/hg18/bed/wgRna-2009-06-11
    # Download GFF file of latest miRNA annotations from miRBase at the
    # Wellcome Trust Sanger Institute (WTSI). This is Release 13.0 (March
    # 2009)
    wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/CURRENT/genomes/hsa.gff
    # Re-format, need to add "chr" to the beginning of each line.
    sed -e 's/^/chr/' hsa.gff > hsMirBaseFormat.gff
    # Remove extra "chr" in comment lines
    perl -pi.bak -e 's/chr#/#/' hsMirBaseFormat.gff
    # Change chrMT to chrM
    perl -pi.bak -e 's/chrMT/chrM/' hsMirBaseFormat.gff
    # Remove all but ID name in last field
    sed -e 's/\";//g' hsMirBaseFormat.gff | sed -e 's/ID=\"//g' \
       | sed -e 's/ACC=\"MI[0-9]*\s//' > hsMirBaseFormatIdOnly.gff

    # use score 906 for + strand and 480 for - strand. This will show 
    # up black on the track for + strand and grey for - strand.
    # Starts appear to be 1-based when compared to miRNAs in current track
    # and those in Ensembl.
    # Confirmed with Sam Griffith-Jones (one of the authors of miRBase,
    # sam.griffith-jones@manchester.ac.uk) that these GFF coordinates
    # are 1-based. 
    # Also add thickStart and thickEnd columns and "miRNA" for type.
    awk 'BEGIN {FS="\t"} {OFS="\t"} \
        {if ($0 !~ /#/ && $7 == "+") \
         print $1, $4-1, $5, $9, 960, $7, 0, 0, "miRNA"; \
       else if ($0 !~ /#/ && $7 == "-") \
         print $1, $4-1, $5, $9, 480, $7, 0, 0, "miRNA";}' \
        hsMirBaseFormatIdOnly.gff > hsMirBaseFormatIdOnly.bed
    # 2009-06-12
    # snoRNAs are from snoRNABase at http://www-snorna.biotoul.fr/
    # Download coordinates for hg18 from
    # http://www-snorna.biotoul.fr/coordinates.php
    # This is version 3 of the database.
    # save as tab-separated file: snoRNABaseVersion3Coords.txt and remove
    # first and last lines.
    perl -pi.bak -e 's/\"//g' snoRNABaseVersion3Coords.txt
    # Reformat to BED format with thickStart and thickEnd set to 0.
    awk 'BEGIN {FS="\t"} {OFS="\t"} \
        {if ($4 == "+") \
         print $1, $2-1, $3, $5, 960, $4, 0, 0,$6; \
       else if ($4 == "-") \
         print $1, $2-1, $3, $5, 480, $4, 0, 0,$6;}' \
       snoRNABaseVersion3Coords.txt > snoRNABaseVersion3Coords.bed
   # Merge the miRNA and snoRNA files together
   cat hsMirBaseFormatIdOnly.bed snoRNABaseVersion3Coords.bed \
       > wgRna20090611.bed
   # Load into separate table rather than overwriting wgRna
   cp -p /cluster/home/hartera/src/hg/lib/wgRna.sql wgRnaJun09.sql
   perl -pi.bak -e 's/TABLE wgRna/TABLE wgRnaJun09/' wgRnaJun09.sql
   hgLoadBed -sqlTable=wgRnaJun09.sql hg18 wgRnaJun09 wgRna20090611.bed
# Reading wgRna20090611.bed
# Loaded 1120 elements of size 9
# Sorted
# Creating table definition for wgRnaJun09
# Saving bed.tab
# Loading hg18

   # Clean up
   rm *.bak

hgsql -e 'select count(*) from wgRna;' hg18 
# 1059
# for miRNAs: 685 (676 unique names)
# and others: 374 including 21 scaRNA
hgsql -e 'select count(*) from wgRnaJun09;' hg18
# 1120
# for miRNAs: 718 (705 unique)
# and others: 402 including 21 scaRNA
   # 2009-06-13
   # Renamed the old wgRna track to wgRnaOld and renamed the new wgRnaJun09
   # track to wgRna. Will keep the old track around for a while until
   # new track checked and QA'd.
   hgsql -e 'alter table wgRna rename wgRnaOld;' hg18
   hgsql -e 'alter table wgRnaJun09 rename wgRna;' hg18


##################
## Uniqueness Track: Step one (courtesy of John Castle, Rosetta)
## Make oligos of length XX

# Perl one-liner to make a batch file
# I've included the perl files CNV_makereads2.pl (simply uses substr on a chromosome) and fastagrep.pl (to remove sequences with Ns # The files chr$x.fa are the individual chromosomes

perl -e 'for ($i = 1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} if ($i == 24) {$x = 'Y';} if ($i == 25) {$x = 'M';} print "~/DTcode/CNV_makereads2.pl 100 /info/genome/Projects/721/ref/chr$x.fa | fastagrep.pl -v n > chr$x.fa\n";}' > batch_chr_get

#!/usr/bin/perl -w
#---------------------------------------------------------------------
#                   C O P Y R I G H T   N O T I C E
#---------------------------------------------------------------------
#            Copyright (c) 2001 Rosetta Inpharmatics, Inc. 
#           12040 115th Avenue NE, Kirkland, WA 98034-6900
#         All Rights Reserved.  Reproduction, adaptation, or
#          translation without prior written permission of 
#             Rosetta Inpharmatics, Inc. is prohibited.
#---------------------------------------------------------------------
# CNV_makereads.pl
# $Id: hg18.txt,v 1.422 2010/06/02 23:00:02 angie Exp $


#use lib ('/home/castlej/perl/','/home/castlej/OSDTools/','/home/castlej/DTcode/');
#use strict;

my $oligo_length = $ARGV[0];
my $file = $ARGV[1];

open(IN,$file);
$/ = "\n>";# change input line separator to '>' to suck up FASTA sequences
while ($line= <IN>) {
  $line =~ s/^>//m;
  # remove '>' from end of $line 
  $line =~ s/>$//m;
  # remove Unigene lines starting with '#'
  $line =~ s/\n\#.*$//m;
  # get sequence id
  $line =~ /^\s*(\S+).*([^\0]*)/;
  $id = $1;
  $seq = $2;
  $seq =~ s/\n//g;
}

if ($id =~ /(chr\S+)\.nib/) {
  $chr = $1;
} elsif ($id =~ /(chr\S+)/) {
  $chr = $1;
}

for ($i = 0; $i <length($seq)-$oligo_length; $i++) {
  $a = substr($seq,$i,$oligo_length);
  $j = $i+$oligo_length;
  print ">$chr:$i-$j\n$a\n";
}


#!/usr/bin/perl -w
#---------------------------------------------------------------------
#                   C O P Y R I G H T   N O T I C E
#---------------------------------------------------------------------
#      Copyright (c) 2000,2001,2002 Rosetta Inpharmatics, Inc. 
#           12040 115th Avenue NE, Kirkland, WA 98034-6900
#         All Rights Reserved.  Reproduction, adaptation, or
#          translation without prior written permission of 
#             Rosetta Inpharmatics, Inc. is prohibited.
#---------------------------------------------------------------------
#
# $Id: hg18.txt,v 1.422 2010/06/02 23:00:02 angie Exp $
#
# finds selected sequences in FASTA by regex matching in defline or sequence

use strict;

my( $option,
    $regex,
    @regexes,
    %tofind,
    $exceptflag,
    $key, 
    $value,
    $line,
    );

$exceptflag = 0;

unless (scalar(@ARGV)) {
  print "\nUsage: $0 [OPTION] PATTERN [FASTAFILE]\n";
  print "$0 finds sequences by pattern matching in FASTA format data\n\n";
  exit;
}

while ((scalar(@ARGV)) && ($ARGV[0] =~ /^-(\w+)/)) {
  $option = $1;
  shift(@ARGV);
  if ($option =~ /v/) { # user wants sequences NOT matching regex(es)
    $exceptflag = 1;
  }
  if ($option =~ /s/) { # regex on command line
    push(@regexes, shift(@ARGV));
  }
  
  if ($option =~ /f/) { # user wants list of regexes from file
    open(INHANDLE, "<$ARGV[0]") || 
      die "$0: error, can't open regex list file $ARGV[0]\n";
    while (defined($regex = <INHANDLE>)) {
      chomp $regex;
      push(@regexes, $regex);
    }
    shift(@ARGV);
  }
}

if (scalar(@regexes) < 1) { push(@regexes, shift(@ARGV)); }
$/ = "\n>"; # change input line separator to suck up FASTA sequences

SEQUENCE:
while (defined($line = <>)) {
  # remove '>' from start of first $line
  $line =~ s/^>//m;
  # stick '>' back on all $lines
  $line = '>'.$line;
  # remove '>' from end of $line 
  $line =~ s/>$//m;
  # remove Unigene lines starting with '#'
  $line =~ s/\n\#.*$//m;
  foreach $regex (@regexes) {
    if ($line =~ /$regex/) { 
      unless ($exceptflag) { print $line; }
      next SEQUENCE; 
    }
  }
  if ($exceptflag) { print $line; }
}


# Submit batch file to cluster (we use LSF), each line is a submission
perl -ne 'chomp; $a = "bsub -q short64  \"$_\"\n"; system($a);' batch_chr_get


####################
# Uniqueness Step two # I've used an older version of BWA.  The newer version from sourceforge outputs a binary file which then must be converted to a text file
# HG18 is the human genome
# I could include banything_2GBNew.pl but it is simply a cluster "chunk and submit" code 
# Method 1 perl -e 'for ($i =1;$i<= 25; $i++) {$x = $i;  if ($i == 23) {$x = 'X';} elsif ($i == 24) {$x = 'Y';} elsif ($i == 25) {$x = 'M';} print "banything_2GbNew2.pl -a /ifs65/dtap/bin/bwa/bwa-0.2.0/bwa  -z 1000000 -in chr$x.fa -o chr$x.bwa  -stdout chr$x.bwa  -pre \"aln -o 0 /info/dtap/projects/1057_CNV/HG18/HG18 \" -suf \" \"  \n";}' >! batch_banything
 chmod +777  batch_banything
batch_banything

# Method 2 perl -e 'for ($i =1;$i<= 25; $i++) {$x = $i;  if ($i == 23) {$x = 'X';} elsif ($i == 24) {$x = 'Y';} elsif ($i == 25) {$x = 'M';} print "/ifs65/dtap/bin/bwa/bwa-0.2.0/bwa aln -o 0 /info/dtap/projects/1057_CNV/HG18/HG18  chr$x.fa >  chr$x.bwa\n"}' >!  batch_banything
 chmod +777  batch_banything
perl -ne 'chomp; $a = "bsub -q long64  \"$_\"\n"; system($a);' batch_anything


#####################
# Uniqueness Step three 
# I ran this one-liner from a higher level directory
perl -e '$pwd = `pwd`; chomp($pwd); @a = `ls`; foreach $dir (@a) {chomp ($dir); unless ($dir =~ /(\d+)mer_2nd/) {next;}; @b = `ls $dir/*fa.bwa`; foreach $file (@b) {chomp($file); $f = "$pwd/$file"; $f =~ /^(\S+chr[^\.]+)\.*/; $e = $1; print "~/DTcode/CNV_parseBWA_wiggle.pl 100 1 $f\* > $e.quality.100.wiggle\n";}}' > batch_wiggle
# Submit batch file to cluster (we use LSF), each line is a submission
perl -ne 'chomp; $a = "bsub -q long64  \"$_\"\n"; system($a);' batch_wiggle

#!/usr/bin/perl -w
# John Castle
# May 19, 2009
# $Cap          a maximum value to clip data with
# $Use_score    whether to output the uniqueness score or the number of hits
# @FilesIn      the BWA text output files to scan
#  ** NOTE ** The newer BWA algorithm outputs a binary file that is then made into a text file using BWA again.  
# However, the text file output has a slightly different format so the parsing will need to change.


($Cap, $Use_score, @FilesIn)      = @ARGV;

if ($FilesIn[0] =~ /\.gz/) {
  open(IN,"gzip -dc $FilesIn[0] |")
} else {
  open(IN,$FilesIn[0]);
}

#### Description
@a = split("\t",<IN>);
$a[6] =~ /(\d+)/;
$len = $1;
close(IN);

### Wiggle header text
if ($Use_score == 0) {
  print "track type=wiggle_0 name=\"Alignment scores of $len\mer as\" description=\"Unique $len mer alignments\" color=100,50,150 gridDefault=on yLineOnOff=on visibility=full maxHeightPixels=40:40:12\n";
} else {
  print "track type=wiggle_0 name=\"$len\mer alignment scores\" description=\"$len\mer alignment scores from BWA/MAQ, where 37 indicates a unique alignment\" color=100,50,150 gridDefault=on yLineOnOff=on visibility=full maxHeightPixels=40:40:12\n";
}

### Parse through file(s)
foreach $file (@FilesIn) {
  if ($file =~ /\.gz/) {
    open(IN,"gzip -dc $file |");
  } else {
    open(IN,$file);
  }
  @a = split("\t",<IN>);
  $a[0]    =~ /(chr\S+):(\d+)/;
  $Chr     = $1;
  $start   = $2;
  $score   = $a[5];
  $hits    = $a[11];
  if ($hits > $Cap) {$hits = $Cap;}
  if ($Use_score == 1) {$value = $score;}
  else {$value = $hits;}

  while (<IN>) { # Make wiggle track, with start and end coordinates for same scoring regions
    @a = split("\t",$_);
    if ($#a <15) {
      next;
    }
    
    $a[0] =~ /(chr\S+):(\d+)/;
    $chr   = $1;
    $pos   = $2;
    $score = $a[5];
    $hits  = $a[11];
    if ($hits > $Cap) {$hits = $Cap;}
    
    if ($Use_score == 1) {$x = $score;
    } else {$x = $hits;}
    
    if ($x != $value) {
      print "$Chr\t$start\t$pos\t$value\n";
      $Chr   = $chr;
      $value = $x;
      $start = $pos;
    }
  }
  print "$Chr\t$start\t$pos\t$value\n";
  close(IN);
}

############################################################################
# Re-Run equCab2 alignment (DONE - 2009-06-29,07-02 - Hiram)
    mkdir /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29
    cd /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29

    cat << '_EOF_' > DEF
# Human vs. Horse

BLASTZ_M=50

# TARGET: Human hg18
SEQ1_DIR=/scratch/data/hg18/bothMaskedNibs
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
	-workhorse=hgwdev \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    582m47.015s
    #	failed due to power failure - Mon Jun 29 23:32:54 PDT 2009
    time doBlastzChainNet.pl `pwd`/DEF \
	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
	-continue=chainRun -workhorse=hgwdev \
	-chainMinScore=3000 -chainLinearGap=medium > chainRun.log 2>&1 &
    #	real    430m13.886s
    cat fb.hg18.chainEquCab2Link.txt 
    #	1647122438 bases of 2881515245 (57.162%) in intersection

    mkdir /hive/data/genomes/equCab2/bed/blastz.hg18.swap
    cd /hive/data/genomes/equCab2/bed/blastz.hg18.swap
    time doBlastzChainNet.pl \
	/hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29/DEF \
	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
	-swap -workhorse=hgwdev \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    238m42.004s
    cat fb.equCab2.chainHg18Link.txt 
    #	1622340736 bases of 2428790173 (66.796%) in intersection

############################################################################
# Fantom Cage 4 Track (2009-07-16)
cd /projects/compbiousr/sugnet/projects/cage-20090428
mkdir data
cd data
# Get the Human tags from Riken's download site.
wget -r -l 3 http://fantom.gsc.riken.jp/4/download/Tables/human/CAGE/mapping/

# Apparently time series with hours at:
# 4,5,6,8,10,11,15,21,22,27,28,33,34,35,37,40,42,43,45,47,48,49,51,52,53,57,59,61,62,63,64,65,69,73,74,91,92,93,h95 ctrls, i02, i03

# Goto the data directory
cd /projects/compbiousr/sugnet/projects/cage-20090428/data/fantom.gsc.riken.jp/4/download/Tables/human/CAGE/mapping/
# Unzip data
for bz in `ls *.bz2`; do \
  echo "Unzipping $bz"; \
  bunzip2 $bz; \
done 

# From column headers it looks like the values of interest are:
  # 0 = id 
  # 1 = library_count
  # 2 = edit_string
  # 3 = chrom
  # 4 = strand
  # 5 = start
  # 6 = end

# Pull the raw scores into a single file
cat h*_mapping.tbl.txt | grep -v '^#' | grep -v 'library_count' | grep 'chr' | perl -ne '$l=$_; @w = split /\t/, $l; print "$w[3]\t$w[5]\t$w[6]\t$w[0]\t$w[1]\t$w[4]\n";' > all.wscores.bed

    cat << '_EOF_' > toBed.pl
#!/usr/bin/perl

$prefix = shift(@ARGV);
$prefix =~ s/h/H/g;
while($l = <>) {
 if(!($l=~ /^\#/) && !($l=~/^id/)) { 
      chomp($l); 
      @w = split /\t/, $l; 
      $score = 100 * $w[1];
      if($score > 1000) {
	  $score = 1000;
      }
      $name = $prefix; 
      $size  = $w[6] - $w[5];
      print "$w[3]\t$w[5]\t$w[6]\t$prefix\t$score\t$w[4]\t$w[5]\t$w[6]\t0\t1\t$size,\t0,\n";
  }
}
'_EOF_'
    # << happy emacs

chmod 755 toBed.pl

# Make the top level bed track
for f in `ls *mapping.tbl.txt`; do 
  root=`basename $f .txt`;
  prefix=`basename $f _mapping.tbl.txt`;
  bed=$root.bed;  
  echo "Reading from $f into $bed with prefix $prefix";
  toBed.pl $prefix < $f > $bed;
done;

# Call program in stats mode to generate summary statistics about how many reads there are in a sliding window around
# sites with tags
cageSingleTrack -input=all.wscores.bed  -forward=all.forward.plaw.scores -reverse=all.reverse.plaw.scores -stats-only

# Grab every 100th record to make a bite (byte?) sized chunk for R
cat all.forward.plaw.scores | perl -e '$c = 0; while($l=<>) { if($c++ % 100 == 0) { print "$l"; } }' > sample.txt

# Some R code to fit a power law model and get coefficient via log/log line fit
d = read.table('sample.txt');
# Grab all the data less than 200 counts (81% of data) as that is where the model really fits
dd = d$V4[d$V4 < 200]
# Use hist command to find counts at each bucket size
h = hist(dd, 200, plot=F)
# Take the logs
y = log10(h$counts)
x = log10(h$breaks[1:198])
# Fit a robust line
library(MASS)
r = rlm(y~x)
# Call:
# rlm(formula = y ~ x)
# Converged in 5 iterations
# 
# Coefficients:
#(Intercept)           x
#   3.987744   -1.196954

# Visually note that the data fits a power law nicely
plot(log10(h$breaks[1:198]),log10(h$counts), xlab="Log10 Tags In Window", ylab="Log10 Number of Times Occuring", main="Distribution of CAGE Tags in Sliding 35bp Window")
abline(r)

# Using the coefficient learned above predict the posterior probability of seeing this observation 
cageSingleTrack -input=all.wscores.bed  -forward=all.forward.plaw.bg2 -reverse=all.reverse.plaw.bg2  -alpha=1.196954 -xmax=198

# Load up the bed graph tracks
hgLoadBed -bedGraph=4 hg18 FantomCageForwardPowerLawGraph all.forward.plaw.bg2
hgLoadBed -bedGraph=4 hg18 FantomCageReversePowerLawGraph all.reverse.plaw.bg2


############################################################################
# ENCODE PHASED GENOTYPES for NA12878 (DONE 7/22/09 angie)
    mkdir /hive/data/genomes/hg18/bed/phasedGenotypesNA12878
    cd /hive/data/genomes/hg18/bed/phasedGenotypesNA12878
    wget http://illumina-mac.stanford.edu/NA12878_Reference_Genome/code/CEU.trio.dec.with.x.with.rs.calls
    wget http://illumina-mac.stanford.edu/NA12878_Reference_Genome/code/{Makefile,PhaseSNPs.pm}

#TODO: strip homozyg-same-as-reference SNPs from CEU.trio, then make:

    make NA12878_SNPs_Phased.bed
    perl -wpe '/^(\w+)\t(\d+)\t(\d+)\t([ACGT])\/([ACGT])\t([MP\/HA]+)$/ || die "parse\n$_\t"; \
      ($c, $s, $e, $a1, $a2, $t) = ($1, $2, $3, $4, $5, $6); \
      if ($t eq "M/P") { \
        $_ = "$c\t$s\t$e\tM:$a1\n" . "$c\t$s\t$e\tP:$a2\n"; \
      } elsif ($t eq "P/M") { \
        $_ = "$c\t$s\t$e\tM:$a2\n" . "$c\t$s\t$e\tP:$a1\n"; \
      } elsif ($t eq "H") { \
        $_ = "$c\t$s\t$e\t$a1\n"; \
      } elsif ($t eq "A") { \
        $_ = "$c\t$s\t$e\tA:$a1/$a2\n"; \
      } else { die "unrec type $t"; } \
      ' NA12878_SNPs_Phased.bed \
      > phasedGenotypesNA12878.bed
    hgLoadBed -noNameIx hg18 phasedGenotypesNA12878 phasedGenotypesNA12878.bed
#Loaded 5469032 elements of size 4


############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
# rnaBinding RNA Binding Proteins (2009-07-28 markd)
# contributor: Jeremy Sanford <sanford@biology.ucsc.edu>

    # sfrs1Input BED table: 
    #   need to drop color, as it's in the wrong column
    #   skip header
    tawk 'NR>1{print $1,$2,$3,$4,$5,$6}' Input_sequence_blocks.bed | hgLoadBed hg18 sfrs1Input stdin

    # sfrs1Clip BED table:
    #   skip header
    tawk 'NR>1{print $1,$2,$3,$4,$5,$6}' SFRS1_CLIP_sequence_blocks.bed | hgLoadBed hg18 sfrs1Clip stdin

    # SFRS1_consensus_sites.wig
    tawk 'NR>1' SFRS1_consensus_sites.wig  | wigEncode stdin sfrs1ConsensusSites.wig  sfrs1ConsensusSites.wib
    # Converted stdin, upper limit 11.63, lower limit -28.64
    hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 sfrs1ConsensusSites sfrs1ConsensusSites.wig
    ln -s $(pwd -P)/sfrs1ConsensusSites.wib /gbdb/hg18/wib/
############################################################################
# VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-08-04 - 2009-09-09, hartera)
# Needs updating as the current version is build 33.
# Download the human VEGA Genes posted on ftp site on 2009-03-31
# 2009-08-03 (hartera) - Added code to register track handler for
# vegaGeneComposite.
# 2009-08-15 - 2009-08-16 (hartera) - Added code to allow use of radio buttons
# on the configuratio page for the track item labels. Modified code so it 
# can be shared with Ensembl to create the links to Vega transcript, gene
# and protein reports on the details pages. 
# 2009-08-22 - Finished code for adding Vega report URLs to the details pages.
# Loaded the vegaGtp table.
# 2009-09-01 - 2009-09-02 (hartera). Loaded a vegaPep table for the protein
# sequence link on the details pages.
# 2009-09-04 Re-load all tables as some reverted to the older version during
# mySQL 5 upgrade.
# 2009-09-08 - 2009-09-09 Code change to change message on details page when 
# no protein is available and change to trackDb to make vegaGene items a 
# darker blue colour. Reloaded vegaPep after removing proteins whose
# transcripts are not in vegaGtp to make all.joiner happy.

    mkdir /hive/data/genomes/hg18/bed/vega35
    cd /hive/data/genomes/hg18/bed/vega35
    wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/human/*" \
         "ftp://ftp.sanger.ac.uk/pub/vega/human/pep/*.tot.fa.gz"
    zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
        | grep "^chr" > nonHaps.gtf
    zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
        | grep -v "^chr" > haps.gtf
    awk 'BEGIN{OFS="\t";FS="\t";}{ if ($1 == "c6_COX") { if (($4 >= 28688544) && ($5 <= 33420241)) print; } else if ($1 == "c6_QBL") { if (($4 >= 28885510) && ($5 <= 33451440)) print;}}' haps.gtf > keptHaps.gtf
    liftUp -type=.gtf lifted.gtf /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry keptHaps.gtf
    cat nonHaps.gtf lifted.gtf > all.gtf
    
    # Do this to create the infoOut.txt file and extract the extra information 
    gtfToGenePred -infoOut=infoOut.txt -genePredExt all.gtf stdout | gzip > tempAll.gp.gz
    ~/kent/src/hg/utils/automation/extractGtf.pl infoOut.txt > vegaGtp.tab

    # Change the gene name to have the gene_id label so that this is in the
    # name2 field of the extended genePred table. This can then be displayed
    # at the track item label. 
    perl -pi.bak -e 's/gene_id/other_gene_id/' all.gtf
    perl -pi.bak -e 's/gene_name/gene_id/' all.gtf
    gzip all.gtf
    rm *.gtf tempAll.gp.gz
    # create genePred files for loading into database
    gtfToGenePred -genePredExt all.gtf.gz stdout | gzip > all.gp.gz
    genePredCheck -db=hg18 all.gp.gz 
    # checked: 81244 failed: 0
    zcat all.gtf.gz | grep -i pseudo > pseudo.gtf
    zcat all.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
    gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
    gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
    genePredCheck -db=hg18 pseudo.gp
    # checked: 8331 failed: 0
    genePredCheck -db=hg18 not.pseudo.gp
    # checked: 72913 failed: 0
    hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
    hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
    # Added code to src/hg/hgTracks/simpleTracks.c to register a track handler
    # for vegaGeneComposite that is now used for this data. This used
    # vegaGeneMethods to display the name2 field (gene) as the item label in
    # the track.


############################################################################
# EPO ANCESTRAL REGIONS (DONE 8/5/09 angie)
    # Use Aspera client to download 1000Genomes' Enredo-Pecan-Ortheus 
    # four-catarrhini ancestral-tree calls for genome regions, as well as 
    # their annotated fasta (requested by Sol Katzman):
    cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/
    set asperaInst = /opt/aspera/connect
    set ascp = $asperaInst/bin/ascp
    set aKey = $asperaInst/etc/asperaweb_id_dsa.putty
    set aOpts = "-i $aKey -QTr -l300M -q"
    set server = anonftp@ftp-private.ncbi.nlm.nih.gov
    set aliDir = technical/reference/ancestral_alignments
    mkdir -p $aliDir
    cd $aliDir
    foreach f (MD5SUM README README.ancestral_alignments summary.txt \
      $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/$f .
    end
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
      $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/human_ancestor$c.bed
      $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/human_ancestor$c.fa.bz2
    end
    chmod 444 *
    # Check md5sums:
    perl -wpe 'chomp; ($expSum, $f) = split; $actSum = `md5sum $f`; $actSum =~ s/ .*\n//; \
               $_ = ""; \
               if ($expSum ne $actSum) { warn "MISMATCH: $f exp=$expSum, actual=$actSum\n"; } \
               else {print "$f OK\n";}' MD5SUM
    # Shortcut requested by Sol:
    ln -s `pwd` /hive/data/outside/ancestral.epo.hg18
    # Load up the regions:
    mkdir /hive/data/genomes/hg18/bed/epoAncestralRegions
    cd /hive/data/genomes/hg18/bed/epoAncestralRegions
    set aliPath = /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/$aliDir
    sed -e 's/^/chr/' $aliPath/human_ancestor_*.bed > epoAncestralRegions.bed
    hgLoadBed hg18 epoAncestralRegions epoAncestralRegions.bed -tab -allowStartEqualEnd
#Loaded 10195 elements of size 4
    featureBits hg18 epoAncestralRegions
#2778857014 bases of 2881515245 (96.437%) in intersection
    featureBits hg18 -countGaps gap epoAncestralRegions
#6232933 bases of 3107677273 (0.201%) in intersection


    # 2009-08-16 (hartera)
    # ensGtp table definition is in ~/kent/src/hg/lib/ensGtp.sql
    # There is an index on the protein field so it can not be NULL. 
    # If there is no protein, the gene name is given.
    # Added code to hgTracks.c and hgTrackUi.c to allow the use of 
    # radio buttons on the track configuratioin page to select the
    # gene name, accession or both to be displayed in the track.
    # The gene name is displayed by default.
    # Added code to hgc.c so that Ensembl and Vega can share code to 
    # create links on the details pages to the Vega reports for transcript, 
    # gene and protein through these IDs. Created new function
    # printEnsemblOrVegaCustomUrl(). 

    # 2009-08-22 (hartera)
    # Create a vegaGtp table using the vegaGtp.tab file above. Use ensGtp.sql
    # to create the table. vegaGtp associates geneId/transcriptId/proteinId 
    # for the links to Vega reports from the details page. If there is no
    # protein ID because the transcript is noncoding, the gene name is used
    # instead. This field can not be NULL in the table as there is an index
    # on it.     
    cd /hive/data/genomes/hg18/bed/vega35
    cp ~/kent/src/hg/lib/ensGtp.sql .
    # One of the gene names is long for a noncoding gene so it does not fit 
    # in the protein ID field so change the protein field in ensGtp.sql 
    # to allow 40 chars instead of 20 and re-load the table.
    hgsql -e 'drop table vegaGtp;' hg18
    hgLoadSqlTab hg18 vegaGtp ensGtp.sql vegaGtp.tab
    # Loaded succesfully
    # Added code to hgc.c to use printEnsemblOrVegaCustomUrl() in 
    # doVegaGene() to add the links to Vega reports on the details pages.
    # Code was added so that there is no protein sequence link on the details
    # page if it there is none available e.g. noncoding.
    # 2009-09-01 - 2009-09-02 (hartera)
    # Coding genes are displaying the message that there is no protein
    # prediction available. Need to add a vegaPep table.
    cd /hive/data/genomes/hg18/bed/vega35
    # from the Ensembl process:
    zcat Homo_sapiens.VEGA.mar.pep.tot.fa.gz  \
        | sed -e 's/^>.* Transcript:/>/;' | gzip > vegaPep.txt.gz
    zcat vegaPep.txt.gz \
         | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
         | sed -e '/^$/d; s/*$//' | sort > vegaPep.hg18.fa.tab
    # Load table
    hgPepPred hg18 tab vegaPep vegaPep.hg18.fa.tab
    # Add vegaPep to the trackDb.ra entry for the vegaGeneComposite track 
    # in the type line for src/hg/makeDb/trackDb/human/hg18/trackDb.ra.
    # Check that the vegaPep table looks ok and then check protein-coding and 
    # noncoding transcript details pages for protein links.
    # 2009-09-04, hartera
    # Re-load tables after upgrade to mySQL 5 as they had reverted back to 
    # tables with the previous Vega dataset.
    
    cd /hive/data/genomes/hg18/bed/vega35
    hgsql -e 'drop table vegaGene;' hg18
    hgsql -e 'drop table vegaPseudoGene;' hg18
    hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
    hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
    hgsql -e 'drop table vegaGtp;' hg18
    hgLoadSqlTab hg18 vegaGtp ensGtp.sql vegaGtp.tab
    hgsql -e 'drop table vegaPep;' hg18
    hgPepPred hg18 tab vegaPep vegaPep.hg18.fa.tab

    # 2009-09-08 (hartera). Changed message in code for details page when no
    # protein sequence is available to be more explanatory. "Non-protein
    # coding gene or gene fragment, no protein prediction available." Changed
    # the colouring for the vegaGene subtrack to be darker blue so there is 
    # more of a contrast between vegaGene and vegaPseudoGene subtracks.

    # 2009-09-09 (hartera) - re-loaded vegaPep table with only those proteins
    # that have a transcript ID in vegaGtp. 
    # all.joiner is complaining as there are about 23,000 extra proteins in 
    # vegaPep that do not have transcripts in vegaGtp. Decided to remove these
    # and e-mailed the HAVANA group to ask about the discrepancy. 
    cd /hive/data/genomes/hg18/bed/vega35
    awk '{print $2}' vegaGtp.tab | sort | uniq > vegaGtp.tx.ids
    awk '{print $1}' vegaPep.hg18.fa.tab | sort | uniq > vegaPep.tx.ids
    wc -l *.tx.ids
    # 81244 vegaGtp.tx.ids
    # 60003 vegaPep.tx.ids
    # Number of transcripts that have a protein ID:
    hgsql -Ne 'select transcript from vegaGtp where protein like "OTTHUMP%";' \
         hg18 | sort | uniq > vegaGtpWithProt.tx.ids
    wc -l vegaGtpWithProt.tx.ids        
    # 36747 vegaGtpWithProt.tx.ids
  
    # find those that are common to both. 
    comm -12 vegaGtp.tx.ids vegaPep.tx.ids > pepandGtp.tx.ids
    wc -l pepandGtp.tx.ids 
    # 36747 pepandGtp.tx.ids
    comm -12 pepandGtp.tx.ids vegaGtpWithProt.tx.ids | wc -l
    # 36747 
    # Therefore all the vegaGtp transcripts with a protein ID are in the
    # protein FASTA file.
    hgsql -Ne 'select * from vegaPep as p, vegaGtp as g where g.protein \
          like "OTTHUMP%" and p.name = g.transcript;' hg18 \
          > vegaPepOnlyInGtp.hg18.fa.tab
    wc -l vegaPepOnlyInGtp.hg18.fa.tab 
    # 36747 vegaPepOnlyInGtp.hg18.fa.tab
 
    hgsql -e 'drop table vegaPep;' hg18
    hgPepPred hg18 tab vegaPep vegaPepOnlyInGtp.hg18.fa.tab    

#############################################################################
# lsSnpPdb: import of LS-SNP/PDB data for SNP 130 (2009-02-02 markd)
    # down load from JHU
    ssh genbank
    sudo su - genbank
    cd /cluster/data/genbank
    ./bin/lsSnpPdbDownloadStep hg18
    # load into hgwdev database
    ssh hgwdev
    cd /cluster/data/genbank
    ./bin/lsSnpPdbDbLoadStep hg18
    # once this has been QAed, will auto-update from genbank scripts
#############################################################################
# BURGE	LAB DATA MAPPED WITH GEMMAPPER. PROVIDED BY THOMAS DERRIEN FROM RODERIC
# GUIGO'S LAB AT CRG. (E-MAIL: thomas.derrien@crg.es)
# (hartera, 2009-09-13 - 2009-09-16, DONE) 
# 2009-12-14, hartera. Set cdsStart = cdsEnd = 0. Moved track data directory to 
# /hive/data/genomes/hg18/bed.
# 2010-01-04, hartera. Change the data to BED format and re-loaded tables. BED
# is more appropriate for this data type.
# The data is too dense in places (feedback from QA) so it would be more
# appropriate to have a Signal track as for the ENCODE RNA-seq data tracks. 
# 2010-02-09, hartera. Create bedGraph Signal subtracks for each tissue/cell
# using reads/per million mapped reads as the data value.
# 2010-02-17, hartera. Updated trackDb.ra entry to include views.
# 2010-05-15 and 2010-05-16, hartera. Re-created the Signal subtracks using 
# the -bed12 option of bedItemOverlapCount so that blocks are used.
 
   mkdir /hive/groups/gencode/browser/hg18/burgeRnaSeqGemMapperAlign
   cd /hive/groups/gencode/browser/hg18/burgeRnaSeqGemMapperAlign

   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325476_brain_HCT168_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325477_liver_HCT169_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325478_heart_HCT170_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325479_skelMuscle_HCT171_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325480_colon_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325481_adipose_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325482_testes_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325483_lymphNode_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325484_HCT204_bt474_.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325485_HCT205_HME_.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325486_HCT202_s2468_.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325487_HCT203_s2468.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325488_HCT206_s2468_.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
   wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325489_HCT207_s2468_.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz"
  
   cd /hive/groups/gencode/browser/hg18/burgeRnaSeqGemMapperAlign
   # Load this data into tables for hg18.
   # Unzip the files:
   gunzip *.gff.gz 
   # Create a file with the list of file names and tissues.
   ls *.gff > burgeDataFiles.txt
   GSM325486_HCT202_s2468	breast
   GSM325487_HCT203_s2468	MCF-7
   GSM325488_HCT206_s2468	MB435 
   GSM325489_HCT207_s2468	T47D
   # Did not map these two as they are not 32 bp. 
   GSM325490_brain_s1368	MAQC	mixed human brain tissue/cell lines
   GSM325491_UHR_s247		MAQC_UHR mixed human cell lines
   # Edit the file above to add a tab separation between file name and tissue
   # name. Then remove the "read_name: " from the last field in each 
   # file otherwise it gets included in the name and load the data into hg18.
   # Write a script to do this: 
cat << '_EOF_' > formatAndLoadData
#!/bin/bash -e
   
# Assign variables
# Tab-separated file of file names and tissue/cell line names
DATAFILES=$1
# track name used as prefix for subtracks
TRACK=$2
# database
DATABASE=$3

cat $DATAFILES | while read file tissue; do
    subTrack=`echo $TRACK$tissue`
    echo $subTrack   
    sed -e 's/read_name:\s//' $file > ${subTrack}.gff
    ldHgGene -exon=read $DATABASE ${subTrack} ${subTrack}.gff 
done
'_EOF_'
    # << emacs
   chmod +x formatAndLoadData
   ./formatAndLoadData burgeDataFiles.txt burgeRnaSeqGemMapperAlign hg18 \
     > load.log &
   # Added a trackDb entry in 
   # ccds/trunk/gencode/browser/trackDb/human/hg18/trackDb.ra

   # 2009-12-14, Need to change cdsStart = cdsEnd = 0 in the table as this
   # data should have no CDS defined. Currently cdsStart = cdsEnd = txEnd. 
   cd /hive/groups/gencode/browser/hg18/burgeRnaSeqGemMapperAlign
   hgsql -Ne 'show tables like "burge%";' hg18 > burgeTables
   foreach t (`cat burgeTables`)
      echo $t
      hgsql -e "update $t set cdsStart = 0;" hg18
      hgsql -e "update $t set cdsEnd = 0;" hg18
   end
   # Then move data to directory in hg18 genome bed directory
   cd /hive/data/genomes/hg18/bed
   mv /hive/groups/gencode/browser/hg18/burgeRnaSeqGemMapperAlign ./
  
   # 2010-01-04 Change the data to BED format. For genePred format, 
   # there is always a track configuration added for colouring tracks by
   # genomic codons which does not make sense for this data. Also, BED is
   # more appropriate for this data type.
   cd /hive/data/genomes/hg18/bed/burgeRnaSeqGemMapperAlign
   # Convert gff to genePred and then genePred to BED, drop old table and
   # then load database with BED format data. Need to fix the cdStart and
   # cdsEnd fields to be 0. 
   foreach f (`ls burgeRnaSeqGemMapperAlign*.gff`)
     echo $f >> bed.log
     set g=$f:r
     echo $g
     ldHgGene -exon=read -nobin -out=${g}.gp hg18 $g $f >>& bed.log
     awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1,$2,$3,$4,$5,0,0,$8,$9,$10}' \
         ${g}.gp > ${g}Fixed.gp 
     genePredToBed ${g}Fixed.gp > ${g}.bed 
     echo "Dropping table $g"
     hgsql -e "drop table ${g};" hg18 
     hgLoadBed hg18 $g ${g}.bed >>& bed.log 
   end
   # Changed track type in trackDb/human/trackDb.ra to bed 12 and 
   # then did make alpha in trackDb directory.
   
   # 2010-02-17
   # trackDb.ra entry in trackDb/human was updated to include views for the
   # Raw Signal and Alignment subtracks.

   # 2010-05-15 and 2010-05-16. Add Signal tracks so it is easier to view the 
   # data in regions where there is a high density of reads. 
   cd /hive/data/genomes/hg18/bed/burgeRnaSeqGemMapperAlign
   # Use bedItemOverlapCount to get counts of overlapping items for each base.
   # Need to sort the bed files and then get the number of reads mapped for
   # that tissue. Divide the counts by the number of million mapped reads to
   # get reads per million mapped reads as the data value. 
   # Re-make the subtracks using the -bed12 option so that blocks are used 
   # instead of just the first three fields of the BED file as is the default.
   rm *.count *.bedGraph
   foreach f (`ls *.bed`)
      echo $f
      set g=$f:r
      sort ${f} | bedItemOverlapCount -bed12 hg18 stdin > ${f}.count
      set size=`hgsql -Ne "select count(distinct name) from ${g};" hg18`
      awk -v size=${size} 'BEGIN {OFS="\t"} {print $1,$2,$3,($4 / (size/1000000));}' ${f}.count > ${g}.bedGraph
   end

   # Load the bedGraph tables into the database as Raw Signal tracks.
   foreach f (`ls *.bedGraph`)
      echo $f
      set g=$f:r 
      hgsql -e "drop table ${g}AllRawSignal;" hg18
      hgLoadBed -bedGraph=4 hg18 ${g}AllRawSignal $f >>& load.log
   end

############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
# ADD LINK TO GENENETWORK (DONE. 11/06/09 Fan).

# Received geneNetwork ID list file, GN_human_RefSeq.txt, for hg18 from GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com].

    ssh hgwdev
    mkdir -p /cluster/data/hg18/bed/geneNetwork
    cd /cluster/data/hg18/bed/geneNetwork

    hgsql hg18 < ~/src/hg/lib/geneNetworkId.sql
    hgsql hg18 -e \
    'load data local infile "GN_human_RefSeq.txt" into table geneNetworkId'

#########################################################################
# BUILD snpArrayIllumina HumanCytoSNP-12 SUB-TRACK (DONE 12/4/09, Fan)

# Received raw data file HumanCytoSNP-12_forUCSC.csv
# from Illumina,  Jennifer L. Stone Ph.D., jstone@illumina.com 

#    mkdir -p /hive/data/genomes/gs.19/build36/bed/snp/illumina/120309
#    cd /hive/data/genomes/gs.19/build36/bed/snp/illumina/120309

    cat HumanCytoSNP-12_forUCSC.csv |\
    sed -e 's/,/\t/g' >HumanCytoSNP.tab

    hgsql hg18 -e 'drop table snpArrayIlluminaHumanCytoSNP_12Raw'
    hgsql hg18 < ~/src/hg/lib/snpArrayIlluminaHumanCytoSNP_12Raw.sql
    hgsql hg18 -e 'load data local infile "HumanCytoSNP.tab" into table snpArrayIlluminaHumanCytoSNP_12Raw ignore 1 lines'

    ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIlluminaHumanCytoSNP_12Raw snp130

# The illuminaLookup1M generate two files:
#
#	illuminaLookup.out  contains all probes found in snp130
#	illuminaLookup.err  contains all probes not found in snp130

    mv illuminaLookup.out illuminaLookupHumanCytoSNP_12a.out

    cut -f 1 illuminaLookup.err >j.1
    cat j.1 |sed -e 's/chrMt/chrM/' |\
    sed -e 's/XY/X/'  >j.chr

    cut -f 2-5 illuminaLookup.err >j.2

    cut -f 6 illuminaLookup.err >j.3
    cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand

    cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed

    paste j.chr j.2 j.strand j.observed >illuminaLookupHumanCytoSNP_12b.out

# combine two parts
    cat illuminaLookupHumanCytoSNP_12a.out illuminaLookupHumanCytoSNP_12b.out >snpArrayIlluminaHumanCytoSNP_12.tab

# load the table
    hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHumanCytoSNP_12 snpArrayIlluminaHumanCytoSNP_12.tab -tab -sqlTable=snpArrayIlluminaHumanCytoSNP_12.sql

#############################################################################
# BUILD snpArrayIllumina Human660W-Quad SUB-TRACK (DONE 12/9/09, Fan)

# Received raw data file Human660W.ucsc.csv 
# from Illumina,  Jennifer L. Stone Ph.D., jstone@illumina.com 

    mkdir -p /hive/data/genomes/gs.19/build36/bed/snp/illumina/120809
    cd /hive/data/genomes/gs.19/build36/bed/snp/illumina/120809

    cat Human660W.ucsc.csv|\
    sed -e 's/,/\t/g' >Human660W.tab

    hgsql hg18 -e 'drop table snpArrayIlluminaHuman660W_QuadRaw'
    hgsql hg18 < ~/src/hg/lib/snpArrayIlluminaHuman660W_QuadRaw.sql
    hgsql hg18 -e 'load data local infile "Human660W.tab" into table snpArrayIlluminaHuman660W_QuadRaw ignore 1 lines'

    ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIlluminaHuman660W_QuadRaw snp130

# The illuminaLookup1M generate two files:
#
#	illuminaLookup.out  contains all probes found in snp130
#	illuminaLookup.err  contains all probes not found in snp130

    mv illuminaLookup.out illuminaLookupHuman660W_Quada.out

    cut -f 1 illuminaLookup.err >j.1
    cat j.1 |sed -e 's/chrMt/chrM/' |\
    sed -e 's/XY/X/'  >j.chr

    cut -f 2-5 illuminaLookup.err >j.2

    cut -f 6 illuminaLookup.err >j.3
    cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand

    cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed

    paste j.chr j.2 j.strand j.observed >illuminaLookupHuman660W_Quadb.out

# combine two parts
    cat illuminaLookupHuman660W_Quada.out illuminaLookupHuman660W_Quadb.out >snpArrayIlluminaHuman660W_Quad.tab

# load the table
    hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHuman660W_Quad snpArrayIlluminaHuman660W_Quad.tab -tab -sqlTable=snpArrayIlluminaHuman660W_Quad.sql

#############################################################################
# BUILD snpArrayIllumina Human Omni1-Quad SUB-TRACK (DONE 12/9/09, Fan)

# Received raw data file Omni.ucsc.txt
# from Illumina,  Jennifer L. Stone Ph.D., jstone@illumina.com 

#    mkdir -p /hive/data/genomes/gs.19/build36/bed/snp/illumina/120309
#    cd /hive/data/genomes/gs.19/build36/bed/snp/illumina/120309

    cat Omni.ucsc.txt |\
    sed -e 's/,/\t/g' >HumanOmni1.tab

    hgsql hg18 -e 'drop table snpArrayIlluminaHumanOmni1_QuadRaw'
    hgsql hg18 < ~/src/hg/lib/snpArrayIlluminaHumanOmni1_QuadRaw.sql
    hgsql hg18 -e 'load data local infile "HumanOmni1.tab" into table snpArrayIlluminaHumanOmni1_QuadRaw ignore 1 lines'

    ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIlluminaHumanOmni1_QuadRaw snp130

# The illuminaLookup1M generate two files:
#
#	illuminaLookup.out  contains all probes found in snp130
#	illuminaLookup.err  contains all probes not found in snp130

    mv illuminaLookup.out illuminaLookupHumanOmni1_Quada.out

    cut -f 1 illuminaLookup.err >j.1
    cat j.1 |sed -e 's/chrMt/chrM/' |\
    sed -e 's/XY/X/'  >j.chr

    cut -f 2-5 illuminaLookup.err >j.2

    cut -f 6 illuminaLookup.err >j.3
    cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand

    cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed

    paste j.chr j.2 j.strand j.observed >illuminaLookupHumanOmni1_Quadb.out

# combine two parts
    cat illuminaLookupHumanOmni1_Quada.out illuminaLookupHumanOmni1_Quadb.out >snpArrayIlluminaHumanOmni1_Quad.tab

# load the table
    hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHumanOmni1_Quad snpArrayIlluminaHumanOmni1_Quad.tab -tab -sqlTable=snpArrayIlluminaHumanOmni1_Quad.sql

#############################################################################
# NHGRI GWAS CATALOG (DONE 2/4/13 angie)
# 2013 updates: 2/4
# 2012 updates: 12/10, 10/4, 8/1, 6/4, 4/4, 2/21 (remove extra whitespace, translate non-ASCII to html), 2/6
# Updated 12/7/11, 11/2/11, 10/3/11, 9/2/11, 8/1/11, 6/9/11, 4/1/11, 3/1/11, 2/1/11
# Updated 12/7/10, 11/1/10, 10/6/10, 9/1/10, 8/2/10, 6/2/10, 5/12/10, 4/1/10, 3/1/10
# Originally done 1/19/10
# Area of possible future improvement: for SNPs that can't be mapped via our SNP track,
# could some of them be obsolete IDs that have been merged into current IDs?
    mkdir /hive/data/genomes/hg18/bed/gwasCatalog
    cd /hive/data/genomes/hg18/bed/gwasCatalog
    # Done once, don't need to redo:
    cut -f 1-4 ../snp130/snp130.bed \
    | sort -k4,4 \
    > snp130Coords.bed
    set today = `date +%y%m%d`
    mkdir /hive/data/genomes/hg18/bed/gwasCatalog/$today
    cd /hive/data/genomes/hg18/bed/gwasCatalog/$today
    wget http://www.genome.gov/admin/gwascatalog.txt
    head -1 gwascatalog.txt | sed -re 's/\t/\n/g'
    # Compare to original column headers -- some additions in June 2011 (2nd column):
#  1   1 Date Added to Catalog
#  2   2 PubMedID
#  3   3 First Author
#  4   4 Date
#  5   5 Journal
#  6   6 Link
#  7   7 Study
#  8   8 Disease/Trait
#  9   9 Initial Sample Size
# 10  10 Replication Sample Size
# 11  11 Region
#     12 Chr_id
#     13 Chr_pos
# 12  14 Reported Gene(s)
#     15 Mapped Gene
#     16 Upstream_gene_id
#     17 Downstream_gene_id
#     18 Snp_gene_ids
#     19 Upstream_gene_distance
#     20 Downstream_gene_distance
# 13  21 Strongest SNP-Risk Allele
# 14  22 SNPs
#     23 Merged
#     24 Snp_id_current
#     25 Context
#     26 Intergenic
# 15  27 Risk Allele Frequency
# 16  28 p-Value
#     29 Pvalue_mlog
# 17  30 p-Value (text)
# 18  31 OR or beta
# 19  32 95% CI (text)
# 20  33 Platform [SNPs passing QC]
# 21  34 CNV
    # Original columns of interest: pretty much all except for Date Added to the Catalog,
    # and Link which can be generated from PubMedID.  Watch out for these:
    # * Some rows don't name a SNP ("" or "NR") -- in that case, skip.
    # * Risk allele is not always just a number, may have desc
    # * Missing data may be "", "NR", "NS" or "Pending"
    # June 2011 new columns: ignore for now; make new table format if user demand

    # Use SNPs (comma-sep list) to map to genome coords, and strongest SNP-Risk Allele 
    # as bed 4+ name.
    perl -MEncode -we 'while (<>) { \
                next if (/^\s*$/); \
                s/\r$//; \
                @w = split("\t"); \
                next if ($w[21] !~ /^rs\d+/); \
                if ($w[3] =~ /^(\d+)\/(\d+)\/(\d+)$/) { # transform to mysql DATE \
                  ($month, $day, $year) = ($1, $2, $3); \
                  $w[3] = "$year-$month-$day"; \
                } else { die "Cant parse date ($w[3])\t" } \
                $w[21] =~ s/ //g; \
                my @snps = split(",", $w[21]); \
                # discard columns (use descending order): \
                foreach $i (28, 25, 24, 23, 22, 21, 19, 18, 17, 16, 15, 14, 12, 11, 5, 0) { \
                  splice(@w, $i, 1); \
                } \
                # trim leading/trailing spaces if any; \
                # convert the Unicode in titles to HTML because non-ASCII gives Galaxy trouble. \
                foreach $i (0 .. $#w) { \
                  $w[$i] =~ s/^\s*//;  $w[$i] =~ s/\s*$//; \
                  # ugh, clean out non-utf8 stuff before decoding utf8 into unicode: \
                  $w[$i] =~ s/\226/-/g; $w[$i] =~ s/\327/x/g; $w[$i] =~ s/\317\?/&tau;/g; \
                  $w[$i] =~ s/\342\?\?/<sub>1<\/sub>/g; $w[$i] =~ s/\347/c/g; \
                  $w[$i] =~ s/\351/e/g; $w[$i] =~ s/\353/e/g; \
                  $w[$i] = decode_utf8($w[$i], Encode::FB_CROAK); \
                  @chars = split(//, $w[$i]); \
                  $w[$i] = ""; \
                  foreach $c (@chars) { \
                    if (ord($c) > 127) { \
                      $c = sprintf "&#%d;", ord($c); \
                    } \
                  $w[$i] .= $c; \
                  } \
                } \
                foreach $s (@snps) { \
                  print join("\t", $s, @w) . "\n"; \
                } \
              }' \
      gwascatalog.txt \
    | sort > noCoords.tab
    join -t "	" -1 4 ../snp130Coords.bed noCoords.tab \
        -o 1.1,1.2,1.3,1.4,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12,2.13,2.14,2.15,2.16,2.17,2.18,2.19 \
    | sort -k1,1 -k2n,2n \
        > gwasCatalog.bed
    hgLoadBed hg18 gwasCatalog gwasCatalog.bed \
      -tab -sqlTable=$HOME/kent/src/hg/lib/gwasCatalog.sql -notItemRgb -allowStartEqualEnd
#Read 10796 elements of size 22 from gwasCatalog.bed

    # For David: find examples of risk alleles for which dbSNP observed
    # alleles are complementary (A/T or C/G) -- how do we know what strand the
    # risk allele is on??  -- asked corresp. author Teri Manolio.  Info is not
    # always available in the original publication, so sadly there is not always 
    # a way to resolve these. GWAS catalog folks aren't going to modify their 
    # database to add a column for these cases.
    hgsql hg18 -NBe 'select snp.name,gc.riskAllele,snp.strand,snp.refNcbi,snp.observed \
                     from gwasCatalog as gc, snp130 as snp \
                     where gc.riskAllele rlike "^rs[0-9]+-[ACGT]" and \
                           gc.name = snp.name and snp.observed in ("C/G", "A/T") \
                     order by gc.name;' > ambigStrand.txt
    wc -l ambigStrand.txt 
#689 ambigStrand.txt


#############################################################################
# CRG MAPABILITY (2010-01-19 - 2010-01-28, hartera, DONE)
# Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca
# from the Guigo lab at the Center for Genomic Regulation (CRG) in Barcelona. 
# Data was produced using their GEM mapper aligner taking sliding k-mers 
# window of the human genome that were mapped back onto the genome with up 
# to 2mismatches. For each window, a mappability score is computed 
# S = 1/(nb of match_found) and the BigWig index was created according to 
# this score.
# 2010-01-26 Loaded tables and added data to /gbdb/
# 2010-01-28 Changed the table names to have wgEncode prefix for consistency. 
# Added trackDb entry for the subtracks to the ENCODE Mapability track entry.
# 2010-03-16 - 2010-03-18. Added metadata to trackDb for the subtracks and#
# added downloads for the bigWig data files.
# 2010-04-28 Received new data from Thomas Derrien. Downloaded data and 
# added it to /gbdb/. A bug was found in a library used by bedGraphToBigWig 
# so sent a new binary to data providers and they re-created the bigWig files. 
# 2010-05-12. Updated downloads for the new data files.

     mkdir -p /hive/data/genomes/hg18/bed/crgMapability
     cd /hive/data/genomes/hg18/bed/crgMapability
cat << 'EOF' > temp
#!/bin/tcsh -ef
http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg18_H.sapiens.genome.hg18.main.mappability-36.bw.bz2
http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg18_H.sapiens.genome.hg18.main.mappability-40.bw.bz2
http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg18_H.sapiens.genome.hg18.main.mappability-50.bw.bz2
http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg18_H.sapiens.genome.hg18.main.mappability-75.bw.bz2
http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg18_H.sapiens.genome.hg18.main.mappability-100.bw.bz2
'EOF'

     awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \
         temp > download.csh
     rm temp
     chmod +x download.csh
     ./download.csh >& download.log &
     
     # Add the data to /gbdb/ and load the file names into tables (2010-01-26)
     cd /hive/data/genomes/hg18/bed/crgMapability
     bunzip2 *.bz2

     # Add data to gbdb
     mkdir -p /gbdb/hg18/bbi/
     # Symlink files with names as crgMapabilityAlignXmer.bw to /gbdb/hg18/bbi
     # and load file name into a table - one per dataset. Each table 
     # represents a subtrack.
     foreach f (`ls *.bw`)
        echo $f
        set g=`echo $f | cut -d "-" -f2`
        set num=`echo $g | cut -d "." -f1`
        set mer=`echo "${num}mer"`
        set nf=`echo "crgMapabilityAlign${mer}.bw"`
        echo $nf
        ln -s `pwd`/${f} /gbdb/hg18/bbi/${nf}
        hgsql hg18 -e "drop table if exists crgMapabilityAlign${mer}; \
     create table crgMapabilityAlign${mer} (fileName varchar(255) not null); \
     insert into crgMapabilityAlign${mer} values ('/gbdb/hg18/bbi/${nf}');"
     end

     # 2010-01-28. 
     # Renamed the tables to have a wgEncode prefix for consistency. 
     cd /hive/data/genomes/hg18/bed/crgMapability
     hgsql -Ne 'show tables like "crg%";' hg18 > tables.txt
     foreach t (`cat tables.txt`)
        set g=`echo $t | sed -e 's/c/C/'`
        hgsql -e "alter table ${t} rename enc${g};" hg19
     end 
     # Added a trackDb entry for this subtrack of the ENCODE Mapability 
     # track in kent/src/hg/makeDb/trackDb/human/hg18/trackDb.wgEncode.ra
     # use bigWigInfo to check min and max values. 

     # 2010-03-16 - 2010-03-18
     # Added metadata to the trackDb entries for the subtracks and     
     # added downloads for these data files.
     cd /usr/local/apache/htdocs/goldenPath/hg18/encodeDCC/wgEncodeMapability
     cp -p /gbdb/hg18/bbi/crg*.bw
     gzip crg*.bw
     # Edited the preamble.html in 
     # hg18/encodeDCC/wgEncodeMapability/ to include the CRG dataset.
     # Run encodeDownloadsPage.pl to generate the index page for downloads.
     # It does not capture all the information probably because the subtrack
     # name is different to the downloads name so change the file names and 
     # re-load the tables and make the downloads. 
     cd /hive/data/genomes/hg18/bed/crgMapability
     foreach f (`ls *.bw`)
        echo $f
        set g=`echo $f | cut -d "-" -f2`
        set num=`echo $g | cut -d "." -f1`
        set mer=`echo "${num}mer"`
        set of=`echo "crgMapabilityAlign${mer}.bw"`
        set nf=`echo "wgEncodeCrgMapabilityAlign${mer}.bw"`
        echo $nf
        rm /gbdb/hg18/bbi/${of}
        ln -s `pwd`/${f} /gbdb/hg18/bbi/${nf}
        hgsql hg18 -e "drop table if exists wgEncodeCrgMapabilityAlign${mer}; \
     create table wgEncodeCrgMapabilityAlign${mer} (fileName varchar(255) not null); \
     insert into wgEncodeCrgMapabilityAlign${mer} values ('/gbdb/hg18/bbi/${nf}');"
     end
  
     cd /usr/local/apache/htdocs/goldenPath/hg18/encodeDCC/wgEncodeMapability
     rm crg*
     cp -p /gbdb/hg18/bbi/wgEncodeCrg*.bw .
     gzip wgEncodeCrg*.bw
     # Then run encodeDownloadsPages.pl
     /cluster/home/hartera/bin/encodeDownloadsPage.pl -checksum \
          -preamble=preamble.html index.html . 

     # Downloaded and added new bigWig files to /gbdb/hg18/bbi 
     # (2010-04-28, hartera). New files were created as there was a bug
     # in the older version of bedGraphToBigWig.          
     cd /hive/data/genomes/hg18/bed/crgMapability
     rm temp download.csh download.log 
cat << 'EOF' > temp
#!/bin/tcsh -ef
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg18.main.mappability-100.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg18.main.mappability-36.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg18.main.mappability-40.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg18.main.mappability-50.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg18.main.mappability-75.bw.bz2
'EOF'
     awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \
         temp > download.csh
     rm temp
     chmod +x download.csh
     ./download.csh >& download.log &

     # Add data to /gbdb/. The file names in /gbdb/ are the same as before 
     # so the tables do not need to be reloaded.
     cd /hive/data/genomes/hg18/bed/crgMapability
     bunzip2 *.bz2
     foreach f (`ls *.bw`)
        echo $f
        set g=`echo $f | cut -d "-" -f2`
        set num=`echo $g | cut -d "." -f1`
        set mer=`echo "${num}mer"`
        set nf=`echo "wgEncodeCrgMapabilityAlign${mer}.bw"`
        echo $nf
        rm /gbdb/hg18/bbi/${nf}
        ln -s `pwd`/${f} /gbdb/hg18/bbi/${nf}
     end
     # 2010-05-12
     # Updated downloads for the new data files.
     cd /usr/local/apache/htdocs/goldenPath/hg18/encodeDCC/wgEncodeMapability
     rm wgEncodeCrg*
     cp -p /gbdb/hg18/bbi/wgEncodeCrg*.bw .
     gzip wgEncodeCrg*.bw
     rm md5sum.txt
     # Run encodeDownloadsPage.pl to generate the index page for downloads
     # and generate new md5sum.txt file for the data.
     encodeDownloadsPage.pl -checksum -db=hg19 index.html

#####################################################################
# tRNAs track (2010-03-12, Fan RE-BUILT)
#
    ssh hgwdev
    cd /hive/data/genomes/gs.19/build36/bed
    mkdir tRNAs
    cd tRNAs

# Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/

    cp -p /projects/lowelab/users/lowe/Browser/vertebrates/hg18-tRNAs.bed .
    cp -p \
    /projects/lowelab/users/lowe/Browser/vertebrates/hg18_tRNAs_images.tar .

    hgsql hg18 -e 'drop table if exists tRNAs'
    hgLoadBed -tab hg18 tRNAs hg18-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql

    mkdir gif
    cd gif
    tar -xvf ../hg18_tRNAs_images.tar
    mv images/*.gif .
    rm -rf images
    mkdir /hive/data/gbdb/hg18/RNA-img
    rm /hive/data/gbdb/hg18/RNA-img/*
    cp -p * /hive/data/gbdb/hg18/RNA-img

#####################################################################
# PAR track (2010-02-18, markd DONE)

  cd /hive/data/genomes/hg18/bed/par/
  # create hg18.par using the documented coordinates
  hgPar hg18 hg18.par par
#####################################################################
# H-INVITATIONAL GENE ANNOTATION DATABASE (Working 2010-0226, chin)
    #http://h-invitational.jp/hinv/ahg-db/index.jsp
    # Create knownGene table to reference HINV gene ID's
    #  for link on knownGenes details page
    # Also, create an HINV gene track

    # download CDNA file H-InvDB_7.0  (Feb 26, 2010) -- got release # from downloads page).
    #  ftp://ftp.ddbj.nig.ac.jp/mirror_database/hinv/jbirc_ff/annotation/
   mkdir /cluster/data/hinv/H-InvDB_7.0
   cd /cluster/data/hinv/H-InvDB_7.0
   wget --timestamp \
ftp://ftp.ddbj.nig.ac.jp/mirror_database/hinv/jbirc_ff/annotation/FCDNA.gz

# HH-Inv(7.0)
    mkdir /hive/data/genomes/hg18/bed/hinv7.0
    cd /hive/data/genomes/hg18/bed/hinv7.0
    cat << '_EOF_' > hinvToBed.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $chr="";
my $start="";
my $end="";
my %accNoDups;
my $invId = "";
my $invIdVer = "";
my $accNo = "";
my $strand = "";
my $cai = 0;
open (FH, "zcat FCDNA.gz|") or die "can not zcat FCDNA.gz";
while (my $line = <FH>) {
    my ($id, $tag, $rest) = split('\s+', $line, 3);
    if ($line =~ m/^CDNA_H-INVITATIONAL-ID:/ ) {
	$invId = $tag;
    } elsif ($line =~ m/^CDNA_H-INVITATIONAL-ID-VERSION:/ ) {
	$invIdVer = $tag;
    } elsif ($line =~ m/^CDNA_CHROMOSOME-NUMBER:/ ) {
	$chr = $tag;
    } elsif ($line =~ m/^CDNA_STRAND:/ ) {
	$strand = $tag;
    } elsif ($line =~ m/^PREDICTED-ORF_CAI:/ ) {
	$cai = int($tag * 1000);
    } elsif ($line =~ m/^CDNA_START:/ ) {
	$start = $tag;
    } elsif ($line =~ m/^CDNA_END:/ ) {
	$end = $tag;
    } elsif ($line =~ m/^CDNA_ACCESSION-NO:/ ) {
	$accNo = $tag;
    } elsif ($line =~ m/CDNA_CLUSTER-ID:/ ) {
	if (length($accNo) > 0) {
	    next if ($chr eq "UM");
	    if (length($start) < 1 || length($end) < 1) {
		printf STDERR "no start,end ? chr%s\t%s\n", $chr, $invIdVer;
	    } else {
		die "have accession but no ID ?" if (length($invId) < 1);
		$invIdVer =~ s/\.[0-9]+$//;
		printf "chr%s\t%d\t%d\t%s\t%d\t%s\n",
			$chr, $start, $end, $invIdVer, $cai, $strand;
	    }
	}
	$accNo = "";
	$invId = "";
	$invIdVer = "";
	$chr = "";
	$start = "";
	$end = "";
	$cai = 0;
	$strand = "";
    }
}
close (FH);
'_EOF_'
    # << happy emacs
    ln -s /hive/data/outside/hinv/H-InvDB_7.0/FCDNA.gz .
    chmod +x hinvToBed.pl
    time ./hinvToBed.pl | grep -v chr6_hla_hap | sort -k1.1 -k2.2n > hinv7.0.bed
    # zcat: FCDNA.gz: decompression OK, trailing garbage ignored
    # real    3m1.060s
    # user    3m14.142s
    # sys     0m10.961s
    # verify the new table does not exist
    hgsql -e "show tables" hg18 | grep -i hinv
    hgLoadBed -verbose=2  hg18 HInvGeneMrnaBed hinv7.0.bed
    # Reading hinv7.0.bed
    # Loaded 217721 elements of size 4
    hgsql -e "show tables" hg18 | grep -i hinv
    # HInv
    # HInvGeneMrna
    # HInvGeneMrnaBed
    # knownToHInv
    # knownXToHInv
    # check the coverage
    featureBits hg18 HInvGeneMrnaBed
    # 1350541623 bases of 2881515245 (46.869%) in intersection
    # exon only
    featureBits hg18 HInvGeneMrna   
    # 82136473 bases of 2881515245 (2.850%) in intersection
    # measure exon and intron to compare
    hgsql -N -e "select tName, tStart, tEnd, qName, strand from HInvGeneMrna;" \
       hg18 > hinvGeneMrna.bed
    # 988629029 bases of 2881515245 (34.309%) in intersection

    featureBits hg18 HInvGeneMrnaBed -countGaps gap
    # 4523138 bases of 3107677273 (0.146%) in intersection


# stop here pending answer for seraching with newest version id 

#########################################################################
# UPDATE snpArrayIllumina HumanCytoSNP-12 SUB-TRACK (DONE 3/23/10, Fan)

# Received raw data file HumanCytoSNP-12_v2_1_forUCSC.csv
# from Illumina,  Jennifer L. Stone Ph.D., jstone@illumina.com 

    mkdir -p /hive/data/genomes/gs.19/build36/bed/snp/illumina/032210
    cd /hive/data/genomes/gs.19/build36/bed/snp/illumina/032210

    cat HumanCytoSNP-12_v2_1_forUCSC.csv |\
    sed -e 's/,/\t/g' >HumanCytoSNP.tab

    hgsql hg18 -e 'drop table snpArrayIlluminaHumanCytoSNP_12Raw'
    hgsql hg18 < ~/src/hg/lib/snpArrayIlluminaHumanCytoSNP_12Raw.sql
    hgsql hg18 -e 'load data local infile "HumanCytoSNP.tab" into table snpArrayIlluminaHumanCytoSNP_12Raw ignore 1 lines'

    ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIlluminaHumanCytoSNP_12Raw snp130

# The illuminaLookup1M generate two files:
#
#	illuminaLookup.out  contains all probes found in snp130
#	illuminaLookup.err  contains all probes not found in snp130

    mv illuminaLookup.out illuminaLookupHumanCytoSNP_12a.out

    cut -f 1 illuminaLookup.err >j.1
    cat j.1 |sed -e 's/chrMt/chrM/' |\
    sed -e 's/XY/X/'  >j.chr

    cut -f 2-5 illuminaLookup.err >j.2

    cut -f 6 illuminaLookup.err >j.3
    cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand

    cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed

    paste j.chr j.2 j.strand j.observed >illuminaLookupHumanCytoSNP_12b.out

# combine two parts
    cat illuminaLookupHumanCytoSNP_12a.out illuminaLookupHumanCytoSNP_12b.out >snpArrayIlluminaHumanCytoSNP_12.tab

# load the table
    hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHumanCytoSNP_12 snpArrayIlluminaHumanCytoSNP_12.tab -tab -sqlTable=/cluster/home/fanhsu/scratch/tip201/kent/src/hg/lib/snpArrayIlluminaHumanCytoSNP_12.sql

#############################################################################
# ucscRetro track (2010-04-12, baertsch DONE)
mkdir -p /hive/users/baertsch/retro/hg18
cd /hive/users/baertsch/retro/hg18
wget http://compbio.soe.ucsc.edu/retrogene/retroFinder-1.16.tar.gz
tar xvfz retroFinder-1.16.tar.gz
cd retroFinder-1.16/src/pslPseudo
make
cd ../../..

cat << '_EOF_' > DEF
RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 "
DB=hg18
SCORETHRESH=550
LOGNAME=baertsch
GENOMENAME='Homo sapiens'
GBDB=hg
MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz/
TMPMRNA=/hive/users/baertsch/mrnaBlastz/$DB
TMPEST=/hive/users/baertsch/est/$DB
EST=all_est
SPLICED_EST=intronEst
SPLIT_EST=0
SPLIT_SPLICED_EST=1
SCRIPT=/hive/users/baertsch/retro/hg18/retroFinder-1.16/scripts
GENOME=/hive/data/genomes/
RETRODIR=$GENOME/$DB/bed/retro
BASE=/hive/users/baertsch/retro
OUTDIR=/hive/users/baertsch/retro/$DB/
RESULT=$OUTDIR/result
LOG=$OUTDIR/log
OUT=$OUTDIR/out
OVERLAPDIR=$OUTDIR/run.o
VERSION=5
TABLE=ucscRetroInfo$VERSION
ALIGN=ucscRetroAli$VERSION
LOCAL=/scratch/data/$DB
TWOBIT=$LOCAL/$DB.2bit
NIB=$LOCAL/nib
RMSK=/hive/data/genomes/$DB/linSpecRep/
NET1=netMm8 
NET2=netCanFam2 
NET3=netRheMac2 
GENE1=knownGene
GENE2=refGene
GENE3=mgcGenes
CLUSTER=swarm
SPECIES="hg18 mm9 rheMac2"
ROOTDIR="/cluster/home/baertsch/public_html"
EXPDIR=exp
GENEPFAM=knownGene
PFAM=knownToPfam
PFAMIDFIELD=name
PFAMDOMAIN=value
ARRAY=gnfAtlas2
AFFYPROBE="affyU133A,affyGnf1h"
ARRAYMEDIAN=hgFixed.gnfHumanAtlas2Median 
ARRAYRATIO=hgFixed.gnfHumanAtlas2AllRatio
ARRAYABS=hgFixed.gnfHumanAtlas2All   
ARRAYEXP=hgFixed.gnfHumanAtlas2MedianExps 
ARRAYEXPALL=hgFixed.gnfHumanAtlas2AllExps
ARRAYLOOKUP=knownToGnfAtlas2 
ARRAYPSLS="/hive/data/genomes/hg18/bed/geneAtlas2/affyU133A.psl
/hive/data/genom
es/hg18/bed/geneAtlas2/affyGnf1h.psl"
ALTSPLICE=altGraphX
SPLITBYAGE=splitRetrosByAge
PDB=proteins090821
'_EOF_'
    # << happy emacs

#add ./retroFinder-1.16/scripts to PATH
retroFinder-1.16/scripts/filterMrna.sh DEF
retroFinder-1.16/scripts/filterEst.sh DEF
nohup retroFinder-1.16/scripts/ucscRetroStep1.sh DEF
#check cluster job
nohup retroFinder-1.16/scripts/ucscRetroStep2.sh DEF
nohup retroFinder-1.16/scripts/ucscRetroStep3.sh DEF
#check cluster job
nohup retroFinder-1.16/scripts/ucscRetroStep4.sh DEF
nohup retroFinder-1.16/scripts/ucscRetroStep5.sh DEF
    # Load the track
nohup retroFinder-1.16/scripts/ucscRetroStep6.sh DEF
#add ucscRetroAli to trackDb.ra 

#############################################################################
# NEANDERTAL TRACKS (DONE 5/6/10 angie)
    # Ed Green and Hernan Burbano contributed data for several tracks
    # in advance of the publication in Science of the Neandertal genome
    # sequence and analysis.  These tracks were built on a private,
    # access-controlled server (genome-nt) and then transferred to hgwdev
    # and quickly pushed to hgwbeta and RR when the embargo lifted.
    # Full descriptions are in a separate file, hg18.nt.txt.
    # Track tables, in case anybody searches for them in here:
    # H-C Coding Diffs:	ntHumChimpCodingDiff
    # Sel Swp Scan (S):	ntSssZScorePMVar
    # 5% Lowest S:	ntSssTop5p
    # S SNPs:	 	ntSssSnps 
    # Cand. Gene Flow:	ntOoaHaplo
    # Neandertal Cntgs:	bamAll bamFeld1 bamMez1 bamSid1253 bamVi33dot16 bamVi33dot25 bamVi33dot26
    # Neandertal Seq:	bamSLFeld1 bamSLMez1 bamSLSid1253 bamSLVi33dot16 bamSLVi33dot25 bamSLVi33dot26 
    # Modern Human Seq:	bamMMS4 bamMMS5 bamMMS6 bamMMS7 bamMMS8
    # Neandertal Mito:	ntMito

#############################################################################
# BUILD DECIPHER TRACK (DONE, 6/1/10, Fan)

ssh hgwdev

# Received raw DECIPHER data file, daa_28-05-10_ucsc.txt, 
# from Manuel Corpas [mc10@sanger.ac.uk] and place it under
# /hive/data/outside/decipher/

cd /hive/data/genomes/gs.19/build36/bed
mkdir decipher
cd decipher

cp -p /hive/data/outside/decipher/daa_28-05-10_ucsc.txt .

hgsql hg18 -e 'drop table decipherRaw'
hgsql hg18 < ~/src/hg/lib/decipherRaw.sql

# load into decipherRaw table
hgsql hg18 -e \
'load data local infile "daa_28-05-10_ucsc.txt" into table decipherRaw ignore 1 lines'

# construct the bed file, decipher.bed
hgsql hg18 -N -e 'select "chr", chr, start-1, end, id from decipherRaw ' |\
sed -e 's/chr\t/chr/' |sort -u>j.tmp
# fix some out of range of entries
cat j.tmp|sed -e 's/243000000/242951149/' |\
sed -e 's/115090019/114142980/' >decipher.bed
rm j.tmp

# load the bed track.
hgLoadBed hg18 decipher decipher.bed

# create knownToDecipher table.
hgMapToGene -all hg18 decipher knownGene knownToDecipher
#############################################################################
# UPDATE KEGG TABLES (DONE, Fan, 6/18/10)

mkdir -p /hive/data/genomes/hg18/bed/pathways/kegg
cd /hive/data/genomes/hg18/bed/pathways/kegg

wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab

cat map_title.tab | sed -e 's/\t/\thsa\t/' > j.tmp
cut -f 2 j.tmp >j.hsa
cut -f 1,3 j.tmp >j.1
paste j.hsa j.1 |sed -e 's/\t//' > keggMapDesc.tab
rm j.hsa j.1
rm j.tmp

hgsql hg18 -e 'drop table keggMapDesc'
hgsql hg18 < ~/kent/src/hg/lib/keggMapDesc.sql
hgsql hg18 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'

wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/hsa/hsa_pathway.list

cat hsa_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp
hgsql hg18 -e 'drop table keggPathway'
hgsql hg18 < ~/kent/src/hg/lib/keggPathway.sql
hgsql hg18 -e 'load data local infile "j.tmp" into table keggPathway'

hgsql hg18 -N -e \
'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \
>keggPathway.tab

hgsql hg18 -e 'delete from keggPathway'

hgsql hg18 -e 'load data local infile "keggPathway.tab" into table keggPathway'

rm j.tmp

#############################################################################
# Add KEGG column to hg18 Gene Sorter (Done, Fan, 6/18/2010)

mkdir -p /hive/data/genomes/hg18/bed/geneSorter
cd /hive/data/genomes/hg18/bed/geneSorter
hgsql hg18 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab

hgsql hg18 -e 'drop table knownToKeggEntrez'

hgsql hg18 < ~/kent/src/hg/lib/knownToKeggEntrez.sql

hgsql hg18 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez'

#############################################################################
# Add Human RNA-editing track hg18 (Done, galt, 7/12/2010)

# DARNED=DAtabase of RNa EDiting
#http://darned.ucc.ie/
#University College Cork

mkdir -p /hive/data/genomes/hg18/bed/darned
cd /hive/data/genomes/hg18/bed/darned
# create go.csh to download and compose allChroms.bed
./go.csh
hgLoadBed hg18 darned allChroms.bed
# at human, level
# added darned.html
# added trackDb.ra entry
#############################################################################
# REFINE DECIPHER DETAILS PAGE (DONE, Fan, 7/13/10) 
# 

ssh hgwdev
cd /hive/data/genomes/gs.19/build36/bed/decipher

hgsql hg18 -N -e \
'select d.* from knownToDecipher d, knownCanonical c where
c.transcript=d.name' >knownCanonToDecipher.tab

hgsql hg18 -e "drop table knownCanonToDecipher"

hgsql hg18 < ~/src/hg/lib/knownCanonToDecipher.sql

hgsql hg18 -e 'load data local infile "knownCanonToDecipher.tab" into table
knownCanonToDecipher'

#############################################################################
# Got UCSF Brain Methyl data from Ting already loaded.  7/2010


#############################################################################
# LIFTOVER TO Hg19 (RE-DONE - 2010-07-26 - Hiram )
    #	preserving the previous 10K liftOver files
    mkdir /hive/data/genomes/hg18/bed/liftOver10K
    cd /hive/data/genomes/hg18/bed/liftOver10K
    ln -s ../blat.hg19.2009-03-06/hg18ToHg19.over.chain.gz .

    #	this liftOver is a 5000 size chunk
    mkdir /hive/data/genomes/hg18/bed/blat.hg19.2010-07-26
    cd /hive/data/genomes/hg18/bed/blat.hg19.2010-07-26
    # -debug run to create run dir, preview scripts...
    #	verifies files can be found
    doSameSpeciesLiftOver.pl -debug hg18 hg19
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
	 hg18 hg19 > do.log 2>&1 &
    #	real    67m51.597s

    #	checking liftOver accuracy
    mkdir /hive/data/genomes/hg18/bed/blat.hg19.2010-07-26/refGene
    cd /hive/data/genomes/hg18/bed/blat.hg19.2010-07-26/refGene
    hgsql -N -e "select * from refGene;" hg18 | cut -f2- > refGene.hg18.gp
    wc -l refGene.hg18.gp
    #	36766
    #	the 5K block size lift over chain
    liftOver -genePred refGene.hg18.gp ../hg18ToHg19.over.chain.gz \
	refGene.hg18ToHg19.5K.lift.gp refGene.hg18ToHg19.5K.unMapped.gp
    wc -l refGene.hg18ToHg19.5K.unMapped.gp
    #   440
    #	the 10K block size lift over chain
    liftOver -genePred refGene.hg18.gp \
	../../liftOver10K/hg18ToHg19.over.chain.gz \
	refGene.hg18ToHg19.10K.lift.gp refGene.hg18ToHg19.10K.unMapped.gp
    wc -l refGene.hg18ToHg19.10K.unMapped.gp
    #	430
    #	construct custom track of chain files.
    #	the 5K block size lift over chain
    chainToPsl ../hg18ToHg19.over.chain.gz \
	/hive/data/genomes/hg18/chrom.sizes \
	/hive/data/genomes/hg19/chrom.sizes \
	/hive/data/genomes/hg18/hg18.2bit \
	/hive/data/genomes/hg19/hg19.2bit stdout \
	| pslToBed stdin hg18ToHg19.5K.bed
    #	the 10K block size lift over chain
    chainToPsl ../../liftOver10K/hg18ToHg19.over.chain.gz \
	/hive/data/genomes/hg18/chrom.sizes \
	/hive/data/genomes/hg19/chrom.sizes \
	/hive/data/genomes/hg18/hg18.2bit \
	/hive/data/genomes/hg19/hg19.2bit stdout \
	| pslToBed stdin hg18ToHg19.10K.bed

    grep -v "^#" refGene.hg18ToHg19.5K.unMapped.gp \
	| awk '{print $1}' | sort -u > 5K.genes.unMapped
    grep -v "^#" refGene.hg18ToHg19.10K.unMapped.gp \
	| awk '{print $1}' | sort -u > 10K.genes.unMapped

    #	do just the exons all by themselves:
    featureBits hg18 refGene:exon -bed=hg18.refGene.exons.bed
    liftOver hg18.refGene.exons.bed ../hg18ToHg19.over.chain.gz \
	hg18ToHg19.refGene.exons.lifted.5K.bed \
	hg18ToHg19.refGene.exons.5K.unMapped
    liftOver hg18.refGene.exons.bed \
	../../liftOver10K/hg18ToHg19.over.chain.gz \
	hg18ToHg19.refGene.exons.lifted.10K.bed \
	hg18ToHg19.refGene.exons.10K.unMapped
    wc -l *.exons.*.unMapped
    #	284 hg18ToHg19.refGene.exons.10K.unMapped
    #	260 hg18ToHg19.refGene.exons.5K.unMapped

    #	create custom track showing identical fragments in hg18 and hg19:
    hgsql -e "show tables;" hg18 | grep _gold | while read T
do
    hgsql -N -e "select frag,fragStart,fragEnd from $T;" hg18
done | sort > hg18.gold.frags.tab

    hgsql -N -e "select frag,fragStart,fragEnd from gold;" hg19 \
	| sort > hg19.gold.frags.tab

    #	most are identical:
    comm -12 hg18.gold.frags.tab hg19.gold.frags.tab | wc -l
    #	26436
    #	unique to hg18:
    comm -23 hg18.gold.frags.tab hg19.gold.frags.tab | wc -l
    #	705
    #	unique to hg19:  (includes patch1 fragments)
    comm -13 hg18.gold.frags.tab hg19.gold.frags.tab | wc -l
    #	1126
    hgsql -e "show tables;" hg18 | grep _gold | while read T
do
    hgsql -N -e "select chrom,chromStart,chromEnd,frag,0,strand from $T;" hg18
done | sort -k4,4 > hg18.gold.bed

    #	construct custom track of fragments in hg18 that are not in hg19
    comm -23 hg18.gold.frags.tab hg19.gold.frags.tab | sort \
	> hg18.unique.frags.tab
    join -1 4 -2 1 hg18.gold.bed hg18.unique.frags.tab \
	| awk '{print $2,$3,$4,$1,$5,$6}' | sort -k1,1 -k2,2n \
	> hg18.unique.frags.bed
    comm -12 hg18.gold.frags.tab hg19.gold.frags.tab | sort \
	> hg18.hg19.common.frags.tab
    join -1 4 -2 1 hg18.gold.bed hg18.hg19.common.frags.tab \
	| awk '{print $2,$3,$4,$1,$5,$6}' | sort -k1,1 -k2,2n \
	> hg18.hg19.common.frags.bed

#############################################################################
# LIFTOVER TO Hg17 (RE-DONE - 2010-07-26 - Hiram )
    #	preserving the previous 10K liftOver files
    cd /hive/data/genomes/hg18/bed/liftOver10K
    ### XXX !!!! **** The liftOver directory is full of the "*Original*" files.
    # the blat.*.date directories are mere symlinks to ../liftOver
    # this is bad.  Fixup the file in blat.hg17.2009-03-06 so it is the
    #	real file, eliminate the liftOver copy, and construct this symlink:
    ln -s ../blat.hg17.2009-03-20/hg18ToHg17.over.chain.gz .

    #	this liftOver is a 5000 size chunk
    mkdir /hive/data/genomes/hg18/bed/blat.hg17.2010-07-26
    cd /hive/data/genomes/hg18/bed/blat.hg17.2010-07-26
    # -debug run to create run dir, preview scripts...
    #	verifies files can be found
    doSameSpeciesLiftOver.pl -debug hg18 hg17
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	 hg18 hg17 > do.log 2>&1 &
    #	real    74m50.836s


############################################################################
# NUMTS TRACK (DONE 2010-08-09 - Chin)

    mkdir /cluster/data/hg18/bed/NumtS
    cd  /cluster/data/hg18/bed/NumtS
    # download raw data from
    wget http://193.204.182.50/files/all_UCSC_custom_tracks.txt 
    wget http://193.204.182.50/files/NumtS_fragments_extlink.html
    # split the all_UCSC_custom_tracks.txt into 4 bed files 
    # numtSAssembled.bed, numtS.bed, numtSMitochondrion.bed and
    # numtSMitochondrionChrPlacement.bed

    cat all_UCSC_custom_tracks.txt | awk ' /^track name/ {print $_}' / 
       > tracks.list
    # load the bed files to hg18
    hgLoadBed hg18  numtSAssembled  numtSAssembled.bed
    hgLoadBed hg18 numtS numtS.bed
    hgLoadBed hg18 numtSMitochondrion numtSMitochondrion.bed
    hgLoadBed hg18 numtSMitochondrionChrPlacement numtSMitochondrionChrPlacement.bed
    # reload the tracks with data with updated ID (DONE 2011-01-26 Chin)
    mkdir /cluster/data/hg18/bed/NumtS/2011-01-26
    # cp over all new data
    cd /cluster/data/hg18/bed/NumtS/2011-01-26
    # load the bed files to hg18
    hgLoadBed hg18  numtSAssembled  numtSAssembled.bed
    hgLoadBed hg18 numtS numtS.bed
    hgLoadBed hg18 numtSMitochondrion numtSMitochondrion.bed
    hgLoadBed hg18 numtSMitochondrionChrPlacement numtSMitochondrionChrPlacement.bed
    # update the ~/kent/src/hg/makeDb/trackDb/human/numtSeq.html with
    # description_revisedMC ( numtSeq-20110126.html)
    cp  numtSeq-20110126.html \
       /cluster/home/chinhli/kent/src/hg/makeDb/trackDb/human/numtSeq.html

##############################################################################
# hg18 <-> hg19 difference tracks (WORKING - 2010-09-03 - Hiram)
    # single instance of documentation for hg18 *and* hg19 tracks
    mkdir /hive/data/genomes/hg18/bed/liftOverHg19
    cd /hive/data/genomes/hg18/bed/liftOverHg19

    #	not needed, but interesting, collect all the fragment
    #	definitions from the gold tables:
    hgsql -e "show tables;" hg18 | grep _gold | while read T
do  
    hgsql -N -e "select frag,fragStart,fragEnd,strand from $T;" hg18
done | sort > hg18.gold.frags.tab

    hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg19 \
        | sort > hg19.gold.frags.tab

    # construct common and difference listings
    comm -12 hg18.gold.frags.tab hg19.gold.frags.tab \
	> identical.hg18.hg19.frags.tab
    comm -23 hg18.gold.frags.tab hg19.gold.frags.tab \
	> unique.hg18Only.frags.tab
    comm -13 hg18.gold.frags.tab hg19.gold.frags.tab \
	> unique.hg19Only.frags.tab

    # better yet, get full information about each fragment
    hgsql -e "show tables;" hg18 | grep _gold | while read T
do  
    hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from $T;" hg18
done | sort -k6 > hg18.gold.tab

    hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg19 \
        | sort -k6 > hg19.gold.tab

    # construct a single key for each fragment for joining.
    # the key is frag,fragStart,fragEnd,strand
    awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n",
	$6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg18.gold.tab | sort \
	> hg18.fragKey.tab
    awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n",
	$6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg19.gold.tab | sort \
	> hg19.fragKey.tab

    # now, by joining those keys, we can get exact identicals, and
    # the only-in listings as bed files to load as tracks:
    join hg18.fragKey.tab hg19.fragKey.tab \
	| awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $4,$5,$6,$2,$3,$5,$6}' \
        | sort -k1,1 -k2,2n > hg18.hg19.identical.bed

    join hg18.fragKey.tab hg19.fragKey.tab \
	| awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $11,$12,$13,$9,$10,$12,$13}' \
        | sort -k1,1 -k2,2n > hg19.hg18.identical.bed

    join -v 1 hg18.fragKey.tab hg19.fragKey.tab \
	| awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \
        | sort -k1,1 -k2,2n > hg18.only.bed

    join -v 2 hg18.fragKey.tab hg19.fragKey.tab \
	| awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \
        | sort -k1,1 -k2,2n > hg19.only.bed

    hgLoadBed hg18 hg19ContigDiff hg18.only.bed
    hgLoadBed hg19 hg18ContigDiff hg19.only.bed

    wc -l hg1?.only.bed
    #	 708 hg18.only.bed
    #	1131 hg19.only.bed

    # this leaves the outstanding question of "why" they might be in
    #	the only-in listings.  Some contigs may be different versions,
    #   sometimes different sections of the same contig are used,
    #	and contigs are dropped from hg18 to hg19, or new contigs added
    #	to hg19 to fill in gaps from hg18
    # Let's see if we can measure some of this:
    awk '{print $4}' hg18.only.bed | sort -u > hg18.only.ids.list
    awk '{print $4}' hg19.only.bed | sort -u > hg19.only.ids.list

    # Looks like 333 idential contigs with different parts used:
    comm -12 hg18.only.ids.list hg19.only.ids.list > differentPortions.list
    wc -l differentPortions.list
    # 333

    # and perhaps 198 = 531-333 of different versions of same contig:
    sed -e "s/\.[0-9]*$//" hg18.only.ids.list | sort -u \
	> hg18.noVersions.ids.list
    sed -e "s/\.[0-9]*$//" hg19.only.ids.list | sort -u \
	> hg19.noVersions.ids.list
    comm -12 hg18.noVersions.ids.list hg19.noVersions.ids.list | wc -l
    #	531
    sed -e "s/\.[0-9]*$//" differentPortions.list | sort -u \
	> differentPortions.noVersions.list
    comm -12 hg18.noVersions.ids.list hg19.noVersions.ids.list | sort -u \
	> noVersions.common.list
    # indeed, 198 contigs of different versions:
    comm -23 noVersions.common.list differentPortions.noVersions.list \
	| sort -u > differentVersions.list
    wc -l differentVersions.list
    #	198

    # dividing up these items:
    cat << '_EOF_' > identifyPortions.pl
#!/usr/bin/env perl

use strict;
use warnings;

my %differentVersions;
my %differentPortions;

open (FH, "<differentVersions.list" ) or
	die "can not read differentVersions.list";
while (my $line = <FH>) {
    chomp $line;
    $differentVersions{$line} = 1;
}
close (FH);

open (FH, "differentPortions.list" ) or
	die "can not read differentPortions.list";
while (my $line = <FH>) {
    chomp $line;
    $differentPortions{$line} = 1;
}
close (FH);

my %hg18Done;
open (DP, ">hg18.differentPortions.bed") or die "can not write to hg18.differentPortions.bed";
open (DV, ">hg18.differentVersions.bed") or die "can not write to hg18.differentVersions.bed";
open (FH, "<hg18.only.bed" ) or die "can not read hg18.only.bed";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
    # assume done while $acc is still complete
    $hg18Done{$acc} = 1;
    if (exists($differentPortions{$acc})) {
	printf DP "%s\n", $line;
    } else {
	my $trimAcc = $acc;
	$trimAcc =~ s/\.[0-9]+$//;
	if (exists($differentVersions{$trimAcc})) {
	    printf DV "%s\n", $line;
	} else {
            # this one does not match
	    $hg18Done{$acc} = 0;
	}
    }
}
close (FH);
close (DV);
close (DP);
open (DR, ">hg18.dropped.bed") or die "can not write to hg18.dropped.bed";
open (FH, "<hg18.only.bed" ) or die "can not read hg18.only.bed";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
    if (0 == $hg18Done{$acc}) {
	printf DR "%s\n", $line;
    }
}
close (FH);
close (DR);

my %hg19Done;
open (DP, ">hg19.differentPortions.bed") or die "can not write to hg19.differentPortions.bed";
open (DV, ">hg19.differentVersions.bed") or die "can not write to hg19.differentVersions.bed";
open (FH, "<hg19.only.bed" ) or die "can not read hg19.only.bed";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
    # assume done while $acc is still complete
    $hg19Done{$acc} = 1;
    if (exists($differentPortions{$acc})) {
	printf DP "%s\n", $line;
    } else {
	my $trimAcc = $acc;
	$trimAcc =~ s/\.[0-9]+$//;
	if (exists($differentVersions{$trimAcc})) {
	    printf DV "%s\n", $line;
	} else {
            # this one does not match
	    $hg19Done{$acc} = 0;
	}
    }
}
close (FH);
close (DV);
close (DP);
open (DR, ">hg19.newTo19.bed") or die "can not write to hg19.newTo19.bed";
open (FH, "<hg19.only.bed" ) or die "can not read hg19.only.bed";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
    if (0 == $hg19Done{$acc}) {
	printf DR "%s\n", $line;
    }
}
close (FH);
close (DR);
'_EOF_'
    # << happy emacs
    chmod +x identifyPortions.pl
    ./identifyPortions.pl
    # make sure nothing was lost
    sort hg18.differentVersions.bed hg18.differentPortions.bed \
	hg18.dropped.bed  | sum  
    #	50075    28
    sort hg18.only.bed | sum
    #	50075    28
    sort hg19.differentVersions.bed hg19.differentPortions.bed \
	hg19.newTo19.bed | sum
    #	36621    45
    sort hg19.only.bed | sum
    #	36621    45

    sort -k1,1 -k2,2n hg19.differentVersions.bed hg19.differentPortions.bed \
	hg19.newTo19.bed > hg19.itemRgb.bed
    sort -k1,1 -k2,2n hg18.differentVersions.bed hg18.differentPortions.bed \
	hg18.dropped.bed > hg18.itemRgb.bed

    hgLoadBed hg18 hg18ContigDiff hg18.itemRgb.bed
    hgLoadBed hg19 hg19ContigDiff hg19.itemRgb.bed

##############################################################################
# 1000 GENOMES COVERAGE MASK (DONE 10/1/10 angie)
    mkdir /hive/data/genomes/hg18/bed/1000GenomesMask
    cd /hive/data/genomes/hg18/bed/1000GenomesMask
    wget --timestamping \
      ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_07/low_coverage/other_data/\*
    cat > pseudoFastaToBed.pl <<'_EOF_'
#!/usr/bin/env perl
use warnings;
use strict;

my ($base, $inFile) = @ARGV;

die "usage: $0 outputBase input.gz\n" if (! $base || ! $inFile);
my ($IN, $OUTD, $OUTM, $OUTU);
open($IN, "zcat $inFile |") || die;
my $outD = $base . "Depth.bed.gz";
my $outM = $base . "MapQ.bed.gz";
my $outU = $base . "Uncov.bed.gz";
open($OUTD, "| gzip -c > $outD") || die;
open($OUTM, "| gzip -c > $outM") || die;
open($OUTU, "| gzip -c > $outU") || die;

sub printItem {
  my ($chr, $start, $end, $mask) = @_;
  return unless $end > $start;
  if ($mask eq 'D') {
    print $OUTD join("\t", $chr, $start, $end) . "\n";
  } elsif ($mask eq 'M') {
    print $OUTM join("\t", $chr, $start, $end) . "\n";
  } elsif ($mask eq 'B') {
    print $OUTD join("\t", $chr, $start, $end) . "\n";
    print $OUTM join("\t", $chr, $start, $end) . "\n";
  } elsif ($mask eq "-") {
    print $OUTU join("\t", $chr, $start, $end) . "\n";
  }
}

sub maskToBed3Subtracks {
  my ($chr, $seqRef) = @_;
  $chr =~ s/MT$/M/; $chr =~ s/^([0-9XMY])/chr$1/;
  my ($start, $end) = (0, 0);
  my $len = length($$seqRef);
  my $prevM;
  while ($end < $len) {
    my $m = substr $$seqRef, $end, 1;
    if (defined $prevM && $m ne $prevM) {
      &printItem($chr, $start, $end, $prevM);
      $start = $end;
    }
    $end++;
    $prevM = $m;
  }
  &printItem($chr, $start, $end, $prevM);
}

my ($prevChrom, $seq);
while (<$IN>) {
  if (/^>(\S+)/) {
    my $chrom = $1;
    if (defined $prevChrom) {
      &maskToBed3Subtracks($prevChrom, \$seq);
    }
    $prevChrom = $chrom;
    $seq = "";
  } elsif (/^([NMDB0-]+)$/) {
    $seq .= $1;
  } else {
    die "Unexpected line format:\n$_\t";
  }
}
&maskToBed3Subtracks($prevChrom, \$seq);
close($OUTD); close($OUTM); close($OUTU);
'_EOF_'
    # << emacs
    chmod a+x pseudoFastaToBed.pl
    foreach f ({CEU,CHBJPT,YRI}.low_coverage.mask.fa.gz)
      set pop = `echo $f:r:r:r:r | perl -wpe '$_ = ucfirst lc; s/Chbjpt/ChbJpt/;'`
      set tBase = "covMask1kGPilotLowCov$pop"
      echo $tBase
      ./pseudoFastaToBed.pl $tBase $f
    end
    # Use featureBits to merge adjacent regions
    foreach f (covMask*.bed.gz)
      echo $f:r:r
      featureBits hg18 $f -bed=stdout \
      | cut -f 1-3 \
      | hgLoadBed hg18 $f:r:r stdin
    end
# covMask1kGPilotLowCovCeuDepth
# 6718955 bases of 2881515245 (0.233%) in intersection
# Loaded 97777 elements of size 3
# covMask1kGPilotLowCovCeuMapQ
# 408568477 bases of 2881515245 (14.179%) in intersection
# Loaded 4052843 elements of size 3
# covMask1kGPilotLowCovCeuUncov
# 3942 bases of 2881515245 (0.000%) in intersection
# Loaded 311 elements of size 3
# covMask1kGPilotLowCovChbJptDepth
# 12143572 bases of 2881515245 (0.421%) in intersection
# Loaded 198277 elements of size 3
# covMask1kGPilotLowCovChbJptMapQ
# 429343803 bases of 2881515245 (14.900%) in intersection
# Loaded 4198464 elements of size 3
# covMask1kGPilotLowCovChbJptUncov
# 50676 bases of 2881515245 (0.002%) in intersection
# Loaded 2108 elements of size 3
# covMask1kGPilotLowCovYriDepth
# 11875006 bases of 2881515245 (0.412%) in intersection
# Loaded 193700 elements of size 3
# covMask1kGPilotLowCovYriMapQ
# 454810959 bases of 2881515245 (15.784%) in intersection
# Loaded 4338322 elements of size 3
# covMask1kGPilotLowCovYriUncov
# 21232 bases of 2881515245 (0.001%) in intersection
# Loaded 1255 elements of size 3

    # Make some union tables
    featureBits hg18 -or covMask1kGPilotLowCovCeuDepth \
                         covMask1kGPilotLowCovChbJptDepth \
                         covMask1kGPilotLowCovYriDepth \
                     -bed=stdout \
    | cut -f 1-3 | gzip -c > covMask1kGPilotLowCovUnionDepth.bed.gz
#14033969 bases of 2881515245 (0.487%) in intersection
    hgLoadBed hg18 covMask1kGPilotLowCovUnionDepth covMask1kGPilotLowCovUnionDepth.bed.gz
#Loaded 232479 elements of size 3

    featureBits hg18 -or covMask1kGPilotLowCovCeuMapQ \
                         covMask1kGPilotLowCovChbJptMapQ \
                         covMask1kGPilotLowCovYriMapQ \
                     -bed=stdout \
    | cut -f 1-3 | gzip -c > covMask1kGPilotLowCovUnionMapQ.bed.gz
#463864561 bases of 2881515245 (16.098%) in intersection
    hgLoadBed hg18 covMask1kGPilotLowCovUnionMapQ covMask1kGPilotLowCovUnionMapQ.bed.gz
#Loaded 4319382 elements of size 3

    featureBits hg18 -or covMask1kGPilotLowCovCeuUncov \
                         covMask1kGPilotLowCovChbJptUncov \
                         covMask1kGPilotLowCovYriUncov \
                     -bed=stdout \
    | cut -f 1-3 | gzip -c > covMask1kGPilotLowCovUnionUncov.bed.gz
#66237 bases of 2881515245 (0.002%) in intersection
    hgLoadBed hg18 covMask1kGPilotLowCovUnionUncov covMask1kGPilotLowCovUnionUncov.bed.gz
#Loaded 3129 elements of size 3

    # Make intersection of uncovered bits too:
    featureBits hg18 covMask1kGPilotLowCovCeuUncov \
                     covMask1kGPilotLowCovChbJptUncov \
                     covMask1kGPilotLowCovYriUncov \
                     -bed=stdout \
    | cut -f 1-3 | gzip -c > covMask1kGPilotLowCovIntersectionUncov.bed.gz
#676 bases of 2881515245 (0.000%) in intersection
    hgLoadBed hg18 covMask1kGPilotLowCovIntersectionUncov covMask1kGPilotLowCovIntersectionUncov.bed.gz
#Loaded 49 elements of size 3

    featureBits hg18 -or covMask1kGPilotLowCovCeuDepth covMask1kGPilotLowCovCeuMapQ \
                         covMask1kGPilotLowCovCeuUncov \
                         covMask1kGPilotLowCovChbJptDepth covMask1kGPilotLowCovChbJptMapQ \
                         covMask1kGPilotLowCovChbJptUncov \
                         covMask1kGPilotLowCovYriDepth covMask1kGPilotLowCovYriMapQ \
                         covMask1kGPilotLowCovYriUncov \
                     -bed=stdout \
    | cut -f 1-3 | gzip -c > covMask1kGPilotLowCovUnion.bed.gz
#467348829 bases of 2881515245 (16.219%) in intersection
    hgLoadBed hg18 covMask1kGPilotLowCovUnion covMask1kGPilotLowCovUnion.bed.gz
#Loaded 4339659 elements of size 3


#########################################################################
# SNP "BAD APPLES" TRACK (IN PROGRESS 10/4/10 angie)
    cd /hive/data/genomes/gs.19/build36/bed/1000GenomesMask
    foreach t (Depth MapQ Uncov "")
      echo $t
      bedIntersect -tab -aHitAny -allowStartEqualEnd ../snp130/snp130.bed \
        covMask1kGPilotLowCovUnion$t.bed.gz snp130BadApplesUnion$t.bed
    end
    wc -l snp130BadApplesUnion*.bed
#   5240097 snp130BadApplesUnion.bed
#    502329 snp130BadApplesUnionDepth.bed
#   5150686 snp130BadApplesUnionMapQ.bed
#      1491 snp130BadApplesUnionUncov.bed
    cut -f 4 snp130BadApplesUnion.bed | sort -u | wc -l
#4452279
    sed -e 's/snp130/snp130BadApples/' /hive/data/outside/dbSNP/130/human/snp130.sql \
      > snp130BadApples.sql
    foreach t (Depth MapQ Uncov "")
      echo $t
      hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
        hg18 snp130BadApples$t -sqlTable=snp130BadApples.sql snp130BadApplesUnion$t.bed
    end
# Reading snp130BadApplesUnionDepth.bed
# Loaded 502329 elements of size 17
# Reading snp130BadApplesUnionMapQ.bed
# Loaded 5150686 elements of size 17
# Reading snp130BadApplesUnionUncov.bed
# Loaded 1491 elements of size 17
# Reading snp130BadApplesUnion.bed
# Loaded 5240097 elements of size 17
    hgsql hg18 -e 'rename table snp130BadApples to snp130BadApplesUnion'
    gzip snp130BadApples*.bed

    # rs ID lists
    foreach f (snp130BadApples*.bed.gz)
      zcat $f | cut -f 4 | sort -u > $f:r:r.rsIDs.txt
    end
    wc -l *.txt
# 4452279 snp130BadApplesUnion.rsIDs.txt
#  392307 snp130BadApplesUnionDepth.rsIDs.txt
# 4376753 snp130BadApplesUnionMapQ.rsIDs.txt
#    1477 snp130BadApplesUnionUncov.rsIDs.txt

# use list of SNPs to port to hg19?  Try to port masked regions??  

# asked richard and sendu to repeat for hg19/GRCh37.  Richard's reply:
# --------------------------
# No, we haven't done that.  We would be doing it for new call sets I
# expect, not for the pilot calls.
# We are developing new approaches to variant calling that might well
# change the accessibility criteria and
# masks.
#
# I am copying the 1000GP data processing group to remind ourselves that
# amongst all the current discussion
# about calling, we need to return to how we handle accessibility.
# --------------------------

#########################################################################
# DENISOVA (ANCIENT HUMAN) (DONE 11/16/10 angie)
    mkdir /hive/data/genomes/hg18/bed/denisova
    cd /hive/data/genomes/hg18/bed/denisova
    # Use username and password emailed by Ed Green 9/30/10
    alias wg     wget -r --user=xxx --password=xxx
    wg ftp://cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2/\*
    wg ftp://cdna.eva.mpg.de/Present-day_human_sequence_alignments_to_hg18_and_panTro2/\*
    wg ftp://cdna.eva.mpg.de/Catalog_of_changes/\*
    # Ed says we don't need files with 'hcca' or 'hcscca' in the names
    # (only hg18 or panTro2).
    find . -name \*h\*cca\* -exec echo rm {} \;
    # Inspect & execute output of the find command to save a bit of space (11G of 83G).

    # Combine the two sequence-lib files for Denisova into one bam.  First ensure that
    # the headers are identical:
    samtools view -H cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2/SL3003/SL3003-hg18.bam > h1
    samtools view -H cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2/SL3004_100122/SL3004-hg18.bam > h2
    cmp h1 h2
    # No output, and they seem to be sorted by position, good to go:
    samtools merge SL3003_SL3004_100122-hg18.bam \
     cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2/SL3003/SL3003-hg18.bam \
     cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2/SL3004_100122/SL3004-hg18.bam
#1023.571u 22.997s 17:34.24 99.2%        0+0k 0+0io 0pf+0w
    # Build BAM index (.bam.bai) files.
    samtools index SL3003_SL3004_100122-hg18.bam
#94.811u 3.560s 1:51.16 88.4%    0+0k 0+0io 0pf+0w
    pushd cdna.eva.mpg.de/Present-day_human_sequence_alignments_to_hg18_and_panTro2
    foreach f (*.bam)
      echo $f
      samtools index $f
    end
    popd

    # Make /gbdb/ links and load database tables
    mkdir /gbdb/hg18/denisova
    ln -s `pwd`/SL3003_SL3004_100122-hg18.bam{,.bai} /gbdb/hg18/denisova/
    mkdir /gbdb/hg18/denisova/modernHumanSeq
    find `pwd` -name MMS\*hg18\*.bam\* -exec echo ln -s {} /gbdb/hg18/denisova/modernHumanSeq/ \;
    # Inspect & execute output of the find command.

    hgBbiDbLink hg18 bamSLDenisova /gbdb/hg18/denisova/SL3003_SL3004_100122-hg18.bam
    foreach f (/gbdb/hg18/denisova/modernHumanSeq/MMS*.bam)
      set track = `echo $f | perl -wpe 's/.*(MMS\d+)_HGDP\d+_(\w+)\..*/bam$1$2/ || die; s/_//;'`
      echo $track
      hgBbiDbLink hg18 $track $f
    end

    # to see the grp table:
    hgsql -e "select * from grp order by priority;" hg18
    # add new denisova group:
    hgsql hg18 -e "INSERT INTO grp VALUES ('denisova', 'Denisova Assembly and Analysis', 6.6, 0);"

#########################################################################
# BUILD R-DMR TRACK.  DONE (Fan 7/23/2010)

ssh hgwdev
mkdir -p /hive/data/genomes/gs.19/build36/bed/rdmr
cd /hive/data/genomes/gs.19/build36/bed/rdmr

# download the supplemental data file, ng.471-S2.txt.

cp -p ng.471-S2.txt rdmrRaw.tab
# remove the header lines at the beginning of rdmrRaw.tab
vi rdmrRaw.tab

# load rdmrRaw table
hgsql hg18 -e 'drop table rdmrRaw'
hgsql hg18 < ~/kent/src/hg/lib/rdmrRaw.sql
hgsql hg18 -e 'load data local infile "rdmrRaw.tab" into table rdmrRaw'

# create rdmr table

hgsql hg18 -N -e 'select chrom, chromStart-1, chromEnd, gene from rdmrRaw'
>rdmr.tab

hgLoadBed hg18 rdmr rdmr.tab

#####################################################################
# Create liftOver files to and from: calJac3 <-> hg18 (DONE 2011-01-10 - Chin)

    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/hg18/bed/lastzCalJac3.2010-12-20
    cd /hive/data/genomes/hg18/bed/lastzCalJac3.2010-12-20

    cat << '_EOF_' > DEF
# human vs. marmoset
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg19
SEQ1_DIR=/scratch/data/hg18/hg18.2bit
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Marmoset (calJac3)
SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
SEQ2_LIMIT=50
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg18/bed/lastzCalJac3.2010-12-20
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy


    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
        -stop net \
        > do.log 2>&1 &
    #   real    101m10.634s

    cd /hive/data/genomes/hg18/bed
    ln -s lastzCalJac3.2010-12-20 lastz.calJac
    cd /hive/data/genomes/hg18/bed/lastz.calJac3/axtChain
    cp hg18.calJac3.over.chain.gz ../../liftOver/.
    cd /hive/data/genomes/hg18/bed/liftOver
    mv hg18.calJac3.over.chain.gz  hg18ToCalJac3.over.chain.gz
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg18/liftOver
    ln -s /hive/data/genomes/hg18/bed/liftOver/hg18ToCalJac3.over.chain.gz .
    md5sum *.gz > md5sum.txt

#########################################################################
# ISCA (FORMERLY ISCA RETROSPECTIVE) FROM DBVAR (DONE 5/21/12 angie)
# Updated 3/2/12.  Updated 5/21/12 to include nstd45 (Curated) and b0b's aggregate/depth subtracks.
# Combined submitted+remapped, w/new subcategories likely benign, likely pathogenic,
# and removed Retrospective from names 10/18/11.
# Split into benign/pathogenic/uncertain subtracks 9/14/11.
# Reloaded 4/19/11 to drop the boring aggregate variants (sv; keep ssv).
# Originally done 1/31/11
    # Redmine: Track #34 (dbVar for human)
    set today = `date +%Y_%m_%d`
    mkdir /hive/data/genomes/hg18/bed/isca/$today
    cd /hive/data/genomes/hg18/bed/isca/$today
    # Get variants submitted on this assembly, and variants remapped from other assemblies.
    wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd37_ISCA/gvf/nstd37_ISCA.NCBI36.submitted.all.germline.ucsc.gvf.gz
    wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd37_ISCA/gvf/nstd37_ISCA.NCBI36.remap.all.germline.ucsc.gvf.gz
    wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd45_ISCA_curated_dataset/gvf/nstd45_ISCA_curated_dataset.NCBI36.submitted.all.germline.ucsc.gvf.gz
    # See notes on data contents: http://redmine.soe.ucsc.edu/issues/34#note-34
    # and notes on data format plan: http://redmine.soe.ucsc.edu/issues/34#note-36
    zcat nstd37_ISCA*.gvf.gz \
    | ~/kent/src/hg/utils/automation/gvfToBed8Attrs.pl \
      > isca.bed
    zcat nstd45_ISCA*.gvf.gz \
    | ~/kent/src/hg/utils/automation/gvfToBed8Attrs.pl \
      > iscaCurated.bed
    wc -l isca*.bed
#  12923 isca.bed
#     84 iscaCurated.bed

    # Split into subtracks by clinical_int value.
    zcat nstd37_ISCA*.gvf.gz \
    | grep ssv | sed -e 's/.*clinical_int=//; s/;.*//;' | sort | uniq -c
#   4304 Benign
#   4583 Pathogenic
#   3408 Uncertain significance
#    464 Uncertain significance: likely benign
#    164 Uncertain significance: likely pathogenic
    zcat nstd45_ISCA*.gvf.gz \
    | grep ssv | sed -e 's/.*clinical_int=//; s/;.*//;' | sort | uniq -c
#     29 Benign
#     55 Pathogenic

    foreach subtrack (Benign Pathogenic)
      grep -w  $subtrack isca.bed > isca$subtrack.bed
      hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
        -allowStartEqualEnd hg18 isca$subtrack isca$subtrack.bed
      grep -w  $subtrack iscaCurated.bed > iscaCurated$subtrack.bed
      hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
        -allowStartEqualEnd hg18 iscaCurated$subtrack iscaCurated$subtrack.bed
    end
#Read 4304 elements of size 11 from iscaBenign.bed
#Read 29 elements of size 11 from iscaCuratedBenign.bed
#Read 4583 elements of size 11 from iscaPathogenic.bed
#Read 55 elements of size 11 from iscaCuratedPathogenic.bed

    # The subcategories of Uncertain need a bit more sophisticated treatment:
    set subtrack = Uncertain
    grep -w $subtrack isca.bed \
    | grep -vi 'Uncertain Significance: likely' \
      > isca$subtrack.bed
    hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
      -allowStartEqualEnd hg18 isca$subtrack isca$subtrack.bed
#Read 3408 elements of size 11 from iscaUncertain.bed

    foreach unc (benign pathogenic)
      set subtrack = Likely`perl -we 'print ucfirst("'$unc'");'`
      grep -wi "Uncertain Significance: likely $unc" isca.bed \
        > isca$subtrack.bed
      hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
        -allowStartEqualEnd hg18 isca$subtrack isca$subtrack.bed
    end
#Read 464 elements of size 11 from iscaLikelyBenign.bed
#Read 164 elements of size 11 from iscaLikelyPathogenic.bed

    # Add b0b's aggregate/depth subtracks.
    # make bedGraphs
    hgsql -N -e "SELECT chrom, chromStart, chromEnd FROM iscaPathogenic \
        WHERE attrVals LIKE '%number_gain%'" hg18 | sort \
    | bedItemOverlapCount hg18 stdin > iscaPathGain.bedGraph

    hgsql -N -e "SELECT chrom, chromStart, chromEnd FROM iscaPathogenic \
        WHERE attrVals LIKE '%number_loss%'" hg18 | sort \
    | bedItemOverlapCount hg18 stdin > iscaPathLoss.bedGraph

    # load tables
    hgLoadBed -bedGraph=4 hg18 iscaPathGainCum iscaPathGain.bedGraph
#Read 1997 elements of size 4 from iscaPathGain.bedGraph
    hgLoadBed -bedGraph=4 hg18 iscaPathLossCum iscaPathLoss.bedGraph
#Read 3570 elements of size 4 from iscaPathLoss.bedGraph


#############################################################################
# BUILD B CELL RNA-SEQ TRACKS (DONE, 3/29/11, Fan)

ssh hgwdev
mkdir -p /hive/data/genomes/gs.19/build36/bed/Bcell
cd /hive/data/genomes/gs.19/build36/bed/Bcell

# Get data from custom track site

wget --timestamping http://genomicsweb1.med.upenn.edu/ucsc/bcelltranscriptometracks.txt
wget --timestamping http://genomicsweb1.med.upenn.edu/ucsc/800m_junctions.bed
wget --timestamping http://genomicsweb1.med.upenn.edu/ucsc/800m.bw

ln -s `pwd`/800m.bw /gbdb/hg18/bbi/ceuBcellRNASeqBW.bw

hgsql hg18 -e 'drop table if exists ceuBcellRNASeqBW; \
               create table ceuBcellRNASeqBW (fileName varchar(255) not null); \
	       insert into ceuBcellRNASeqBW values ("/gbdb/hg18/bbi/ceuBcellRNASeqBW.bw");'

fgrep -v track 800m_junctions.bed >ceuBcellRNASeq.bed
hgLoadBed hg18 ceuBcellRNASeq ceuBcellRNASeq.bed
#############################################################################
# CREATE .PNG PICTURE FILES OF EVOFOLD RNA STRUCTURES. (DONE, 4/29/2011, Fan)

ssh hgwdev
mkdir /hive/data/genomes/hg18/bed/evofold/doEvoFold
cd /hive/data/genomes/hg18/bed/evofold/doEvoFold

# Creaet sub-directories to store .png files (total of about 47.5 K of them) sparated by chromosomes.

mkdir -p evoFold/chr1
mkdir -p evoFold/chr10
mkdir -p evoFold/chr11
mkdir -p evoFold/chr12
mkdir -p evoFold/chr13
mkdir -p evoFold/chr14
mkdir -p evoFold/chr15
mkdir -p evoFold/chr16
mkdir -p evoFold/chr17
mkdir -p evoFold/chr18
mkdir -p evoFold/chr19
mkdir -p evoFold/chr2
mkdir -p evoFold/chr20
mkdir -p evoFold/chr21
mkdir -p evoFold/chr22
mkdir -p evoFold/chr3
mkdir -p evoFold/chr4
mkdir -p evoFold/chr5
mkdir -p evoFold/chr6
mkdir -p evoFold/chr7
mkdir -p evoFold/chr8
mkdir -p evoFold/chr9
mkdir -p evoFold/chrM
mkdir -p evoFold/chrX
mkdir -p evoFold/chrY

# get latest verion of the .jar file of VARNA

wget --timestamping http://varna.lri.fr/bin/VARNAv3-7.jar

# Create Java command line files

echo 'doEvoFold hg18 do$1 $1' >do1Chrom
chmod +x do1Chrom

do1Chrom chr1
do1Chrom chr10
do1Chrom chr11
do1Chrom chr12
do1Chrom chr13
do1Chrom chr14
do1Chrom chr15
do1Chrom chr16
do1Chrom chr17
do1Chrom chr18
do1Chrom chr19
do1Chrom chr2
do1Chrom chr20
do1Chrom chr21
do1Chrom chr22
do1Chrom chr3
do1Chrom chr4
do1Chrom chr5
do1Chrom chr6
do1Chrom chr7
do1Chrom chr8
do1Chrom chr9
do1Chrom chrM
do1Chrom chrX
do1Chrom chrY

# run the dochrXX command files in small batches with '&' to exploit multiple CPU
# wait an hour for each batch to finish so that we don't suck in too much computational resources.

dochr1 &
dochr2 &
dochr3 &
dochr4 &
dochr5 &

sleep 3600

dochr6 &
dochr7 &
dochr8 &
dochr9 &
dochr10 &

sleep 3600

dochr11 &
dochr12 &
dochr13 &
dochr14 &
dochr15 &

sleep 3600

dochr16 &
dochr17 &
dochr18 &
dochr19 &
dochr20 &

sleep 3600

dochr21 &
dochr22 &
dochrX &
dochrY &
dochrM &

# check the resulting .png files

# create a simple script file, check1, with the following 3 lines:

echo $1
hgsql hg18 -N -e "select count(*) from evofold where chrom='${1}'"
ls evoFold/$1/*.png|wc

chmod +x check1

# create another script file, checkAll, with the following lines:

check1 chr1
check1 chr10
check1 chr11
check1 chr12
check1 chr13
check1 chr14
check1 chr15
check1 chr16
check1 chr17
check1 chr18
check1 chr19
check1 chr2
check1 chr20
check1 chr21
check1 chr22
check1 chr3
check1 chr4
check1 chr5
check1 chr6
check1 chr7
check1 chr8
check1 chr9
check1 chrM
check1 chrX
check1 chrY

chmod +x checkAll
checkAll >j.check

# examing the resuls in j.check to make sure things are OK.

# create symbolic links

ln -s /hive/data/genomes/gs.19/build36/bed/evofold/doEvoFold/evoFold  /gbdb/hg18/evoFold 
ln -s /gbdb/hg18/evoFold /usr/local/apache/htdocs/evoFold/hg18
#############################################################################
# BUILD evoCpg TRACK (DONE, Fan, 5/23/11)
ssh hgwdev
cd /hive/data/genomes/gs.19/build36/bed
mkdir evoCpg
cd evoCpg
# put data file, weizmann_evo_cgi.bed, here.

cat weizmann_evo_cgi.bed|grep -v track >evoCpg.bed
hgLoadBed hg18 evoCpg evoCpg.bed

# create kent/src/hg/makeDb/trackDb/human/hg18/evoCpg.html.

# add pushQue record
#############################################################################
# BUILD HG18 OMIM RELATED TRACKS (DONE, 6/3/11, Fan)

ssh hgwdev
cd /hive/data/genomes/hg18/bed
mkdir -p omim/05172011
cd omim/05172011

# obtain the following files from OMIM and place them at this subdirectory

       genemap.txt  
       mim2gene.txt  
       mimAV.txt
       script1.pl  
       script2.pl

cat genemap.txt|sed -e 's/|/\t/g' > genemap.tab

hgLoadSqlTab -warn hg18 omimGeneMap ~/kent/src/hg/lib/omimGeneMap.sql genemap.tab

# Load mim2gene table

hgsql hg18 -e 'drop table mim2gene'
hgsql hg18 < ~/kent/src/hg/lib/mim2gene.sql
hgsql hg18 -e 'load data local infile "mim2gene.txt" into table mim2gene ignore 1 lines'

# build omimGeneSymbol table

doOmimGeneSymbols hg18 j.out
cat j.out |sort -u >omimGeneSymbol.tab
hgLoadSqlTab -warn hg18 omimGeneSymbol ~/kent/src/hg/lib/omimGeneSymbol.sql omimGeneSymbol.tab 

perl ./script1.pl --gene-map-file=genemap.txt >omimPhenotype.tab
hgLoadSqlTab -warn hg18 omimPhenotype ~/kent/src/hg/lib/omimPhenotype.sql omimPhenotype.tab 

hgsql hg18 -e 'update omimPhenotype set phenotypeClass = -1 where phenotypeClass=0'
hgsql hg18 -e 'update omimPhenotype set phenotypeId = -1 where phenotypeId=0'

doOmimGene2 hg18 j.tmp
cat j.tmp |sort -u > omimGene2.tab

hgLoadBed hg18 omimGene2 omimGene2.tab

rm j.tmp
##############################################################
# build the omimAvSnp track

cd /hive/data/genomes/hg18/bed/omim/05172011
mkdir av
cd av

# get the mimAV.txt data file from OMIM

cut -f 1 mimAV.txt >j1
cut -f 2 mimAV.txt >j2
cut -f 3  mimAV.txt >j3
cut -f 4  mimAV.txt >j4
cut -f 5  mimAV.txt >j5

cat j1 |sed -e 's/\./\t/' >j1.2

cat j4 |sed -e 's/,/\t/' >j4-2
cut -f 1 j4-2 >j4.1
cut -f 2 j4-2 >j4.2

paste j1 j1.2 j3 j4 j4.1 j4.2 j5 j2 >omimAv.tab

hgsql hg18 -e 'drop table omimAv'
hgsql hg18 < ~/src/hg/lib/omimAv.sql
hgsql hg18 -e 'load data local infile "omimAv.tab" into table omimAv ignore 1 lines'
hgsql hg18 -e 'update omimAv set repl2 = rtrim(ltrim(repl2))'

doOmimAv hg18 omimAvRepl.tab  2>j.err

hgsql hg18 -e "drop table omimAvRepl"
hgsql hg18 < ~/kent/src/hg/lib/omimAvRepl.sql
hgsql hg18 -e 'load data local infile "omimAvRepl.tab" into table omimAvRepl'

rm j1.2  j1 j2 j3  j4  j4-2  j4.1  j4.2  j5

hgsql hg18 -N -e 'select chrom, chromStart, chromEnd, avId from omimAvRepl r, snp130 s where s.name = dbSnpId order by avId' >omimAvSnp.tab

hgLoadBed -allowStartEqualEnd  hg18 omimAvSnp omimAvSnp.tab
##############################################################
# build the omimLocation track

cd /hive/data/genomes/hg18/bed/omim/05172011
mkdir location
cd location

doOmimLocation hg18 omimLocation.bed 2>j.err
hgLoadBed hg18 omimLocation omimLocation.bed

# Remove all gene entries in omimGene2 from omimLocation table

hgsql hg18 -N -e \
'delete from omimLocation where name  in (select name from omimGene2) '

# Per OMIM request, delete all the gray entries in omimLocation table.

mkdir cleanUpOmimLocation
cd cleanUpOmimLocation

hgsql hg18 -N -e \
'select distinct name from omimLocation' |sort -u >j.all

hgsql hg18 -N -e \
'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=1' >j.1
hgsql hg18 -N -e \
'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=2' >j.2
hgsql hg18 -N -e \
'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=3' >j.3
hgsql hg18 -N -e \
'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=4' >j.4

cat j.1 j.2 j.3 j.4 |sort -u >j.1234

diff j.all j.1234 |grep "<" |sed -e 's/</do1/' >doall

cat << '_EOF_' > do1
hgsql hg18 -e "delete from omimLocation where name='${1}'"
'_EOF_'
# << emacs

./doall

#############################################################################
# adding new decode data (WORKING - 2011-07-26 - Hiram)
    mkdir /hive/data/outside/decode
    cd /hive/data/outside/decode
    wget --timestamping "http://www.decode.com/addendum/Maps.zip"
    unzip Maps.zip
    # produces a Maps directory
    mkdir /hive/data/outside/decode/hg18
    cd /hive/data/outside/decode/hg18
    # extract the data from the 10Kb bin recombination maps,
    #	constructing bedGraph files
for F in female female_carrier female_noncarrier \
        male male_carrier male_noncarrier \
        sex-averaged sex-averaged_carrier sex-averaged_noncarrier
do
    ls -og ../Maps/${F}.rmap
    grep -v stdrate ../Maps/${F}.rmap | awk '
        { printf "%s\t%d\t%d\t%s\n", $1, $2-5000, $2+5000, $4 }' > ${F}.bedGraph
    bedGraphToBigWig ${F}.bedGraph /hive/data/genomes/hg18/chrom.sizes \
        ${F}.bw
done

    # load the bigWig files into SQL table name friendly tables:
for C in female female_carrier female_noncarrier male male_carrier \
        male_noncarrier sex-averaged sex-averaged_carrier sex-averaged_noncarrier
do
    N=${C}
    case ${C} in
        female) N="Female" ;;
        female_carrier) N="FemaleCarrier" ;;
        female_noncarrier) N="FemaleNonCarrier" ;;
        male) N="Male" ;;
        male_carrier) N="MaleCarrier" ;;
        male_noncarrier) N="MaleNonCarrier" ;;
        sex-averaged) N="SexAveraged" ;;
        sex-averaged_carrier) N="SexAveragedCarrier" ;;
        sex-averaged_noncarrier) N="SexAveragedNonCarrier" ;;
    esac
    echo $C $N
    rm -f /gbdb/hg18/decode/${C}.bw /gbdb/hg18/decode/${N}.bw
    ln -s `pwd`/${C}.bw /gbdb/hg18/decode/${N}.bw
    hgsql -e "drop table decode${N};" hg18
    hgBbiDbLink hg18 decode${N} /gbdb/hg18/decode/${N}.bw
done

    # compute male - female difference
    awk '{printf "%s_%d_%d\t%s\n", $1, $2, $3, $4}' male.bedGraph \
	| sort > ordered.male.txt
    awk '{printf "%s_%d_%d\t%s\n", $1, $2, $3, $4}' female.bedGraph \
	| sort > ordered.female.txt
    join ordered.male.txt ordered.female.txt > maleFemale.txt

    awk '{printf "%s\t%.6f\n", $1, $2-$3}' maleFemale.txt \
	| sed -e "s/_/\t/g" | sort -k1,1 -k2,2n > maleFemale.bedGraph

    # and hot spots
    awk '$4 > 9.99' female.bedGraph > hotSpotFemale.bed
    awk '$4 > 9.99' male.bedGraph > hotSpotMale.bed

    hgLoadBed hg18 decodeHotSpotFemale hotSpotFemale.bed
    hgLoadBed hg18 decodeHotSpotMale hotSpotMale.bed

    bedGraphToBigWig maleFemale.bedGraph /hive/data/genomes/hg18/chrom.sizes \
        MaleFemaleDifference.bw
    ln -s `pwd`/MaleFemaleDifference.bw /gbdb/hg18/decode/
    hgsql -e "drop table decodeMaleFemaleDifference;" hg18
    hgBbiDbLink hg18 decodeMaleFemaleDifference /gbdb/hg18/decode/MaleFemaleDifference.bw

#############################################################################
# HapMap recombination maps added to deCODE track (DONE - 2011-08-30 - Hiram)
    mkdir /hive/data/genomes/hg18/bed/hapmap/release24
    cd /hive/data/genomes/hg18/bed/hapmap/release24
    wget --timestamping \
https://mathgen.stats.ox.ac.uk/wtccc-software/recombination_rates/genetic_map_b36_CEU.tgz

wget --timestamping \
https://mathgen.stats.ox.ac.uk/wtccc-software/recombination_rates/genetic_map_b36_YRI.tgz

wget --timestamping \
https://mathgen.stats.ox.ac.uk/wtccc-software/recombination_rates/genetic_map_b36_combined.tgz

    mkdir CEU
    cd CEU
    tar xvzf ../genetic_map_b36_CEU.tgz
    cd ..
    mkdir YRI
    cd YRI
    tar xvzf ../genetic_map_b36_YRI.tgz
    cd ..
    mkdir combined
    cd combined
    tar xvzf ../genetic_map_b36_combined.tgz

for F in CEU/genetic_map_*.txt
do
    C=`basename $F | sed -e "s/genetic_map_chr//; s/_CEU_b36.txt//"`
    grep -v "^position" ${F} | awk -v chr="chr${C}" '
BEGIN {prev = 0}
{ printf "%s\t%d\t%d\t%s\n", chr, prev, $1, $2; prev = $1 }
'
done | sort -k1,1 -k2,2n > hapMapRelease24CEURecombMap.bedGraph

for F in YRI/genetic_map_*.txt
do
    C=`basename $F | sed -e "s/genetic_map_chr//; s/_YRI_b36.txt//"`
    grep -v "^position" ${F} | awk -v chr="chr${C}" '
BEGIN {prev = 0}
{ printf "%s\t%d\t%d\t%s\n", chr, prev, $1, $2; prev = $1 }
'
done | sort -k1,1 -k2,2n > hapMapRelease24YRIRecombMap.bedGraph

for F in combined/genetic_map_*.txt
do
    C=`basename $F | sed -e "s/genetic_map_chr//; s/_combined_b36.txt//"`
    grep -v "^position" ${F} | awk -v chr="chr${C}" '
BEGIN {prev = 0}
{ printf "%s\t%d\t%d\t%s\n", chr, prev, $1, $2; prev = $1 }
'
done | sort -k1,1 -k2,2n > hapMapRelease24CombinedRecombMap.bedGraph

for F in hapMapRelease24CEURecombMap hapMapRelease24CombinedRecombMap \
        hapMapRelease24YRIRecombMap
do
bedGraphToBigWig -verbose=2 ${F}.bedGraph \
        /hive/data/genomes/hg18/chrom.sizes ${F}.bw > ${F}.log 2>&1
done

for T in hapMapRelease24CEURecombMap hapMapRelease24CombinedRecombMap \
        hapMapRelease24YRIRecombMap
do
    rm -f /gbdb/hg18/decode/${T}.bw
    ln -s `pwd`/${T}.bw /gbdb/hg18/decode/${T}.bw
    hgsql -e "drop table ${T};" hg18
    hgBbiDbLink hg18 ${T} /gbdb/hg18/decode/${T}.bw
done

#############################################################################
# HI SEQ DEPTH (DONE 7/15/11 angie)
    mkdir /hive/data/genomes/hg18/bed/hiSeqDepth
    cd /hive/data/genomes/hg18/bed/hiSeqDepth
    foreach cov (001 005 01 05 1)
      wget --timestamp http://eqtl.uchicago.edu/Masking/seq.cov$cov.bed.gz
      gunzip -N seq.cov$cov.bed.gz
    end
    wc -l seq.cov*
#    553 seq.cov001.bed
#   1301 seq.cov005.bed
#   2187 seq.cov01.bed
#  18369 seq.cov05.bed
#  34359 seq.cov1.bed
    foreach cov (001 005 01 05 1)
      echo seq.cov$cov.bed
      featureBits -countGaps hg18 seq.cov$cov.bed
    end
#seq.cov001.bed
#57409 bases of 3107677273 (0.002%) in intersection
#seq.cov005.bed
#183848 bases of 3107677273 (0.006%) in intersection
#seq.cov01.bed
#362423 bases of 3107677273 (0.012%) in intersection
#seq.cov05.bed
#3462959 bases of 3107677273 (0.111%) in intersection
#seq.cov1.bed
#Coordinate out of allowed range [0,135374737) for chr10 near line 6826 of seq.cov1.bed
    # I edited line 6826 of seq.cov1.bed to end with 135374737 not 135374744
    featureBits -countGaps hg18 seq.cov$cov.bed
#6466376 bases of 3107677273 (0.208%) in intersection

    # Sanity check: verify that the smaller ones are strict subsets of larger:
    featureBits hg18 -countGaps seq.cov001.bed \!seq.cov005.bed
    featureBits hg18 -countGaps seq.cov005.bed \!seq.cov01.bed
    featureBits hg18 -countGaps seq.cov01.bed \!seq.cov05.bed
    featureBits hg18 -countGaps seq.cov05.bed \!seq.cov1.bed
    # Yep, all got 0 bases:
#0 bases of 3107677273 (0.000%) in intersection
    # Hmm, some overlap w/gap track:
    featureBits hg18 -countGaps seq.cov1.bed gap -bed=gapOverlaps.bed
#477 bases of 3107677273 (0.000%) in intersection

    # Load tables:
    hgLoadBed hg18 hiSeqDepthTopPt1Pct seq.cov001.bed
#Loaded 553 elements of size 3
    hgLoadBed hg18 hiSeqDepthTopPt5Pct seq.cov005.bed
#Loaded 1301 elements of size 3
    hgLoadBed hg18 hiSeqDepthTop1Pct seq.cov01.bed
#Loaded 2187 elements of size 3
    hgLoadBed hg18 hiSeqDepthTop5Pct seq.cov05.bed
#Loaded 18369 elements of size 3
    hgLoadBed hg18 hiSeqDepthTop10Pct seq.cov1.bed
#Loaded 34359 elements of size 3

    # Compare with Terry Furey's blacklisted regions for ENCODE
    # http://encodewiki.ucsc.edu/EncodeDCC/index.php/Blacklist_of_problematic_genomic_regions
    # http://hgdownload-test.cse.ucsc.edu/goldenPath/hg18/encodeDCC/wgEncodeMapability/wgEncodeDukeRegionsExcluded.bed6.gz
    featureBits -countGaps hg18 \
      /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed
#10366850 bases of 3107677273 (0.334%) in intersection
    foreach cov (001 005 01 05 1)
      featureBits -countGaps hg18 seq.cov$cov.bed -enrichment \
        /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed
    end
    # Watch the coverage of seq*bed by Terry's regions drop as $cov increases:
#seq.cov001.bed 0.002%, /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed 0.334%, both 0.002%, cover 94.75%, enrich 284.02x
#seq.cov005.bed 0.006%, /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed 0.334%, both 0.006%, cover 93.43%, enrich 280.06x
#seq.cov01.bed 0.012%, /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed 0.334%, both 0.010%, cover 87.56%, enrich 262.49x
#seq.cov05.bed 0.111%, /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed 0.334%, both 0.038%, cover 34.48%, enrich 103.35x
#seq.cov1.bed 0.208%, /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed 0.334%, both 0.050%, cover 24.02%, enrich 71.99x

############################################################################################
# CREATE TABLES AND .PNG PICTURE FILES OF evofoldV2 RNA STRUCTURES. (DONE, 7/26/2011, Fan)

ssh hgwdev
mkdir -p /hive/data/genomes/hg18/bed/evofoldV2/doEvoFoldV2
cd /hive/data/genomes/hg18/bed/evofoldV2/doEvoFoldV2

wget http://moma.ki.au.dk/~jsp/upload/evofoldV2.hg18.bed
cat ~/kent/src/hg/lib/evofold.sql | \
sed -e "s/evofold/evofoldV2/g" > tmp.sql 
hgLoadBed �notItemRgb -sqlTable=tmp.sql hg18 evofoldV2 evofoldV2.hg18.bed

# Creaet sub-directories to store .png files (total of about 47.5 K of them) sparated by chromosomes.

mkdir -p evoFoldV2/chr1
mkdir -p evoFoldV2/chr10
mkdir -p evoFoldV2/chr11
mkdir -p evoFoldV2/chr12
mkdir -p evoFoldV2/chr13
mkdir -p evoFoldV2/chr14
mkdir -p evoFoldV2/chr15
mkdir -p evoFoldV2/chr16
mkdir -p evoFoldV2/chr17
mkdir -p evoFoldV2/chr18
mkdir -p evoFoldV2/chr19
mkdir -p evoFoldV2/chr2
mkdir -p evoFoldV2/chr20
mkdir -p evoFoldV2/chr21
mkdir -p evoFoldV2/chr22
mkdir -p evoFoldV2/chr3
mkdir -p evoFoldV2/chr4
mkdir -p evoFoldV2/chr5
mkdir -p evoFoldV2/chr6
mkdir -p evoFoldV2/chr7
mkdir -p evoFoldV2/chr8
mkdir -p evoFoldV2/chr9
mkdir -p evoFoldV2/chrM
mkdir -p evoFoldV2/chrX
mkdir -p evoFoldV2/chrY

# get latest verion of the .jar file of VARNA

wget --timestamping http://varna.lri.fr/bin/VARNAv3-7.jar

# Create Java command line files

echo 'doEvoFoldV2 hg18 do$1 $1' >do1Chrom
chmod +x do1Chrom

do1Chrom chr1
do1Chrom chr10
do1Chrom chr11
do1Chrom chr12
do1Chrom chr13
do1Chrom chr14
do1Chrom chr15
do1Chrom chr16
do1Chrom chr17
do1Chrom chr18
do1Chrom chr19
do1Chrom chr2
do1Chrom chr20
do1Chrom chr21
do1Chrom chr22
do1Chrom chr3
do1Chrom chr4
do1Chrom chr5
do1Chrom chr6
do1Chrom chr7
do1Chrom chr8
do1Chrom chr9
do1Chrom chrM
do1Chrom chrX
do1Chrom chrY

chmod +x do*

# run the dochrXX command files in small batches with '&' to exploit multiple CPU
# wait an hour for each batch to finish so that we don't suck in too much computational resources.

dochr1 &
dochr2 &
dochr3 &
dochr4 &
dochr5 &

sleep 3600

dochr6 &
dochr7 &
dochr8 &
dochr9 &
dochr10 &

sleep 3600

dochr11 &
dochr12 &
dochr13 &
dochr14 &
dochr15 &

sleep 3600

dochr16 &
dochr17 &
dochr18 &
dochr19 &
dochr20 &

sleep 3600

dochr21 &
dochr22 &
dochrX &
dochrY &
dochrM &

# check the resulting .png files

# create a simple script file, check1, with the following 3 lines:

echo $1
hgsql hg18 -N -e "select count(*) from evofoldV2 where chrom='${1}'"
ls evoFoldV2/$1/*.png|wc

chmod +x check1

# create another script file, checkAll, with the following lines:

check1 chr1
check1 chr10
check1 chr11
check1 chr12
check1 chr13
check1 chr14
check1 chr15
check1 chr16
check1 chr17
check1 chr18
check1 chr19
check1 chr2
check1 chr20
check1 chr21
check1 chr22
check1 chr3
check1 chr4
check1 chr5
check1 chr6
check1 chr7
check1 chr8
check1 chr9
check1 chrM
check1 chrX
check1 chrY

chmod +x checkAll
checkAll >j.check

# examing the resuls in j.check to make sure things are OK.

# create symbolic links
mkdir -p /usr/local/apache/htdocs/evoFoldV2

ln -s /hive/data/genomes/gs.19/build36/bed/evofoldV2/doEvoFoldV2/evoFoldV2  /gbdb/hg18/evoFoldV2 
ln -s /gbdb/hg18/evoFoldV2 /usr/local/apache/htdocs/evoFoldV2/hg18


############################################################################
# GENEREVIEWS TRACK (DONE 2011-09-22 - Chin)
# This track depends on some tasks completed for hg19, specifically:
#
#      ~/kent/src/hg/lib/geneReviewsBB.sql
#      ~/kent/src/hg/lib/geneReviewsBed5.as
#      ~/kent/src/hg/lib/geneReviewsRefGene.sql
#      ~/kent/src/hg/makeDb/trackDb/human/geneReviews.html
#      ~/kent/src/utils/geneReviews/addGeneReviewToBed.pl
#
#  and data fetched from NCBI:
#       /hive/data/outside/ncbi/geneReviews/current/disease_gene_GR.txt
#       /hive/data/genomes/hg19/bed/geneReviews/grRefGeneData.tab
#       /hive/data/genomes/hg19/bed/geneReviews/grRefGene.lst
# 
# Refer to GENEREVIEWS TRACK section in hg19.txt for details

    mkdir /hive/data/genomes/hg18/bed/geneReviews
    cd /hive/data/genomes/hg18/bed/geneReviews
    cp /hive/data/outside/ncbi/geneReviews/current/disease_gene_GR.txt .
    cp  /hive/data/genomes/hg19/bed/geneReviews/grRefGeneData.tab .

    # load RefSeg Gene to geneReview mapping list to hg18
    hgLoadSqlTab -warn hg18 geneReviewsRefGene \
      $HOME/kent/src/hg/lib/geneReviewsRefGene.sql grRefGeneData.tab
    # Scanning through 1 files

    cp /hive/data/genomes/hg19/bed/geneReviews/grRefGene.lst .
  
    # for each refGen in grRefGene.lst, create a non-overlapping bed row.
    cat grRefGene.lst | while read G
      do
        echo ${G}
        hgsql hg18 -N -e \
          "SELECT e.chrom,e.txStart,e.txEnd,j.geneSymbol \
          FROM knownGene e, kgXref j WHERE e.alignID = j.kgID AND \
          j.geneSymbol ='${G}' ORDER BY e.chrom,e.txStart;" > temp.in
        bedRemoveOverlap temp.in temp.out
        cat temp.out >> geneReviews.tab
      done
    rm temp.*

    # load the collapsed bed4 file to hg18,
    hgLoadBed hg18 geneReviews geneReviews.tab

    # Use addGeneReviewToBed.pl will add the geneReviews detail in html format to 
    # the bed 4 file
    chmod +x $HOME/kent/src/utils/geneReviews/addGeneReviewToBed.pl
    # Add geneReview item in html format format as field 5
    $HOME/kent/src/utils/geneReviews//addGeneReviewToBed.pl hg18 > hg18.geneReviews.bed5

    # Convert to bigBed format 
    /cluster/bin/x86_64/bedToBigBed -bedFields=4 -tabs \
     -as=$HOME/kent/src/hg/lib/geneReviewsBed5.as  hg18.geneReviews.bed5 \
     /hive/data/genomes/hg18/chrom.sizes hg18.geneReviews.bb

    # upload the bigBed file to genomewiki
    /cluster/bin/scripts/gwUploadFile hg18.geneReviews.bb hg18.geneReviews.bb
    # # loading file: hg18.geneReviews.bb
    # # into Image name: Hg18.geneReviews.bb
    # # login name: chinhli
    # # siteUrl: genomewiki.ucsc.edu
    # # traceBackLimit: 0
    # # traceBackLimit: 0 past site.Images
    # Image info: {u'comment': u'gwUploadFile upload', u'sha1': u'6f5009a367a6b4fdaa2739541680253bd183af12', u'url': u'http://genomewiki.ucsc.edu/images/c/cd/Hg18.geneReviews.bb', u'timestamp': u'2011-09-22T23:28:39Z', u'metadata': None, u'height': 0, u'width': 0, u'user': u'Chinhli', u'descriptionurl': u'http://genomewiki.ucsc.edu/index.php/File:Hg18.geneReviews.bb', u'size': 170249}
    # Image File:Hg18.geneReviews.bb usage:

    hgsql hg18 -e "source $HOME/kent/src/hg/lib/geneReviewsBB.sql;"
    hgsql hg18 -e 'insert into geneReviewsBB values ("http://genomewiki.ucsc.edu/images/c/cd/Hg18.geneReviews.bb")'
    

#############################################################################
2012-06-25 markd

discovered that:
   /hive/data/genomes/gs.19/build36/bed/blastz.tupBel1/axtChain/hg18.tupBel1.net.gz

was not actually compressed
   mv hg18.tupBel1.net.gz hg18.tupBel1.net
   gzip  hg18.tupBel1.net

#############################################################################