# CREATE EMPTY DATABASE AND TABLES.
hgsql -e "create database visiGeneNew" mysql
hgsql visiGeneNew < ~/kent/src/hg/visiGene/visiGene.sql
makeTableDescriptions visiGeneNew ~/kent/src/hg/visiGene/visiGene.as

# LOAD PAUL GRAY/MAHONEY LAB DATA.
# Transferred images from Paul Gray's Mac to mine and converted
# his spreadsheet to a tab-separated file, cloning.tab.
cd ~/kent/src/hg/visiGene/vgLoadMahoney
vgLoadMahoney /gbdb/visiGene mm5 cloning.tab clonePcr.bed outDir
cd outDir
visiGeneLoad whole.ra whole.tab /dev/null -database=visiGeneNew
visiGeneLoad slices.ra slices.tab /dev/null -database=visiGeneNew

# LOAD JACKSON LABS DATA.
# First ask Galt to create a local copy of the Jackson labs
# database.  I'm not sure how he did it.
cd ~/kent/src/hg/visiGene/vgLoadJax
vgLoadJax /gbdb/visiGene jackson visiGene
./loadNew

# Update the privateUser fields where we don't have permissions by entering
# this at the mysql prompt.
update submissionSet,journal set submissionSet.privateUser=-1 
   where (journal.name like 'Nat %' or journal.name = 'Nature')  
   and submissionSet.journal = journal.id and submissionSet.name like 'jax%'

# LOAD NIBB IMAGES
# Do this after creating the nibbImageProbe.fa file as described
# in makeXenTro1.doc, and after creating the nibbImageProbes table
# in hg17 as describe in makeHg17.doc.  The image files are
# loaded in /cluster/store11/visiGene/offline/nibbFrog.
ssh kolossus
cd /cluster/store11/visiGene/offline
nibbParseImageDir nibbFrog nibFrog.tab bad.tab
nibbPrepImages nibbFrog nibFrog.tab \
	/cluster/store11/visiGene/gbdb/200/inSitu/XenopusLaevis/nibb \
	/cluster/store11/visiGene/gbdb/full/inSitu/XenopusLaevis/nibb
# Note the nibbPrepImages step is a 2 day process, next time may
# want to run it on the kki cluster.  It does need to be run on a 64
# bit machine because of bugs in the 32 bit image magick convert program.

ssh hgwdev
cd ~/kent/src/hg/visiGene/vgLoadNibb
hgMapToGene hg17 nibbImageProbes knownGene knownToNibbImage

# Now go into the gene sorter on hg17, configure it to just show
# the name, genbank, and NIBB Xenopus columns.  Filter on * in the
# NIBB Xenopus column (which will get rid of rows with no data in that
# column).  Save the text output to names.raw.  Then get rid of names
# that are no more than genbank accessions as so:
awk '$1 != $2 {printf("%s\t%s\n", $1, $3);}' names.raw > names.txt

# Now create the .tab and .ra files as so:
vgLoadNibb /cluster/store11/visiGene/offline/nibbFrog \
	/cluster/store11/visiGene/offline/nibbFrog.tab \
	/cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa \
	names.tab stage.tab outDir
visiGeneLoad outDir/nibb.ra outDir/nibb.tab /dev/null -database=visiGeneNew


# LOAD GENSAT IMAGES
# This was done with the assistance of Mike Dicuccio at NCBI, 
# dicuccio@ncbi.nlm.nih.gov.  If updating probably it's best to
# get in touch with him and make sure that the ftp site is up to
# date.   

# Download data from NCBI into /cluster/store11/visiGene/offline/gensat
cd /cluster/store11/visiGene/offline
mkdir gensat
cd gensat
mkdir RawData
cd RawData
wget --timeStamping ftp://ftp.ncbi.nih.gov/pub/gensat/RawData/GENSAT-20051120.xml.gz
wget --timeStamping ftp://ftp.ncbi.nih.gov/pub/gensat/RawData/NCBI_Gensat-20051120.dtd

# At this point if the dtd has changed you may need to remake 
# kent/src/hg/visiGene/gensat/lib/gs.c with autoXml.  Once
# this is done then do the download with gensatImageDownload.
# It'll take about 3 days. The results will be in the Institutions dir.
cd /cluster/store11/visiGene/offline/gensat
zcat RawData/GENSAT-20051120.xml.gz | gensatImageDownload . download.log

# Create parasol directory and a list of the jpg files.
ssh kki
cd /cluster/store11/visiGene/offline/gensat
mkdir prepImageRun
find Institutions -name '*.jpg' -print | sed 's/Institutions\//' > prepImageRun/jpg.lst
cd prepImageRun

# Create parasol batch
cat << '_EOF_' > gsub
#LOOP
vgPrepImage /cluster/store11/visiGene/offline/gensat/Institutions /cluster/store11/
visiGene/gbdb/200/inSitu/Mouse/gensat /cluster/store11/visiGene/gbdb/full/inSitu/Mo
use/gensat $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 jpg.lst single gsub spec
para make spec

# Note the above procedure would take about 3 days.  I ended up copying the
# data over to /san/sanvol1, and doing it on the pita cluster.  The job
# there just took two hours, with just 100 cpus available.  It took
# an hour to copy the data over, and eight hours to copy it back though,
# and some tweaking.

# MAKE FULL TEXT INDEX
cd /cluster/store11/visiGene/gbdb
vgGetText visiGene.text mm7 hg17
ixIxx visiGene.text visiGene.ix visiGene.ixx


# (Galt 2006-02)
# RSYNC'd from /cluster/store11/visiGene to /san/sanvol1/visiGene
# and moved the /gbdb/visiGene symlink to point to the new location.
# I also had to manually run a script to find symlinks pointing from full/ over to 
# /cluster/store11/offline and remake them to point correctly to /san/sanvol1/visiGene/offline.

# Allen Brain Atlas jp2 image prep (Galt 2006-02-12)
# Create parasol directory and a list of the jpg files.
ssh pk
cd /san/sanvol1/visiGene/offline/allenBrain
mkdir prepImageRun
find imageDisk -name '*.jp2' -print | sed 's/imageDisk\///' > prepImageRun/jpg.lst
cd prepImageRun
# Create parasol batch
cat << '_EOF_' > gsub
#LOOP
vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 jpg.lst single gsub spec
para make spec -maxNode=50

[pk:/san/sanvol1/visiGene/offline/allenBrain/prepImageRun> /parasol/bin/para time
11748 jobs in batch
4291 jobs (including everybody's) in Parasol queue.
Checking finished jobs
Completed: 11748 of 11748 jobs
CPU time in finished jobs:     474919s    7915.32m   131.92h    5.50d  0.015 y
IO & Wait Time:               5029116s   83818.60m  1396.98h   58.21d  0.159 y
Average job time:                 469s       7.81m     0.13h    0.01d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:           41811s     696.85m    11.61h    0.48d
Submission to last job:        172301s    2871.68m    47.86h    1.99d


# -maxNode=50 was needed. 
# Note that because it opens up to 40 output files at the same time, it overwhelms NFS
# when a lot of nodes are running, it can bring down the SAN.  Because I was nearly
# done when it came back up, I just re-pushed with -maxNode=50 to keep it under control.
# However in the future, something like this should be done to keep the file access local
# as much as possible.
# Here is the proposed new way:
# -----------------------
cat << '_EOF_' > gsub
#LOOP
./vgPrep.csh $(path1) $(root1) $(file1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy

cat << '_EOF_' > vgPrep.csh
#!/bin/tcsh
mkdir -p /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain/$1
mkdir -p /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain/$1
cp /san/sanvol1/visiGene/offline/allenBrain/imageDisk/$1 /scratch/tmp/$3
vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /scratch/tmp/vg200$2 /scratch/tmp/vgfull$2 $1
set err = $status
if (! $err ) then
    cp -r /scratch/tmp/vg200$2/* /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain
    cp -r /scratch/tmp/vgfull$2/* /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain
endif
rm -f  /scratch/tmp/$3
rm -fr /scratch/tmp/vg200$2
rm -fr /scratch/tmp/vgfull$2
if ( $err ) then
    exit 1
endif
'_EOF_'
# << this line makes emacs coloring happy

# -----------------------

# ADDED TWO ADDITIONAL ZOOM-OUT LEVELS 5 AND 6:
# Ran /san/sanvol1/offline/level56Run/ cluster job on a list of all files dumped
#  from the visiGene.imageFile table so that we made new zoom out levels 5 and 6
#  for all pictures.  Since it was a special one-time deal, I just used ImageMagick.
# vgPrepImage.c has been modified to do the 2 new zoomout levels so that they
#  will be built automatically in future.

# Ran several checks to make sure no files were missing, fixed any errors.
# Found embedded space in some nibb filenames, found a couple of gensat 
# images which had previously failed to download and redownloaded them ok.
# Found a few missing things and 0 bytes jpgs and re-ran them.  
# It should be pretty clean right now.

# LOAD ALLEN BRAIN DATA
vgLoadAllen \
 /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain \
 /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20051021.tab \
 /cluster/data/mm7/bed/allenBrain/allProbes.fa \
 /cluster/data/mm7/bed/allenBrain/allProbes.tab \
 output
#backed-up data in case of trouble:
mkdir /san/sanvol1/visiGene/dump/visiGene.20060220
hgsqldump visiGene -T /san/sanvol1/visiGene/dump/visiGene.20060220
#load into visiGene db
visiGeneLoad -database=visiGene output/aba.ra output/aba.tab /dev/null

# Manually added several researchers names to the contributor and submissionContributor tables
# at the request of Susan Sunkin as well as updating the text for contributor, copyright, acknowledgements.
# I manually also updated aba.ra and vgLoadAllen.c to reflect her changes.  The manual mods
# to contributor which work great in the visiGene search are not currently automatically
# supported, and would thus be lost if we ever nuke it and start fresh.
# At some point, we will probably add an additional field to the .ra structure
# and have visiGeneLoad support it.

# RE-MAKE FULL TEXT INDEX
cd hg/visiGene/vgGetText
make alpha
# basically does this, but puts it in cgi-bin/visiGeneData/:
#vgGetText visiGene.text mm7 hg17
#ixIxx visiGene.text visiGene.ix visiGene.ixx
# (hgVisiGene cgi v128 now knows about this new location)

############################

# REBUILD WITH NEW vgProbeTrack PROGRAM - AFTER ADDING ALLEN BRAIN (DONE galt 2009-10-12)

# (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes
cd /hive/data/inside/visiGene/dump
# (this backup shown is really an example template for the next person who needs to do this)
mkdir visiGene.20060315
cd visiGene.20060315
hgsqldump visiGene -T .
mkdir mm6; hgsqldump mm6 vgProbes -T mm6
mkdir mm7; hgsqldump mm7 vgProbes -T mm7
mkdir mm8; hgsqldump mm8 vgProbes -T mm8
mkdir mm9; hgsqldump mm9 vgProbes -T mm9
mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17
mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18
#(do any others needed that might not be listed here)
#(document the reason for making the backup)
echo 'vgLoadAllenBrain has been run, so making backup of visiGene db and probe tracks before updating, ' > README

# OK, NOW USE vgProbeTrack TO UPDATE

cd ~/kent/src/hg/visiGene/vgProbeTrack

# -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db
# so it can find the .sql script to create vgProbes or vgAllProbes tables as needed.
# I happen to know that only AllenBrain was updated since last time, and that is mouse only

# populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once).
vgProbeTrack POP

# find sequence using various methods - given probe seq, primers, bacs, refseq, etc.
#  must specify a specific assembly to use, so just using mm7 since mm8 still in qa.
#  this finds any stuff for the mouse taxon
vgProbeTrack SEQ working mm7  

# create alignments using either refSeqAli or all_mrna or bacEnds or blat.  Took 1.5 hours.  
# alignments are individually tracked per assembly here
# alignment successes go in $db.vgProbes psl track, and whether succeeded or failed,
# it only looks for things that have not already attempted alignment
# the status goes into visiGene.vgPrbAli with .db="mm7"
# because mm7.vgProbes is a new table, to create it we include the -sqlPath so
# it can find the vgProbes.sql script
vgProbeTrack ALI working mm7 -sqlPath=..

# this finds any seq required for mm7.vgProbes track not already in mm7.seq 
# adds the new .fa file in /cluster/data/mm7/bed/visiGene/
# adds a symlink to it in /gbdb/mm7/visiGene/
# and runs hgLoadSeq mm6 /gbdb/mm7/visiGene/vgPrbExt_??????.fa to add it to mm7.seq
vgProbeTrack EXT working mm7

# mm6.vgProbes was already complete from previous probe track creation, 
#  it just needed to catch the new Allen Brain probes and align them.  About 1.5 hours.
vgProbeTrack ALI working mm6
vgProbeTrack EXT working mm6

# hg17.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse
# this internally uses pslMap against the mm7 to hg17 liftover chain.gz
# Because it is "Xeno" (from mouse to human), it creates track vgAllProbes,
# and maintains the list of processed alignments in visiGene.vgPrbAliAll.
vgProbeTrack PSLMAP working hg17 mm7  
# updates hg17.seq/extFile similarly to the EXT command, but for All probes.
# just like with EXT, EXTALL puts .fa in /cluster/data/hg17/visiGene
# and symlink in /gbdb/hg17/visiGene and updates using hgLoadSeq.
# if a sequence has already been loaded it will not be loaded again.
vgProbeTrack EXTALL working hg17

# hg18.vgAllProbes never existed before
vgProbeTrack PSLMAP working hg18 mm7  -sqlPath=..
# because the nibb blatz probe track hg18.nibbImageProbes was never done on hg18 
# until just now (see makeHg18.doc), we have to add it for the first time.
# "nibb" is not really a db here, so I manually put in a taxon mapping for it, 
# so it appears as Xenopus laevis 8355, see the source code.
vgProbeTrack REMAP working hg18 nibb nibbImageProbes /gbdb/hg18/nibbImageProbes.fa  
vgProbeTrack EXTALL working hg18

# mm8 is in qa and so it is basically ready to use now.  About 1.5 hours.
vgProbeTrack ALI working mm8  -sqlPath=..
vgProbeTrack EXT working mm8


# RE-MAKE knownToVisiGene tables (see respective makedocs for these)
#knownToVisiGene mm6
#knownToVisiGene mm7
#knownToVisiGene mm8
#knownToVisiGene hg17 -fromProbePsl=vgAllProbes
#knownToVisiGene hg18 -fromProbePsl=vgAllProbes

############################

###  JACKSON UPDATE (done 2006-04-01 galt)  #############

# updated jackson20060328 db on kkr3u00 (see hg/visiGene/jackson/makeJackson.doc)

# Dropped old visiGeneOld db, asked Heather to clone visiGene db to visiGeneOld db,
# and then ran this query to remove the old previous JAX info:
# MULTI-TABLE DELETE:
delete submissionSource, submissionSet, submissionContributor, image,
imageProbe, expressionLevel, imageFile from
submissionSource so,
submissionSet ss,
submissionContributor sc,
image i,
imageProbe ip,
expressionLevel el,
imageFile f
where so.id = 2
and ss.submissionSource = so.id
and sc.submissionSet = ss.id
and i.submissionSet = ss.id
and ip.image = i.id
and el.imageProbe = ip.id
and f.submissionSet = ss.id;

#delete query (get rid of all submissionSource.id=2)
#Query OK, 164717 rows affected (48 min 16.07 sec)

# Workaround for uniProt access from kkr3u00
ssh hgwdev
setenv jdb jackson20060328
cd ~/kent/src/hg/visiGene/vgLoadJax
hgsqldump uniProt taxon commonName -T .
ssh kkr3u00
setenv jdb jackson20060328
cd ~/kent/src/hg/visiGene/vgLoadJax
hgsql mysql -e "create database uniProt"
hgsql uniProt < taxon.sql
hgsql uniProt < commonName.sql
# hgsql uniProt -e 'show tables'
hgsql uniProt -e "load data local infile 'taxon.txt' into table taxon"
hgsql uniProt -e "load data local infile 'commonName.txt' into table commonName"
# hgsql uniProt -e 'show table status\G'
# cleanup
rm taxon.*
rm commonName.*

#update vgLoadJax.c to update the date given in .ra acknowledgements
#recompile vgLoadJax on dev
#run vgLoadJax to create .ra .tab .txt for each submissionSet
ssh kkr3u00
setenv jdb jackson20060328
cd ~/kent/src/hg/visiGene/vgLoadJax
#remove any old data dir
rm -fr visiGene/
# visiGene in line below is just an output dir for the .ra/.tab/.txt files
~/bin/i386/vgLoadJax /san/sanvol1/visiGene/gbdb jackson20060328 visiGene
#ref 32185: missing title from BIB_Refs, ref skipped
#Calculating age from postnatal
#ref 67768: missing title from BIB_Refs, ref skipped
#Calculating age from postnatal month 3
#Calculating age from postnatal
#Calculating age from postnatal
#Calculating age from postnatal month 4
#Calculating age from postnatal month 4
#Calculating age from Not Specified 12.5
#refCount=2970


#ran loadAll to load the updated jax .ra .tab .txt into visiGene db
ssh hgwdev
cd ~/kent/src/hg/visiGene/vgLoadJax
loadAll
#loadAll.output has 1112 lines like
#visiGene/100423.ra

# ran vgGetText to update cgi-bin-galt/visiGeneData/ using visiGene db
cd ~/kent/src/hg/visiGene/vgGetText
make alpha
# output:
#vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 hg17
#probe has 19276 rows
#gene has 15173 rows
#imageProbe has 115500 rows

# recompiled hgVisiGene 


############################

# REBUILD PROBETRACK   (DONE galt 2006-04-04)
#    WITH vgProbeTrack PROGRAM - AFTER DOING JAX UPDATE 20060328 

# (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes
cd /san/sanvol1/visiGene/dump
# (this backup shown is really an example template for the next person who needs to do this)
mkdir visiGene.20060404
cd visiGene.20060404
hgsqldump visiGene -T .
mkdir mm6; hgsqldump mm6 vgProbes -T mm6
mkdir mm7; hgsqldump mm7 vgProbes -T mm7
mkdir mm8; hgsqldump mm8 vgProbes -T mm8
mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17
mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18
#(do any others needed that might not be listed here)
#(document the reason for making the backup)
echo 'vgLoadJax jackson20060328 has been run, so making backup of visiGene db and probe tracks before updating probeTracks, ' > README

# OK, NOW USE vgProbeTrack TO UPDATE

cd ~/kent/src/hg/visiGene/vgProbeTrack

# Make sure vgProbeTrack program is up to date
make

# -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db
# so it can find the .sql script to create vgProbes or vgAllProbes tables as needed.
# I happen to know that only JAX was updated since last time, and that is mouse only

# populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once).
vgProbeTrack POP
#new probe records found = 1285, # new vgPrb records added = 1285
#   most of these are old, but we updated JAX by dropping completely and re-adding
#   so these probes find their way back via sequence identity of probes in vgPrb.sequence

# find sequence using various methods - given probe seq, primers, bacs, refseq, etc.
#  must specify a specific assembly to use, so mm7 is ready to use now, mm8 still in qa.
#  this finds any stuff for the mouse taxon
vgProbeTrack SEQ working mm7

#rc = 0 = count of primers for mrna search for taxon 10090
#rc = 0 = count of primers for genome search for taxon 10090
#bac list read done.
#found seq for 0 bacEndPairs
#rc = 549 = count of refSeq mrna for mm7
#rc = 18 = count of genRef mrna for mm7
#rc = 33 = count of genbank mrna for mm7
#rc = 428 = count of flatRef mrna for mm7
#rc = 0 = count of flatAll mrna for mm7
#rc = 1 = count of linkRef mrna for mm7
#rc = 0 = count of linkAll mrna for mm7
#rc = 1 = count of kgAlRef mrna for mm7
#rc = 37 = count of kgAlAll mrna for mm7


# create alignments using either refSeqAli or all_mrna or bacEnds or blat.  Took 1.5 hours.  
# alignments are individually tracked per assembly here
# alignment successes go in $db.vgProbes psl track, and whether succeeded or failed,
# it only looks for things that have not already attempted alignment
# the status goes into visiGene.vgPrbAli with .db="mm7"
vgProbeTrack ALI working mm7

# this finds any seq required for mm7.vgProbes track not already in mm7.seq 
# adds the new .fa file in /cluster/data/mm7/bed/visiGene/
# adds a symlink to it in /gbdb/mm7/visiGene/
# and runs hgLoadSeq mm6 /gbdb/mm7/visiGene/vgPrbExt_??????.fa to add it to mm7.seq
vgProbeTrack EXT working mm7

# mm6.vgProbes was already complete from previous probe track creation, 
#  it just needed to catch the new Allen Brain probes and align them.  About 1.5 hours.
vgProbeTrack ALI working mm6
vgProbeTrack EXT working mm6

# hg17.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse
# this internally uses pslMap against the mm7 to hg17 liftover chain.gz
# Because it is "Xeno" (from mouse to human), it creates track vgAllProbes,
# and maintains the list of processed alignments in visiGene.vgPrbAliAll.
vgProbeTrack PSLMAP working hg17 mm7  
# updates hg17.seq/extFile similarly to the EXT command, but for All probes.
# just like with EXT, EXTALL puts .fa in /cluster/data/hg17/visiGene
# and symlink in /gbdb/hg17/visiGene and updates using hgLoadSeq.
# if a sequence has already been loaded it will not be loaded again.
vgProbeTrack EXTALL working hg17

# hg18.vgAllProbes existed before
vgProbeTrack PSLMAP working hg18 mm7 
vgProbeTrack EXTALL working hg18

# mm8 is in qa and so it is basically ready to use now.
vgProbeTrack ALI working mm8
vgProbeTrack EXT working mm8


# RE-MAKE knownToVisiGene tables (see respective makedocs for these)
knownToVisiGene mm6
knownToVisiGene mm7
knownToVisiGene mm8
knownToVisiGene hg17 -fromProbePsl=vgAllProbes
knownToVisiGene hg18 -fromProbePsl=vgAllProbes

# update text/index for visiGene
cd hg/visiGene/vgGetText
make alpha
#vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 hg17
#probe has 19276 rows
#gene has 15173 rows
#imageProbe has 115500 rows


############################
#
# Patch contributors so we can search MGI submission sets 
#  by specifying JAX or MGI in the search box.
#

select id from submissionSource where name = 'MGI';
+----+
| id |
+----+
| 6  |
+----+

# note: we have to double the search word or else the search doesn't work
insert into contributor set name = 'JAX JAX';
insert into contributor set name = 'MGI MGI';
mysql> select * from contributor where name in ('JAX JAX','MGI MGI');
+------+---------+
| id   | name    |
+------+---------+
| 3981 | JAX JAX |
| 3982 | MGI MGI |
+------+---------+

insert into submissionContributor select id, '3981' from submissionSet where submissionSource = 6;
insert into submissionContributor select id, '3982' from submissionSet where submissionSource = 6;

##### ADD SUPPORT FOR ABURL (DONE 2006-04-19 galt)
# I manually updated this, currently needed only by JAX,
# adding antibodySource table that maps abSubmitId to antibody and submissionSource
# and adding field abUrl to submissionSource table.
# The code for vgLoadJax and visiGeneLoad were also updated to support this new 
# link from antibody probe to submissionSource website for further details.
# Since this will be automatically maintained in future, no point in belaboring the makefile.
# This also involved an update to hgVisiGene including passing submissionSource id on
# the url to the primers page so that the external link can be made when it is an antibody.

##### REPLICATED submissionSet.privateUser SETTINGS TO NEW VISIGENE DB (DONE 2006-04-24 galt)
# This was an oversight caused by full removal of all old jax submissionSets 
# when we did the jax 2006-03-28 update.  Since we had lost the privateUser settings,
# I just replicated it from visiGeneOld with a simple query.
update visiGene.submissionSet n, visiGeneOld.submissionSet o set n.privateUser=-1
where o.privateUser=-1 and o.name = n.name;
# currently this is just jax submissionSets for which we have not received permissions to use.


##### ADDED IMAGEFILE-FORWARDING TO COMBINE MAHONEY AND JAX-MAHONEY ANNOTATIONS (DONE 2006-04-26 galt)
# The idea here is that JAX has some useful annotations, but including them made a lot of 
# unnecessary duplication in the system.  Although it wasn't easy, we have come up with
# a method to map the imageFiles from Mahoney to the ones in JAX.  We have made imageFileFwd table
# to store that mapping information, and added code to hgVisiGene to use it.  Wholemount steps
# are manual, while slices steps use hg/visiGene/vgLoadJax/forwardSlices.c I wrote to map them Mah->JAX.
# Additional complications are that JAX combined several slices together into one image
# following a certain pattern.  Luckily for the wholemounts, the original images were not modified by JAX.
# This means that we can get a perfect match Mah->JAX for the wholemounts using md5sum (produced unique values).
# Because both we and JAX imported the Mahoney data/spreadsheet into auto-incremented primary keyfield
# tables, the original order is preserved and allows a surprisingly good mah->jax many-to-one slices mapping.
# I also extended vgLoadJax to be able to find the primers in the PRB_Notes which was useful both 
# for mapping the slices, and because we end up using JAX annotations for the fullCaption() page,
# so that we don't lose primer info. For the remaining fraction where Mahoney never supplied primers
# (actually they have since updated the info, but neither JAX nor we have gotten that yet),
# something over 20%, we have managed to instead just map on gene. This worked surprisingly well,
# and made either correct or very close matches.

# Since previously, vgLoadJax looked for the mahoney set in jax and excluded it, we need to 
# make and import it into visiGene.  I have removed the skipping of mahoney set from the 
# vgLoadJax code (so that next time we update jax, the mahoney set will not be excluded)
# and added a commandline option to do just a single submission set.  I happen to know that
# the mahoney set in jax is jax92242.  THIS STEP WON'T BE NEEDED IN FUTURE.

# the latest jax sybase db conversion is on kkr3u00 because it had space and little use.
ssh kkr3u00
cd ~/kent/src/hg/visiGene/vgLoadJax
# clean out any old subdirectory
rm -fr visiJaxMahoney
# process just the jaxMahoney submissionSet
${HOME}/bin/i386/vgLoadJax -oneSubmissionSet=92242 /san/sanvol1/visiGene/gbdb jackson20060328 visiJaxMahoney

# load it into visiGene db
ssh hgwdev
visiGeneLoad visiJaxMahoney/92242.ra  visiJaxMahoney/92242.tab  visiJaxMahoney/92242.txt

# we are going to treat the jax version of Mahoney as "privateUser" 
# in order to suppress it and reduce the duplication of Mahoney images
hgsql visiGene -e 'update submissionSet set privateUser=-1 where name like "jax92242"'

submissionSets:  (for reference)
name       id
-----------------------------------
mahoneyWhole    = 1
mahoneySlices01 = 2
jax92242        = 1820

			  
ssh hgwdev
cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/wholeMount
md5sum *.jpg | sed -e 's/  /\t/' | sort > ~/kent/src/hg/visiGene/vgLoadJax/mahoneyWholeMount.md5
#(quick - 1 or 2 minutes only)

#Find the jax-Mahoney images that are WholeMount
ssh hgwdev
cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax

# WARNING: change database name and submissionSet id constants in text below if needed!!!
#  jaxMahoney = 1820, bodyPart.id = 1 for name="whole"
hgsql visiGene -BN -e 'select distinct imageFile.fileName from imageFile, image, specimen, bodyPart \
where imageFile.submissionSet=1820 and bodyPart=1 \
and image.imageFile=imageFile.id and image.specimen=specimen.id' \
 | xargs md5sum | sed -e 's/  /\t/' | sort > ~/kent/src/hg/visiGene/vgLoadJax/jaxMahoneyWholeMount.md5

cd ~/kent/src/hg/visiGene/vgLoadJax

# verify that they are unique by md5sum:
wc -l *.md5
   1833 jaxMahoneyWholeMount.md5
   1843 mahoneyWholeMount.md5
sort -k 1,1 -u jaxMahoneyWholeMount.md5 | wc -l
   1833
sort -k 1,1 -u mahoneyWholeMount.md5 | wc -l
   1843

hgsql visiGene

create table mahoneyWholeMountMd5 (
    md5 char(32) not null,         # md5 sum of .jpg
    fileName varchar(10) not null, # .jpg fileName
    INDEX(md5),
    INDEX(fileName)
);
load data local infile 'mahoneyWholeMount.md5' into table mahoneyWholeMountMd5;
analyze table mahoneyWholeMountMd5;

create table jaxMahoneyWholeMountMd5 (
    md5 char(32) not null,         # md5 sum of .jpg
    fileName varchar(10) not null, # .jpg fileName
    INDEX(md5),
    INDEX(fileName)
);
load data local infile 'jaxMahoneyWholeMount.md5' into table jaxMahoneyWholeMountMd5;
analyze table jaxMahoneyWholeMountMd5;

# verify that they match uniquely and completely:
select count(*) from mahoneyWholeMountMd5 m, jaxMahoneyWholeMountMd5 j where m.md5 = j.md5;
+----------+
| count(*) |
+----------+
|     1833 |
+----------+

# make forwarding table (NO NEED TO DO IN FUTURE, IS IN visiGene.as,.sql)
CREATE TABLE imageFileFwd (
    fromIf int not null,      # From imageFile
    toIf   int not null,      #   To imageFile
        #Indices
    INDEX(fromIf),
    INDEX(toIf)
);

# WARNING: change submissionSet ids!!!
# find how the mahoney matches to the jaxMahoney
#    (I verified that all filenames are unique in all 3 submissionSets: jaxM, mWhole, mSlices)
insert into imageFileFwd
select mi.id, ji.id from imageFile mi, imageFile ji, mahoneyWholeMountMd5 m, jaxMahoneyWholeMountMd5 j
where m.md5 = j.md5 and mi.fileName=m.fileName and ji.fileName=j.fileName
 and mi.submissionSet=1 and ji.submissionSet=1820;
# Records: 1828

# the wholemounts are now done, so let's do the slices next!


# Cluster Run to do OCR on jaxMahoneySlices (Galt 2006-04-28)
#  if program ocrad is not in /cluster/bin/i386, download and compile it (very easy)
#  ocrad is a gnu program

# Create parasol directory and a list of the jpg files.
ssh hgwdev
cd /san/sanvol1/visiGene/offline/jax
mkdir ocrJaxMahoneyRun
cd ocrJaxMahoneyRun
mkdir output

# make list of jaxMahoneySlice .jpgs
# WARNING: change database and submissionSet ids!!!
hgsql visiGene -BN -e 'select distinct imageFile.fileName from \
imageFile, image, specimen \
where imageFile.submissionSet=1820 and bodyPart<>1 \
and image.imageFile=imageFile.id and image.specimen=specimen.id' \
 > jaxMahoneySlices.list

# Create parasol batch
cat << '_EOF_' > gsub
#LOOP
./ocrSlices.csh $(file1) $(root1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy

cat << '_EOF_' > ocrSlices.csh
#!/bin/tcsh -ef
if ( -e output/$2.map ) then
    rm output/$2.map
endif
if ( -e output/$2.ocr ) then
    rm output/$2.ocr
endif
convert /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/$1 output/$2.pgm
# There wasn't a single threshold value that worked, so do entire series
/cluster/bin/i386/ocrad --threshold=.4 --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --threshold=.5 --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --threshold=.6 --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --threshold=.7 --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --threshold=.8 --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --threshold=.9 --charset=ascii output/$2.pgm >> output/$2.ocr

@ x = $2
# special handling for 7996.jpg thru 8060.jpg, the Accession does not end in "aa" for these.
if ( ($x >= 7996) && ($x <= 8060) ) then
    cat output/$2.ocr | tr lOoiI 10011 | tr -d . | perl -0ne 'print "$1\n" while ( /(T\d{8})/gs )' | sort -u > output/$2.temp
else
    cat output/$2.ocr | tr lOoiI 10011 | tr -d . | perl -0ne 'print "$1\n" while ( /(T\d{8}aa)/gs )' | sort -u > output/$2.temp
endif

set tempTs = ( `cat output/$2.temp` )
if ( $#tempTs > 0 ) then
    foreach t ( $tempTs )
        if ( ($x >= 7996) && ($x <= 8060) ) then  # special handling for these
            if (-e /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/slices/${t}aa) then
                set t = "${t}aa"
            endif
            if (-e /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/slices/${t}00) then
                set t = "${t}00"
            endif
        endif
        echo "$1\t$t" >> output/$2.map
    end
else
    echo "$1\tNO_TEXT" > output/$2.map
endif
rm output/$2.temp

'_EOF_'
# << this line makes emacs coloring happy

chmod a+x ocrSlices.csh

ssh pk
cd /san/sanvol1/visiGene/offline/jax/ocrJaxMahoneyRun

gensub2 jaxMahoneySlices.list single gsub spec
para create spec
para try
para push
para check
para time
#2095 jobs in batch
#292661 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 2095 of 2095 jobs
#CPU time in finished jobs:       1059s      17.65m     0.29h    0.01d  0.000 y
#IO & Wait Time:                  5687s      94.79m     1.58h    0.07d  0.000 y
#Average job time:                   3s       0.05m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:               7s       0.12m     0.00h    0.00d
#Submission to last job:           134s       2.23m     0.04h    0.00d

cat output/*.map | sort > ~/kent/src/hg/visiGene/vgLoadJax/jaxMahoneySlices.map

--------------------------

ssh hgwdev
cd ~/kent/src/hg/visiGene/vgLoadJax

cat *.map | wc -l
   4066
cat *.map | grep NO_TEXT | wc -l
    1  (turns out to be a jax annotation caption error - missing leading zero in mtf#, ignoring)

hgsql visiGene

create table jaxMahoneySlicesMap (
    jFileName varchar(10) not null, # jaxMahoney .jpg fileName
    mFileName varchar(20) not null  #    mahoney .jpg fileName
);
load data local infile 'jaxMahoneySlices.map' into table jaxMahoneySlicesMap;
update jaxMahoneySlicesMap set mFileName = concat(mFileName,".jpg") where mFileName <> "NO_TEXT";
create index jFileName on jaxMahoneySlicesMap(jFileName);
create index mFileName on jaxMahoneySlicesMap(mFileName);
analyze table jaxMahoneySlicesMap;


# WARNING: change submissionSet ids!!!
insert into imageFileFwd
select mi.id, ji.id from imageFile mi, imageFile ji, jaxMahoneySlicesMap map
where ji.fileName=map.jFileName and mi.fileName=map.mFileName
 and mi.submissionSet=2 and ji.submissionSet=1820;
# Records: 3896

# It's looking good.

# clean up
drop table mahoneyWholeMountMd5;
drop table jaxMahoneyWholeMountMd5;
drop table jaxMahoneySlicesMap;

#############################################################
#
# Patch contributors so we can search submission sets
#  by specifying Mahoney in the search box.
#

select id from submissionSource where name like 'Mahoney%';
+----+
| id |
+----+
| 1  |
+----+

# note: we have to double the search word or else the search doesn't work
insert into contributor set name = 'Mahoney mahoney';
mysql> select * from contributor where name in ('Mahoney mahoney');
+------+-----------------+
| id   | name            |
+------+-----------------+
| 3987 | Mahoney mahoney |
+------+-----------------+

insert into submissionContributor select id, '3987' from submissionSet where submissionSource = 1;

############################

###  JACKSON UPDATE (re-done to fix expression data 2006-06-05 galt)  #############

# The expression data was not correctly matching subpanels,
# and the bodyPart was incorrectly displaying just "floor plate" 
# instead of the full part-tree-lineage available in field printName.
#
# We found the solution in vgLoadJax was to NOT use the GXD_Expression
# table at all - apparently it is not necessary as the data is in other tables.
# This meant that we are using GXD_Strength values instead of the old 1/0 for level.
# We made the vgLoadJax code treat these correctly, and tweaked hgVisiGene too.
# And then we also decided to add the expression pattern while we were at it
# since JAX db had it - so added it to vgLoadJax and hgVisiGene.

# previously updated: jackson20060328 db on kkr3u00 
# (see above, and see hg/visiGene/jackson/makeJackson.doc)

# save imageFileFwd data in new form for easy restore:
create table iffKeepThis as 
select a.fileName "fromFN", b.fileName "toFN" from imageFileFwd iff, imageFile a, imageFile b 
where iff.fromIf = a.id and iff.toIf = b.id;

create index fromFN on iffKeepThis(fromFn(10));
create index toFN on iffKeepThis(toFn(10));


# Asked Heather to clone visiGene db to visiGeneBadExpr db,
# and then ran this query to remove the old previous JAX info:
# MULTI-TABLE DELETE:

# CRITICAL! to make sure that analyze table has been run on all tables involved,
# otherwise this will run forever.  Don't assume that the cardinality is defined.
# Running analyze table is super quick.

analyze table submissionSource;
analyze table submissionSet;
analyze table submissionContributor;
analyze table image;
analyze table imageFile;
analyze table imageProbe;
analyze table expressionLevel;

delete from submissionSource where name = 'MGI';
# 1 rows

delete submissionSet from submissionSet ss left join submissionSource so on ss.submissionSource=so.id where so.id is null;
# 1113 rows

delete submissionContributor from submissionContributor sc left join submissionSet ss on sc.submissionSet=ss.id where ss.id is null;
# 7926 rows

delete image from image i left join submissionSet ss on i.submissionSet=ss.id where ss.id is null;
# 33816 rows 

delete imageFile from imageFile imf left join submissionSet ss on imf.submissionSet=ss.id where ss.id is null;
# 13854 rows

delete imageProbe from imageProbe ip left join image i on ip.image=i.id where i.id is null;
# 35395 rows

delete expressionLevel from expressionLevel el left join imageProbe ip on el.imageProbe=ip.id where ip.id is null;
# 102293 rows

delete from imageFileFwd;
# 5724 rows 

delete antibodySource from antibodySource abs left join submissionSource so on abs.submissionSource=so.id where so.id is null;
# 745 rows


#recompile vgLoadJax on dev
#run vgLoadJax to create .ra .tab .txt for each submissionSet
ssh kkr3u00
cd ~/kent/src/hg/visiGene/vgLoadJax
#remove any old data dir
rm -fr visiGene/
# visiGene in line below is just an output dir for the .ra/.tab/.txt files
vgLoadJax /san/sanvol1/visiGene/gbdb jackson20060328 visiGene
#refCount=2971

#ran loadAll to load the updated jax .ra .tab .txt into visiGene db
ssh hgwdev
cd ~/kent/src/hg/visiGene/vgLoadJax
loadAll


# deal with parallel Mahoney-in-Jax data

select * from submissionSet where name='jax92242' \G
*************************** 1. row ***************************
              id: 2848
            name: jax92242
     publication: Mouse Brain Organization Revealed Through Direct Genome-Scale TF Expression Analysis

select id,name from submissionSet where name like 'mahoney%';
+----+-----------------+
| id | name            |
+----+-----------------+
|  2 | mahoneySlices01 |
|  1 | mahoneyWhole01  |
+----+-----------------+

# save imageFileFwd data in new form for easy restore:
insert into imageFileFwd 
select a.id, b.id from iffKeepThis iff, imageFile a, imageFile b 
where iff.fromFN = a.fileName and iff.toFN = b.fileName
and a.submissionSet in (1,2) and b.submissionSet in (2848);

drop table iffKeepThis;

# Since we had lost the privateUser settings,
# I just replicated it from visiGeneBadExpr backup with a simple query.
update visiGene.submissionSet n, visiGeneBadExpr.submissionSet o set n.privateUser=-1
where o.privateUser=-1 and o.name = n.name;
# currently this is just jax submissionSets for which we have not received permissions to use,
# and the mahoney-in-jax that is suppressed.


# ran vgGetText to update cgi-bin-galt/visiGeneData/ using visiGene db
cd ~/kent/src/hg/visiGene/vgGetText
make alpha

# recompiled hgVisiGene earlier to support new expression level scale, and pattern


# RE-MAKE knownToVisiGene tables (see respective makedocs for these)
knownToVisiGene mm6
knownToVisiGene mm7
knownToVisiGene mm8
knownToVisiGene hg17 -fromProbePsl=vgAllProbes
knownToVisiGene hg18 -fromProbePsl=vgAllProbes

# Patch contributors so we can search MGI submission sets 
#  by specifying JAX or MGI in the search box.
#

select id from submissionSource where name = 'MGI';
+----+
| id |
+----+
| 7  |
+----+

# note: we have to double the search word or else the search doesn't work
# skip adding these two which are already there:
#   insert into contributor set name = 'JAX JAX';
#   insert into contributor set name = 'MGI MGI';
mysql> select * from contributor where name in ('JAX JAX','MGI MGI');
+------+---------+
| id   | name    |
+------+---------+
| 3981 | JAX JAX |
| 3982 | MGI MGI |
+------+---------+

insert into submissionContributor select id, '3981' from submissionSet where submissionSource = 7;
insert into submissionContributor select id, '3982' from submissionSet where submissionSource = 7;

#######################################################
#
#  Received a major update from Susan Sunkin at ABA
#  consisting of 6000 new images (we had 12000 already)
#
#

# Allen Brain Atlas jp2 image prep (Galt 2006-12-12)
# Create parasol directory and a list of the jpg files.
ssh pk
cd /san/sanvol1/visiGene/offline/allenBrain
rm -fr prepImageRun
mkdir prepImageRun
find imageDisk -name '*.jp2' -print | sed 's/imageDisk\///' | grep May_06 > prepImageRun/jpg.lst

cd prepImageRun
# Create parasol batch
cat << '_EOF_' > gsub
#LOOP
vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 jpg.lst single gsub spec
para make spec -maxNode=50

[pk:prepImageRun> /parasol/bin/para time
6317 jobs in batch
266106 jobs (including everybody's) in Parasol queue.
Checking finished jobs
Completed: 6317 of 6317 jobs
CPU time in finished jobs:     267986s    4466.44m    74.44h    3.10d  0.008 y
IO & Wait Time:                368981s    6149.68m   102.49h    4.27d  0.012 y
Average job time:                 101s       1.68m     0.03h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            1471s      24.52m     0.41h    0.02d
Submission to last job:         43292s     721.53m    12.03h    0.50d


# -maxNode=50 was needed because it opens many output files at the same time - do not overwhelm NFS

# -----------------------

# Allen Brain Atlas update (Galt 2007-02-08)

# see mm6.txt for prep running allenCleanup and allenCollectSeq

# LOAD ALLEN BRAIN DATA
# note mm6,mm7,mm8 all have the same thing since it is for mouse generally
# note make sure the contributors list in vgLoadAllen.c is correct
vgLoadAllen \
 /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain \
 /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20061204.tab \
 /cluster/data/mm6/bed/allenBrain/allProbes.fa \
 /cluster/data/mm6/bed/allenBrain/allProbes.tab \
 output
#Got 17913 images
#Got 17913 named probes
#Got 17913 probe sequences

# Did not do this: 
# (instead, I asked Heather to clone entire visiGene db to visiGeneOld)
#backed-up data in case of trouble:
#mkdir /san/sanvol1/visiGene/dump/visiGene.20061220
#hgsqldump visiGene -T /san/sanvol1/visiGene/dump/visiGene.20061220

#restore fileLocation to point to dev
update fileLocation set name =
concat('http://hgwdev.cse.ucsc.edu',substring(name,INSTR(name,'/visiGene/')));
# 14 rows

# clean out the old ABA records before we do a full load
delete from submissionSource where name = 'Allen Brain Atlas (ABA)';
# 1 row

delete submissionSet from submissionSet ss left join submissionSource so on
ss.submissionSource=so.id where so.id is null;
# 1 row

delete submissionContributor from submissionContributor sc left join
submissionSet ss on sc.submissionSet=ss.id where ss.id is null;
# 13 rows

delete image from image i left join submissionSet ss on i.submissionSet=ss.id
where ss.id is null;
# 11736 rows

delete imageFile from imageFile imf left join submissionSet ss on
imf.submissionSet=ss.id where ss.id is null;
# 11736 rows

delete imageProbe from imageProbe ip left join image i on ip.image=i.id where
i.id is null;
# 11737 rows

delete expressionLevel from expressionLevel el left join imageProbe ip on
el.imageProbe=ip.id where ip.id is null;
# 0 rows

delete antibodySource from antibodySource abs left join submissionSource so on
abs.submissionSource=so.id where so.id is null;
# 0 rows


#load into visiGene db
visiGeneLoad -database=visiGene output/aba.ra output/aba.tab /dev/null


# RE-MAKE FULL TEXT INDEX
cd hg/visiGene/vgGetText
make alpha
# basically does this, and puts it in cgi-bin/visiGeneData/:
#vgGetText visiGene.text mm8 hg18
#ixIxx visiGene.text visiGene.ix visiGene.ixx

############################

# REBUILD PROBETRACK   (DONE galt 2007-02-15)
#    WITH vgProbeTrack PROGRAM - AFTER DOING Allen Brain Atlas update 2007-02-08

# (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes
cd /san/sanvol1/visiGene/dump
# (this backup shown is really an example template for the next person who needs to do this)
mkdir visiGene.20070215
cd visiGene.20070215
hgsqldump visiGene -T .
mkdir mm6; hgsqldump mm6 vgProbes -T mm6
mkdir mm7; hgsqldump mm7 vgProbes -T mm7
mkdir mm8; hgsqldump mm8 vgProbes -T mm8
mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17
mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18
#(do any others needed that might not be listed here)
#(document the reason for making the backup)
echo 'vgLoadAllen has been run on ABA update 2007-02-08, so making backup of visiGene db and probe tracks before updating probeTracks, ' > README

# OK, NOW USE vgProbeTrack TO UPDATE

cd ~/kent/src/hg/visiGene/vgProbeTrack

# Make sure vgProbeTrack program is up to date
make

# -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db
# so it can find the .sql script to create vgProbes or vgAllProbes tables as needed.
# I happen to know that only JAX was updated since last time, and that is mouse only

# populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once).
vgProbeTrack POP
# new probe records found = 7335, # new vgPrb records added = 7314
#   most of these are old, but we updated ABA by dropping completely and re-adding
#   so these probes find their way back via sequence identity of probes in vgPrb.sequence

# find sequence using various methods - given probe seq, primers, bacs, refseq, etc.
#  must specify a specific assembly to use, so mm8 is ready to use now
#  this finds any stuff for the mouse taxon
vgProbeTrack SEQ working mm8

rc = 17 = count of primers for genome search for taxon 10090
rc = 141 = count of primers for mrna search for taxon 10090
bac list read done.
found seq for 0 bacEndPairs
rc = 93 = count of refSeq mrna for mm8
rc = 1 = count of genRef mrna for mm8
rc = 4 = count of genbank mrna for mm8
rc = 19 = count of flatRef mrna for mm8
rc = 0 = count of flatAll mrna for mm8
rc = 0 = count of linkRef mrna for mm8
rc = 0 = count of linkAll mrna for mm8
rc = 1 = count of kgAlRef mrna for mm8
rc = 4 = count of kgAlAll mrna for mm8

# create alignments using either refSeqAli or all_mrna or bacEnds or blat.  Took 1.5 hours.  
# alignments are individually tracked per assembly here
# alignment successes go in $db.vgProbes psl track, and whether succeeded or failed,
# it only looks for things that have not already attempted alignment
# the status goes into visiGene.vgPrbAli with .db="mm8"
vgProbeTrack ALI working mm8

# this finds any seq required for mm8.vgProbes track not already in mm8.seq 
# adds the new .fa file in /cluster/data/mm8/bed/visiGene/
# adds a symlink to it in /gbdb/mm8/visiGene/
# and runs hgLoadSeq mm6 /gbdb/mm8/visiGene/vgPrbExt_??????.fa to add it to mm8.seq
vgProbeTrack EXT working mm8

# mm6.vgProbes was already complete from previous probe track creation, 
#  it just needed to catch the new Allen Brain probes and align them.  About 1.5 hours.
vgProbeTrack ALI working mm7
vgProbeTrack EXT working mm7

# mm6.vgProbes was already complete from previous probe track creation, 
#  it just needed to catch the new Allen Brain probes and align them.  About 1.5 hours.
vgProbeTrack ALI working mm6
vgProbeTrack EXT working mm6

# hg18.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse
# this internally uses pslMap against the mm8 to hg18 liftover chain.gz
# Because it is "Xeno" (from mouse to human), it creates track vgAllProbes,
# and maintains the list of processed alignments in visiGene.vgPrbAliAll.
vgProbeTrack PSLMAP working hg18 mm8  
# updates hg18.seq/extFile similarly to the EXT command, but for All probes.
# just like with EXT, EXTALL puts .fa in /cluster/data/hg18/visiGene
# and symlink in /gbdb/hg18/visiGene and updates using hgLoadSeq.
# if a sequence has already been loaded it will not be loaded again.
vgProbeTrack EXTALL working hg18

# hg17.vgAllProbes existed before
vgProbeTrack PSLMAP working hg17 mm7 
vgProbeTrack EXTALL working hg17


# RE-MAKE knownToVisiGene tables (see respective makedocs for these)
knownToVisiGene mm6
knownToVisiGene mm7
knownToVisiGene mm8
knownToVisiGene hg17 -fromProbePsl=vgAllProbes
knownToVisiGene hg18 -fromProbePsl=vgAllProbes

# update text/index for visiGene
cd hg/visiGene/vgGetText
make alpha
#vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 mm8 hg17 hg18
#probe has 26611 rows
#gene has 20413 rows
#imageProbe has 125765 rows

################### (galt 2007-04-20 done)
# FIXED TWO ADDITIONAL ZOOM-OUT LEVELS 5 AND 6:
# Ran /san/sanvol1/offline/level56RunJax/ cluster job on a list of all files needed.
# Somehow 13000 pix were missing from the list when we made zoom out levels 5 and 6
# originally

cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/

find . -type d > dlist

vi dlist

#remove anything starting with "foo" or "ztest" or "goo" plus "."
#That should leave just valid directories.

cat dlist | sed -e 's/\.\///' > dlist2

[hgwdev:jax> cat level6missing.csh
#!/bin/tcsh
set nonomatch
while (1)
    set i=$<
    if ("$i" == "") then
        break
    endif
    if ( -e $i/*_6_000.jpg) then
    else
        echo "$i"
    endif
end

cat dlist2 | level6missing.csh > dlist3

cd /san/sanvol1/visiGene/offline
mkdir level56RunJax
cd level56RunJax
cp ../level56Run/level56.csh .
cp ../level56Run/gsub .
cat gsub
[hgwdev:level56RunJax> cat gsub
#LOOP
./level56.csh $(path1)
#ENDLOOP

cat /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/dlist3 | gawk '{print
"/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/" $1 ".jpg"}' > jpg.lst

pk
cd /san/sanvol1/visiGene/offline/level56RunJax
gensub2 jpg.lst single gsub spec
para create spec
para try
para push
para time

#Completed: 13235 of 13235 jobs
#CPU time in finished jobs:       3819s      63.65m     1.06h    0.04d  0.000 y
#IO & Wait Time:                 45674s     761.23m    12.69h    0.53d  0.001 y
#Average job time:                   4s       0.06m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              21s       0.35m     0.01h    0.00d
#Submission to last job:           560s       9.33m     0.16h    0.01d

# Followup to show that it worked:
#[hgwdev:jax> cat dlist2 | level6missing.csh > dlist3X
#[hgwdev:jax> ll dlist*
#-rw-rw-r--  1 galt protein 117205 Apr 20 13:13 dlist
#-rw-rw-r--  1 galt protein  85191 Apr 20 13:15 dlist2
#-rw-rw-r--  1 galt protein  71495 Apr 20 13:33 dlist3
#-rw-rw-r--  1 galt protein      0 Apr 20 14:25 dlist3X
#
#This shows that all completed (because dlist3X is empty)

#-----------------------------------------------

# Rsync request
#please rsync from /san to hgnfs1:

rsync hgwdev:/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/
hgnfs1:/hgnfs1:/export/gbdb2/full/inSitu/Mouse/jax/

#
################### (galt 2007-05-09 done)
# Adding support to vgProbeTrack and knownToVisiGene for
#  the BLATZ'd frog probes to mm8 which Jim did recently.
# knownToVisiGene no longer uses -fromProbePsl option,
#  instead it automatically detects vgProbes and vgAllProbes
#  and uses them in that order if no symbolic matches were found.
# Added a SELFMAP command to vgProbeTrack to migrate any missing
#  self alignments in vgProbes to vgAllProbes 
# Made a backup of visiGene.vg* first:
ssh hgwdev
cd /san/sanvol1/visiGene/dump
mkdir visiGene.20070509
cd visiGene.20070509
hgsqldump visiGene -T .

cd ~/kent/src/hg/visiGene/vgProbeTrack

vgProbeTrack -sqlPath=.. REMAP working mm8 nibb nibbImageProbes /gbdb/mm8/nibbImageProbes.fa
#FYI: Table mm8.vgAllProbes does not exist
#hgPepPred visiGene generic vgRemapTemp /gbdb/mm8/nibbImageProbes.fa
#Processing /gbdb/mm8/nibbImageProbes.fa
#Count of Psls found for reMap: 1379
#cat vgPrbReMap.psl vgAllProbes.psl | sort -u | sort -k 10,10 >vgAllProbesNew.psl
#hgLoadPsl mm8 vgAllProbesNew.psl -table=vgAllProbes
#Processing vgAllProbesNew.psl
#rm vgPrbReMap.psl vgAllProbes.psl vgAllProbesNew.psl

vgProbeTrack SELFMAP working mm8
#Count of nonBac Psls found for pslMap: 24615
#Count of bac Psls found for pslMap: 0
#cat bac.psl nonBac.psl > vgPrbSelfMap.psl
#cat vgPrbSelfMap.psl vgAllProbes.psl | sort -u | sort -k 10,10 >
#vgAllProbesNew.psl
#hgLoadPsl mm8 vgAllProbesNew.psl -table=vgAllProbes
#Processing vgAllProbesNew.psl
#rm vgPrbSelfMap.psl vgAllProbes.psl vgAllProbesNew.psl

vgProbeTrack EXTALL working mm8
#rc = 981 = count of sequences for vgPrbExt.fa, to use with mm8 trackvgAllProbes
#cp vgPrbExt.fa /cluster/data/mm8/bed/visiGene/vgPrbExt_YDGSWH.fa
#ln -s /cluster/data/mm8/bed/visiGene/vgPrbExt_YDGSWH.fa/gbdb/mm8/visiGene/vgPrbExt_YDGSWH.fa
#hgLoadSeq mm8 /gbdb/mm8/visiGene/vgPrbExt_YDGSWH.fa
#981 sequences
#Updating seq table

knownToVisiGene mm8

####################################################

################### (galt 2008-04-04 done)
# Slight name change for NIBB (affected visiGene)
# removed the word "Japanese " from NIBB name in visiGene.submissionSource
# removed same thing from vgLoadNibb.c source code.
# requested push of table hgwdev.visiGene.submissionSource.

################### (galt 2008-08-18 done)
# make downloads for visiGene

ssh hgwdev
co browser   # if you haven't already done it

change browser module, downloads.html to add links to visiGene download

cvs commit browser/downloads.html

# updating the visiGene downloads

cd /usr/local/apache/htdocs/goldenPath
mkdir visiGene
cd visiGene
mkdir database
cd database


vi README
---------
This directory contains the downloadable tables in the UCSC visiGene
database. This database is shared by the program VisiGene
 http://genome.ucsc.edu/cgi-bin/hgVisiGene
and tracks that incorporate visiGene data, such as the Known Genes tracks.

To see descriptions of the tables in visiGene, visit the Table Browser:
  http://genome.ucsc.edu/cgi-bin/hgTables
select "All Tables" as the group, select visiGene as the database,
and select a table.  Then click the "describe table schema" button.
---------

hgsqldump visiGene -T .

rm vgPrbAli.*
rm vgPrbAliAll.*

sed -i -e 's/hgwdev[.]cse/genome/' fileLocation.txt

gzip *.txt

Do a push-request:
------------
please rsync (with appropriate flags)

hgwdev:/usr/local/apache/htdocs/goldenPath/visiGene/

to

hgdownload:/usr/local/apache2/htdocs/goldenPath/visiGene/

Reason:
 Now users will have an easier time of downloading visiGene database.

------------

also, first time only, do 

update browser sandbox with links on downloads.html,
then do a push-request:

Please push downloads.html from dev to hgdownload:

hgwdev:/usr/local/apache/htdocs/downloads.html

to

hgdownload:/usr/local/apache2/htdocs/downloads.html

Reason:
 added the page links for visiGene database download.


################### (galt 2008-09-08 done)
# move visiGene data to hive
ssh hgwdev
mv /san/SanVol1/visiGene /hive/data/inside/visiGene
ln -s /hive/data/inside/visiGene /gbdb/visiGene
# note /usr/local/apache/htdocs/visiGene is still a symlink to /gbdb/visiGene
#
################### (galt 2009-10-14)
# adjustments for mysql5 blob string comparisons 
ssh hgwdev
hgsql visiGene -e 'ALTER table vgPrb modify seq longtext'
# also changed vgProbeTrack.c to make vgRemapTemp.seq be longtext.

################### (galt 2009-10-15)
# Initial vgProbeTrack run for hg19
ssh hgwdev
cd /hive/data/inside/visiGene/dump
mkdir visiGene.20091015
hgsqldump visiGene -T visiGene.20091015
cd ~/kent/src/hg/visiGene/vgProbeTrack
make

vgProbeTrack PSLMAP working hg19 mm9 -sqlPath=.. >& hg19VgptPslmap.log&
# takes a few minutes
#-----------------------------
#Count of nonBac Psls found for pslMap: 24924
#Count of bac Psls found for pslMap: 0
#rc = 24647 = count of sequences for pslMap for taxon 10090
# updates vgPrbAliAll and hg19.vgAllProbes
#mysql> select count(*) from vgPrbAliAll where db='hg19';
#    24886 

vgProbeTrack REMAP working hg19 nibb nibbImageProbes \
  /gbdb/hg19/nibbImageProbes.fa >& hg19VgptRemap.log&
#Count of Psls found for reMap: 1441

vgProbeTrack EXTALL working hg19 >& hg19VgptExtall.log&
#rc = 24539 = count of sequences for vgPrbExt.fa, to use with hg19 track vgAllProbes
#creates /gbdb/hg19/visiGene/vgPrbExt_SJMFAO.fa
#hgLoadSeq hg19 /gbdb/hg19/visiGene/vgPrbExt_SJMFAO.fa
#Warning: load of seq did not go as planned: 24539 record(s), 0 row(s) skipped,
# (these are due to NULL in unused field gb_date, harmless)
# someone will probably add show warnings to hgLoadSeq soon.
#mysql> select count(*) from hg19.vgAllProbes;
#    25931 

cd ~/kent/src/hg/visiGene/knownToVisiGene
make
knownToVisiGene hg19

#################################################
# Fix Mahoney Lab submissionSource website gone.
# this website mahoney.chip.org is dead 
# and it looks like forever.
# Changed visiGene.submissionSource.{setUrl,itemUrl} to ""
# for the Mahoney Lab (id==1).
# Also changed source of hgVisiGene/printCaption.c
# to tolerate the blank string.