# CREATE EMPTY DATABASE AND TABLES. hgsql -e "create database visiGeneNew" mysql hgsql visiGeneNew < ~/kent/src/hg/visiGene/visiGene.sql makeTableDescriptions visiGeneNew ~/kent/src/hg/visiGene/visiGene.as # LOAD PAUL GRAY/MAHONEY LAB DATA. # Transferred images from Paul Gray's Mac to mine and converted # his spreadsheet to a tab-separated file, cloning.tab. cd ~/kent/src/hg/visiGene/vgLoadMahoney vgLoadMahoney /gbdb/visiGene mm5 cloning.tab clonePcr.bed outDir cd outDir visiGeneLoad whole.ra whole.tab /dev/null -database=visiGeneNew visiGeneLoad slices.ra slices.tab /dev/null -database=visiGeneNew # LOAD JACKSON LABS DATA. # First ask Galt to create a local copy of the Jackson labs # database. I'm not sure how he did it. cd ~/kent/src/hg/visiGene/vgLoadJax vgLoadJax /gbdb/visiGene jackson visiGene ./loadNew # Update the privateUser fields where we don't have permissions by entering # this at the mysql prompt. update submissionSet,journal set submissionSet.privateUser=-1 where (journal.name like 'Nat %' or journal.name = 'Nature') and submissionSet.journal = journal.id and submissionSet.name like 'jax%' # LOAD NIBB IMAGES # Do this after creating the nibbImageProbe.fa file as described # in makeXenTro1.doc, and after creating the nibbImageProbes table # in hg17 as describe in makeHg17.doc. The image files are # loaded in /cluster/store11/visiGene/offline/nibbFrog. ssh kolossus cd /cluster/store11/visiGene/offline nibbParseImageDir nibbFrog nibFrog.tab bad.tab nibbPrepImages nibbFrog nibFrog.tab \ /cluster/store11/visiGene/gbdb/200/inSitu/XenopusLaevis/nibb \ /cluster/store11/visiGene/gbdb/full/inSitu/XenopusLaevis/nibb # Note the nibbPrepImages step is a 2 day process, next time may # want to run it on the kki cluster. It does need to be run on a 64 # bit machine because of bugs in the 32 bit image magick convert program. ssh hgwdev cd ~/kent/src/hg/visiGene/vgLoadNibb hgMapToGene hg17 nibbImageProbes knownGene knownToNibbImage # Now go into the gene sorter on hg17, configure it to just show # the name, genbank, and NIBB Xenopus columns. Filter on * in the # NIBB Xenopus column (which will get rid of rows with no data in that # column). Save the text output to names.raw. Then get rid of names # that are no more than genbank accessions as so: awk '$1 != $2 {printf("%s\t%s\n", $1, $3);}' names.raw > names.txt # Now create the .tab and .ra files as so: vgLoadNibb /cluster/store11/visiGene/offline/nibbFrog \ /cluster/store11/visiGene/offline/nibbFrog.tab \ /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa \ names.tab stage.tab outDir visiGeneLoad outDir/nibb.ra outDir/nibb.tab /dev/null -database=visiGeneNew # LOAD GENSAT IMAGES # This was done with the assistance of Mike Dicuccio at NCBI, # dicuccio@ncbi.nlm.nih.gov. If updating probably it's best to # get in touch with him and make sure that the ftp site is up to # date. # Download data from NCBI into /cluster/store11/visiGene/offline/gensat cd /cluster/store11/visiGene/offline mkdir gensat cd gensat mkdir RawData cd RawData wget --timeStamping ftp://ftp.ncbi.nih.gov/pub/gensat/RawData/GENSAT-20051120.xml.gz wget --timeStamping ftp://ftp.ncbi.nih.gov/pub/gensat/RawData/NCBI_Gensat-20051120.dtd # At this point if the dtd has changed you may need to remake # kent/src/hg/visiGene/gensat/lib/gs.c with autoXml. Once # this is done then do the download with gensatImageDownload. # It'll take about 3 days. The results will be in the Institutions dir. cd /cluster/store11/visiGene/offline/gensat zcat RawData/GENSAT-20051120.xml.gz | gensatImageDownload . download.log # Create parasol directory and a list of the jpg files. ssh kki cd /cluster/store11/visiGene/offline/gensat mkdir prepImageRun find Institutions -name '*.jpg' -print | sed 's/Institutions\//' > prepImageRun/jpg.lst cd prepImageRun # Create parasol batch cat << '_EOF_' > gsub #LOOP vgPrepImage /cluster/store11/visiGene/offline/gensat/Institutions /cluster/store11/ visiGene/gbdb/200/inSitu/Mouse/gensat /cluster/store11/visiGene/gbdb/full/inSitu/Mo use/gensat $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 jpg.lst single gsub spec para make spec # Note the above procedure would take about 3 days. I ended up copying the # data over to /san/sanvol1, and doing it on the pita cluster. The job # there just took two hours, with just 100 cpus available. It took # an hour to copy the data over, and eight hours to copy it back though, # and some tweaking. # MAKE FULL TEXT INDEX cd /cluster/store11/visiGene/gbdb vgGetText visiGene.text mm7 hg17 ixIxx visiGene.text visiGene.ix visiGene.ixx # (Galt 2006-02) # RSYNC'd from /cluster/store11/visiGene to /san/sanvol1/visiGene # and moved the /gbdb/visiGene symlink to point to the new location. # I also had to manually run a script to find symlinks pointing from full/ over to # /cluster/store11/offline and remake them to point correctly to /san/sanvol1/visiGene/offline. # Allen Brain Atlas jp2 image prep (Galt 2006-02-12) # Create parasol directory and a list of the jpg files. ssh pk cd /san/sanvol1/visiGene/offline/allenBrain mkdir prepImageRun find imageDisk -name '*.jp2' -print | sed 's/imageDisk\///' > prepImageRun/jpg.lst cd prepImageRun # Create parasol batch cat << '_EOF_' > gsub #LOOP vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 jpg.lst single gsub spec para make spec -maxNode=50 [pk:/san/sanvol1/visiGene/offline/allenBrain/prepImageRun> /parasol/bin/para time 11748 jobs in batch 4291 jobs (including everybody's) in Parasol queue. Checking finished jobs Completed: 11748 of 11748 jobs CPU time in finished jobs: 474919s 7915.32m 131.92h 5.50d 0.015 y IO & Wait Time: 5029116s 83818.60m 1396.98h 58.21d 0.159 y Average job time: 469s 7.81m 0.13h 0.01d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 41811s 696.85m 11.61h 0.48d Submission to last job: 172301s 2871.68m 47.86h 1.99d # -maxNode=50 was needed. # Note that because it opens up to 40 output files at the same time, it overwhelms NFS # when a lot of nodes are running, it can bring down the SAN. Because I was nearly # done when it came back up, I just re-pushed with -maxNode=50 to keep it under control. # However in the future, something like this should be done to keep the file access local # as much as possible. # Here is the proposed new way: # ----------------------- cat << '_EOF_' > gsub #LOOP ./vgPrep.csh $(path1) $(root1) $(file1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > vgPrep.csh #!/bin/tcsh mkdir -p /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain/$1 mkdir -p /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain/$1 cp /san/sanvol1/visiGene/offline/allenBrain/imageDisk/$1 /scratch/tmp/$3 vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /scratch/tmp/vg200$2 /scratch/tmp/vgfull$2 $1 set err = $status if (! $err ) then cp -r /scratch/tmp/vg200$2/* /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain cp -r /scratch/tmp/vgfull$2/* /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain endif rm -f /scratch/tmp/$3 rm -fr /scratch/tmp/vg200$2 rm -fr /scratch/tmp/vgfull$2 if ( $err ) then exit 1 endif '_EOF_' # << this line makes emacs coloring happy # ----------------------- # ADDED TWO ADDITIONAL ZOOM-OUT LEVELS 5 AND 6: # Ran /san/sanvol1/offline/level56Run/ cluster job on a list of all files dumped # from the visiGene.imageFile table so that we made new zoom out levels 5 and 6 # for all pictures. Since it was a special one-time deal, I just used ImageMagick. # vgPrepImage.c has been modified to do the 2 new zoomout levels so that they # will be built automatically in future. # Ran several checks to make sure no files were missing, fixed any errors. # Found embedded space in some nibb filenames, found a couple of gensat # images which had previously failed to download and redownloaded them ok. # Found a few missing things and 0 bytes jpgs and re-ran them. # It should be pretty clean right now. # LOAD ALLEN BRAIN DATA vgLoadAllen \ /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain \ /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20051021.tab \ /cluster/data/mm7/bed/allenBrain/allProbes.fa \ /cluster/data/mm7/bed/allenBrain/allProbes.tab \ output #backed-up data in case of trouble: mkdir /san/sanvol1/visiGene/dump/visiGene.20060220 hgsqldump visiGene -T /san/sanvol1/visiGene/dump/visiGene.20060220 #load into visiGene db visiGeneLoad -database=visiGene output/aba.ra output/aba.tab /dev/null # Manually added several researchers names to the contributor and submissionContributor tables # at the request of Susan Sunkin as well as updating the text for contributor, copyright, acknowledgements. # I manually also updated aba.ra and vgLoadAllen.c to reflect her changes. The manual mods # to contributor which work great in the visiGene search are not currently automatically # supported, and would thus be lost if we ever nuke it and start fresh. # At some point, we will probably add an additional field to the .ra structure # and have visiGeneLoad support it. # RE-MAKE FULL TEXT INDEX cd hg/visiGene/vgGetText make alpha # basically does this, but puts it in cgi-bin/visiGeneData/: #vgGetText visiGene.text mm7 hg17 #ixIxx visiGene.text visiGene.ix visiGene.ixx # (hgVisiGene cgi v128 now knows about this new location) ############################ # REBUILD WITH NEW vgProbeTrack PROGRAM - AFTER ADDING ALLEN BRAIN (DONE galt 2009-10-12) # (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes cd /hive/data/inside/visiGene/dump # (this backup shown is really an example template for the next person who needs to do this) mkdir visiGene.20060315 cd visiGene.20060315 hgsqldump visiGene -T . mkdir mm6; hgsqldump mm6 vgProbes -T mm6 mkdir mm7; hgsqldump mm7 vgProbes -T mm7 mkdir mm8; hgsqldump mm8 vgProbes -T mm8 mkdir mm9; hgsqldump mm9 vgProbes -T mm9 mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17 mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18 #(do any others needed that might not be listed here) #(document the reason for making the backup) echo 'vgLoadAllenBrain has been run, so making backup of visiGene db and probe tracks before updating, ' > README # OK, NOW USE vgProbeTrack TO UPDATE cd ~/kent/src/hg/visiGene/vgProbeTrack # -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db # so it can find the .sql script to create vgProbes or vgAllProbes tables as needed. # I happen to know that only AllenBrain was updated since last time, and that is mouse only # populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once). vgProbeTrack POP # find sequence using various methods - given probe seq, primers, bacs, refseq, etc. # must specify a specific assembly to use, so just using mm7 since mm8 still in qa. # this finds any stuff for the mouse taxon vgProbeTrack SEQ working mm7 # create alignments using either refSeqAli or all_mrna or bacEnds or blat. Took 1.5 hours. # alignments are individually tracked per assembly here # alignment successes go in $db.vgProbes psl track, and whether succeeded or failed, # it only looks for things that have not already attempted alignment # the status goes into visiGene.vgPrbAli with .db="mm7" # because mm7.vgProbes is a new table, to create it we include the -sqlPath so # it can find the vgProbes.sql script vgProbeTrack ALI working mm7 -sqlPath=.. # this finds any seq required for mm7.vgProbes track not already in mm7.seq # adds the new .fa file in /cluster/data/mm7/bed/visiGene/ # adds a symlink to it in /gbdb/mm7/visiGene/ # and runs hgLoadSeq mm6 /gbdb/mm7/visiGene/vgPrbExt_??????.fa to add it to mm7.seq vgProbeTrack EXT working mm7 # mm6.vgProbes was already complete from previous probe track creation, # it just needed to catch the new Allen Brain probes and align them. About 1.5 hours. vgProbeTrack ALI working mm6 vgProbeTrack EXT working mm6 # hg17.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse # this internally uses pslMap against the mm7 to hg17 liftover chain.gz # Because it is "Xeno" (from mouse to human), it creates track vgAllProbes, # and maintains the list of processed alignments in visiGene.vgPrbAliAll. vgProbeTrack PSLMAP working hg17 mm7 # updates hg17.seq/extFile similarly to the EXT command, but for All probes. # just like with EXT, EXTALL puts .fa in /cluster/data/hg17/visiGene # and symlink in /gbdb/hg17/visiGene and updates using hgLoadSeq. # if a sequence has already been loaded it will not be loaded again. vgProbeTrack EXTALL working hg17 # hg18.vgAllProbes never existed before vgProbeTrack PSLMAP working hg18 mm7 -sqlPath=.. # because the nibb blatz probe track hg18.nibbImageProbes was never done on hg18 # until just now (see makeHg18.doc), we have to add it for the first time. # "nibb" is not really a db here, so I manually put in a taxon mapping for it, # so it appears as Xenopus laevis 8355, see the source code. vgProbeTrack REMAP working hg18 nibb nibbImageProbes /gbdb/hg18/nibbImageProbes.fa vgProbeTrack EXTALL working hg18 # mm8 is in qa and so it is basically ready to use now. About 1.5 hours. vgProbeTrack ALI working mm8 -sqlPath=.. vgProbeTrack EXT working mm8 # RE-MAKE knownToVisiGene tables (see respective makedocs for these) #knownToVisiGene mm6 #knownToVisiGene mm7 #knownToVisiGene mm8 #knownToVisiGene hg17 -fromProbePsl=vgAllProbes #knownToVisiGene hg18 -fromProbePsl=vgAllProbes ############################ ### JACKSON UPDATE (done 2006-04-01 galt) ############# # updated jackson20060328 db on kkr3u00 (see hg/visiGene/jackson/makeJackson.doc) # Dropped old visiGeneOld db, asked Heather to clone visiGene db to visiGeneOld db, # and then ran this query to remove the old previous JAX info: # MULTI-TABLE DELETE: delete submissionSource, submissionSet, submissionContributor, image, imageProbe, expressionLevel, imageFile from submissionSource so, submissionSet ss, submissionContributor sc, image i, imageProbe ip, expressionLevel el, imageFile f where so.id = 2 and ss.submissionSource = so.id and sc.submissionSet = ss.id and i.submissionSet = ss.id and ip.image = i.id and el.imageProbe = ip.id and f.submissionSet = ss.id; #delete query (get rid of all submissionSource.id=2) #Query OK, 164717 rows affected (48 min 16.07 sec) # Workaround for uniProt access from kkr3u00 ssh hgwdev setenv jdb jackson20060328 cd ~/kent/src/hg/visiGene/vgLoadJax hgsqldump uniProt taxon commonName -T . ssh kkr3u00 setenv jdb jackson20060328 cd ~/kent/src/hg/visiGene/vgLoadJax hgsql mysql -e "create database uniProt" hgsql uniProt < taxon.sql hgsql uniProt < commonName.sql # hgsql uniProt -e 'show tables' hgsql uniProt -e "load data local infile 'taxon.txt' into table taxon" hgsql uniProt -e "load data local infile 'commonName.txt' into table commonName" # hgsql uniProt -e 'show table status\G' # cleanup rm taxon.* rm commonName.* #update vgLoadJax.c to update the date given in .ra acknowledgements #recompile vgLoadJax on dev #run vgLoadJax to create .ra .tab .txt for each submissionSet ssh kkr3u00 setenv jdb jackson20060328 cd ~/kent/src/hg/visiGene/vgLoadJax #remove any old data dir rm -fr visiGene/ # visiGene in line below is just an output dir for the .ra/.tab/.txt files ~/bin/i386/vgLoadJax /san/sanvol1/visiGene/gbdb jackson20060328 visiGene #ref 32185: missing title from BIB_Refs, ref skipped #Calculating age from postnatal #ref 67768: missing title from BIB_Refs, ref skipped #Calculating age from postnatal month 3 #Calculating age from postnatal #Calculating age from postnatal #Calculating age from postnatal month 4 #Calculating age from postnatal month 4 #Calculating age from Not Specified 12.5 #refCount=2970 #ran loadAll to load the updated jax .ra .tab .txt into visiGene db ssh hgwdev cd ~/kent/src/hg/visiGene/vgLoadJax loadAll #loadAll.output has 1112 lines like #visiGene/100423.ra # ran vgGetText to update cgi-bin-galt/visiGeneData/ using visiGene db cd ~/kent/src/hg/visiGene/vgGetText make alpha # output: #vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 hg17 #probe has 19276 rows #gene has 15173 rows #imageProbe has 115500 rows # recompiled hgVisiGene ############################ # REBUILD PROBETRACK (DONE galt 2006-04-04) # WITH vgProbeTrack PROGRAM - AFTER DOING JAX UPDATE 20060328 # (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes cd /san/sanvol1/visiGene/dump # (this backup shown is really an example template for the next person who needs to do this) mkdir visiGene.20060404 cd visiGene.20060404 hgsqldump visiGene -T . mkdir mm6; hgsqldump mm6 vgProbes -T mm6 mkdir mm7; hgsqldump mm7 vgProbes -T mm7 mkdir mm8; hgsqldump mm8 vgProbes -T mm8 mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17 mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18 #(do any others needed that might not be listed here) #(document the reason for making the backup) echo 'vgLoadJax jackson20060328 has been run, so making backup of visiGene db and probe tracks before updating probeTracks, ' > README # OK, NOW USE vgProbeTrack TO UPDATE cd ~/kent/src/hg/visiGene/vgProbeTrack # Make sure vgProbeTrack program is up to date make # -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db # so it can find the .sql script to create vgProbes or vgAllProbes tables as needed. # I happen to know that only JAX was updated since last time, and that is mouse only # populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once). vgProbeTrack POP #new probe records found = 1285, # new vgPrb records added = 1285 # most of these are old, but we updated JAX by dropping completely and re-adding # so these probes find their way back via sequence identity of probes in vgPrb.sequence # find sequence using various methods - given probe seq, primers, bacs, refseq, etc. # must specify a specific assembly to use, so mm7 is ready to use now, mm8 still in qa. # this finds any stuff for the mouse taxon vgProbeTrack SEQ working mm7 #rc = 0 = count of primers for mrna search for taxon 10090 #rc = 0 = count of primers for genome search for taxon 10090 #bac list read done. #found seq for 0 bacEndPairs #rc = 549 = count of refSeq mrna for mm7 #rc = 18 = count of genRef mrna for mm7 #rc = 33 = count of genbank mrna for mm7 #rc = 428 = count of flatRef mrna for mm7 #rc = 0 = count of flatAll mrna for mm7 #rc = 1 = count of linkRef mrna for mm7 #rc = 0 = count of linkAll mrna for mm7 #rc = 1 = count of kgAlRef mrna for mm7 #rc = 37 = count of kgAlAll mrna for mm7 # create alignments using either refSeqAli or all_mrna or bacEnds or blat. Took 1.5 hours. # alignments are individually tracked per assembly here # alignment successes go in $db.vgProbes psl track, and whether succeeded or failed, # it only looks for things that have not already attempted alignment # the status goes into visiGene.vgPrbAli with .db="mm7" vgProbeTrack ALI working mm7 # this finds any seq required for mm7.vgProbes track not already in mm7.seq # adds the new .fa file in /cluster/data/mm7/bed/visiGene/ # adds a symlink to it in /gbdb/mm7/visiGene/ # and runs hgLoadSeq mm6 /gbdb/mm7/visiGene/vgPrbExt_??????.fa to add it to mm7.seq vgProbeTrack EXT working mm7 # mm6.vgProbes was already complete from previous probe track creation, # it just needed to catch the new Allen Brain probes and align them. About 1.5 hours. vgProbeTrack ALI working mm6 vgProbeTrack EXT working mm6 # hg17.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse # this internally uses pslMap against the mm7 to hg17 liftover chain.gz # Because it is "Xeno" (from mouse to human), it creates track vgAllProbes, # and maintains the list of processed alignments in visiGene.vgPrbAliAll. vgProbeTrack PSLMAP working hg17 mm7 # updates hg17.seq/extFile similarly to the EXT command, but for All probes. # just like with EXT, EXTALL puts .fa in /cluster/data/hg17/visiGene # and symlink in /gbdb/hg17/visiGene and updates using hgLoadSeq. # if a sequence has already been loaded it will not be loaded again. vgProbeTrack EXTALL working hg17 # hg18.vgAllProbes existed before vgProbeTrack PSLMAP working hg18 mm7 vgProbeTrack EXTALL working hg18 # mm8 is in qa and so it is basically ready to use now. vgProbeTrack ALI working mm8 vgProbeTrack EXT working mm8 # RE-MAKE knownToVisiGene tables (see respective makedocs for these) knownToVisiGene mm6 knownToVisiGene mm7 knownToVisiGene mm8 knownToVisiGene hg17 -fromProbePsl=vgAllProbes knownToVisiGene hg18 -fromProbePsl=vgAllProbes # update text/index for visiGene cd hg/visiGene/vgGetText make alpha #vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 hg17 #probe has 19276 rows #gene has 15173 rows #imageProbe has 115500 rows ############################ # # Patch contributors so we can search MGI submission sets # by specifying JAX or MGI in the search box. # select id from submissionSource where name = 'MGI'; +----+ | id | +----+ | 6 | +----+ # note: we have to double the search word or else the search doesn't work insert into contributor set name = 'JAX JAX'; insert into contributor set name = 'MGI MGI'; mysql> select * from contributor where name in ('JAX JAX','MGI MGI'); +------+---------+ | id | name | +------+---------+ | 3981 | JAX JAX | | 3982 | MGI MGI | +------+---------+ insert into submissionContributor select id, '3981' from submissionSet where submissionSource = 6; insert into submissionContributor select id, '3982' from submissionSet where submissionSource = 6; ##### ADD SUPPORT FOR ABURL (DONE 2006-04-19 galt) # I manually updated this, currently needed only by JAX, # adding antibodySource table that maps abSubmitId to antibody and submissionSource # and adding field abUrl to submissionSource table. # The code for vgLoadJax and visiGeneLoad were also updated to support this new # link from antibody probe to submissionSource website for further details. # Since this will be automatically maintained in future, no point in belaboring the makefile. # This also involved an update to hgVisiGene including passing submissionSource id on # the url to the primers page so that the external link can be made when it is an antibody. ##### REPLICATED submissionSet.privateUser SETTINGS TO NEW VISIGENE DB (DONE 2006-04-24 galt) # This was an oversight caused by full removal of all old jax submissionSets # when we did the jax 2006-03-28 update. Since we had lost the privateUser settings, # I just replicated it from visiGeneOld with a simple query. update visiGene.submissionSet n, visiGeneOld.submissionSet o set n.privateUser=-1 where o.privateUser=-1 and o.name = n.name; # currently this is just jax submissionSets for which we have not received permissions to use. ##### ADDED IMAGEFILE-FORWARDING TO COMBINE MAHONEY AND JAX-MAHONEY ANNOTATIONS (DONE 2006-04-26 galt) # The idea here is that JAX has some useful annotations, but including them made a lot of # unnecessary duplication in the system. Although it wasn't easy, we have come up with # a method to map the imageFiles from Mahoney to the ones in JAX. We have made imageFileFwd table # to store that mapping information, and added code to hgVisiGene to use it. Wholemount steps # are manual, while slices steps use hg/visiGene/vgLoadJax/forwardSlices.c I wrote to map them Mah->JAX. # Additional complications are that JAX combined several slices together into one image # following a certain pattern. Luckily for the wholemounts, the original images were not modified by JAX. # This means that we can get a perfect match Mah->JAX for the wholemounts using md5sum (produced unique values). # Because both we and JAX imported the Mahoney data/spreadsheet into auto-incremented primary keyfield # tables, the original order is preserved and allows a surprisingly good mah->jax many-to-one slices mapping. # I also extended vgLoadJax to be able to find the primers in the PRB_Notes which was useful both # for mapping the slices, and because we end up using JAX annotations for the fullCaption() page, # so that we don't lose primer info. For the remaining fraction where Mahoney never supplied primers # (actually they have since updated the info, but neither JAX nor we have gotten that yet), # something over 20%, we have managed to instead just map on gene. This worked surprisingly well, # and made either correct or very close matches. # Since previously, vgLoadJax looked for the mahoney set in jax and excluded it, we need to # make and import it into visiGene. I have removed the skipping of mahoney set from the # vgLoadJax code (so that next time we update jax, the mahoney set will not be excluded) # and added a commandline option to do just a single submission set. I happen to know that # the mahoney set in jax is jax92242. THIS STEP WON'T BE NEEDED IN FUTURE. # the latest jax sybase db conversion is on kkr3u00 because it had space and little use. ssh kkr3u00 cd ~/kent/src/hg/visiGene/vgLoadJax # clean out any old subdirectory rm -fr visiJaxMahoney # process just the jaxMahoney submissionSet ${HOME}/bin/i386/vgLoadJax -oneSubmissionSet=92242 /san/sanvol1/visiGene/gbdb jackson20060328 visiJaxMahoney # load it into visiGene db ssh hgwdev visiGeneLoad visiJaxMahoney/92242.ra visiJaxMahoney/92242.tab visiJaxMahoney/92242.txt # we are going to treat the jax version of Mahoney as "privateUser" # in order to suppress it and reduce the duplication of Mahoney images hgsql visiGene -e 'update submissionSet set privateUser=-1 where name like "jax92242"' submissionSets: (for reference) name id ----------------------------------- mahoneyWhole = 1 mahoneySlices01 = 2 jax92242 = 1820 ssh hgwdev cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/wholeMount md5sum *.jpg | sed -e 's/ /\t/' | sort > ~/kent/src/hg/visiGene/vgLoadJax/mahoneyWholeMount.md5 #(quick - 1 or 2 minutes only) #Find the jax-Mahoney images that are WholeMount ssh hgwdev cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax # WARNING: change database name and submissionSet id constants in text below if needed!!! # jaxMahoney = 1820, bodyPart.id = 1 for name="whole" hgsql visiGene -BN -e 'select distinct imageFile.fileName from imageFile, image, specimen, bodyPart \ where imageFile.submissionSet=1820 and bodyPart=1 \ and image.imageFile=imageFile.id and image.specimen=specimen.id' \ | xargs md5sum | sed -e 's/ /\t/' | sort > ~/kent/src/hg/visiGene/vgLoadJax/jaxMahoneyWholeMount.md5 cd ~/kent/src/hg/visiGene/vgLoadJax # verify that they are unique by md5sum: wc -l *.md5 1833 jaxMahoneyWholeMount.md5 1843 mahoneyWholeMount.md5 sort -k 1,1 -u jaxMahoneyWholeMount.md5 | wc -l 1833 sort -k 1,1 -u mahoneyWholeMount.md5 | wc -l 1843 hgsql visiGene create table mahoneyWholeMountMd5 ( md5 char(32) not null, # md5 sum of .jpg fileName varchar(10) not null, # .jpg fileName INDEX(md5), INDEX(fileName) ); load data local infile 'mahoneyWholeMount.md5' into table mahoneyWholeMountMd5; analyze table mahoneyWholeMountMd5; create table jaxMahoneyWholeMountMd5 ( md5 char(32) not null, # md5 sum of .jpg fileName varchar(10) not null, # .jpg fileName INDEX(md5), INDEX(fileName) ); load data local infile 'jaxMahoneyWholeMount.md5' into table jaxMahoneyWholeMountMd5; analyze table jaxMahoneyWholeMountMd5; # verify that they match uniquely and completely: select count(*) from mahoneyWholeMountMd5 m, jaxMahoneyWholeMountMd5 j where m.md5 = j.md5; +----------+ | count(*) | +----------+ | 1833 | +----------+ # make forwarding table (NO NEED TO DO IN FUTURE, IS IN visiGene.as,.sql) CREATE TABLE imageFileFwd ( fromIf int not null, # From imageFile toIf int not null, # To imageFile #Indices INDEX(fromIf), INDEX(toIf) ); # WARNING: change submissionSet ids!!! # find how the mahoney matches to the jaxMahoney # (I verified that all filenames are unique in all 3 submissionSets: jaxM, mWhole, mSlices) insert into imageFileFwd select mi.id, ji.id from imageFile mi, imageFile ji, mahoneyWholeMountMd5 m, jaxMahoneyWholeMountMd5 j where m.md5 = j.md5 and mi.fileName=m.fileName and ji.fileName=j.fileName and mi.submissionSet=1 and ji.submissionSet=1820; # Records: 1828 # the wholemounts are now done, so let's do the slices next! # Cluster Run to do OCR on jaxMahoneySlices (Galt 2006-04-28) # if program ocrad is not in /cluster/bin/i386, download and compile it (very easy) # ocrad is a gnu program # Create parasol directory and a list of the jpg files. ssh hgwdev cd /san/sanvol1/visiGene/offline/jax mkdir ocrJaxMahoneyRun cd ocrJaxMahoneyRun mkdir output # make list of jaxMahoneySlice .jpgs # WARNING: change database and submissionSet ids!!! hgsql visiGene -BN -e 'select distinct imageFile.fileName from \ imageFile, image, specimen \ where imageFile.submissionSet=1820 and bodyPart<>1 \ and image.imageFile=imageFile.id and image.specimen=specimen.id' \ > jaxMahoneySlices.list # Create parasol batch cat << '_EOF_' > gsub #LOOP ./ocrSlices.csh $(file1) $(root1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > ocrSlices.csh #!/bin/tcsh -ef if ( -e output/$2.map ) then rm output/$2.map endif if ( -e output/$2.ocr ) then rm output/$2.ocr endif convert /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/$1 output/$2.pgm # There wasn't a single threshold value that worked, so do entire series /cluster/bin/i386/ocrad --threshold=.4 --charset=ascii output/$2.pgm >> output/$2.ocr /cluster/bin/i386/ocrad --threshold=.5 --charset=ascii output/$2.pgm >> output/$2.ocr /cluster/bin/i386/ocrad --charset=ascii output/$2.pgm >> output/$2.ocr /cluster/bin/i386/ocrad --threshold=.6 --charset=ascii output/$2.pgm >> output/$2.ocr /cluster/bin/i386/ocrad --threshold=.7 --charset=ascii output/$2.pgm >> output/$2.ocr /cluster/bin/i386/ocrad --threshold=.8 --charset=ascii output/$2.pgm >> output/$2.ocr /cluster/bin/i386/ocrad --threshold=.9 --charset=ascii output/$2.pgm >> output/$2.ocr @ x = $2 # special handling for 7996.jpg thru 8060.jpg, the Accession does not end in "aa" for these. if ( ($x >= 7996) && ($x <= 8060) ) then cat output/$2.ocr | tr lOoiI 10011 | tr -d . | perl -0ne 'print "$1\n" while ( /(T\d{8})/gs )' | sort -u > output/$2.temp else cat output/$2.ocr | tr lOoiI 10011 | tr -d . | perl -0ne 'print "$1\n" while ( /(T\d{8}aa)/gs )' | sort -u > output/$2.temp endif set tempTs = ( `cat output/$2.temp` ) if ( $#tempTs > 0 ) then foreach t ( $tempTs ) if ( ($x >= 7996) && ($x <= 8060) ) then # special handling for these if (-e /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/slices/${t}aa) then set t = "${t}aa" endif if (-e /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/slices/${t}00) then set t = "${t}00" endif endif echo "$1\t$t" >> output/$2.map end else echo "$1\tNO_TEXT" > output/$2.map endif rm output/$2.temp '_EOF_' # << this line makes emacs coloring happy chmod a+x ocrSlices.csh ssh pk cd /san/sanvol1/visiGene/offline/jax/ocrJaxMahoneyRun gensub2 jaxMahoneySlices.list single gsub spec para create spec para try para push para check para time #2095 jobs in batch #292661 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 2095 of 2095 jobs #CPU time in finished jobs: 1059s 17.65m 0.29h 0.01d 0.000 y #IO & Wait Time: 5687s 94.79m 1.58h 0.07d 0.000 y #Average job time: 3s 0.05m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 7s 0.12m 0.00h 0.00d #Submission to last job: 134s 2.23m 0.04h 0.00d cat output/*.map | sort > ~/kent/src/hg/visiGene/vgLoadJax/jaxMahoneySlices.map -------------------------- ssh hgwdev cd ~/kent/src/hg/visiGene/vgLoadJax cat *.map | wc -l 4066 cat *.map | grep NO_TEXT | wc -l 1 (turns out to be a jax annotation caption error - missing leading zero in mtf#, ignoring) hgsql visiGene create table jaxMahoneySlicesMap ( jFileName varchar(10) not null, # jaxMahoney .jpg fileName mFileName varchar(20) not null # mahoney .jpg fileName ); load data local infile 'jaxMahoneySlices.map' into table jaxMahoneySlicesMap; update jaxMahoneySlicesMap set mFileName = concat(mFileName,".jpg") where mFileName <> "NO_TEXT"; create index jFileName on jaxMahoneySlicesMap(jFileName); create index mFileName on jaxMahoneySlicesMap(mFileName); analyze table jaxMahoneySlicesMap; # WARNING: change submissionSet ids!!! insert into imageFileFwd select mi.id, ji.id from imageFile mi, imageFile ji, jaxMahoneySlicesMap map where ji.fileName=map.jFileName and mi.fileName=map.mFileName and mi.submissionSet=2 and ji.submissionSet=1820; # Records: 3896 # It's looking good. # clean up drop table mahoneyWholeMountMd5; drop table jaxMahoneyWholeMountMd5; drop table jaxMahoneySlicesMap; ############################################################# # # Patch contributors so we can search submission sets # by specifying Mahoney in the search box. # select id from submissionSource where name like 'Mahoney%'; +----+ | id | +----+ | 1 | +----+ # note: we have to double the search word or else the search doesn't work insert into contributor set name = 'Mahoney mahoney'; mysql> select * from contributor where name in ('Mahoney mahoney'); +------+-----------------+ | id | name | +------+-----------------+ | 3987 | Mahoney mahoney | +------+-----------------+ insert into submissionContributor select id, '3987' from submissionSet where submissionSource = 1; ############################ ### JACKSON UPDATE (re-done to fix expression data 2006-06-05 galt) ############# # The expression data was not correctly matching subpanels, # and the bodyPart was incorrectly displaying just "floor plate" # instead of the full part-tree-lineage available in field printName. # # We found the solution in vgLoadJax was to NOT use the GXD_Expression # table at all - apparently it is not necessary as the data is in other tables. # This meant that we are using GXD_Strength values instead of the old 1/0 for level. # We made the vgLoadJax code treat these correctly, and tweaked hgVisiGene too. # And then we also decided to add the expression pattern while we were at it # since JAX db had it - so added it to vgLoadJax and hgVisiGene. # previously updated: jackson20060328 db on kkr3u00 # (see above, and see hg/visiGene/jackson/makeJackson.doc) # save imageFileFwd data in new form for easy restore: create table iffKeepThis as select a.fileName "fromFN", b.fileName "toFN" from imageFileFwd iff, imageFile a, imageFile b where iff.fromIf = a.id and iff.toIf = b.id; create index fromFN on iffKeepThis(fromFn(10)); create index toFN on iffKeepThis(toFn(10)); # Asked Heather to clone visiGene db to visiGeneBadExpr db, # and then ran this query to remove the old previous JAX info: # MULTI-TABLE DELETE: # CRITICAL! to make sure that analyze table has been run on all tables involved, # otherwise this will run forever. Don't assume that the cardinality is defined. # Running analyze table is super quick. analyze table submissionSource; analyze table submissionSet; analyze table submissionContributor; analyze table image; analyze table imageFile; analyze table imageProbe; analyze table expressionLevel; delete from submissionSource where name = 'MGI'; # 1 rows delete submissionSet from submissionSet ss left join submissionSource so on ss.submissionSource=so.id where so.id is null; # 1113 rows delete submissionContributor from submissionContributor sc left join submissionSet ss on sc.submissionSet=ss.id where ss.id is null; # 7926 rows delete image from image i left join submissionSet ss on i.submissionSet=ss.id where ss.id is null; # 33816 rows delete imageFile from imageFile imf left join submissionSet ss on imf.submissionSet=ss.id where ss.id is null; # 13854 rows delete imageProbe from imageProbe ip left join image i on ip.image=i.id where i.id is null; # 35395 rows delete expressionLevel from expressionLevel el left join imageProbe ip on el.imageProbe=ip.id where ip.id is null; # 102293 rows delete from imageFileFwd; # 5724 rows delete antibodySource from antibodySource abs left join submissionSource so on abs.submissionSource=so.id where so.id is null; # 745 rows #recompile vgLoadJax on dev #run vgLoadJax to create .ra .tab .txt for each submissionSet ssh kkr3u00 cd ~/kent/src/hg/visiGene/vgLoadJax #remove any old data dir rm -fr visiGene/ # visiGene in line below is just an output dir for the .ra/.tab/.txt files vgLoadJax /san/sanvol1/visiGene/gbdb jackson20060328 visiGene #refCount=2971 #ran loadAll to load the updated jax .ra .tab .txt into visiGene db ssh hgwdev cd ~/kent/src/hg/visiGene/vgLoadJax loadAll # deal with parallel Mahoney-in-Jax data select * from submissionSet where name='jax92242' \G *************************** 1. row *************************** id: 2848 name: jax92242 publication: Mouse Brain Organization Revealed Through Direct Genome-Scale TF Expression Analysis select id,name from submissionSet where name like 'mahoney%'; +----+-----------------+ | id | name | +----+-----------------+ | 2 | mahoneySlices01 | | 1 | mahoneyWhole01 | +----+-----------------+ # save imageFileFwd data in new form for easy restore: insert into imageFileFwd select a.id, b.id from iffKeepThis iff, imageFile a, imageFile b where iff.fromFN = a.fileName and iff.toFN = b.fileName and a.submissionSet in (1,2) and b.submissionSet in (2848); drop table iffKeepThis; # Since we had lost the privateUser settings, # I just replicated it from visiGeneBadExpr backup with a simple query. update visiGene.submissionSet n, visiGeneBadExpr.submissionSet o set n.privateUser=-1 where o.privateUser=-1 and o.name = n.name; # currently this is just jax submissionSets for which we have not received permissions to use, # and the mahoney-in-jax that is suppressed. # ran vgGetText to update cgi-bin-galt/visiGeneData/ using visiGene db cd ~/kent/src/hg/visiGene/vgGetText make alpha # recompiled hgVisiGene earlier to support new expression level scale, and pattern # RE-MAKE knownToVisiGene tables (see respective makedocs for these) knownToVisiGene mm6 knownToVisiGene mm7 knownToVisiGene mm8 knownToVisiGene hg17 -fromProbePsl=vgAllProbes knownToVisiGene hg18 -fromProbePsl=vgAllProbes # Patch contributors so we can search MGI submission sets # by specifying JAX or MGI in the search box. # select id from submissionSource where name = 'MGI'; +----+ | id | +----+ | 7 | +----+ # note: we have to double the search word or else the search doesn't work # skip adding these two which are already there: # insert into contributor set name = 'JAX JAX'; # insert into contributor set name = 'MGI MGI'; mysql> select * from contributor where name in ('JAX JAX','MGI MGI'); +------+---------+ | id | name | +------+---------+ | 3981 | JAX JAX | | 3982 | MGI MGI | +------+---------+ insert into submissionContributor select id, '3981' from submissionSet where submissionSource = 7; insert into submissionContributor select id, '3982' from submissionSet where submissionSource = 7; ####################################################### # # Received a major update from Susan Sunkin at ABA # consisting of 6000 new images (we had 12000 already) # # # Allen Brain Atlas jp2 image prep (Galt 2006-12-12) # Create parasol directory and a list of the jpg files. ssh pk cd /san/sanvol1/visiGene/offline/allenBrain rm -fr prepImageRun mkdir prepImageRun find imageDisk -name '*.jp2' -print | sed 's/imageDisk\///' | grep May_06 > prepImageRun/jpg.lst cd prepImageRun # Create parasol batch cat << '_EOF_' > gsub #LOOP vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 jpg.lst single gsub spec para make spec -maxNode=50 [pk:prepImageRun> /parasol/bin/para time 6317 jobs in batch 266106 jobs (including everybody's) in Parasol queue. Checking finished jobs Completed: 6317 of 6317 jobs CPU time in finished jobs: 267986s 4466.44m 74.44h 3.10d 0.008 y IO & Wait Time: 368981s 6149.68m 102.49h 4.27d 0.012 y Average job time: 101s 1.68m 0.03h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 1471s 24.52m 0.41h 0.02d Submission to last job: 43292s 721.53m 12.03h 0.50d # -maxNode=50 was needed because it opens many output files at the same time - do not overwhelm NFS # ----------------------- # Allen Brain Atlas update (Galt 2007-02-08) # see mm6.txt for prep running allenCleanup and allenCollectSeq # LOAD ALLEN BRAIN DATA # note mm6,mm7,mm8 all have the same thing since it is for mouse generally # note make sure the contributors list in vgLoadAllen.c is correct vgLoadAllen \ /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain \ /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20061204.tab \ /cluster/data/mm6/bed/allenBrain/allProbes.fa \ /cluster/data/mm6/bed/allenBrain/allProbes.tab \ output #Got 17913 images #Got 17913 named probes #Got 17913 probe sequences # Did not do this: # (instead, I asked Heather to clone entire visiGene db to visiGeneOld) #backed-up data in case of trouble: #mkdir /san/sanvol1/visiGene/dump/visiGene.20061220 #hgsqldump visiGene -T /san/sanvol1/visiGene/dump/visiGene.20061220 #restore fileLocation to point to dev update fileLocation set name = concat('http://hgwdev.cse.ucsc.edu',substring(name,INSTR(name,'/visiGene/'))); # 14 rows # clean out the old ABA records before we do a full load delete from submissionSource where name = 'Allen Brain Atlas (ABA)'; # 1 row delete submissionSet from submissionSet ss left join submissionSource so on ss.submissionSource=so.id where so.id is null; # 1 row delete submissionContributor from submissionContributor sc left join submissionSet ss on sc.submissionSet=ss.id where ss.id is null; # 13 rows delete image from image i left join submissionSet ss on i.submissionSet=ss.id where ss.id is null; # 11736 rows delete imageFile from imageFile imf left join submissionSet ss on imf.submissionSet=ss.id where ss.id is null; # 11736 rows delete imageProbe from imageProbe ip left join image i on ip.image=i.id where i.id is null; # 11737 rows delete expressionLevel from expressionLevel el left join imageProbe ip on el.imageProbe=ip.id where ip.id is null; # 0 rows delete antibodySource from antibodySource abs left join submissionSource so on abs.submissionSource=so.id where so.id is null; # 0 rows #load into visiGene db visiGeneLoad -database=visiGene output/aba.ra output/aba.tab /dev/null # RE-MAKE FULL TEXT INDEX cd hg/visiGene/vgGetText make alpha # basically does this, and puts it in cgi-bin/visiGeneData/: #vgGetText visiGene.text mm8 hg18 #ixIxx visiGene.text visiGene.ix visiGene.ixx ############################ # REBUILD PROBETRACK (DONE galt 2007-02-15) # WITH vgProbeTrack PROGRAM - AFTER DOING Allen Brain Atlas update 2007-02-08 # (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes cd /san/sanvol1/visiGene/dump # (this backup shown is really an example template for the next person who needs to do this) mkdir visiGene.20070215 cd visiGene.20070215 hgsqldump visiGene -T . mkdir mm6; hgsqldump mm6 vgProbes -T mm6 mkdir mm7; hgsqldump mm7 vgProbes -T mm7 mkdir mm8; hgsqldump mm8 vgProbes -T mm8 mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17 mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18 #(do any others needed that might not be listed here) #(document the reason for making the backup) echo 'vgLoadAllen has been run on ABA update 2007-02-08, so making backup of visiGene db and probe tracks before updating probeTracks, ' > README # OK, NOW USE vgProbeTrack TO UPDATE cd ~/kent/src/hg/visiGene/vgProbeTrack # Make sure vgProbeTrack program is up to date make # -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db # so it can find the .sql script to create vgProbes or vgAllProbes tables as needed. # I happen to know that only JAX was updated since last time, and that is mouse only # populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once). vgProbeTrack POP # new probe records found = 7335, # new vgPrb records added = 7314 # most of these are old, but we updated ABA by dropping completely and re-adding # so these probes find their way back via sequence identity of probes in vgPrb.sequence # find sequence using various methods - given probe seq, primers, bacs, refseq, etc. # must specify a specific assembly to use, so mm8 is ready to use now # this finds any stuff for the mouse taxon vgProbeTrack SEQ working mm8 rc = 17 = count of primers for genome search for taxon 10090 rc = 141 = count of primers for mrna search for taxon 10090 bac list read done. found seq for 0 bacEndPairs rc = 93 = count of refSeq mrna for mm8 rc = 1 = count of genRef mrna for mm8 rc = 4 = count of genbank mrna for mm8 rc = 19 = count of flatRef mrna for mm8 rc = 0 = count of flatAll mrna for mm8 rc = 0 = count of linkRef mrna for mm8 rc = 0 = count of linkAll mrna for mm8 rc = 1 = count of kgAlRef mrna for mm8 rc = 4 = count of kgAlAll mrna for mm8 # create alignments using either refSeqAli or all_mrna or bacEnds or blat. Took 1.5 hours. # alignments are individually tracked per assembly here # alignment successes go in $db.vgProbes psl track, and whether succeeded or failed, # it only looks for things that have not already attempted alignment # the status goes into visiGene.vgPrbAli with .db="mm8" vgProbeTrack ALI working mm8 # this finds any seq required for mm8.vgProbes track not already in mm8.seq # adds the new .fa file in /cluster/data/mm8/bed/visiGene/ # adds a symlink to it in /gbdb/mm8/visiGene/ # and runs hgLoadSeq mm6 /gbdb/mm8/visiGene/vgPrbExt_??????.fa to add it to mm8.seq vgProbeTrack EXT working mm8 # mm6.vgProbes was already complete from previous probe track creation, # it just needed to catch the new Allen Brain probes and align them. About 1.5 hours. vgProbeTrack ALI working mm7 vgProbeTrack EXT working mm7 # mm6.vgProbes was already complete from previous probe track creation, # it just needed to catch the new Allen Brain probes and align them. About 1.5 hours. vgProbeTrack ALI working mm6 vgProbeTrack EXT working mm6 # hg18.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse # this internally uses pslMap against the mm8 to hg18 liftover chain.gz # Because it is "Xeno" (from mouse to human), it creates track vgAllProbes, # and maintains the list of processed alignments in visiGene.vgPrbAliAll. vgProbeTrack PSLMAP working hg18 mm8 # updates hg18.seq/extFile similarly to the EXT command, but for All probes. # just like with EXT, EXTALL puts .fa in /cluster/data/hg18/visiGene # and symlink in /gbdb/hg18/visiGene and updates using hgLoadSeq. # if a sequence has already been loaded it will not be loaded again. vgProbeTrack EXTALL working hg18 # hg17.vgAllProbes existed before vgProbeTrack PSLMAP working hg17 mm7 vgProbeTrack EXTALL working hg17 # RE-MAKE knownToVisiGene tables (see respective makedocs for these) knownToVisiGene mm6 knownToVisiGene mm7 knownToVisiGene mm8 knownToVisiGene hg17 -fromProbePsl=vgAllProbes knownToVisiGene hg18 -fromProbePsl=vgAllProbes # update text/index for visiGene cd hg/visiGene/vgGetText make alpha #vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 mm8 hg17 hg18 #probe has 26611 rows #gene has 20413 rows #imageProbe has 125765 rows ################### (galt 2007-04-20 done) # FIXED TWO ADDITIONAL ZOOM-OUT LEVELS 5 AND 6: # Ran /san/sanvol1/offline/level56RunJax/ cluster job on a list of all files needed. # Somehow 13000 pix were missing from the list when we made zoom out levels 5 and 6 # originally cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/ find . -type d > dlist vi dlist #remove anything starting with "foo" or "ztest" or "goo" plus "." #That should leave just valid directories. cat dlist | sed -e 's/\.\///' > dlist2 [hgwdev:jax> cat level6missing.csh #!/bin/tcsh set nonomatch while (1) set i=$< if ("$i" == "") then break endif if ( -e $i/*_6_000.jpg) then else echo "$i" endif end cat dlist2 | level6missing.csh > dlist3 cd /san/sanvol1/visiGene/offline mkdir level56RunJax cd level56RunJax cp ../level56Run/level56.csh . cp ../level56Run/gsub . cat gsub [hgwdev:level56RunJax> cat gsub #LOOP ./level56.csh $(path1) #ENDLOOP cat /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/dlist3 | gawk '{print "/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/" $1 ".jpg"}' > jpg.lst pk cd /san/sanvol1/visiGene/offline/level56RunJax gensub2 jpg.lst single gsub spec para create spec para try para push para time #Completed: 13235 of 13235 jobs #CPU time in finished jobs: 3819s 63.65m 1.06h 0.04d 0.000 y #IO & Wait Time: 45674s 761.23m 12.69h 0.53d 0.001 y #Average job time: 4s 0.06m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 21s 0.35m 0.01h 0.00d #Submission to last job: 560s 9.33m 0.16h 0.01d # Followup to show that it worked: #[hgwdev:jax> cat dlist2 | level6missing.csh > dlist3X #[hgwdev:jax> ll dlist* #-rw-rw-r-- 1 galt protein 117205 Apr 20 13:13 dlist #-rw-rw-r-- 1 galt protein 85191 Apr 20 13:15 dlist2 #-rw-rw-r-- 1 galt protein 71495 Apr 20 13:33 dlist3 #-rw-rw-r-- 1 galt protein 0 Apr 20 14:25 dlist3X # #This shows that all completed (because dlist3X is empty) #----------------------------------------------- # Rsync request #please rsync from /san to hgnfs1: rsync hgwdev:/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/ hgnfs1:/hgnfs1:/export/gbdb2/full/inSitu/Mouse/jax/ # ################### (galt 2007-05-09 done) # Adding support to vgProbeTrack and knownToVisiGene for # the BLATZ'd frog probes to mm8 which Jim did recently. # knownToVisiGene no longer uses -fromProbePsl option, # instead it automatically detects vgProbes and vgAllProbes # and uses them in that order if no symbolic matches were found. # Added a SELFMAP command to vgProbeTrack to migrate any missing # self alignments in vgProbes to vgAllProbes # Made a backup of visiGene.vg* first: ssh hgwdev cd /san/sanvol1/visiGene/dump mkdir visiGene.20070509 cd visiGene.20070509 hgsqldump visiGene -T . cd ~/kent/src/hg/visiGene/vgProbeTrack vgProbeTrack -sqlPath=.. REMAP working mm8 nibb nibbImageProbes /gbdb/mm8/nibbImageProbes.fa #FYI: Table mm8.vgAllProbes does not exist #hgPepPred visiGene generic vgRemapTemp /gbdb/mm8/nibbImageProbes.fa #Processing /gbdb/mm8/nibbImageProbes.fa #Count of Psls found for reMap: 1379 #cat vgPrbReMap.psl vgAllProbes.psl | sort -u | sort -k 10,10 >vgAllProbesNew.psl #hgLoadPsl mm8 vgAllProbesNew.psl -table=vgAllProbes #Processing vgAllProbesNew.psl #rm vgPrbReMap.psl vgAllProbes.psl vgAllProbesNew.psl vgProbeTrack SELFMAP working mm8 #Count of nonBac Psls found for pslMap: 24615 #Count of bac Psls found for pslMap: 0 #cat bac.psl nonBac.psl > vgPrbSelfMap.psl #cat vgPrbSelfMap.psl vgAllProbes.psl | sort -u | sort -k 10,10 > #vgAllProbesNew.psl #hgLoadPsl mm8 vgAllProbesNew.psl -table=vgAllProbes #Processing vgAllProbesNew.psl #rm vgPrbSelfMap.psl vgAllProbes.psl vgAllProbesNew.psl vgProbeTrack EXTALL working mm8 #rc = 981 = count of sequences for vgPrbExt.fa, to use with mm8 trackvgAllProbes #cp vgPrbExt.fa /cluster/data/mm8/bed/visiGene/vgPrbExt_YDGSWH.fa #ln -s /cluster/data/mm8/bed/visiGene/vgPrbExt_YDGSWH.fa/gbdb/mm8/visiGene/vgPrbExt_YDGSWH.fa #hgLoadSeq mm8 /gbdb/mm8/visiGene/vgPrbExt_YDGSWH.fa #981 sequences #Updating seq table knownToVisiGene mm8 #################################################### ################### (galt 2008-04-04 done) # Slight name change for NIBB (affected visiGene) # removed the word "Japanese " from NIBB name in visiGene.submissionSource # removed same thing from vgLoadNibb.c source code. # requested push of table hgwdev.visiGene.submissionSource. ################### (galt 2008-08-18 done) # make downloads for visiGene ssh hgwdev co browser # if you haven't already done it change browser module, downloads.html to add links to visiGene download cvs commit browser/downloads.html # updating the visiGene downloads cd /usr/local/apache/htdocs/goldenPath mkdir visiGene cd visiGene mkdir database cd database vi README --------- This directory contains the downloadable tables in the UCSC visiGene database. This database is shared by the program VisiGene http://genome.ucsc.edu/cgi-bin/hgVisiGene and tracks that incorporate visiGene data, such as the Known Genes tracks. To see descriptions of the tables in visiGene, visit the Table Browser: http://genome.ucsc.edu/cgi-bin/hgTables select "All Tables" as the group, select visiGene as the database, and select a table. Then click the "describe table schema" button. --------- hgsqldump visiGene -T . rm vgPrbAli.* rm vgPrbAliAll.* sed -i -e 's/hgwdev[.]cse/genome/' fileLocation.txt gzip *.txt Do a push-request: ------------ please rsync (with appropriate flags) hgwdev:/usr/local/apache/htdocs/goldenPath/visiGene/ to hgdownload:/usr/local/apache2/htdocs/goldenPath/visiGene/ Reason: Now users will have an easier time of downloading visiGene database. ------------ also, first time only, do update browser sandbox with links on downloads.html, then do a push-request: Please push downloads.html from dev to hgdownload: hgwdev:/usr/local/apache/htdocs/downloads.html to hgdownload:/usr/local/apache2/htdocs/downloads.html Reason: added the page links for visiGene database download. ################### (galt 2008-09-08 done) # move visiGene data to hive ssh hgwdev mv /san/SanVol1/visiGene /hive/data/inside/visiGene ln -s /hive/data/inside/visiGene /gbdb/visiGene # note /usr/local/apache/htdocs/visiGene is still a symlink to /gbdb/visiGene # ################### (galt 2009-10-14) # adjustments for mysql5 blob string comparisons ssh hgwdev hgsql visiGene -e 'ALTER table vgPrb modify seq longtext' # also changed vgProbeTrack.c to make vgRemapTemp.seq be longtext. ################### (galt 2009-10-15) # Initial vgProbeTrack run for hg19 ssh hgwdev cd /hive/data/inside/visiGene/dump mkdir visiGene.20091015 hgsqldump visiGene -T visiGene.20091015 cd ~/kent/src/hg/visiGene/vgProbeTrack make vgProbeTrack PSLMAP working hg19 mm9 -sqlPath=.. >& hg19VgptPslmap.log& # takes a few minutes #----------------------------- #Count of nonBac Psls found for pslMap: 24924 #Count of bac Psls found for pslMap: 0 #rc = 24647 = count of sequences for pslMap for taxon 10090 # updates vgPrbAliAll and hg19.vgAllProbes #mysql> select count(*) from vgPrbAliAll where db='hg19'; # 24886 vgProbeTrack REMAP working hg19 nibb nibbImageProbes \ /gbdb/hg19/nibbImageProbes.fa >& hg19VgptRemap.log& #Count of Psls found for reMap: 1441 vgProbeTrack EXTALL working hg19 >& hg19VgptExtall.log& #rc = 24539 = count of sequences for vgPrbExt.fa, to use with hg19 track vgAllProbes #creates /gbdb/hg19/visiGene/vgPrbExt_SJMFAO.fa #hgLoadSeq hg19 /gbdb/hg19/visiGene/vgPrbExt_SJMFAO.fa #Warning: load of seq did not go as planned: 24539 record(s), 0 row(s) skipped, # (these are due to NULL in unused field gb_date, harmless) # someone will probably add show warnings to hgLoadSeq soon. #mysql> select count(*) from hg19.vgAllProbes; # 25931 cd ~/kent/src/hg/visiGene/knownToVisiGene make knownToVisiGene hg19 ################################################# # Fix Mahoney Lab submissionSource website gone. # this website mahoney.chip.org is dead # and it looks like forever. # Changed visiGene.submissionSource.{setUrl,itemUrl} to "" # for the Mahoney Lab (id==1). # Also changed source of hgVisiGene/printCaption.c # to tolerate the blank string.