#This describes how at least some of the tables in #hgFixed were created. This is a database containing #primarily expression data. There are two main formats: # expRecord.as - This describes the mRNA sources for # a series of microarray experiments # expData.as - This describes the measured value # in either absolute or relative ratio terms of # each gene/probe/target in a series of microarray # experiments. Each expData is associated with # an expRecord, thogh expDatas sometimes share # the same expRecord. #The Human Affy GNF Expression Atlas 2003 Version: # Create the main expRecord table and the expData table for # the absolute measurements as so: hgGnfMicroarray gnfHumanU95AllExps gnfHumanU95All /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 # Convert these to ratios using the median of medians of non-cancerous # cell types as the denominator as so: cd ~/src/hg/makeDb/hgRatioMicroarray hgRatioMicroarray gnfHumanU95All gnfHumanU95AllRatio -clump=gnfClump.ra # Take the median value over multiple replicants and put in this table: cd ../hgMedianMicroarray hgMedianMicroarray hgFixed gnfHumanU95AllRatio gnfHumanU95AllExps gnfU95Median.ra gnfHumanU95MedianRatio gnfHumanU95MedianExps -minExps=1 # Also make a median version of the absolute measurements hgMedianMicroarray hgFixed gnfHumanU95All gnfHumanU95AllExps gnfU95Median.ra gnfHumanU95Median gnfHumanU95MedianExps -minExps=1 # The Mouse Affy GNF Expression Atlas: # Create the expRecord tables for U74 a/b/c and the expData table for # the absolute measurements: hgGnfMicroarray gnfMouseU74aAllExps gnfMouseU74aAll /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 hgGnfMicroarray gnfMouseU74bAllExps gnfMouseU74bAll /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt hgGnfMicroarray gnfMouseU74cAllExps gnfMouseU74cAll /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt # Convert these to ratios using the median of medians of # cell types as the denominator as so: cd ~/src/hg/makeDb/hgRatioMicroarray hgRatioMicroarray gnfMouseU74aAll gnfMouseU74aAllRatio -clump=gnfMouseU74aClump.ra hgRatioMicroarray gnfMouseU74bAll gnfMouseU74bAllRatio -clump=gnfMouseU74bClump.ra hgRatioMicroarray gnfMouseU74cAll gnfMouseU74cAllRatio -clump=gnfMouseU74cClump.ra # Take the median value over multiple replicants and put in this table: cd ../hgMedianMicroarray hgMedianMicroarray hgFixed gnfMouseU74aAllRatio gnfMouseU74aAllExps gnfMouseU74aMedian.ra gnfMouseU74aMedianRatio gnfMouseU74aMedianExps -minExps=1 hgMedianMicroarray hgFixed gnfMouseU74bAllRatio gnfMouseU74bAllExps gnfMouseU74bMedian.ra gnfMouseU74bMedianRatio gnfMouseU74bMedianExps -minExps=1 hgMedianMicroarray hgFixed gnfMouseU74cAllRatio gnfMouseU74cAllExps gnfMouseU74cMedian.ra gnfMouseU74cMedianRatio gnfMouseU74cMedianExps -minExps=1 # Also make a median version of the absolute measurements hgMedianMicroarray hgFixed gnfMouseU74aAll gnfMouseU74aAllExps gnfMouseU74aMedian.ra gnfMouseU74aMedian gnfMouseU74aMedianExps -minExps=1 hgMedianMicroarray hgFixed gnfMouseU74bAll gnfMouseU74bAllExps gnfMouseU74bMedian.ra gnfMouseU74bMedian gnfMouseU74bMedianExps -minExps=1 hgMedianMicroarray hgFixed gnfMouseU74cAll gnfMouseU74cAllExps gnfMouseU74cMedian.ra gnfMouseU74cMedian gnfMouseU74cMedianExps -minExps=1 #The Human GNF Expression Atlas 2 (2004) # # Create the main expRecord table and the expData table for # the absolute measurements as so: hgGnfMicroarray gnfHumanAtlas2AllExps gnfHumanAtlas2All /projects/compbio/data/microarray/geneAtlas2/human/U133A+GNF1B_101402.AD.txt -chip=U133A+GNF1B # Convert these to ratios using the median of medians of non-cancerous # cell types as the denominator as so: cd ~/src/hg/makeDb/hgRatioMicroarray hgRatioMicroarray gnfHumanAtlas2All gnfHumanAtlas2AllRatio -clump=gnfHumanAtlas2Clumps.ra # Take the median value over multiple replicants and put in this table: cd ../hgMedianMicroarray hgMedianMicroarray hgFixed gnfHumanAtlas2AllRatio gnfHumanAtlas2AllExps gnfHumanAtlas2.ra gnfHumanAtlas2MedianRatio gnfHumanAtlas2MedianExps -minExps=1 # Also make a median version of the absolute measurements hgMedianMicroarray hgFixed gnfHumanAtlas2All gnfHumanAtlas2AllExps gnfHumanAtlas2.ra gnfHumanAtlas2Median gnfHumanAtlas2MedianExps -minExps=1 #The Mouse GNF Expression Atlas 2 (2004) # Create the main expRecord table and the expData table for # the absolute measurements as so: hgGnfMicroarray gnfMouseAtlas2AllExps gnfMouseAtlas2All /projects/compbio/data/microarray/geneAtlas2/mouse/GNF1M_20030403.AD.txt -chip=GNF1M # Convert these to ratios using the median of medians of non-cancerous # cell types as the denominator as so: cd ~/src/hg/makeDb/hgRatioMicroarray hgRatioMicroarray gnfMouseAtlas2All gnfMouseAtlas2AllRatio -clump=../hgMedianMicroarray/gnfMouseAtlas2.ra # Take the median value over multiple replicants and put in this table: cd ../hgMedianMicroarray hgMedianMicroarray hgFixed gnfMouseAtlas2AllRatio gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2MedianRatio gnfMouseAtlas2MedianExps -minExps=1 # Also make a median version of the absolute measurements hgMedianMicroarray hgFixed gnfMouseAtlas2All gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2Median gnfMouseAtlas2MedianExps -minExps=1 #The Rat GNF Expression Atlas 2 (2004) # Create the main expRecord table and the expData table for # the absolute measurements as so: hgGnfMicroarray gnfRatAtlas2AllExps gnfRatAtlas2All /projects/compbio/data/microarray/geneAtlas2/rat/PivotNoApwithTissues.txt -chip=RG-U34A -ref=http://expression.gnf.org/ratlas # Convert these to ratios using the median of medians of non-cancerous # tissues or cell types (in this case, this is all the tissues) as the # denominator as so: cd ~/src/hg/makeDb/hgRatioMicroarray hgRatioMicroarray gnfRatAtlas2All gnfRatAtlas2AllRatio -clump=gnfRatAtlas2Clumps.ra # Take the median value over multiple replicants and put in this table. # Use Clumps.ra file renamed as gnfRatAtlas2.ra as this contains all the # tissues since there are no cancer tissues in this expression data set: cd ../hgMedianMicroarray hgMedianMicroarray hgFixed gnfRatAtlas2AllRatio gnfRatAtlas2AllExps gnfRatAtlas2.ra gnfRatAtlas2MedianRatio gnfRatAtlas2MedianExps -minExps=1 # Also make a median version of the absolute measurements hgMedianMicroarray hgFixed gnfRatAtlas2All gnfRatAtlas2AllExps gnfRatAtlas2.ra gnfRatAtlas2Median gnfRatAtlas2MedianExps -minExps=1 # C. elegans life cycle data from the Kim Lab via the Stanford Microarray Database. cd ~/kent/src/hg/makeDb/hgStanfordMicroarray hgStanfordMicroarray hgFixed kimWormLifeAllRatio kimWormLifeAllExps /projects/compbio/data/microarray/wormLifeCycle/spots -swap '-trimName=(green)' -suppress=green '-trimTissue=(repeat #?)' cd ../hgMedianMicroarray hgMedianMicroarray hgFixed kimWormLifeAllRatio kimWormLifeAllExps kimMed.ra kimWormLifeMedianRatio kimWormLifeMedianExps # D. melanogaster life cycle data from Arbeitman et al 2002 # via the Stanford Microarray Database. cd ~/kent/src/hg/makeDb/hgStanfordMicroarray # absolute: hgStanfordMicroarray -geneField="Systematic name" -dataField=CH2I_MEDIAN \ hgFixed arbFlyLifeAll arbFlyLifeAllExps \ /projects/compbio/data/microarray/flyLifeCycle/spots # ratios: hgStanfordMicroarray -geneField="Systematic name" \ hgFixed arbFlyLifeAllRatio arbFlyLifeAllExps \ /projects/compbio/data/microarray/flyLifeCycle/spots cd ../hgMedianMicroarray echo "select name,id from arbFlyLifeAllExps" | hgsql -N hgFixed \ | sort > arbMed.ra # edit arbMed.ra to collapse the N=1, N=2 lines. # median absolute: hgMedianMicroarray hgFixed arbFlyLifeAll arbFlyLifeAllExps arbMed.ra \ arbFlyLifeMedian arbFlyLifeMedianExps # median ratios: hgMedianMicroarray hgFixed arbFlyLifeAllRatio arbFlyLifeAllExps arbMed.ra \ arbFlyLifeMedianRatio arbFlyLifeMedianExps # cvs add and check in arbMed.ra ########################### # REGENERATING FLY LIFE-CYCLE TABLES. (DONE 5/12/2006 ANDY) hgsql hgFixed -e "rename table kimWormLifeAllRatio to kimWormLifeAllRatio_old" hgsql hgFixed -e "rename table kimWormLifeMedianExps to kimWormLifeMedianExps_old" hgsql hgFixed -e "rename table kimWormLifeMedianRatio to kimWormLifeMedianRatio_old" # The scopDes table, which is used by the SuperFamily column in hgNear. mkdir /cluster/store1/scop cd /cluster/store1/scop wget http://scop.mrc-lmb.cam.ac.uk/scop/parse/dir.des.scop.txt_1.63 grep -v '^#' dir.des.scop.txt* > scopDes.txt hgsql hgFixed < ~/kent/src/hg/lib/scopDes.sql echo "load data local infile 'scopDes.txt' into table scopDes;" | hgsql hgFixed # The Yeast Cell Cycle Time Course from Cho RJ et al 1998 cd /cluster/data/sacCer1/download/systematic_results/expression_data hgGnfMicroarray yeastChoCellCycleExps yeastChoCellCycle \ Cho_et_al_full_data.txt -chip=affyYeast \ -chopName=/ \ -url=http://yscdp.stanford.edu/yeast_cell_cycle/cellcycle.html \ -ref=http://www.pnas.org/cgi/content/abstract/95/7/3752 \ -credit=http://yscdp.stanford.edu/yeast_cell_cycle/cellcycle.html cd ~/src/hg/makeDb/hgRatioMicroarray hgRatioMicroarray yeastChoCellCycle yeastChoCellCycleRatio # Mouse expression data by sex on Affy MOE430A arrays from # John Rinn (john.rinn@yale.edu) et al. cd /projects/compbio/data/microarray/rinnEtAl hgGnfMicroarray mouseRinnSexExps mouseRinnSex rinnEtAlSpots.txt \ -chip=MOE430A \ -url=n/a \ -ref=n/a \ -credit=n/a cd ~/kent/src/hg/makeDb/hgRatioMicroarray hgRatioMicroarray mouseRinnSex mouseRinnSexRatio cd ~/kent/src/hg/makeDb/hgMedianMicroarray hgMedianMicroarray hgFixed mouseRinnSex mouseRinnSexExps mouseRinnSex.ra mouseRinnSexMedian mouseRinnSexMedianExps hgMedianMicroarray hgFixed mouseRinnSexRatio mouseRinnSexExps mouseRinnSex.ra mouseRinnSexMedianRatio mouseRinnSexMedianExps # D. melanogaster full euchromatic expression profile (FEEP) -- # Stolc et al. 2004. 1# Loaded up absolute tables directly from files downloaded from # http://genome.med.yale.edu/FEEP/FEEP.html -- # see /projects/compbio/data/microarray/flyFEEP/README . # Extract ratio from absolute: hgRatioMicroarray flyFeepAll flyFeepAllRatio cd ~/kent/src/hg/makeDb/hgMedianMicroarray echo "select description,id from flyFeepAllExps" | hgsql -N hgFixed \ | sort > flyFeepMed.ra # edit flyFeepMed.ra to collapse lines with the same initial character. # median absolute: hgMedianMicroarray hgFixed flyFeepAll flyFeepAllExps flyFeepMed.ra \ flyFeepMedian flyFeepMedianExps # median ratios: hgMedianMicroarray hgFixed flyFeepAllRatio flyFeepAllExps flyFeepMed.ra \ flyFeepMedianRatio flyFeepMedianExps # cvs add and check in flyFeepMed.ra # Human data from Shyamsundar R, et al. (2005) Genome Biol 6(3):R22 mkdir -p /projects/compbio/data/microarray/shyamsundarEtAl cd /projects/compbio/data/microarray/shyamsundarEtAl wget ftp://smd-ftp.stanford.edu/smd/publications/426/3130/exptsetno_3130.tar.gz wget ftp://smd-ftp.stanford.edu/smd/publications/426/3130/exptset_3130.meta tar xfz exptsetno_3130.tar.gz rm exptsetno_3130.tar.gz mkdir spots cat << _EOF_ > cleanXls.awk { if (/^!/) { line = \$0 gsub(/\"|,/, "", line) print line } else print } _EOF_ for file in *.xls; do awk -f cleanXls.awk $file > spots/$file done cd ~/kent/src/hg/makeDb/hgMedianMicroarray # The hgFixed.history doesn't have the errata column echo alter table history add column errata varchar(255) | hgsql hgFixed hgStanfordMicroarray -dataField="Normalized Ch2 Intensity (Median)" \ hgFixed humanNormal humanNormalExps /projects/compbio/data/microarray/shyamsundarEtAl/spots hgStanfordMicroarray -dataField="Log(base2) of R/G Normalized Ratio (Mean)" \ hgFixed humanNormalRatio humanNormalExps /projects/compbio/data/microarray/shyamsundarEtAl/spots echo "select name from humanNormalExps" | hgsql -N hgFixed | awk "{print \"\'\"\$0\"\'\"}" > col1 echo "select id from humanNormalExps" | hgsql -N hgFixed > col2 n=`wc -l < col1` for i in `seq 1 $n`; do echo "n/a" >> col1.5; done paste col1 col1.5 col2 | sort | tr '\t' ' ' > humanNormal.ra rm col1 col1.5 col2 # EDIT humanNormal.ra by hand and combine the like tissues hgMedianMicroarray -minExps=1 hgFixed humanNormal humanNormalExps humanNormal.ra \ humanNormalMedian humanNormalMedianExps hgMedianMicroarray -minExps=1 hgFixed humanNormalRatio humanNormalExps humanNormal.ra \ humanNormalMedianRatio humanNormalMedianExps #### HUMAN NORMAL DATA FIXING (10/5/2006 Andy) ssh hgwdev cd /projects/compbio/data/microarray/shyamsundarEtAl mv spots/13729.xls . tail +23 13729.xls | cut -f8,63 > data.txt echo 13729 > arrays.txt for array in spots/*; do echo $array >> arrays.txt tail +23 $array | cut -f63 > newCol.txt paste data.txt newCol.txt > tmp.txt mv tmp.txt data.txt done sed '/^[[:space:]]/d' data.txt > tmp.txt mv tmp.txt data.txt sed 's/spots\///;s/\.xls.*$//' arrays.txt > tmp.txt mv tmp.txt arrays.txt for id in `cat arrays.txt`; do grep $id -B1 exptset_3130.meta | grep Name | sed 's/.*=//;s/\"//g' >> names.txt; done paste arrays.txt names.txt | sort -k2,2 > tmp.txt mv tmp.txt arrays.txt rm names.txt # I changed my mind echo "" | cat - names.txt | tr '\n' '\t' > oneLine.txt cat oneLine.txt data.txt > tmp.txt mv tmp.txt data. # (copy/paste this into columnDb.ra) # Mouse data from Zhang, et. al The functional landscape of mouse gene expression" J Biol. # http://hugheslab.med.utoronto.ca/Zhang/ mkdir -p /cluster/store2/microarray ln -s /cluster/store2/microarray /cluster/data/microarray mkdir -p /cluster/data/microarray/zhangEtAl cd /cluster/data/microarray/zhangEtAl wget http://hugheslab.med.utoronto.ca/Zhang/expression_39309_normalized.txt sed 's/\(XM_[0-9]\+\)\.1/\1/' expression_39309_normalized.txt > arrays.txt hgGenericMicroarray hgFixed mouseLandscape arrays.txt wget http://hugheslab.med.utoronto.ca/Zhang/mouse_XM_mRNA_NCBI_2.fa sed 's/^>.*|\(XM.*\)\.1|.*$/>\1/' mouse_XM_mRNA_NCBI_2.fa > xm.fa ssh kk9 cd /santest/scratch mkdir andy cd andy/ cp /cluster/data/microarray/zhangEtAl/xm.fa . ls -1 /panasas/store/mm6/nib/* | grep -v random > chroms.lst cat << _EOF_ > gsub #LOOP blat -ooc=/scratch/hg/h/mouse11.ooc -fine -q=rna -noHead \$(path1) xm.fa xm.\$(root1).psl #ENDLOOP _EOF_ gensub2 chroms.lst single gsub spec para create spec para push para time #Completed: 22 of 22 jobs #CPU time in finished jobs: 36298s 604.96m 10.08h 0.42d 0.001 y #IO & Wait Time: 91s 1.52m 0.03h 0.00d 0.000 y #Average job time: 1654s 27.57m 0.46h 0.02d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 2955s 49.25m 0.82h 0.03d #Submission to last job: 2957s 49.28m 0.82h 0.03d cat *.psl > xm.psl ssh hgwdev cd /cluster/data/microarray/zhangEtAl cp /santest/scratch/andy/xm.psl . hgLoadPsl -table=xmMrna mm6 xm.psl hgMapToGene -type=psl -cds mm6 xmMrna knownGene knownToXM echo drop table xmMrna | hgsql mm6 # REBASE 505 (4-28-2005) (Done 5/18/2005 Andy) ssh hgwdev # download files curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgenz > rebase.gcg curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgref > rebaseRefs.txt # References file tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f2- -d' ' > c2 tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f1 -d' ' | sed 's/\.//' > c1 paste c1 c2 | sed '/^$/d' > rebaseRefs.txt rm c1 c2 # Load the cutters table. hgCutters hgFixed rebase.gcg # Load the other table. hgsql hgFixed -e "echo delete from rebaseRefs" hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs" # REBASE 603 (3-1-2006) (Done 3-2-2006 Andy) ssh hgwdev # download files curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgenz > rebase.gcg curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgref > rebaseRefs.txt # References file tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f2- -d' ' > c2 tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f1 -d' ' | sed 's/\.//' > c1 paste c1 c2 | sed '/^$/d' > rebaseRefs.txt rm c1 c2 # Load the cutters table. hgCutters hgFixed rebase.gcg # Load the other table. hgsql hgFixed -e "echo delete from rebaseRefs" hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs" # REBASE 902 (2009) (DONE 2009-02-09, Andy) ssh hgwdev mkdir /hive/data/outside/rebase cd /hive/data/outside/rebase tail -n+15 rebaseRefs.txt | sed '/^$/d; s/^\s\+\([[:digit:]]\+\)\.\s\+\ tmp mv tmp rebaseRefs.txt hgCutters hgFixed rebase.gcg hgsql hgFixed -e "delete from rebaseRefs" hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs" mkdir rebase902 mv rebase.gcg rebaseRefs.txt rebase902/ # REBASE 201 (Dec 31, 2011) (DONE 2012-01-24, Andy) ssh hgwdev cd /hive/data/outside/rebase mkdir rebase201 cd rebase201/ # go to http://rebase.neb.com/rebase/rebase.f3.html # save the GET FILE files, then scp them to this dir ln -s link_gcgenz.txt rebase.gcg hgCutters hgFixed rebase.gcg tail -n +15 link_gcgref.txt | sed '/^$/d; s/^\s*\([[:digit:]]\+\)\.\s\+\ rebaseRefs.txt hgsqldump -d --compact hgFixed rebaseRefs | grep -v "^SET" > rebaseRefs.sql hgLoadSqlTab hgFixed rebaseRefs{,.sql,.txt} # GLADSTONE hESC Novartis microarray data. # 1. Download http://www.genmapp.org/temp/humansimpleESC.zip # 2. Manually convert using MS access on Bob's laptop to a tab-delimited text file. # 3. Add column names to that file manually. ssh hgwdev mkdir /projects/compbio/data/microarray/gladstone cp hESC.txt /projects/compbio/data/microarray/gladstone cd /projects/compbio/data/microarray/gladstone cut -f1,4,6 hESC.txt | tail +2 | sort -k3,3 -k1,1 > data.1 for tiss in `cut -f3 data.1 | sort | uniq`; do grep $tiss data.1 | cut -f1,2 | sort -k1,1 | cut -f2 > data.${tiss}.1 echo $tiss | cat - data.${tiss}.1 > data.${tiss}.2 done paste data.*.2 > data.2 grep Lung data.1 | cut -f1 | sort > names echo Probe | cat - names | paste - data.2 > data.3 cat << _EOF_ > fixGladstone.sed s/_/ /; s/Embryonicstemcell/Embryonic Stem Cell/; s/Smoothmuscle/Smooth Muscle/; s/Salivarygland/Salivary Gland/; s/Lymphnode/Lymph Node/; s/Bonemarrow/Bone Marrow/; s/Spinalcord/Spinal Cord/; s/Wholebrain/Whole Brain/; s/blood/Blood/; _EOF_ head -n1 data.3 | sed -f fixGladstone.sed > header tail +2 data.3 | cat header - > data.4 mv data.4 generic.hESC.txt rm data.* names header hgGenericMicroarray hgFixed gladHumES generic.hESC.txt hgRatioMicroarray gladHumES gladHumESRatio # GLADSTONE ssh hgwdev cd /projects/compbio/data/microarray/gladstone awk '{if ($3 == $4) print}' hESC.txt > bestQ.hESC.txt cat << _EOF_ | hgsql hgFixed CREATE TABLE gladHumESOtherData ( name varchar(255) not null, # Name of item tissueQ varchar(255) not null, # Name of Q-associated tissue qVal float not null, # Q value hVal float not null, # H value #Indices INDEX(name(8)), INDEX(tissueQ(10)) ); _EOF_ cut -f1,2,5,6 hESC.txt | tail +2 | sort -k1,1 -k3,3n \ | awk '{printf("%s\t%s\t%s\t%s\n", $4, $3, $2, $1)}' \ | uniq -f3 \ | awk '{printf("%s\t%s\t%s\t%s\n", $4, $1, $2, $3)}' \ > gladOther.txt # Fix up the tissue column cut -f2 gladOther.txt > tmp.tiss.1 sed -f fixGladstone.sed tmp.tiss.1 > tmp.tiss.2 cut -f1 gladOther.txt > tmp.names cut -f3- gladOther.txt | paste tmp.names tmp.tiss.2 - \ > tmp.glad mv tmp.glad gladOther.txt rm tmp.* echo "load data local infile 'gladOther.txt' into table gladHumESOtherData" | hgsql hgFixed # PRINCETON STEM CELL ARRAYS ssh hgwdev mkdir /projects/compbio/data/microarray/princetonESC cd /projects/compbio/data/microarray/princetonESC for num in i ii iii iv v vi vii; do wget http://stemcell.princeton.edu/affy_cluster_${num}.html grep "td bgcolor=\"#FFFFAA\" align=center class=ssb" affy_cluster_${num}.html | sed 's/.*

\(.*\)<\/td>/\1/' > names grep "" affy_cluster_${num}.html | sed 's/.*right>\(.*\) <\/td>.*$/\1/' | colify 9 /dev/stdin > data paste names data >> tmp.txt rm names data affy_cluster_${num}.html done echo "~Bone Marrow RhoLo~Bone Marrow RhoHi~Bone Marrow Sca-~Bone Marrow Lin+~Fetal Liver Sca+~Fetal Liver Sca-~Fetal Liver Lin+~Neural Stem Cells~Embryonic Stem Cells" | tr '~' '\t' | cat - tmp.txt > princeton.txt rm tmp.txt # QA push cghNci60Exps on 2006-02-07 to rr. Table/data previously missing (Jen) # QA re-push rosChr22Dat on 2006-02-08 to fix table formatting/timestamps (Jen) # AFFY ALL EXON HUMAN ARRAYS (INCLUDES TABLES ON HG17 AND HG18) (Done 3/15/2006, Andy) # Chuck put them in tab-delimited file in ~sugnet ssh hgwdev cd /projects/compbio/data/microarray mkdir affyHumanExon cd affyHumanExon/ cp ~sugnet/plier-gcbg-sketch.summary.txt . sed -e "s/huex_wta_//g" -e "s/\.CEL//g" plier-gcbg-sketch.summary.txt > data.txt hgGenericMicroarray hgFixed affyHumanExon data.txt # Chuck put probe data into two tables in hg17. # Grab the bed first. Change the original name because a lot got started # without keeping Chuck's naming convention in mind. oh well. hgsql hg17 -e "rename table affyHuEx1 to affyHumanExonProbes" hgsql hg17 -e "rename table affyHuEx1Annot to affyHumanExonProbeAnnot" hgsql hg17 -e "select * from affyHuEx1" | tail +2 | cut -f2-7 | > hg17.probes.bed # Lift to hg18 liftOver hg17.probes.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain hg18.probes.bed hg18.unMapped # How many didn't get lifted (out of 1.4 million)? wc -l hg18.unMapped # 276 hg18.unMapped # That's not bad at all. 99.99% of them lifted fine. # Load the hg18 probe bed. Change the name of the hg17 one. hgLoadBed hg18 affyHumanExonProbes hg18.probes.bed # Deal with that extra annotation table of Chuck's. I made a new autosql # which almost matches it except for the name/probeSet fields. # First copy it out of hg17 and into a file with the new column order. hgsql hg17 -e "select probesetId,numIndependentProbes,exonClustId,numNonOverlapProbes,probeCount,transcriptClustId,probesetType,numXHybeProbe,psrId,level,evidence,bounded,cds from affyHumanExonProbeAnnot" \ | tail +2 > annot.tab # Load that into hgFixed and change the name. hgLoadSqlTab hgFixed affyAllExonProbe ~/kent/src/hg/lib/affyAllExonProbe.sql annot.tab hgsql hgFixed -e "rename table affyAllExonProbe to affyHumanExonProbeAnnot" # Make ratio table for the microarray hgRatioMicroarray affyHumanExon affyHumanExonRatio # Merge probe beds with array data and load those beds. bedMergeExpData hgFixed.affyHumanExonRatio hg17.affyHumanExonProbes hg17.bed bedMergeExpData hgFixed.affyHumanExonRatio hg18.affyHumanExonProbes hg18.bed hgLoadBed hg17 affyHumanExon hg17.bed hgLoadBed hg18 affyHumanExon hg18.bed # Create human-level trackDb entry and affyHumanExon.html # and check into cvs. ###### AFFY HUMAN EXONS (COMPLETE DATA) (DONE 7-21-2006, Andy) ssh hgwdev cd /projects/compbio/data/microarray/affyHumanExon/ mkdir moreData cd moreData/ ssh bark cd /scratch cp forAndy/* /projects/compbio/data/microarray/affyHumanExon/moreData exit sed -e "s/huex_wta_//g" -e "s/\.CEL//g" exonData.vs.tab > data.txt hgGenericMicroarray hgFixed affyHumanExon data.txt hgsql hgFixed -e "select * from affyHumanExonExps" | sed "/^\+/d" | tail +2 | sed "s/_.,/,/" > newExps.tab hgsql hgFixed -e "delete from affyHumanExonExps" hgsql hgFixed -e "load data local infile 'newExps.tab' into table affyHumanExonExps" cd ~/kent/src/hg/makeDb/hgRatioMicroarray/ # Make file affyHumanExon.ra in the medSpec style. hgRatioMicroarray -minAbsVal=0 -clump=affyHumanExon.ra affyHumanExon affyHumanExonRatio bedMergeExpData hgFixed.affyHumanExonRatio hg17.affyHumanExonProbes hg17.bed bedMergeExpData hgFixed.affyHumanExonRatio hg18.affyHumanExonProbes hg18.bed hgLoadBed hg17 affyHumanExon hg17.bed hgLoadBed hg18 affyHumanExon hg18.bed # Copied affyHumanExon to hg16 (DONE 10-12-2006, Andy) cd /cluster/data/hg16/bed/ mkdir affyHumanExon cd affyHumanExon/ echo "select name,expCount,expScores from affyHumanExon" | hgsql hg17 | tail +2 > expdata.tab cp ~/kent/src/hg/lib/expData.sql . hgLoadSqlTab hgFixed expData expData.sql expdata.tab bedMergeExpData hgFixed.expData hg16.affyHuEx1 hg16.bed hgLoadBed hg16 affyHumanExon hg16.bed hgsql -e 'drop table expData' hgFixed # QA push new cutters and rebaseRefs tables (04-06-2006: ASZ). ### load ncbi taxonomy tables (04-11-2006: Robert). mkdir /cluster/store5/taxonomy cd /cluster/store5/taxonomy ln /cluster/store5/taxonomy /cluster/data/taxonomy -s wget ftp://ftp.taxon.nih.gov/pub/taxonomy/taxdump.tar.gz tar xvfz taxdump.tar.gz sed -e 's/\t|\t/~/g' names.dmp |sed -e 's/\t|//g' |awk -F~ 'length($3)<2{OFS="\t";print $2,$1,$4}length($3)>=2{OFS="\t";print $3,$1,$4}' > taxonName.txt sed -e 's/\t|\t/~/g' division.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > taxonDivision.txt sed -e 's/\t|\t/~/g' gencode.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > taxonGencode.txt sed -e 's/\t|\t/~/g' nodes.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > ncbiNode.txt pushd ~/kent/src/hg/lib autoSql taxonNode.as taxonNode -dbLink autoSql taxonXref.as taxonXref -dbLink autoSql taxonName.as taxonName -dbLink autoSql taxonGeneticCode.as taxonGeneticCode -dbLink autoSql taxonDivision.as taxonDivision -dbLink mv taxon*.h ../inc make #edit .sql files to add indexes hgsql hgFixed < taxonName.sql hgsql hgFixed < taxonNode.sql hgsql hgFixed < taxonDivision.sql hgsql hgFixed < taxonGeneticCode.sql popd hgsql hgFixed -e "load data local infile 'taxonName.txt' into table taxonName;" hgsql hgFixed -e "load data local infile 'taxonNode.txt' into table taxonNode" hgsql hgFixed -e "load data local infile 'taxonDivision.txt' into table taxonDivision;" hgsql hgFixed -e "load data local infile 'taxonGencode.txt' into table taxonGeneticCode;" echo "select o.name, n.taxon as ncbi_taxon, n.name , toGenus from sp060115.taxon t, hgFixed.taxonName n, organism o where o.name = n.name and n.taxon = t.id order by toGenus;" | hgsql hg17 -N -B > taxonXref.txt hgsql hgFixed -e "load data local infile 'taxonXref.txt' into table taxonXref;" #--************************************************************************** #-- This is the NCBI genetic code table #-- Initial base data set from Andrzej Elzanowski while at PIR International #-- Addition of Eubacterial and Alternative Yeast by J.Ostell at NCBI #-- Base 1-3 of each codon have been added as comments to facilitate #-- readability at the suggestion of Peter Rice, EMBL #-- Later additions by Taxonomy Group staff at NCBI #-- #-- Version 3.9 #-- Code 14 differs from code 9 only by translating UAA to Tyr rather than #-- STOP. A recent study (Telford et al, 2000) has found no evidence that #-- the codon UAA codes for Tyr in the flatworms, but other opinions exist. #-- There are very few GenBank records that are translated with code 14, #-- but a test translation shows that retranslating these records with code #-- 9 can cause premature terminations. Therefore, GenBank will maintain #-- code 14 until further information becomes available. #-- #-- Version 3.8 #-- Added GTG start to Echinoderm mitochondrial code, code 9 #-- #-- Version 3.7 #-- Added code 23 Thraustochytrium mitochondrial code #-- formerly OGMP code 93 #-- submitted by Gertraude Berger, Ph.D. #-- #-- Version 3.6 #-- Added code 22 TAG-Leu, TCA-stop #-- found in mitochondrial DNA of Scenedesmus obliquus #-- submitted by Gertraude Berger, Ph.D. #-- Organelle Genome Megasequencing Program, Univ Montreal #-- #-- Version 3.5 #-- Added code 21, Trematode Mitochondrial #-- (as deduced from: Garey & Wolstenholme,1989; Ohama et al, 1990) #-- Added code 16, Chlorophycean Mitochondrial #-- (TAG can translated to Leucine instaed to STOP in chlorophyceans #-- and fungi) #-- #-- Version 3.4 #-- Added CTG,TTG as allowed alternate start codons in Standard code. #-- Prats et al. 1989, Hann et al. 1992 #-- #-- Version 3.3 - 10/13/95 #-- Added alternate intiation codon ATC to code 5 #-- based on complete mitochondrial genome of honeybee #-- Crozier and Crozier (1993) #-- #-- Version 3.2 - 6/24/95 #-- Code Comments #-- 10 Alternative Ciliate Macronuclear renamed to Euplotid Macro... #-- 15 Bleharisma Macro.. code added #-- 5 Invertebrate Mito.. GTG allowed as alternate initiator #-- 11 Eubacterial renamed to Bacterial as most alternate starts #-- have been found in Achea #-- #-- #-- Version 3.1 - 1995 #-- Updated as per Andrzej Elzanowski at NCBI #-- Complete documentation in NCBI toolkit documentation #-- Note: 2 genetic codes have been deleted #-- #-- Old id Use id - Notes #-- #-- id 7 id 4 - Kinetoplast code now merged in code id 4 #-- id 8 id 1 - all plant chloroplast differences due to RNA edit #-- #--************************************************************************* # #Genetic-code-table ::= { # { # name "Standard" , # name "SGC0" , # id 1 , # ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "---M---------------M---------------M----------------------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Vertebrate Mitochondrial" , # name "SGC1" , # id 2 , # ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", # sncbieaa "--------------------------------MMMM---------------M------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Yeast Mitochondrial" , # name "SGC2" , # id 3 , # ncbieaa "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "----------------------------------MM----------------------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate # Mitochondrial; Mycoplasma; Spiroplasma" , # name "SGC3" , # id 4 , # ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "--MM---------------M------------MMMM---------------M------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Invertebrate Mitochondrial" , # name "SGC4" , ## id 5 , # ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG", # sncbieaa "---M----------------------------MMMM---------------M------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" , # name "SGC5" , # id 6 , # ncbieaa "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "-----------------------------------M----------------------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Echinoderm Mitochondrial; Flatworm Mitochondrial" , # name "SGC8" , # id 9 , # ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", # sncbieaa "-----------------------------------M---------------M------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Euplotid Nuclear" , # name "SGC9" , # id 10 , # ncbieaa "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "-----------------------------------M----------------------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Bacterial and Plant Plastid" , # id 11 , # ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "---M---------------M------------MMMM---------------M------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Alternative Yeast Nuclear" , # id 12 , # ncbieaa "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "-------------------M---------------M----------------------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Ascidian Mitochondrial" , # id 13 , # ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG", # sncbieaa "---M------------------------------MM---------------M------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # }, # { # name "Alternative Flatworm Mitochondrial" , # id 14 , # ncbieaa "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", # sncbieaa "-----------------------------------M----------------------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # } , # { # name "Blepharisma Macronuclear" , # id 15 , # ncbieaa "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "-----------------------------------M----------------------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # } , # { # name "Chlorophycean Mitochondrial" , # id 16 , # ncbieaa "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "-----------------------------------M----------------------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # } , # { # name "Trematode Mitochondrial" , # id 21 , # ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG", # sncbieaa "-----------------------------------M---------------M------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # } , # { # name "Scenedesmus obliquus Mitochondrial" , # id 22 , # ncbieaa "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "-----------------------------------M----------------------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # } , # { # name "Thraustochytrium Mitochondrial" , # id 23 , # ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", # sncbieaa "--------------------------------M--M---------------M------------" # -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG # -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG # -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG # } #} ########################################################################## # Added Zebrafish microarray data (DONE, 2006-06-10, hartera) # From Leonard Zon's group at the Children's Hospital, Boston # Contact: Tony DiBiase, adibiase@enders.tch.harvard.edu # Data is normalized and log2 transformed, then centered on mean of 0. # Changed table names and reloaded MedianExps table so that the extras has # the strain plus time point for the name otherwise the average is taken # over all time points for a strain for the track display # when Tissue Averages is selected. (DONE, 2006-07-30, hartera) # Changed so that the extras column for the MedianExps table has the # developmental stage so that an average is taken across all strains for # each stage when Tissue Averages is selected. # (Jim recommended displaying it this way and then it also fits in with the # current framework for this type of track). # Also added the strain name and stage to the extra column for the # Experiments tables (AllExps and MedianExps) so that when Chip ID is # selected then all of these are shown. (DONE, 2006-08-11, hartera) # Added absolute data (before logs were taken). (DONE, 2006-09-19, hartera) # The absolute value data was centered on a mean of 0. The log data was # the log2 transformed normalized data, centered on a mean of 0. # This section now OBSOLETE so removed. See section below on UPDATE of # zebrafish microarray data. ########################################################################## # UPDATE the Zebrafish microarray data (DONE, 2006-06-16 - 2006-10-18, hartera) # From Leonard Zon's group at the Children's Hospital, Boston # Contact: Tony DiBiase, adibiase@enders.tch.harvard.edu # Data is Loess normalized absolute values. Then use microarray processing # programs to create ratio tables. # The new data set was obtained so that the ratios could be calculated # directly from the normalized absolute data. The ratios are calculated as # the value for a probeset in one array to the median value across all arrays # for that probset and then a log2 is taken. This allows comparison # between arrays that may differ due to technical or biological differences. # RE-CREATE tables. Data was log2 already so antilog the values to get # absolute values and then pass through the microarray processing programs. # (DONE, 2007-01-05 - 2007-01-08, hartera) ssh hgwdev mkdir /projects/compbio/data/microarray/zebrafishWT cd /projects/compbio/data/microarray/zebrafishWT # copy the data here received by e-mail and unzip unzip wt34.loessNorm.absval.2006-10-12.zip mv wt34.loessNorm.absval.2006-10-12.txt wtAffyNormLog2.txt dos2unix wtAffyNormLog2.txt cat << 'EOF' > format.pl #!/usr/bin/perl -w use strict; while () { # reformat file. change Tu to TU and remove experiment name from the # column headings and translate the name to something human readable. my ($f, @a, $n, $strain, $somites, $hpf, $fullName); $f = $_; if ($f !~ /at/) { @a = split(/\t/, $f); foreach $n (@a) { $fullName = ""; $somites = 0; $hpf = 0; if ($n =~ /^([A-Za-z]+)\.([0-9]+)\.([0-9]+)\.[0-9]+\.[0-9]+/) { $strain = $1; $somites = $2; $hpf = $3; $strain =~ s/Tu/TU/; if ($somites > 0) { $fullName = $strain . "-" . $somites . "-somites"; } elsif ($hpf > 0) { $fullName = $strain . "-" . $hpf . "-hpf"; } print "\t$fullName"; } } print "\n"; } else { print $f; } } 'EOF' # << emacs chmod +x format.pl perl format.pl < wtAffyNormLog2.txt > zebrafishWTNormLog2.txt # antilog the values, log is base 2 cat << 'EOF' > cnvToAntilog #!/usr/bin/awk -f BEGIN { FS = "\t" RS = "\n" ORS="" } { print $1 "\t" x=2 while (x < NF) { print 2^$x "\t" x++ } print 2^$NF "\n" } 'EOF' chmod +x cnvToAntilog # run script and skip header line in file tail +2 zebrafishWTNormLog2.txt | cnvToAntiLog > tmp.txt # add back header line: head -1 zebrafishWTNormLog2.txt > header cat header tmp.txt > zebrafishWTNormAbs.txt # Then load the data into hgFixed using hgGnfMicroarrray and use options # to set the url, ref, and credit to "n/a" and chip to Zebrafish. # Need to use this program to get 3 extras needed for hgMedianMicroarray # No need to round the values this time as they are larger and have # a larger range. # Create the main expRecord table and the expData table for the # absolute measurements hgGnfMicroarray zebrafishZonWTAllExps zebrafishZonWTAll \ zebrafishWTNormAbs.txt -chip=Zebrafish -url=n/a -ref=n/a -credit=n/a # Changed the Exps table so that the extras column for the MedianExps table # has the strain and developmental stage in the second field so that an # average is taken across all strains for each stage when Tissue Averages # is selected. # (Jim recommended displaying it this way and then it also fits in with the # current framework for this type of track). hgsql -N -e 'select name, extras from zebrafishZonWTAllExps;' hgFixed \ > zfishWTExps.extras cat << 'EOF' > cnvExtras.pl #!/usr/bin/perl -w use strict; while () { my ($line, @extras); $line = $_; @extras = split(/,/, $line); $line =~ s/n\/a/$extras[2]/; print $line; } 'EOF' chmod +x cnvExtras.pl cnvExtras.pl < zfishWTExps.extras > zfishWTExps.extras.new # create set of mySQL statements from this to update the AllExps table # to include the name in the second field of extras - same as in the third # field. This is used for display when the "Arrays Grouped By Replicate # Medians" (or Means) is selected from the track controls on the # description page. awk 'BEGIN {FS = "\t"} {print "update zebrafishZonWTAllExps set extras = \" "$2 "\"" " where name = \""$1"\";";}' zfishWTExps.extras.new \ > zfishWTExpsNewExtras.sql hgsql hgFixed < zfishWTExpsNewExtras.sql # Convert these to ratios using the median of the absoulute values # across all experiments to be the denominator for each probeset. # minAbsVal is 0 here as no value in this dataset is less than 1 and the # default for this parameter is 20. hgRatioMicroarray -minAbsVal=0 zebrafishZonWTAll zebrafishZonWTAllRatio # Create the .ra file for the Median tables hgsql -N -e 'select extras, id from zebrafishZonWTAllExps;' hgFixed \ > zfishWTExps # remove extra information and leave experiment name perl -pi.bak -e 's/Zebrafish,[A-Za-z]+\-[0-9]+\-[a-z]+,//' zfishWTExps perl -pi.bak -e 's/,//' zfishWTExps # alter script so that name for each experiment in column 2 is not just the # strain but the strain plus time point (same as first column). This goes into # the extras column for zebrafishWTMedianExps and is used for Tissue Averages # display for the array data track. Otherwise an average is taken for the # strain (hartera, 2006-07-30). # change so that column 2 is the time point so that an average of time points # is taken for the "Tissue Averages" Display (hartera, 2006-08-11) cat << 'EOF' > cnvToMedian #!/usr/bin/awk -f BEGIN { FS = "\t"; OFS = "\t"; } { data[$1] = data[$1] " " $2; } END { for (id in data) { split(id, a, "\\-"); print id, a[2]a[3], substr(data[id], 2); } } 'EOF' # << emacs chmod +x cnvToMedian cnvToMedian zfishWTExps > zfishZonWTMedian.ra # re-order the *.ra file as this determines the order of display sort zfishZonWTMedian.ra | grep "14somites" > tmp.ra sort zfishZonWTMedian.ra | grep "15somites" >> tmp.ra sort zfishZonWTMedian.ra | grep "hpf" >> tmp.ra mv tmp.ra zfishZonWTMedian.ra cp zfishZonWTMedian.ra ~/kent/src/hg/makeDb/hgMedianMicroarray cd ~/kent/src/hg/makeDb/hgMedianMicroarray # Take the median value over multiple replicants and put in this table: hgMedianMicroarray hgFixed zebrafishZonWTAllRatio zebrafishZonWTAllExps \ zfishZonWTMedian.ra zebrafishZonWTMedianRatio \ zebrafishZonWTMedianExps -minExps=1 # Make a median version of the absolute experiments: hgMedianMicroarray hgFixed zebrafishZonWTAll zebrafishZonWTAllExps \ zfishZonWTMedian.ra zebrafishZonWTMedian zebrafishZonWTMedianExps -minExps=1 # get distribution of MedianRatio scores: hgsql -N -e 'select * from zebrafishZonWTMedianRatio;' hgFixed > medRatioData awk '{print $3}' medRatioData > medRatioData2 perl -pi.bak -e 's/,/\n/g' medRatioData2 textHistogram -real -binSize=0.2 -maxBinCount=1100 -minVal=-200 \ medRatioData2 > histMedRatio.out # from this histogram, see that most values fall between -2 to +2 so set the # trackDb for the Affy Zon Wild Type Array track to have expScale of 2.0 # and expStep to 0.2 for the log scale to display the ratios in this track. ########################################################################## #The Mouse GNF Expression Atlas 2 (2004) ########################################################################## # Updated gv* tables for the Locus Variants tracks # (Belinda Giardine Sept 2006) # This track is now available for hg17 and hg18, only the gvPos table needs to # be redone for each build unless new mutations are added. This load changes # the schema (strand, label for gvPos) and adds a new LSDB (BTKbase) and more # sanity checks on all the data causing some mismapped variants to be # discarded. ########################################################################## # mgcMBLabValid - Load of Genbank accession that are in the Brent lab clone # validation database. This contains both human and mouse clones. Since # the Brent lab is no longer doing MGC validations, this set is fixed # and shared by all mouse and human assemblies. (2006-10-26 markd) mkdir -p /cluster/data/genbank/data/download/mgcMBLab cd /cluster/data/genbank/data/download/mgcMBLab # save list of 41805 accessions received from brent lab as # mgcMBLabValid.2006-10-25.acc hgLoadSqlTab hgFixed mgcMBLabValid ~/compbio/genbank/kent/src/hg/lib/mgcMBLabValid.sql mgcMBLabValid.2006-10-25.acc gzip mgcMBLabValid.2006-10-25.acc ########################################################################## # ZEBRAFISH DEVELOPMENTAL ARRAYS FROM GENOME INSTITUTE OF SINGAPORE (GIS) # Data from Article: # Transcriptome Analysis of Zebrafish Embryogenesis Using Microarrays Mathavan # S, Lee SGP, Mak A, Miller LD, Murthy KRK, et al. PLoS Genetics Vol. 1, No. 2, # e29, pages 260-276 doi:10.1371/journal.pgen.0010029 # Contact: Sinnakaruppan Mathavan # Downloaded expression data from # http://giscompute.gis.a-star.edu.sg/~govind/zebrafish/data_download.html # after clicking on link to download largest dataset (12.9 MB): # ene expression data showing the expression profile during different stages # of zebrafish embryonic development for the genes selected from the array are # presented (Compugen array). Each value represents an average performance of # 2-4 replicates. GenBank id of the selected gene is given as the identifier. # Total RNA from different stages of embryonic development, adult male and # female were pooled in equal concentrations and used as reference RNA. The # genes were annotated using Zebrafish Chip Annotation Database. ssh hgwdev mkdir -p /projects/compbio/data/microarray/zebrafishGISDev # Downloaded data and saved in Excel as a tab, separated text file: # PLOSGISData.txt # This file contains Genbank accessions and the expression values # which are log2 based. ########################################################################## # Belinda Giardine April 2007 # gv* tables: # reload tables, additions and corrections, details in hg18 doc