# for emacs: -*- mode: sh; -*- ######################################################################### # hivgne8 DATABASE BUILD (STARTED, 07/26/07, Fan) ssh hgwhiv1 mkdir -p /cluster/store12/medical/hiv/hivgne8 cd /cluster/store12/medical/hiv/hivgne8 # Get HIV GNE8 base sequence (1,700 bases) from GSID # This sequence is re-created using the data sheets (page 281-284, Vol 1) # from VaxGene document, provided by Phil Berman. # The data sheets are scanned and OCRed into raw txt file first, then # manually edited and checked. The final result is store in file, gne8_gsid.txt. echo '>GNE8' >gne8.fa cat gne8_gsid.txt|sed -e 's/ / /g'|sed -e 's/ / /g'|sed -e 's/ / /g'|\ sed -e 's/ /\t/'|\ sed -e 's/ /\t/'|\ cut -f 3 |\ sed -e 's/ //g' >>gne8.fa faToTab gne8.fa stdout|toUpper stdin gne8.tab # translate to nib ln -s /cluster/store12/medical/hiv/hivgne8 ~/hivgne8 cd ~/hivgne8 mkdir nib ln -s gne8.fa chr1.fa faToNib chr1.fa nib/chr1.nib # CREATING DATABASE (DONE 07/27/07) # Create the hivgne8 database. echo 'create database hivgne8' | hgsql hiv1 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 07/27/07) echo "create table grp (PRIMARY KEY(NAME)) select * from hiv1.grp" \ | hgsql hivgne8 # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE 09/05/06) # Make symbolic links from /gbdb/hivgne8/nib to the real nibs. mkdir -p /gbdb/hivgne8/nib ln -s /cluster/store12/medical/hiv/hivgne8/nib/chr1.nib /gbdb/hivgne8/nib # Load /gbdb/hivgne8/nib paths into database and save size info. hgsql hivgne8 < ~/src/hg/lib/chromInfo.sql cd ~/hivgne8 hgNibSeq -preMadeNib hivgne8 /gbdb/hivgne8/nib chr1.fa echo "select chrom,size from chromInfo" | hgsql -N hivgne8 > chrom.sizes # MAKE HGCENTRALHIV1 ENTRY AND TRACKDB TABLE FOR HIVGNE8 (DONE 08/02/06) echo 'insert into defaultDb values("HIV GNE8 (GP120)", "hivgne8");' \ | hgsql -h localhost hgcentralhiv1 echo 'insert into dbDb values("hivgne8", "Sep. 1998", \ "/gbdb/hivgne8/nib", "Human immunodeficiency virus 1", "chr1", 1, 99.5, "HIV GNE8 (GP120)","Human immunodeficiency virus 1", "/gbdb/hivgne8/html/description.html", 0, 0, " sequence as of Feb., 2005");' \ | hgsql hgcentralhiv1 -h localhost echo 'insert into genomeClade values("HIV GNE8 (GP120)", "other", 100);'\ | hgsql hgcentralhiv1 -h localhost # Edit that makefile to add hivgne8 in all the right places vi makefile make update make alpha cvs commit makefile # MAKE HGCENTRALHIV1 BLATSERVERS ENTRY FOR HIVGNE8 ssh hiv1 echo 'insert into blatServers values("hivgne8", "hiv1", "17784", "1", "0"); \ insert into blatServers values("hivgne8", "hiv1", "17785", "0", "0");' \ | hgsql hgcentralhiv1 -h localhost # CREATE TRACKDB TABLE hgsql hivgne8 < ~/src/hg/lib/trackDb.sql hgsql hivgne8 -e "insert into trackDb select * from hiv1.trackDb" # copy over tables from hiv1 mysqldump -d hiv1 dnaSeq -u medcat -p$HGPSWD|hgsql hivgne8 hgsql hivgne8 -e "insert into dnaSeq select * from hiv1.dnaSeq" mysqldump -d hiv1 aaSeq -u medcat -p$HGPSWD|hgsql hivgne8 hgsql hivgne8 -e "insert into aaSeq select * from hiv1.aaSeq" mysqldump -d hiv1 gsidSubjInfo -u medcat -p$HGPSWD|hgsql hivgne8 hgsql hivgne8 -e "insert into gsidSubjInfo select * from hiv1.gsidSubjInfo" mysqldump -d hiv1 gsidClinicRec -u medcat -p$HGPSWD|hgsql hivgne8 hgsql hivgne8 -e "insert into gsidClinicRec select * from hiv1.gsidClinicRec" mysqldump -d hiv1 gsIdXref -u medcat -p$HGPSWD|hgsql hivgne8 hgsql hivgne8 -e "insert into gsIdXref select * from hiv1.gsIdXref" mysqldump -d hiv1 vax004Msa -u medcat -p$HGPSWD|hgsql hivgne8 hgsql hivgne8 -e "insert into vax004Msa select * from hiv1.vax004Msa" # CREATE VAX004 TRACK (DONE 8/9/07) # get vax004 sequences hgsql hivgne8 -N -e 'select * from dnaSeq where id like "%U%"' >vax004.tab # create .fa file tabToFa vax004 mkdir -p /gbdb/hivgne8/vax004 cp -p vax004.fa /gbdb/hivgne8/vax004/vax004.fa hgLoadSeq -replace hivgne8 /gbdb/hivgne8/vax004/vax004.fa # BLAT gfClient -minScore=200 -minIdentity=80 -nohead hiv1.cse.ucsc.edu 17785 /gbdb/hivgne8/nib \ -out=psl -t=dna -q=dna vax004.fa vax004.psl # count the result wc *.psl cut -f 10 vax004.psl |wc cut -f 10 vax004.psl |sort -u |wc # load the psl result into vax004 table hgLoadPsl hivgne8 vax004.psl # CREATE HIVGENE TRACK mkdir -p /cluster/store12/medical/hiv/hivgne8/hivGene cd /cluster/store12/medical/hiv/hivgne8/hivGene # clean up files from previous build rm *.psl *.bed *.tab *.fa *.tmp # Get corresponding DNA gemomic sequences from hiv1 base genome. hgsql hiv1 -N -e \ 'select h.name, substring(b.seq, h.chromStart, h.chromEnd-h.chromStart) from baseSeq b, hivGene h where b.id="hiv1"'\ >hSeq.tmp # keep only the genes/regions within the gp120 region fgrep V hSeq.tmp >hSeq.tab fgrep vpu hSeq.tmp >>hSeq.tab fgrep gp120 hSeq.tmp >>hSeq.tab fgrep RRE hSeq.tmp >>hSeq.tab # create .fa file from .tab file tabToFa hSeq # BLAT the sequences agains hivgne8 base genome gfClient -minScore=10 -minIdentity=35 -nohead hiv1.cse.ucsc.edu 17785 \ /gbdb/hivgne8/nib -out=psl -t=dna -q=dna hSeq.fa hSeq.psl # load the BLAT result into a temp psl track hgLoadPsl hivgne8 hSeq.psl # NEED TO MANUALLY LOAD THE psl.tab, BECAUSE hgLoadPsl HAS PERMISSION PROBLEM ON hiv1. hgsql hivgne8 -e 'delete from hSeq' hgsql hivgne8 -e 'load data local infile "psl.tab" into table hSeq' # create .bed file for hivGene hgsql hivgne8 -N -e 'select "chr1", tStart, tEnd, qName from hSeq ' >hivGene.bed # load the bed file into hivGene table, needs to load manually bed.tab manually due # to permission problem. hgLoadBed hivgne8 hivGene hivGene.bed # CREATE INTERPRO TRACK # get all HIV-1 domain sequences getInterProFa interProXrefHiv1 interProHiv1.fa # BLAT against base genome gfClient localhost 17784 /gbdb/hivgne8/nib -out=psl -t=dnax -q=prot interProHiv1.fa interProHiv1.psl # load it into a temp table hgLoadPsl hivgne8 table=testPsl interProHiv1.psl # create bed file from this temp table hgsql hivgne8 -N -e \ 'select "chr1", tStart, tEnd, qName from testPsl where (tEnd-tStart)/3/qSize>0.42' \ > interProHiv1.bed # load the bed file into the track table hgLoadBed hivgne8 interPro interProHiv1.bed # drop the temp table. hgsql hivgne8 -e 'drop table testPsl' # CREATE CONSERVATION TRACKS mkdir -p /cluster/store12/medical/hiv/hivgne8/conservation cd /cluster/store12/medical/hiv/hivgne8/conservation # create the .wig file and .fa file of the consensus sequence. gsidMsa hivgne8 vax004Msa GNE8 231 vax004Cons.wig vax004Consensus.fa # encode and load the wig file wigEncode vax004Cons.wig stdout vax004Cons.wib \ | hgLoadWiggle hivgne8 vax004Cons stdin # copy .wib file to /gbdb mkdir -p /gbdb/hivgne8/wib cp vax004Cons.wib /gbdb/hivgne8/wib # do the same for protein conservation track mkdir aa cd aa # create .wig file gsidAaMsa2 hivgne8 vax004Msa GNE8 231 vax004AaCons.wig vax004AaConsensus.fa # encode and load the .wib file wigEncode vax004AaCons.wig stdout vax004AaCons.wib \ | hgLoadWiggle hivgne8 vax004AaCons stdin cp vax004AaCons.wib /gbdb/hivgne8/wib # CREATE MAF TRACKS mkdir -p /cluster/store12/medical/hiv/hivgne8/msa cd /cluster/store12/medical/hiv/hivgne8/msa # create a script file, doall hgsql hivgne8 -N -e \ 'select id from dnaSeq where id like "%U%"'\ |sed -e 's/ss/do1 ss/g' >doall # create one line script file, do1, with the following line in it: hgsql hivgne8 -N -e "select id, seq from vax004Msa where id='${1}'" chmod +x do* # run the script to get the .tab file with all MSA sequences of VAX004 doall >gne8.tab # convert .tab into .fa file tabToFa gne8 # grab the base alignment sequence echo ">hivgne8" >gne8.aln hgsql hivgne8 -N -e 'select seq from vax004Msa where id="GNE8"' >> gne8.aln # prepare an interium file, jjAll.mfa cat gne8.aln gne8.fa >jjAll.mfa echo = >>jjAll.mfa # Run xmfaToMafGne8 to create a precursor file for the final .maf xmfaToMafGne8 jjAll.mfa j.out org1=hivgne8 cat j.out|sed -e 's/\./_/g'|sed -e 's/_chr/\.chr/g' >chr1.tmp rm jjAll.mfa j.out cat chr1.tmp |sed -e 's/ss_U/U/g' >chr1.maf # copy .maf to /gbdb. mkdir -p /gbdb/hivgne8/vax004Maf cp chr1.maf /gbdb/hivgne8/vax004Maf -p echo before load hgLoadMaf hivgne8 vax004Maf # create another copy for protein MAF. mkdir -p /gbdb/hivgne8/vax004AaMaf cp -p chr1.maf /gbdb/hivgne8/vax004AaMaf hgLoadMaf hivgne8 vax004AaMaf