# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on # NCBI build 33 (April, 2003 freeze) # [For importing GTF tracks, use /projects/compbio/bin/validate_gtf.pl] # HOW TO BUILD A ASSEMBLY FROM NCBI FILES # --------------------------------------- # NOTE: It is best to run most of this stuff on eieio since it # is not averse to handling files > 2Gb # 0) Make gs.16 directory, gs.16/build33 directory, and gs.16/ffa directory. mkdir /cluster/store5/gs.16 mkdir /cluster/store5/gs.16/build33 mkdir /cluster/store5/gs.16/agp mkdir /cluster/store5/gs.16/ffa # Make a symbolic link from /cluster/store1 to this location cd /cluster/store1 ln -s /cluster/store5/gs.16 ./gs.16 # Make a symbolic link from your home directory to the build dir: ln -s /cluster/store5/gs.16/build33 ~/oo # 1) Download seq_contig.md, ncbi_build33.agp, contig_overlaps.agp # and contig fa file into gs.16/build33 directory. # Download certificates.txt in gs.16/build33 directory (first time in build33) # Download all finished agp's and fa's into gs.16/agp # Download sequence.inf and ncbi_build33.fa files into gs.16/ffa, and unzip # ncbi_build33.fa. # *** For build33, files split into reference.agp/reference.fa (main O&O), DR51.agp/DR51.fa, # and DR52.agp/DR52.fa. (alternate versions of MHC region). These were concatenated # to get the ncbi_build33.agp and ncbi_build33.fa # 2) Sanity check things with /cluster/bin/i386/checkYbr build33/ncbi_build33.agp ffa/ncbi_build33.fa \ build33/seq_contig.md # report any errors back to Richa and Greg at NCBI. # 3) Convert fa files into UCSC style fa files and place in "contigs" directory # inside the gs.16/build33 directory cd build33 mkdir contigs /cluster/bin/i386/faNcbiToUcsc -split -ntLast ../ffa/ncbi_build33.fa \ contigs # 3.1) Make a fake chrM contig cd ~/oo mkdir M # copy in chrM.fa, chrM.agp and chrM.gl from previous version. mkdir M/NT_999999 cp chrM.fa NT_999999/NT_999999.fa # copied chrM.fa, chrM.agp, chrM.gl, chrM.trf.bed, lift directory, NT_999999/NT_999999.fa - not sure which ones we need # 4) Determine the chromosome sizes from agps /cluster/bin/scripts/getChromSizes ../agp # 4.1) Create lift files (this will create chromosome directory structure) and inserts file /cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md . # 5) Create contig agp files (will create contig directory structure) /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build33.agp . # 5.1) Create contig gl files ~kent/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md # 6) Create chromsome agp files /cluster/bin/scripts/createNcbiChrAgp . # Since we received final agp's, these were substituted in for the chr agp's # with all completely commented lines removed. # 6.1) Copy over jkStuff from previous build mkdir jkStuff cp /cluster/store1/gs.14/build31/jkStuff/*.sh jkStuff /build31/jkStuff/*.csh jkStuff cp /cluster/store1/gs.14/build31/jkStuff/*.gsub jkStuff # 6.3) Create chromosome gl files jkStuff/liftGl.sh contig.gl # 7) Distribute contig .fa to appropriate directory (assumes all files # are in "contigs" directory). cd ~/hg15 /cluster/bin/scripts/distNcbiCtgFa contigs . rm -r contigs # 8) Reverse complement NT contig fa files that are flipped in the assembly # (uses faRc program) # Not done for build33 because all contigs on + strand. It should be this # way for the rest of the assemblies /cluster/bin/scripts/revCompNcbiCtgFa seq_contig.md . # GET FRESH MRNA/EST AND REFSEQ SEQUENCE FROM GENBANK (DONE 4/10/03) # Run this just before the sequence gets here! It's OK to work on # this in parallel with Terry's steps above, or in parallel with # RepeatMasker below, but DO NOT let this hold up RepeatMasker. # This will create a genbank.134 directory containing compressed # GenBank flat files and a mrna.134 containing unpacked sequence # info and auxiliary info in a relatively easy to parse (.ra) # format. # Point your browser to ftp://ftp.ncbi.nih.gov/genbank and look at # the README.genbank. Figure out the current release number. (134) lynx ftp://ftp.ncbi.nih.gov/genbank/README.genbank # Consider deleting one of the older genbank releases. It's # good to at least keep one previous release though. # Where there is space make a new genbank directory. Create a # symbolic link to it: ssh eieio mkdir /cluster/store5/genbank.134 ln -s /cluster/store5/genbank.134 ~/genbank cd ~/genbank # ncftp is handy -- it does anonymous login; "prompt" command not needed. ncftp ftp.ncbi.nih.gov cd genbank mget gbpri* gbrod* gbv* gbsts* gbest* gbmam* gbinv* gbbct* gbhtc* gbpat* gbphg* gbpln* quit # This will take at least 2 hours. # Make the refSeq subdir and download files: ssh eieio mkdir -p /cluster/store5/mrna.134/refSeq/041003 cd /cluster/store5/mrna.134/refSeq/041003 ncftp ftp.ncbi.nih.gov cd refseq/cumulative mget *.Z quit # Get extra info & human proteins from NCBI: wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc wget ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/hs.faa.gz gunzip hs.faa.gz # Unpack this into species-specific fa files and get extra info with: cd /cluster/store5/mrna.134/refSeq/041003 cp /cluster/store2/mrna.133/*.fil .. gunzip -c rscu.gbff.Z \ | gbToFaRa -byOrganism=org ../../anyRna.fil refSeq.{fa,ra,ta} stdin # Now unpack and organize the larger genbank mrna/est sequences... ssh eieio cd /cluster/store5/mrna.134 # Make the RNAs for all organisms gunzip -c \ /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa -byOrganism=org anyRna.fil mrna.{fa,ra,ta} stdin # Make the ESTs for all organisms gunzip -c /cluster/store5/genbank.134/gbest*.gz \ | gbToFaRa anyRna.fil est.{fa,ra,ta} stdin -byOrganism=org # Make the nonhuman RNAs gunzip -c \ /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa humanXenoRna.fil humanXenoRna.{fa,ra,ta} stdin # Make the nonMouse RNAs gunzip -c \ /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa mouseXenoRna.fil mouseXenoRna.{fa,ra,ta} stdin # Make the nonRat RNAs gunzip -c \ /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa ratXenoRna.fil ratXenoRna.{fa,ra,ta} stdin # Make the nonhuman ESTs gunzip -c /cluster/store5/genbank.134/gbest*.gz \ | gbToFaRa humanXenoRna.fil humanXenoEst.{fa,ra,ta} stdin # Split the really large ones into smaller pieces for more efficient # cluster runs. mkdir humanXenoRnaSplit humanXenoEstSplit faSplit about humanXenoRna.fa 10000000 humanXenoRnaSplit/xenoRna faSplit about humanXenoEst.fa 70000000 humanXenoEstSplit/xenoEst cd org/Homo_sapiens mkdir estSplit faSplit about est.fa 250000000 estSplit/est # Distribute the files to /iscratch/i/ so they're all ready to be aligned. ssh kkr1u00 mkdir -p /iscratch/i/mrna.134/Homo_sapiens cp -p /cluster/store5/mrna.134/refSeq/041003/org/Homo_sapiens/refSeq.fa \ /iscratch/i/mrna.134/Homo_sapiens/ cp -p /cluster/store5/mrna.134/org/Homo_sapiens/mrna.fa \ /iscratch/i/mrna.134/Homo_sapiens/ cp -p /cluster/store5/mrna.134/org/Homo_sapiens/estSplit/*.fa \ /iscratch/i/mrna.134/Homo_sapiens/ cp -p /cluster/store5/mrna.134/humanXenoRnaSplit/*.fa \ /iscratch/i/mrna.134/Homo_sapiens/ cp -p /cluster/store5/mrna.134/humanXenoEstSplit/*.fa \ /iscratch/i/mrna.134/Homo_sapiens/ ~kent/bin/iSync # REPEAT MASKING (DONE 041103) # Split contigs, run RepeatMasker, lift results # Notes: # * If there is a new version of RepeatMasker, build it and ask the admins # to binrsync it (kkstore:/scratch/hg/RepeatMasker/*). # * Contigs (*/NT_*/NT_*.fa) are split into 500kb chunks to make # RepeatMasker runs manageable on the cluster ==> results need lifting. # * For the NCBI assembly we repeat mask on the sensitive mode setting # (RepeatMasker -s) #- Split contigs into 500kb chunks: ssh eieio cd ~/hg15 mkdir contigOut cd contigOut mkdir split foreach d (../contigs/NT_* ) set contig = $d:t faSplit size ../contigs/$contig 500000 split/${contig}_ -lift=split/$contig.lft \ -maxN=500000 end #- Make the run directory and job list: cd ~/hg15 mkdir RMRun rm -f RMRun/RMJobs touch RMRun/RMJobs foreach f ( /cluster/store5/gs.16/build33/contigOut/split/NT_*.fa ) set f = $f:t echo /cluster/bin/scripts/RMLocalSens \ /cluster/store5/gs.16/build33/contigOut/split $f \ '{'check out line+ /cluster/store5/gs.16/build33/contigOut/split/$f.out'}' \ >> RMRun/RMJobs end #- Do the run ssh kk cd ~/hg15/RMRun para create RMJobs para try, para check, para check, para push, para check,... #- Now while that's running, run TRF (simpleRepeat), and RefSeq #- alignments, in parallel. Also, create the database and the #- tracks that don't rely on cluster runs or on masked sequence. # Distribute the split files and splitting lift files back to our normal # chromosome contig directory tree. cd ~/hg15 # Edit script first! # This copies lft/out/align files from the flat dir structure we were forced to use at first. # This is a one-time only step. /cluster/bin/scripts/distNcbiCtgOut contigOut/split . #- Lift up the split-contig .out's to contig-level .out's ssh eieio cd ~/hg15 foreach d ( ?{,?}/NT_* ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.fa.lft warn ${contig}?*.fa.out > /dev/null cd ../.. end #- Lift up RepeatMask .out files to chromosome coordinates via tcsh jkStuff/liftOut2.sh #- By this point, the database should have been created (below): ssh hgwdev cd ~/hg15 hgLoadOut hg15 ?/*.fa.out ??/*.fa.out # VERIFY REPEATMASKER RESULTS (DONE 041103) # Run featureBits on hg15 and on a comparable genome build, and compare: ssh hgwdev featureBits hg15 rmsk # --> 1386879340 bases of 2866468600 (48.383%) in intersection featureBits hg13 rmsk # --> 1383216615 bases of 2860907679 (48.349%) in intersection # Validate the RepeatMasking by randomly selecting a few NT_*.fa files, # manually repeat masking them and matching the .out files with the # related part in the chromosome-level .out files. For example: ssh kkr1u00 # Pick arbitrary values of $chr and $nt and run these commands: set chr = 1 set nt = NT_004321 mkdir /tmp/RMTest/$nt cd /tmp/RMTest/$nt cp ~/hg15/$chr/$nt/$nt.fa . /scratch/hg/RepeatMasker/RepeatMasker -s $nt.fa # Compare $nt.fa.out against the original ~/hg15/$chr/$nt/$nt.fa.out # and against the appropriate part of $chr/chr$chr.fa.out (use the coords # for $nt given in seq_contig.md). # CREATING DATABASE (DONE 04/08/03) ssh hgwdev # if you haven't already: ln -s /cluster/store5/gs.16/build33 ~/oo ln -s /cluster/store5/gs.16/build33 ~/hg15 # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql df -h /var/lib/mysql # Create the database. echo 'create database hg15' | hgsql hg14 # make a semi-permanent read-only alias (add this to your .cshrc/.bashrc): alias hg15 mysql -u hguser -phguserstuff -A hg15 # Initialize the relational-mrna and external sequence info tables: hgLoadRna new hg15 # Copy over grp table (for track grouping) from another database: echo "create table grp (PRIMARY KEY(NAME)) select * from hg14.grp" \ | hgsql hg15 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 041103) ssh hgwdev # Enter hg15 into hgcentraltest.dbDb so test browser knows about it: # Note: for next assembly, set scientificName column to "Homo sapiens" echo 'insert into dbDb values("hg15", "Human April 2003", \ "/gbdb/hg15/nib", "Human", "DUSP18", 1, 80, "Human");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: cd ~/src/hg/makeDb/trackDb cvs up -d -P . # Edit that makefile to add hg15 in all the right places and do make update make alpha cvs commit makefile # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 04/12/03 -jk) ssh hgwdev # Substitute BBB with the correct number for the hostname: echo 'insert into blatServers values("hg15", "blat2", "17778", "1"); \ insert into blatServers values("hg15", "blat2", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest # MAKE LIFTALL.LFT, NCBI.LFT (DONE 041103) cd ~/hg15 cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly. # Note: this ncbi.lift will not lift floating contigs to chr_random coords, # but it will show the strand orientation of the floating contigs # (grep for '|'). mdToNcbiLift seq_contig.md jkStuff/ncbi.lft # If a lift file has been edited (e.g. as in 6.2.5 above), edit ncbi.lft # to match. If no step 6.2.5 then no editing needed # SIMPLE REPEAT [TRF] TRACK (DONE 041103) # Distribute contigs to /iscratch/i ssh kkr1u00 rm -rf /iscratch/i/gs.16/build33/contigs mkdir -p /iscratch/i/gs.16/build33/contigs cd ~/hg15 cp contigs/*.fa /iscratch/i/gs.16/build33/contigs # Make sure the total size looks like what you'd expect: du -sh /iscratch/i/gs.16/build33/contigs ~kent/bin/iSync # Create cluster parasol job like so: mkdir -p ~/hg15/bed/simpleRepeat cd ~/hg15/bed/simpleRepeat cp ~/hg14/bed/simpleRepeat/gsub . mkdir trf ls -1S /iscratch/i/gs.16/build33/contigs/*.fa > genome.lst echo "" > dummy.lst ssh kk cd ~/hg15/bed/simpleRepeat gensub2 genome.lst dummy.lst gsub spec para create spec para try para check para push para check # When cluster run is done liftUp simpleRepeat.bed ~/hg15/jkStuff/liftAll.lft warn trf/*.bed # Load into the database: ssh hgwdev cd ~/hg15/bed/simpleRepeat ~matt/bin/i386/hgLoadBed hg15 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # REFSEQ ALIGNMENTS AND REFGENE TRACK PREP (DONE 041103) # Make sure contigs have been distributed to /iscratch/i/ (should have # been done for simpleRepeat/TRF above) # Make sure refSeq.fa is under /iscratch/i too (GENBANK above) ssh kk mkdir ~/hg15/bed/refSeq cd ~/hg15/bed/refSeq mkdir psl ls -1S /iscratch/i/gs.16/agp/*.fa > genome.lst # Remove DR51.fa, DR52.fa and bottomDrawer.fa from the list in genome.lst # Edit each fa file to say chr1, chr2, etc. in the top comment line. ls -1 /iscratch/i/mrna.134/Homo_sapiens/refSeq.fa > mrna.lst cp ~/hg13/bed/refSeq/gsub . # Edit the file to make it do blat.x -band -q=rna -trimHardA gensub2 genome.lst mrna.lst gsub spec # Run this on the small cluster if needed ssh kkr1u00 para create spec para try, para check, para push, para check.... para time > time # When cluster is done, process refSeq alignments into near best in genome. ssh eieio cd ~/hg15/bed/refSeq pslSort dirs raw.psl /tmp psl pslReps -minCover=0.15 -sizeMatters -minAli=0.98 -nearTop=0.001 raw.psl \ all_refSeq.psl /dev/null pslSortAcc nohead chrom /tmp all_refSeq.psl pslCat -dir chrom > refSeqAli.psl # After the database has been created, go to "LOAD REFGENE" below... # PROCESS SIMPLE REPEATS INTO MASK (DONE 041103 ms&jk) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh eieio cd ~/hg15/bed/simpleRepeat mkdir -p trfMask foreach f (trf/NT_*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd ~/hg15 mkdir -p bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (done 4/11/03 jk) # This used to be done right after RepeatMasking. Now, we mask with # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above. ssh eieio cd ~/hg15 # Make chr*.fa from contig .fa tcsh jkStuff/chrFa.sh #- Soft-mask (lower-case) the contig and chr .fa's tcsh jkStuff/makeFaMasked.sh #- Make hard-masked .fa.masked files as well: tcsh jkStuff/makeHardMasked.sh #- Rebuild the nib, mixedNib, maskedNib files: tcsh jkStuff/makeNib.sh # Copy the masked contig fa to /iscratch and /scratch: ssh kkr1u00 rm -rf /iscratch/i/gs.16/build33/trfFa mkdir -p /iscratch/i/gs.16/build33/trfFa cp -p ~/hg15/?{,?}/NT_*/NT_??????.fa /iscratch/i/gs.16/build33/trfFa ~kent/bin/iSync ssh kkstore rm -rf /scratch/hg/gs.16/build33/trfFa mkdir -p /scratch/hg/gs.16/build33/trfFa cp -p ~/hg15/?{,?}/NT_*/NT_??????.fa /scratch/hg/gs.16/build33/trfFa # PREPARE CLUSTER FOR BLASTZ RUN (041203) # This needs to be done after trf-masking and nib generation. ssh eieio # Extract lineage-specific repeats using Arian Smit's script: mkdir -p ~/hg15/bed/linSpecRep cd ~/hg15/bed/linSpecRep foreach f (~/hg15/*/*.out) ln -sf $f . end /cluster/bin/scripts/primateSpecificRepeats.pl *.out /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec rm *.out # Copy files to the kkstore:/scratch ssh kkstore # lineage-specific repeats: cd ~/hg15/bed mkdir -p /scratch/hg/gs.16/build33 rm -rf /scratch/hg/gs.16/build33/linSpecRep cp -Rp linSpecRep /scratch/hg/gs.16/build33 # RepeatMasker .out: cd ~/hg15 rm -rf /scratch/hg/gs.16/build33/rmsk mkdir -p /scratch/hg/gs.16/build33/rmsk cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/gs.16/build33/rmsk # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /scratch/hg/gs.16/build33/chromTrfMixedNib mkdir -p /scratch/hg/gs.16/build33/chromTrfMixedNib cd ~/hg15 cp -p nib/chr*.nib /scratch/hg/gs.16/build33/chromTrfMixedNib # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters # Copy to /iscratch as well so we can run blastz before binrsync finishes: rm -rf /iscratch/i/gs.16/build33/{linSpecRep,rmsk,chromTrfMixedNib} cp -Rp /scratch/hg/gs.16/build33/{linSpecRep,rmsk,chromTrfMixedNib} \ /iscratch/i/gs.16/build33/ ssh kkr1u00 ~kent/bin/iSync # Jim's comments Feb 12 '03 about the order in which to run blastz: # In general we should do # 1) hg/mm # 2) mm/rn # 3) rn/hg # 4) hg/hg # 5) mm/mm # 6) rn/rn # There is now an 'axtSwap' program that might let us # get out of having to run the inverse of 1,2 & 3, though # 2 in particular is so fast perhaps it's just as well to # do the inverse explicitly. # SEQUENCE INFO: CHROMINFO (DONE) ssh eieio cd ~/hg15 # Sanity-check */lift/ordered.lft length vs. agp length: foreach c ( ?{,?} ) if (-e $c/lift/ordered.lst) then set lftLen = `tail -1 $c/lift/ordered.lft | awk '{print $5;}'` set agpLen = `tail -1 $c/chr$c.agp | awk '{print $3;}'` if ($lftLen != $agpLen) then echo "ERROR: chr$c : lftLen=$lftLen, agpLen=$agpLen" else echo "chr$c : $lftLen" endif endif end # Make unmasked nibs -- necessary for building chromInfo. mkdir nib foreach f (?{,?}/chr?{,?}{,_random}.fa) echo making unmasked nib for $f faToNib $f nib/$f:t:r.nib end # Make symbolic links from /gbdb/hg15/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/hg15/nib foreach f (/cluster/store5/gs.16/build33/nib/chr*.nib) ln -s $f /gbdb/hg15/nib end # Load /gbdb/hg15/nib paths into database and save size info. hgsql hg15 < ~/src/hg/lib/chromInfo.sql cd ~/hg15 hgNibSeq -preMadeNib hg15 /gbdb/hg15/nib ?{,?}/chr?{,?}{,_random}.fa echo "select chrom,size from chromInfo" | hgsql -N hg15 > chrom.sizes # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 4/12/2003 JK) ssh eieio cd ~/hg15 #- Build the .zip files # Make sure to read and edit jkStuff/zipAll.sh to point to the correct paths. ./jkStuff/zipAll.sh |& tee zipAll.log #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: mkdir zip mv *.zip* zip cd zip foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd ~/hg15/zip # Edit cpToWeb.sh to contain the correct destination path. ../jkStuff/cpToWeb.sh cd /usr/local/apache/htdocs/goldenPath/10april2003 #- Take a look at bigZips/* and chromosomes/*, update their README.txt's # Then make the upstream sequence files. cd bigZips featureBits hg15 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa rm upstream1000.fa featureBits hg15 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa rm upstream2000.fa featureBits hg15 refGene:upstream:5000 -fa=upstream5000.fa zip upstream5000.zip upstream5000.fa rm upstream5000.fa # O+O: ASSEMBLY [GOLD], GAP, COVERAGE, MAP CONTIGS TRACKS (DONE - 042203) # Store o+o info in database. # Note: for build31, Terry specially requested these files from NCBI: # finished.finf # draft.finf # predraft.finf # extras.finf ssh eieio cd /cluster/store5/gs.16/build33 if (-f contig_overlaps.agp) then jkStuff/liftGl.sh contig.gl else ssh hgwdev hgGoldGapGl -noGl hg15 /cluster/store5/gs.16 build33 echo "" echo "*** Note from makeHg15.doc:" echo "Come back to this step later when we have contig_overlaps.agp\!" endif ssh hgwdev cd /cluster/store5/gs.16/build33 if (-f contig_overlaps.agp) then hgGoldGapGl hg15 /cluster/store5/gs.16 build33 cd /cluster/store5/gs.16 /cluster/bin/i386/hgClonePos hg15 build33 ffa/sequence.inf /cluster/store5/gs.16 -maxErr=3 end cd /cluster/store5/gs.16 hgCtgPos hg15 build33 # LOAD REFGENE (DONE 041103) # Do this after the database has been created and the RefSeq alignments # are done (above) # Load refSeq alignments into database ssh hgwdev cd ~/hg15/bed/refSeq hgLoadPsl hg15 -tNameIx refSeqAli.psl # Make /gbdb symlinks for refSeq.fa (not .ra) mkdir -p /gbdb/hg15/mrna.134 cd /gbdb/hg15/mrna.134 ln -s /cluster/store5/mrna.134/refSeq/041003/org/Homo_sapiens/refSeq.fa # Load the refSeq mRNA cd /cluster/store2/tmp hgLoadRna add -type=refSeq hg15 /gbdb/hg15/mrna.134/refSeq.fa \ /cluster/store5/mrna.134/refSeq/041003/org/Homo_sapiens/refSeq.ra cd ~/hg15/bed/refSeq hgRefSeqMrna hg15 /gbdb/hg15/mrna.134/refSeq.fa \ /cluster/store5/mrna.134/refSeq/041003/org/Homo_sapiens/refSeq.ra \ all_refSeq.psl \ /cluster/store5/mrna.134/refSeq/041003/loc2ref \ /cluster/store5/mrna.134/refSeq/041003/hs.faa \ /cluster/store5/mrna.134/refSeq/041003/mim2loc # Don't worry about the "No gene name" errors # Add RefSeq status info hgRefSeqStatus hg15 /cluster/store5/mrna.134/refSeq/041003/loc2ref # Create precomputed join of refFlat and refGene: echo 'CREATE TABLE refFlat \ (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) \ SELECT refLink.name as geneName, refGene.* \ FROM refLink,refGene \ WHERE refLink.mrnaAcc = refGene.name' \ | hgsql hg15 # GC PERCENT (DONE 041203) ssh hgwdev mkdir -p ~/hg15/bed/gcPercent cd ~/hg15/bed/gcPercent hgsql hg15 < ~/src/hg/lib/gcPercent.sql hgGcPercent hg15 ../../nib # PRELOAD MRNA/EST SEQUENCE INFO INTO DATABASE (DONE 04/08/03) # Make /gbdb symlinks for sequence .fa (not .ra) mkdir -p /gbdb/hg15/mrna.134 cd /gbdb/hg15/mrna.134 ln -s /cluster/store5/mrna.134/org/Homo_sapiens/mrna.fa ln -s /cluster/store5/mrna.134/org/Homo_sapiens/est.fa ln -s /cluster/store5/mrna.134/humanXenoRna.fa ln -s /cluster/store5/mrna.134/humanXenoEst.fa # Store the sequence (non-alignment) info in database. cd /cluster/store2/tmp hgLoadRna add -type=mRNA hg15 /gbdb/hg15/mrna.134/mrna.fa \ /cluster/store5/mrna.134/org/Homo_sapiens/mrna.ra hgLoadRna add -type=EST hg15 /gbdb/hg15/mrna.134/est.fa \ /cluster/store5/mrna.134/org/Homo_sapiens/est.ra hgLoadRna add -type=xenoRna hg15 /gbdb/hg15/mrna.134/humanXenoRna.fa \ /cluster/store5/mrna.134/humanXenoRna.ra hgLoadRna add -type=xenoEst hg15 /gbdb/hg15/mrna.134/humanXenoEst.fa \ /cluster/store5/mrna.134/humanXenoEst.ra # MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 4/12/2002 MS&JK ESTs Partially Redone) # Make sure that /scratch/hg/gs.16/build33/trfFa is loaded with NT_*.fa # and has been pushed to the big cluster nodes. (MASK SEQUENCE above) # Make sure mrna/est .fa's are under /iscratch/i too (GENBANK above) mrna is done against unmasked 041203 ssh kk mkdir -p ~/hg15/bed/{mrna,est}/psl cd ~/hg15/bed/mrna ls -1S /scratch/hg/gs.16/build33/trfFa/* > genome.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/mrna.fa > mrna.lst cp ~/hg13/bed/mrna/gsub . gensub2 genome.lst mrna.lst gsub spec para create spec para try cd ~/hg15/bed/est ls -1S /scratch/hg/gs.16/build33/trfFa/* > genome.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/est*.fa > mrna.lst echo '#LOOP \ /cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -ooc={check in exists /scratch/hg/h/11.ooc} {check out line+ psl/$(root1)_$(root2).psl} \ #ENDLOOP' > gsub gensub2 genome.lst mrna.lst gsub spec para create spec para try # In each dir (~/hg15/bed/mrna, ~/hg15/bed/est): para check, para push, para check.... # para time > time # Process mRNA and EST alignments into near best in genome. cd ~/hg15/bed/mrna pslSort dirs raw.psl /tmp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \ /dev/null liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /tmp all_mrna.psl ssh eieio cd ~/hg15/bed/est pslSort dirs raw.psl /tmp/psl psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \ /dev/null liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /cluster/store3/tmp all_est.psl # Load mRNA alignments into database. ssh hgwdev cd ~/hg15/bed/mrna/chrom rm -f *_mrna.psl foreach i (*.psl) mv $i $i:r_mrna.psl end hgLoadPsl hg15 *.psl cd .. hgLoadPsl hg15 all_mrna.psl -nobin # Load EST alignments into database. ssh hgwdev cd ~/hg15/bed/est/chrom rm -f *_est.psl foreach i (*.psl) mv $i $i:r_est.psl end hgLoadPsl hg15 *.psl cd .. hgLoadPsl hg15 all_est.psl -nobin # Sequence info should have already been loaded into database (PRELOAD) # SPLICED ESTS (INTRONEST) (DONE 4/12/03 JK) # Create subset of ESTs with introns and load into database. ssh eieio cd ~/hg15 tcsh jkStuff/makeIntronEst.sh ssh hgwdev cd ~/hg15/bed/est/intronEst hgLoadPsl hg15 *.psl # ESTORIENTINFO, MRNAORIENTINFO, GENE BOUNDS (RNACLUSTER) (DONE 041303 MS+JK) # Put orientation info on ESTs and mRNAs into database: ssh eieio cd ~/hg15/bed/est pslSortAcc nohead contig /tmp/psl contig.psl cd ~/hg15/bed/mrna pslSortAcc nohead contig /tmp/psl contig.psl # Distribute the est and mrna psl files to /iscratch/i ssh kkr1u00 rm -rf /iscratch/i/gs.16/build33/bed mkdir -p /iscratch/i/gs.16/build33/bed cp -r ~/hg15/bed/est/contig /iscratch/i/gs.16/build33/bed/est cp -r ~/hg15/bed/mrna/contig /iscratch/i/gs.16/build33/bed/mrna ~kent/bin/iSync # mrna: use big cluster. ssh kk mkdir -p ~/hg15/bed/mrnaOrientInfo/oi cd ~/hg15/bed/mrnaOrientInfo ls -1S /iscratch/i/gs.16/build33/bed/mrna/*.psl > psl.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/mrna*.fa > mrna.lst cp ~/hg14/bed/mrnaOrientInfo/gsub . # Edit gsub to point to the correct paths. gensub2 psl.lst mrna.lst gsub spec para create spec para try para check, para push, para check, .... # When the cluster run is done do: ssh hgwdev cd ~/hg15/bed/mrnaOrientInfo liftUp mrnaOrientInfo.bed ~/hg15/jkStuff/liftAll.lft warn oi/*.tab hgLoadBed hg15 mrnaOrientInfo mrnaOrientInfo.bed \ -sqlTable=$HOME/kent/src/hg/lib/mrnaOrientInfo.sql > /dev/null # est: use small cluster (I/O intensive). Use 2-level output dir # (input est.fa has been split into multiple files). ssh kkr1u00 mkdir -p ~/hg15/bed/estOrientInfo/oi cd ~/hg15/bed/estOrientInfo foreach f (`cat mrna.lst`) mkdir oi/$f:t:r end ls -1S /iscratch/i/gs.16/build33/bed/est/*.psl > psl.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/est*.fa > mrna.lst cp ~/hg14/bed/estOrientInfo/gsub . # Edit gsub to point to the correct paths. gensub2 psl.lst mrna.lst gsub spec para create spec para try para check, para push, para check, .... # When the cluster run is done do: ssh hgwdev cd ~/hg15/bed/estOrientInfo # oi/*/*.tab -> argument list too long... so cat the lowest level together: foreach d (oi/*) cat $d/*.tab > $d.tab end liftUp estOrientInfo.bed ~/hg15/jkStuff/liftAll.lft warn oi/*.tab bedSort estOrientInfo.bed estOrientInfo.bed hgLoadBed hg15 estOrientInfo estOrientInfo.bed \ -sqlTable=$HOME/kent/src/hg/lib/estOrientInfo.sql > /dev/null # Create rnaCluster table (depends on {est,mrna}OrientInfo above) cd ~/hg15 # Create a list of accessions that come from RAGE libraries and need to # be excluded. (added by Chuck Wed Nov 27 13:09:07 PST 2002) ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg15 \ rage.libs mkdir -p ~/hg15/bed/rnaCluster/chrom # Exclude accesions in the RAGE file foreach f (?{,?}/chr*.fa) set c = $f:t:r set out = bed/rnaCluster/chrom/$c.bed echo clusterRna -mrnaExclude=hg15.rage.libs hg15 /dev/null $out -chrom=$c clusterRna -mrnaExclude=hg15.rage.libs hg15 /dev/null $out -chrom=$c end cd bed/rnaCluster hgLoadBed hg15 rnaCluster chrom/*.bed > /dev/null # GENEBANDS (DONE 4/13/03 JK) # Create precomputed geneBands table: ssh hgwdev hgGeneBands hg15 geneBands.txt hgsql hg15 < ~/kent/src/hg/lib/geneBands.sql echo "load data local infile 'geneBands.txt' into table geneBands;" \ | hgsql hg15 rm geneBands.txt # PRODUCING GENSCAN PREDICTIONS (DONE 4/13/03 MS&JK) ssh eieio mkdir -p ~/hg15/bed/genscan cd ~/hg15/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir -p gtf pep subopt # Generate a list file, genome.list, of all the contigs # *that do not have pure Ns* (due to heterochromatin, unsequencable # stuff) which would cause genscan to run forever. rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/store5/gs.16/build33/?{,?}/NT_*/NT_??????.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end # Log into kkr1u00 (not kk!). kkr1u00 is the driver node for the small # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the # big cluster, due to limitation of memory and swap space on each # processing node). ssh kkr1u00 cd ~/hg15 cd bed/genscan # Create template file, gsub, for gensub2. For example (3-line file): #LOOP rm -f genome.list /cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP echo "" > dummy.list gensub2 genome.list dummy.list gsub jobList para create jobList para try para check para push # Issue either one of the following two commands to check the # status of the cluster and your jobs, until they are done. parasol status para check # If there were out-of-memory problems (run "para problems"), then # re-run those jobs by hand but change the -window arg from 2400000 # to 1200000. In build33, this was 22/NT_011519. # Convert these to chromosome level files as so: ssh eieio cd ~/hg15/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/NT*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/NT*.bed > \ /dev/null cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd ~/hg15/bed/genscan ldHgGene hg15 genscan genscan.gtf hgPepPred hg15 generic genscanPep genscan.pep hgLoadBed hg15 genscanSubopt genscanSubopt.bed > /dev/null # CPGISLANDS (DONE 041203) ssh eieio mkdir -p ~/hg15/bed/cpgIsland cd ~/hg15/bed/cpgIsland # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu) # copy the tar file to the current directory tar xvf cpg_dist.tar cd cpg_dist gcc readseq.c cpg_lh.c -o cpglh.exe cd .. # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked) set fout=$f:t:r:r.cpg echo producing $fout... ./cpg_dist/cpglh.exe $f > $fout end cp ~/hg13/bed/cpgIsland/filter.awk . awk -f filter.awk chr*.cpg > cpgIsland.bed ssh hgwdev cd ~/hg15/bed/cpgIsland hgLoadBed hg15 cpgIsland -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed CREATE GOLDEN TRIANGLE (todo) # Make sure that rnaCluster table is in place. Then extract Affy # expression info into a form suitable for Eisen's clustering program with: cd ~/hg15/bed mkdir triangle cd triangle eisenInput hg15 affyHg10.txt Transfer this to Windows and do k-means clustering with k=200 with cluster. Transfer results file back to ~/hg15/bed/triangle/affyCluster_K_G200.kgg. Then do promoSeqFromCluster hg15 1000 affyCluster_K_G200.kgg kg200.unmasked Then RepeatMask the .fa file inkg200.unmasked, and copy masked versions to kg200. Then cat kg200/*.fa > all1000.fa and set up cluster Improbizer run to do 100 controls for every real run on each - putting the output in imp.200.1000.e. When improbizer run is done make a file summarizing the runs as so: cd imp.200.1000.e motifSig ../imp.200.1000.e.iri ../kg200 motif control* get rid of insignificant motifs with: cd .. awk '{if ($2 > $3) print; }' imp.200.1000.e.iri > sig.200.1000.e.iri turn rest into just dnaMotifs with iriToDnaMotif sig.200.1000.e.iri motif.200.1000.e.txt Extract all promoters with featureBits hg15 rnaCluster:upstream:1000 -bed=upstream1000.bed -fa=upstream1000.fa Locate motifs on all promoters with dnaMotifFind motif.200.1000.e.txt upstream1000.fa hits.200.1000.e.txt -rc -markov=2 liftPromoHits upstream1000.bed hits.200.1000.e.txt triangle.bed # CREATE STS/FISH/BACENDS/CYTOBANDS DIRECTORY STRUCTURE AND SETUP (DONE 4/8/2003) # Create directory structure to hold information for these tracks cd /projects/hg2/booch/psl/ # Change Makefile parameters for OOVERS, GSVERS, PREVGS, PREVOO make new # Update all Makefiles with latest OOVERS and GSVERS, DATABASE, and locations of .fa files # Makefile in: # /gs.16/build33/ # /gs.16/build33/bacends # /gs.16/build33/cytobands # /gs.16/build33/cytoPlots # /gs.16/build33/fish # /gs.16/build33/fosends # /gs.16/build33/g2g # /gs.16/build33/geneticPlots # /gs.16/build33/primers # /gs.16/build33/recombrate # /gs.16/build33/sts # /gs.16/build33/stsPlots # Create accession_info file make accession_info.rdb # UPDATE STS INFORMATION (TODO 4/7/2003) # Download and unpack updated information from dbSTS: # In a web browser, go to ftp://ftp.ncbi.nih.gov/repository/dbSTS/. # Download dbSTS.sts, dbSTS.aliases, and dbSTS.FASTA.dailydump.Z to /projects/hg2/booch/psl/update cd /projects/hg2/booch/psl/update wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.FASTA.dailydump.Z gunzip dbSTS.FASTA.dailydump.Z # Edit Makefile to latest sts.X version from PREV, and update STS files make update # Make new directory for this info and move files there mkdir /cluster/store1/sts.7 cp all.STS.fa /cluster/store1/sts.7 cp all.primers /cluster/store1/sts.7 cp all.primers.fa /cluster/store1/sts.7 # Copy new files to cluster ssh kkstore cd /cluster/store1/sts.7 cp /cluster/store1/sts.7/*.* /scratch/hg/STS # Ask for propagation from sysadmin # Load the sequences (change sts.# to match correct location) ssh hgwdev mkdir /gbdb/hg15/sts.7 cd /gbdb/hg15/sts.7 ln -s /cluster/store1/sts.7/all.STS.fa ./all.STS.fa ln -s /cluster/store1/sts.7/all.primers.fa ./all.primers.fa cd /cluster/store2/tmp hgLoadRna addSeq hg15 /gbdb/hg15/sts.7/all.STS.fa hgLoadRna addSeq hg15 /gbdb/hg15/sts.7/all.primers.fa # CREATE STS MARKER ALIGNMENTS (DONE TSF 4/12/2003) # Create full sequence alignments ssh kk cd /cluster/home/booch/sts # Update Makefile with latest OOVERS and GSVERS and # run cluster jobs make new make jobList para create jobList para push # wait until alignments done make stsMarkers.psl # Copy files to final destination and remove originals make copy.assembly make clean # Create primer alignments ssh kk cd /cluster/home/booch/primers # Update Makefile with latest OOVERS and GSVERS and # run cluster jobs make new make jobList.scratch para create jobList para push # Do an initial quick filter of results (takes a while, still) and create # final file - best done on eieio since disks local ssh eieio make filter make primers.psl # Copy files to final destination and remove make copy.assembly make clean # Create ePCR alignments ssh kk cd /cluster/home/booch/epcr # Update Makefile with latest OOVERS and GSVERS make new make jobList para create jobList para push make all.epcr # Copy files to final destination and remove make copy.assembly make clean # CREATE AND LOAD STS MARKERS TRACK (DONE TSF 4/12/2003) # Copy in current stsInfo2.bed and stsAlias.bed files cd /projects/hg2/booch/psl/gs.16/build33 cp ../update/stsInfo2.bed . cp ../update/stsAlias.bed . # Create final version of sts sequence placements ssh kks00 cd /projects/hg2/booch/psl/gs.16/build33/sts make stsMarkers.final # Create final version of primers placements # Make sure PRIMERS variable in Makefile is pointing to current version cd /projects/hg2/booch/psl/gs.16/build33/primers make primers.final # Create bed file cd /projects/hg2/booch/psl/gs.16/build33 make stsMap.bed # Create database tables ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg15 < all_sts_primer.sql hgsql hg15 < all_sts_seq.sql hgsql hg15 < stsAlias.sql hgsql hg15 < stsInfo2.sql hgsql hg15 < stsMap.sql # Load the tables cd /projects/hg2/booch/psl/gs.16/build33/sts/ echo 'load data local infile "stsMarkers.psl.filter.lifted" into table all_sts_seq;' | hgsql hg15 cd /projects/hg2/booch/psl/gs.16/build33/primers/ echo 'load data local infile "primers.psl.filter.lifted" into table all_sts_primer;' | hgsql hg15 cd /projects/hg2/booch/psl/gs.16/build33/ echo 'load data local infile "stsAlias.bed" into table stsAlias;' | hgsql hg15 echo 'load data local infile "stsInfo2.bed" into table stsInfo2;' | hgsql hg15 echo 'load data local infile "stsMap.bed" into table stsMap;' | hgsql hg15 # UPDATE BACEND SEQUENCES (DONE TSF 4/10/2003) # Download new files. Make new directory based on previous one mkdir /cluster/store1/bacends.4 cd /cluster/store1/bacends.4 wget ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/BACENDS/AllBACends.mfa.gz wget ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/BACENDS/cl_acc_gi_len.gz gunzip AllBACends.mfa.gz gunzip cl_acc_gi_len.gz # Convert fa file cp /cluster/store1/bacends.3/convert.pl . convert.pl < AllBACends.mfa > BACends.fa # Create new pairs files convertBacEndPairInfo cl_acc_gi_len # Split file into pieces and copy to cluster to propagate /cluster/bin/i386/faSplit sequence BACends.fa 100 BACends ssh kkstore cd /cluster/store1/bacends.4 rm /scratch/hg/bacEnds/hs/BACends*.fa mv /cluster/store1/bacends.4/BACends???.fa /scratch/hg/bacEnds/hs/ cp /cluster/store1/bacends.4/BACends.fa /scratch/hg/bacEnds/hs/ # Ask for propagation from sysadmin # Load the sequences (change bacends.# to match correct location) ssh hgwdev mkdir /gbdb/hg15/bacends.4 cd /gbdb/hg15/bacends.4 ln -s /cluster/store1/bacends.4/BACends.fa ./BACends.fa cd /cluster/store2/tmp hgLoadRna addSeq hg15 /gbdb/hg15/bacends.4/BACends.fa # BACEND SEQUENCE ALIGNMENTS (DONE 041703 - TERRY) # (alignments done without RepeatMasking) # Create full sequence alignments ssh kk cd /cluster/home/booch/bacends # Update Makefile with latest OOVERS and GSVERS and run cluster jobs make new make jobList para create jobList para push # Compile alignments and lift the files (takes a while) ssh eieio make bacEnds.psl.lifted # Copy files to final destination and remove make copy.assembly make clean # DONT'T DO THIS (may want to wait until sure they're OK) # BACEND PAIRS TRACK (DONE 041703 - TERRY) # Add /projects/compbiousr/booch/booch/scripts to your path # Update Makefile with new location of pairs/singles # files, if necessary (DONE) cd /projects/hg2/booch/psl/gs.16/build33/bacends # Make initial file of alignments make bacEnds.rdb # Try to fish out more pairs make bacEndsMiss.psl # Re-make bacEnds.rdb with new info make bacEnds.rdb # Create bacEndPairs track file make bacEndPairs.bed # Create bacEndPairsBad and bacEndPairsLong files make bacEndPairsBad.bed # Create psl file to load make bacEnds.load.psl # Create database tables ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg15 < all_bacends.sql hgsql hg15 < bacEndPairs.sql hgsql hg15 < bacEndPairsBad.sql hgsql hg15 < bacEndPairsLong.sql # Load the tables cd /projects/hg2/booch/psl/gs.16/build33/bacends/ echo 'load data local infile "bacEnds.load.psl" into table all_bacends;' | hgsql hg15 echo 'load data local infile "bacEndPairs.bed" into table bacEndPairs;' | hgsql hg15 echo 'load data local infile "bacEndPairsBad.bed" into table bacEndPairsBad;' | hgsql hg15 echo 'load data local infile "bacEndPairsLong.bed" into table bacEndPairsLong;' | hgsql hg15 # FOSEND SEQUENCE ALIGNMENTS (DONE 051503) # Create full sequence alignments ssh kk cd /cluster/home/booch/fosends # Update Makefile with latest OOVERS and GSVERS and run cluster jobs make new make jobList para create jobList para push # Compile alignments and lift the files (takes a while) ssh eieio cd /cluster/home/booch/fosends make fosEnds.psl.lifted # Copy files to final destination and remove make copy.assembly make clean # FOSEND PAIRS TRACK (DONE 051603) # Update Makefile with location of pairs files, if necessary ssh kks00 cd /projects/hg2/booch/psl/gs.16/build33/fosends # Make initial file of alignments make fosEnds.rdb # Try to fish out more pairs make fosEndsMiss.psl # Re-make bacEnds.rdb with new info make fosEnds.rdb # Create bacEndPairs track file make fosEndPairs.bed # Create bacEndPairsBad and bacEndPairsLong files make fosEndPairsBad.bed # Create psl file to load make fosEnds.load.psl # Create database tables ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg15 < all_fosends.sql hgsql hg15 < fosEndPairs.sql hgsql hg15 < fosEndPairsBad.sql hgsql hg15 < fosEndPairsLong.sql # Load the tables cd /projects/hg2/booch/psl/gs.16/build33/fosends/ echo 'load data local infile "fosEnds.load.psl" into table all_fosends;' | hgsql hg15 echo 'load data local infile "fosEndPairs.bed" into table fosEndPairs;' | hgsql hg15 echo 'load data local infile "fosEndPairsBad.bed" into table fosEndPairsBad;' | hgsql hg15 echo 'load data local infile "fosEndPairsLong.bed" into table fosEndPairsLong;' | hgsql hg15 # Load the sequences (change fosends.# to match correct location) (done for hg15 early 4/9/2003) mkdir /gbdb/hg15/fosends.3 cd /gbdb/hg15/fosends.3 ln -s /cluster/store1/fosends.3/fosEnds.fa ./fosEnds.fa cd /cluster/store2/tmp hgLoadRna addSeq hg15 /gbdb/hg15/fosends.3/fosEnds.fa # UPDATE FISH CLONES INFORMATION (DONE 4/9/2003) # Download the latest info from NCBI # point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg # change "Show details on sequence-tag" to "yes" # change "Download or Display" to "Download table for UNIX" # press Submit - save as /projects/hg2/booch/psl/fish/hbrc/hbrc.YYYYMMDD.table # Format file just downloaded. cd /projects/hg2/booch/psl/fish/ # Edit Makefile to point at file just downloaded (variables HBRC, HBRCFORMAT) make HBRC # Copy it to the new freeze location cp /projects/hg2/booch/psl/fish/all.fish.format /projects/hg2/booch/psl/gs.16/build33/fish/ # CREATE AND LOAD FISH CLONES TRACK (DONE 4/23/2003) # (must be done after STS markers track and BAC end pairs track) # Extract the file with clone positions from database ssh hgwdev echo 'select * into outfile "/tmp/booch/clonePos.txt" from clonePos' | hgsql hg15 mv /tmp/booch/clonePos.txt /projects/hg2/booch/psl/gs.16/build33/fish # Get current clone/accession information ssh kks00 cd /projects/hg2/booch/psl/gs.16/build33/fish wget http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out # Create initial placement file make cyto.markers.bed # Get sequences for accessions not in genome # goto http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide # select file "/projects/hg2/booch/psl/gs.16/build33/fish/not.found.acc # change output to FASTA format # download results to "/projects/hg2/booch/psl/gs.16/build33/fish/not.found.fa" # Place sequences against genome make blat # Try to incorporate new placements make cyto.markers.bed2 # Create bed file make bed # Create database table ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg15 < fishClones.sql # Load the table cd /projects/hg2/booch/psl/gs.16/build33/fish/ echo 'load data local infile "fishClones.bed" into table fishClones;' | hgsql hg15 # CREATE AND LOAD CHROMOSOME BANDS TRACK (DONE - 4/23/2003) # (must be done after FISH Clones track) # Create bed file ssh hgwdev make setBands.txt # NOTE: may get errors if inserts file out-of-sync with pctSetBands file make cytobands.pct.ranges make predict # Create database table ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg15 < cytoBand.sql # Load the table cd /projects/hg2/booch/psl/gs.16/build33/cytobands/ echo 'load data local infile "cytobands.bed" into table cytoBand;' | hgsql hg15 # CREATE AND LOAD RECOMBINATION RATE TRACK (DONE 05/01/2003) # (must be done after STS Markers track) # Create bed file cd /projects/hg2/booch/psl/gs.16/build33/recombrate make recombRate.bed # Create database table ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg15 < recombRate.sql # Load the table cd /projects/hg2/booch/psl/gs.16/build33/recombrate/ echo 'load data local infile "recombRate.bed" into table recombRate;' | hgsql hg15 # CREATE AND LOAD GENMAPDB TRACK (TO DO) # Get file from UPenn and name upenn.txt (Contact: Mike Morley ) # Move file to /projects/hg2/booch/psl/gs.16/build33/genMapDb # Create bed file cd /projects/hg2/booch/psl/gs.16/build33/genMapDb make genMapDb.bed # Create database table ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg15 < genMapDb.sql # Load the table cd /projects/hg2/booch/psl/gs.16/build33/genMapDb/ echo 'load data local infile "genMapDb.bed" into table genMapDb;' | hgsql hg15 # CREATE STS MAP COMPARISON PLOTS AND GENETIC PLOTS (DONE 4/28/2003) # (Must wait until after the STS Map track has been finished for STS and genetic plots) # (Must wait until after the CytoBand, FISH tracks have been finished for cytogenetic plots) # Create sts plots cd /projects/hg2/booch/psl/gs.16/build33/stsPlots make stsplots # Create genetic plots cd /projects/hg2/booch/psl/gs.16/build33/geneticPlots make all matlab -nodesktop >> allplot_ncbi('/cse/grads/booch/tracks/gs.16/build33/geneticPlots/','build33', 'jpg'); >> quit # Create cytogenetic plots cd /projects/hg2/booch/psl/gs.16/build33/cytoPlots make bandplots # Set up directories where these will end up ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/mapPlots # Update Makefile with OOVERS, GSVERS, and FREEZE date make new # Copy over files make sts make genetic make band # Update the index.html to include links to these new plots, and delete oldest set # Update the arch.html with the oldest set just removed from index.html # Check changes to index.html and arch.html into CVS # Ask for plots to be pushed to hgwbeta (and RR eventually) - push with Chromosome Reports!!! # CREATE CHROMOSOME REPORTS (DONE 04/29/2003) # Create bacends file cd /projects/hg2/booch/psl/gs.16/build33/bacends make bacends.pairs.rdb # Create overlap file cd /projects/hg2/booch/psl/gs.16/build33/g2g make g2g.rdb # Set up directory cd /projects/hg2/booch/psl/chromReports cp -r template/ 10Apr2003/ # Change index.html files in all directories to update date of assembly # index.html # assembly/index.html # assembly/bacends/index.html # assembly/genetic/index.html # assembly/overlap/index.html # assembly/rh/index.html # assembly/summary/index.html # assembly/yac/index.html # Create files cd /projects/hg2/booch/psl/chromReports make all.assem # Move to correct location ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/chromReports/ mv /projects/hg2/booch/psl/chromReports/10Apr2003 . # Set up link to plots (must have been previously made - see above) cd /usr/local/apache/htdocs/goldenPath/chromReports/10Apr2003/assembly ln -s /usr/local/apache/htdocs/goldenPath/mapPlots/Apr10/RHgenetic ./plots # Update index.html cd /usr/local/apache/htdocs/goldenPath/chromReports/10Apr2003/ # add link to new directory in index.html cvs commit -m "added new assembly 10Apr2003" index.html # Ask for reports to be pushed to hgwbeta (and RR eventually) - push with Map Plots!!! # PRODUCING CROSS_SPECIES mRNA ALIGNMENTS (DONE 4/12/03 JK) # Make sure masked contigs are in /scratch/hg/gs.16/build33/trfFa # Make sure split-up xenoRna sequence is under /iscratch too (GENBANK) ssh kkstore mkdir -p ~/hg15/bed/xenoMrna cd ~/hg15/bed/xenoMrna mkdir psl ls -1S /scratch/hg/gs.16/build33/trfFa/*.fa > human.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/xenoRna*.fa > mrna.lst # Using split fa -- so create separate output dirs and special gsub: foreach f (`cat mrna.lst`) mkdir psl/$f:t:r end echo '#LOOP \ /cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=rnax -t=dnax -mask=lower {check out line+ psl/$(root2)/$(root1)_$(root2).psl} \ #ENDLOOP' > gsub gensub2 human.lst mrna.lst gsub spec para create spec ssh kk cd ~/hg15/bed/xenoMrna para try para check para push # Do para check until the run is done, doing para push if necessary # Sort xeno mRNA alignments as so: ssh eieio cd ~/hg15/bed/xenoMrna pslSort dirs raw.psl /cluster/store2/temp psl/xenoRna* pslReps raw.psl cooked.psl /dev/null -minAli=0.25 liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > xenoMrna.psl rm -r chrom raw.psl cooked.psl chrom.psl # Load into database as so: ssh hgwdev cd ~/hg15/bed/xenoMrna hgLoadPsl hg15 xenoMrna.psl -tNameIx # Sequence info should have already been loaded into database (PRELOAD) # PRODUCING CROSS_SPECIES EST ALIGNMENTS (DONE 4/28/03) # Make sure masked contigs are in /scratch/hg/gs.16/build33/trfFa # Make sure split-up xenoRna sequence is under /iscratch too (GENBANK) ssh kkstore mkdir -p ~/hg15/bed/xenoEst cd ~/hg15/bed/xenoEst mkdir psl ls -1S /scratch/hg/gs.16/build33/trfFa/*.fa > human.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/xenoEst*.fa > mrna.lst # Using split fa -- so create separate output dirs and special gsub: foreach f (`cat mrna.lst`) mkdir psl/$f:t:r end echo '#LOOP \ /cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=dnax -t=dnax -mask=lower {check out line+ psl/$(root2)/$(root1)_$(root2).psl} \ #ENDLOOP' > gsub gensub2 human.lst mrna.lst gsub spec ssh kk cd ~/hg15/bed/xenoEst para create spec para try, para check, para push, para check, ... # Sort xenoEst alignments: ssh eieio cd ~/hg15/bed/xenoEst pslSort dirs raw.psl /cluster/store2/temp psl/xenoEst* pslReps raw.psl cooked.psl /dev/null -minAli=0.10 liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > xenoEst.psl rm -r chrom raw.psl cooked.psl chrom.psl # Load into database as so: ssh hgwdev cd ~/hg15/bed/xenoEst hgLoadPsl hg15 xenoEst.psl -tNameIx # Sequence info should have already been loaded into database (PRELOAD) # PRODUCING SQUIRT ALIGNMENTS (DONE 06/04/03) ssh kkstore mkdir -p ~/hg15/bed/blatCi1 cd ~/hg15/bed/blatCi1 mkdir psl foreach f (~/hg15/?{,?}/NT_??????/NT_??????.fa) set c=$f:t:r mkdir -p psl/$c end ls -1S /iscratch/i/squirt/ci1/queryFa/*.fa > squirt.lst ls -1S /scratch/hg/gs.16/build33/trfFa/*.fa > human.lst gensub2 human.lst squirt.lst gsub2D spec ssh kk para create spec .... # When cluster run is done, sort alignments: ssh eieio cd ~/hg15/bed/blatCi1 mkdir /tmp/$LOGNAME pslSort dirs raw.psl /tmp/$LOGNAME psl/* pslReps raw.psl cooked.psl /dev/null -minAli=0.05 liftUp -nohead lifted.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /tmp/$LOGNAME lifted.psl # Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/hg15/bed/blatCi1/chrom rm -f chr*_blatCi1.psl foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatCi1.psl end hgLoadPsl hg15 *.psl # Make squirt /gbdb/ symlink and load squirt sequence data. mkdir /gbdb/hg15/squirtSeq cd /gbdb/hg15/squirtSeq ln -s /cluster/store5/squirt/ci1/ciona.rm.fasta # PRODUCING FUGU ALIGNMENTS (DONE 041203) # Distribute fugu sequence to /iscratch/i/fugu/ (if it isn't already there) ssh kkr1u00 rm -rf /iscratch/i/fugu mkdir /iscratch/i/fugu cp -p /cluster/store3/fuguSeq/split2.5Mb/*.fa /iscratch/i/fugu ~kent/bin/iSync ssh kk mkdir ~/hg15/bed/blatFugu cd ~/hg15/bed/blatFugu mkdir psl foreach f (~/hg15/?{,?}/NT_??????/NT_??????.fa) set c=$f:t:r mkdir -p psl/$c end ls -1S /iscratch/i/fugu/*.fa > fugu.lst ls -1S /scratch/hg/gs.16/build33/trfFa/*.fa > human.lst cp ~/hg14/bed/blatFugu gsub . gensub2 human.lst fugu.lst gsub spec para create spec para try para check para push para check # When cluster run is done, sort alignments: ssh eieio cd ~/hg15/bed/blatFugu pslCat -dir psl/NT_?????? | \ liftUp -type=.psl stdout ~/hg15/jkStuff/liftAll.lft warn stdin | \ pslSortAcc nohead chrom temp stdin # Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/hg15/bed/blatFugu/chrom rm -f chr*_blatFugu.psl foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatFugu.psl end hgLoadPsl hg15 *.psl # Make fugu /gbdb/ symlink and load Fugu sequence data. mkdir /gbdb/hg15/fuguSeq cd /gbdb/hg15/fuguSeq ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta cd /cluster/store2/tmp hgLoadRna addSeq hg15 /gbdb/hg15/fuguSeq/fugu_v3_mask.fasta # BLASTZ C elegans ALIGNMENTS (DONE - 2003-06-11 - Hiram) # This has been DONE, but there is some kind of problem in the # browser display because a click on the track produces the error: # Expecting two ranges in loadPslFromRangePair # instead of taking you to the alignment. # I believe some more needs to be done, something like the axtBest # business as seen below in the RAT BLASTZ ssh kk mkdir -p ~/hg15/bed/blastzCe1 cp ~angie/hummus/DEF.ce1-hg15.030611 ./DEF . ./DEF mkdir -p $BASE/run ~angie/hummus/make-joblist $DEF > $BASE/run/j # that created xdir.sh and joblist run/j sh $BASE/xdir.sh # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size cd $BASE/run # now edit j because it is not correct: sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2 wc -l j* head j2 # *** make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j para try para check para push ... etc ... # Average job time: 239s 3.98m 0.07h 0.00d # Longest job: 1578s 26.30m 0.44h 0.02d # When that cluster run is done, results are in $BASE/raw/chr*/* # continuing with ~angie/hummus/Notes: # --- normalize and single_cov cd ~/hg15/bed/blastzCe1 # source the DEF file again in case you are coming back to this . ./DEF # a new run directory mkdir -p $BASE/run.1 # another obscure script creates a new job list: ~angie/hummus/do.out2lav $DEF >$BASE/run.1/j cd $BASE/run.1 # the job list is once again incorrect, edit it: sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2 wc -l j* head j2 # make sure the edited j2 is OK, then use it: mv j2 j para create j para try para push;para check; ... etc ... # Average job time: 6s 0.09m 0.00h 0.00d # Longest job: 13s 0.22m 0.00h 0.00d # Translate the .lav files # into axt files ssh eieio set base="/cluster/store5/gs.16/build33/bed/blastzCe1" set seq1_dir="/cluster/store5/gs.16/build33/nib" set seq2_dir="/cluster/store5/worm/ce1/nib" set tbl="blastzCe1" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | /cluster/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \ | /cluster/bin/i386/axtSort stdin $out popd end # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # load these blastz results # next machine ssh hgwdev cd ~/hg15/bed/blastzCe1/pslChrom /cluster/bin/i386/hgLoadPsl hg15 chr*_*.psl # create trackDb/human/hg15/trackDb.ra entry: # track blastzCe1 # shortLabel BLASTZ C elegans # longLabel All BLASTZ C elegans alignments # group compGeno # priority 111 # visibility hide # color 100,50,0 # altColor 255,240,200 # spectrum on # type psl xeno ce1 # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio tcsh set base="/cluster/store5/gs.16/build33/bed/blastzCe1" set tbl="blastzBestCe1" cd $base mkdir -p axtBest pslBest foreach chrdir (lav/chr*) set chr=$chrdir:t echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.16/build33/bed/blastzCe1" set tbl="blastzBestCe1" cd $base/pslBest hgLoadPsl hg15 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: # Not done for build 32: mkdir -p /gbdb/hg15/axtBestCe1 cd /gbdb/hg15/axtBestCe1 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/hg15/axtBestCe1/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('ce1','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end # only perform the following if the axtInfo table does not yet # exist in hg15: # hgsql hg15 < ~/kent/src/hg/lib/axtInfo.sql hgsql hg15 < axtInfoInserts.sql # CHAINING elegans blastz # next machine ssh kk cd /cluster/store5/worm/hg15/bed/blastzSelf mkdir -p axtChain/run1 cd /cluster/store5/worm/ce1/bed/blastzSelf/axtChain/run1 ls -1S /cluster/store5/gs.16/build33/bed/blastzCe1/axtBest/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # worm nibs are on /cluster/bluearc now cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /cluster/store5/gs.16/build33/nib /cluster/bluearc/iscratch/i/worms/Celegans/bothMasksNib $2 > $3 '_EOF_' chmod a+x doChain mkdir out chain gensub2 input.lst single gsub spec para create spec para try para push ... etc ... Average job time: 502s 8.37m 0.14h 0.01d Longest job: 874s 14.57m 0.24h 0.01d # now on the cluster server, sort chains # next machine ssh eieio cd ~/ce1/bed/blastzSelf/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd ~/ce1/bed/blastzSelf/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain ce1 ${c}_chainSelf $i echo done $c end # Loading 845504 chains into ce1.chrI_chainSelf # Loading 845307 chains into ce1.chrII_chainSelf # Loading 1152790 chains into ce1.chrIII_chainSelf # Loading 859566 chains into ce1.chrIV_chainSelf # Loading 2 chains into ce1.chrM_chainSelf # Loading 1114939 chains into ce1.chrV_chainSelf # Loading 604293 chains into ce1.chrX_chainSelf END OF BLASTZ C elegans #################################################### TIGR GENE INDEX (TODO) o mkdir -p ~/hg15/bed/tigr cd ~/hg15/bed/tigr wget ftp://ftp.tigr.org/private/HGI_ren/TGI_track_HumanGenome_build33.tgz tar xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ldHgGene -exon=TC hg15 tigrGeneIndex *.gff LOAD MOUSEREF TRACK (todo) First copy in data from eieio to ~/hg15/bed/mouseRef. Then substitute 'genome' for the appropriate chromosome in each of the alignment files. Finally do: hgRefAlign webb hg15 mouseRef *.alignments LOAD AVID MOUSE TRACK (todo) ssh cc98 cd ~/hg15/bed mkdir avidMouse cd avidMouse wget http://pipeline.lbl.gov/tableCS-LBNL.txt hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed hgLoadBed avidRepeat avidRepeat.bed hgLoadBed avidUnique avidUnique.bed LOAD SNPS (Done. Daryl Thomas; August 28, 2003) # SNP processing has been condensed into a single script, # which makes snpNih, snpTsc, and snpMap # ${HOME}/kent/src/hg/snp/locations/processSnpLocations.csh # snpBuild = 116 # Run from directory $oo/bed/snp/build$snpBuild/snpMap mkdir -p $oo/bed/snp/build$snpBuild/snpMap cd $oo/bed/snp/build$snpBuild/snpMap processSnpLocations.csh hg15 human 33 116 >& log & # check data: # wc -l snpTsc.bed; hg15 -e "select count(*) from snpTsc; # wc -l snpNih.bed; hg15 -e "select count(*) from snpNih; # wc -l snpMap.bed; hg15 -e "select count(*) from snpMap; # select * from snpNih limit 5; desc snpNih; show indexes from snpNih" # select * from snpTsc limit 5; desc snpTsc; show indexes from snpTsc" # select * from snpMap limit 5; desc snpMap; show indexes from snpMap" # remove temp files # rm human* *bed.gz LOAD SNP DETAILS (Done. Daryl Thomas; February 18, 2004) # SNP processing has been condensed into a single script, # which makes dbSnpRsHg # ${HOME}/kent/src/hg/snp/details/processSnpDetails.csh # snpBuild = 119 # Run from directory $oo/bed/snp/build$snpBuild/snpMap mkdir -p $oo/bed/snp/build$snpBuild/details/Done mkdir -p $oo/bed/snp/build$snpBuild/details/Observed cd $oo/bed/snp/build$snpBuild/details processSnpDetails.csh hg15 human 119 >& log & load data local infile "$fileBase.out" into table $database.$table gzip $fileBase.out # check data: # hgFixed -e "select count(*) from dbSnpRsHg; # select * from dbSnpRSHg limit 5; desc dbSnpRsHg; show indexes from dbSnpRSHg" # remove temp files # rm dbSnpRs* LOAD ENSEMBL GENES (Finished 2003-09-26 braney) cd ~/hg15/bed mkdir ensembl cd ensembl # Get the ensembl protein data from http://www.ensembl.org/Homo_sapiens/martview # Follow this sequence through the pages: # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput # choose gzip compression and then hit export. # Save as ensemblGene.gtf.gz gunzip ensemblGene.gtf.gz # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. grep -v ^6_DR51 ensemblGene.gtf | grep -v _NT_ > unrandom.gtf # Add "chr" to front of each line in the gene data gtf file to make # it compatible with ldHgGene # Note: these scripts copied out of ~matt/bin 7/3/03 KRR ../../jkStuff/ensemblAddChr.pl unrandom.gtf ensGene.gtf # Not necessary for this build #../../jkStuff/ensemblFixEns.pl ensGene.gtf ensFixed.gtf # get rid of the ".1" or ".2" after the name sed "s/\..\"/\"/g" ensGene.gtf > ensFixed.gtf mv ensFixed.gtf ensGene.gtf /cluster/bin/i386/ldHgGene hg15 ensGene ensGene.gtf # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. # Save file as ensemblePep.fa.gz # Here is FTP access wget ftp://ftp.ensembl.org/pub/human-15.33/data/fasta/pep/Homo_sapiens.NCBI33.pep.fa.gz gunzip Homo_sapiens.NCBI33.pep.fa.gz cp Homo_sapiens.NCBI33.pep.fa ensemblPep.fa # Go use the data mart link on www.ensembl.org to # download a tab-separated file with the columns # geneId/transcriptId/protienId and put it in # ensGtp.txt. Then load it into database as so: hgsql hg15 < ~/kent/src/hg/lib/ensGtp.sql hgsql -N hg15 < jaxOrtholog.tab # Drop (just in case), create and load the table like this: echo 'drop table jaxOrtholog;' | hgsql hg15 hgsql hg15 < ~/kent/src/hg/lib/jaxOrtholog.sql set table=`pwd`/jaxOrtholog.tab set cmd="load data local infile '$table' into table jaxOrtholog;" echo $cmd | hgsql hg15 # fixes by Heather and Bob (2003-07-02) # filter.awk had order of columns wrong # create HMD_Human4.rpt.noheadfoot # changed 2 humanSymbols from " - " to "-" split.pl < HMD_Human4.rpt.noheadfoot > jaxOrtholog2.tab set cmd="delete from jaxOrtholog;" set cmd="load data local infile '$table' into table jaxOrtholog;" LOAD RNAGENES ssh hgwdev mkdir -p ~/hg15/bed/rnaGene cd ~/hg15/bed/rnaGene wget ftp://ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-hg15.gff.gz gunzip -c ncrna-hg15.gff.gz | grep -v '^#' > contig.gff liftUp chrom.gff ../../jkStuff/liftAll.lft warn contig.gff echo 'drop table hgRnaGene;' | hgsql hg15 hgsql hg15 < ~/kent/src/hg/lib/rnaGene.sql hgRnaGenes hg15 chrom.gff LOAD EXOFISH (todo) - login to hgwdev - cd /cluster/store5/gs.16/build33/bed - mkdir exoFish - cd exoFish - hg15 < ~kent/src/hg/lib/exoFish.sql - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr) into /cluster/store5/gs.16/build33/bed/exoFish/all_maping_ecore - awk -f filter.awk all_maping_ecore > exoFish.bed - hgLoadBed hg15 exoFish exoFish.bed LOAD MOUSE SYNTENY (TODO) ssh hgwdev mkdir -p ~/hg15/bed/mouseSyn cd ~/hg15/bed/mouseSyn # Saved Michael Kamal's email attachment: allDirectedSegmentsBySize300.txt # Process the .txt file (minus header) into a bed 6 + file: grep -v "^#" allDirectedSegmentsBySize300.txt \ | awk '($6 > $5) {printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\n", $4, $5-1, $6, $1, 999, $7, $2-1, $3, $8;} \ ($5 > $6) {printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\n", $4, $6-1, $5, $1, 999, $7, $2-1, $3, $8;}' \ > mouseSynWhd.bed hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/mouseSynWhd.sql \ hg15 mouseSynWhd mouseSynWhd.bed LOAD GENIE (todo) - cat */ctg*/ctg*.affymetrix.gtf > predContigs.gtf - liftUp predChrom.gtf ../../jkStuff/liftAll.lft warn predContigs.gtf - ldHgGene hg15 genieAlt predChrom.gtf - cat */ctg*/ctg*.affymetrix.aa > pred.aa - hgPepPred hg15 genie pred.aa - hg15 mysql> delete * from genieAlt where name like 'RS.%'; mysql> delete * from genieAlt where name like 'C.%'; # LOAD SOFTBERRY GENES (4/25/03 KRR) mkdir -p ~/hg15/bed/softberry cd ~/hg15/bed/softberry set file = Softb_fgenesh_models_apr03.tar.gz wget ftp://www.softberry.com/pub/SC_HUM_APR03/$file gunzip -c $file | tar xvf - ldHgGene hg15 softberryGene chr*.gff hgPepPred hg15 softberry *.protein hgSoftberryHom hg15 *.protein # LOAD GENEID GENES (4/30/03 KRR RELOADED -gtf 2004-04-02 kate) mkdir ~/hg15/bed/geneid cd ~/hg15/bed/geneid mkdir download cd download # Now download *.gtf and *.prot from set dir = www1.imim.es/genepredictions/H.sapiens/golden_path_20030410/geneid_v1.1/ foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y Un) wget http://$dir/chr$c.gtf wget http://$dir/chr${c}_random.gtf wget http://$dir/chr$c.prot wget http://$dir/chr${c}_random.prot end # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene hg15 geneid download/*.gtf -gtf # 32381 gene predictions runGeneCheck . hgPepPred hg15 generic geneidPep download/*-fixed.prot LOAD ACEMBLY (5/30/03 KRR) mkdir -p ~/hg15/bed/acembly cd ~/hg15/bed/acembly # Get acembly*gene.gff from Jean and Danielle Thierry-Mieg set build = ncbi_33 wget ftp://ftp.ncbi.nih.gov/repository/acedb/$build.human.genes/acembly.$build.genes.proteins.fasta.tar.gz wget ftp://ftp.ncbi.nih.gov/repository/acedb/$build.human.genes/acembly.$build.genes.gff.tar.gz gunzip -c acembly.$build.genes.gff.tar.gz | tar xvf - gunzip -c acembly.$build.genes.proteins.fasta.tar.gz | tar xvf - cd acembly.$build.genes.gff # Save just the floating-contig features to different files for lifting # and lift up the floating-contig features to chr*_random coords: # NOTE: file prefix (x1) has been added since build 31 foreach f (x1.acemblygenes.*.gff) set c=$f:r:e egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \ perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff if (-e ../../../$c/lift/random.lft) then liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \ ctg-chr${c}_random.gff endif # Strip out _random or floating contig lines from the normal chrom gff, # and add the "chr" prefix: grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' > chr$c.gff end cd ../acembly.$build.genes.proteins.fasta #- Remove G_t*_ prefixes from acemblyproteins.*.fasta: foreach f (x1.acemblyproteins.*.fasta) perl -wpe 's/^\>G_t[\da-zA-Z]+_/\>/' $f > chr$f:r:e.fa end #- Load into database: cd .. ldHgGene hg15 acembly acembly.$build.genes.gff/chr*.gff hgPepPred hg15 generic acemblyPep \ acembly.$build.genes.proteins.fasta/chr*.fa unset build LOAD GENOMIC DUPES (DONE - 2003-09-02 - Hiram) o - Load genomic dupes ssh hgwdev mkdir -p /cluster/data/hg15/bed/segmentalDups cd /cluster/data/hg15/bed/segmentalDups wget http://humanparalogy.gene.cwru.edu/build33/files_for_ucsc/genomicSuperDups.txt.gz unzip *.zip hgsql hg15 < ~kent/src/hg/lib/genomicSuperDups.sql hgsql -e 'load data local infile "genomicSuperDups.txt" into table genomicSuperDups;' hg15 # The standard hgLoad commands do not work on this file. This data # is organized differently than any of our standard formats. LOAD NCI60 (DONE: sugnet - Wed Apr 30 10:25:02 PDT 2003 ) o - # ssh hgwdev cd /projects/cc/hg/mapplots/data/NCI60/dross_arrays_nci60/ mkdir hg15 cd hg15 findStanAlignments hg15 ../BC2.txt.ns ../../image/cumulative_plates.011204.list.human hg15.image.psl >& hg15.image.log cp ../experimentOrder.txt ./ sed -e 's/ / \.\.\//g' < experimentOrder.txt > epo.txt egrep -v unknown hg15.image.psl > hg15.image.good.psl stanToBedAndExpRecs hg15.image.good.psl hg15.nci60.exp hg15.nci60.bed `cat epo.txt` hg15S -A < ../../scripts/nci60.sql echo "load data local infile 'hg15.nci60.bed' into table nci60" | hg15S -A mkdir /cluster/store5/gs.16/build33/bed/nci60 mv hg15.nci60.bed /cluster/store5/gs.16/build33/bed/nci60 rm *.psl CREATE AND LOAD AFFYU95 [ DONE: hartera 2004-03-30 ] # Set up cluster job to align consensus/exemplar sequences for Affymetrix # HG-U95Av2 chip to hg15 cp /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa /cluster/bluearc/affyGnf ssh kkr1u00 cd /cluster/data/hg15/bed rm -rf affyRatio.2004-03-19 mkdir affyRatio.2004-03-19 cd affyRatio.2004-03-19 mkdir -p /iscratch/i/affy cp /cluster/bluearc/affyGnf/HG-U95Av2_all.fa /iscratch/i/affy iSync rm /cluster/bluearc/affyGnf/HG-U95Av2_all.fa ssh kk cd /cluster/data/hg15/bed/affyRatio.2004-03-19 ls -1 /iscratch/i/affy/HG-U95Av2_all.fa > affy.lst ls -1 /scratch/hg/gs.16/build33/trfFa/ > allctg.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.16/build33/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para create para.spec # Actually do the job with usual para try/check/push/time etc. # para time 2004-03-19 #Completed: 546 of 546 jobs #CPU time in finished jobs: 9072s 151.20m 2.52h 0.10d 0.000 y #IO & Wait Time: 5409s 90.15m 1.50h 0.06d 0.000 y #Average job time: 27s 0.44m 0.01h 0.00d #Longest job: 272s 4.53m 0.08h 0.00d #Submission to last job: 310s 5.17m 0.09h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU95.psl pslSort dirs raw.psl tmp psl # use these filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least 95% identity in aligned region. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl # eliminate the long names to keep names in dependent tables in sync sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl hgLoadPsl hg15 -table=affyU95 affyU95shortQname.psl # LOAD SEQUENCE DATA FOR AFFY U95Av2 FOR ALIGNMENT DISPLAY # Copy probe sequence to /gbdb if it isn't already (they are already there # for hg16) mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes ln -s /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa . # load sequences with "U95Av2:" prefix removed so acc matches name used # in other dependent tables for affy U95Av2 hgLoadSeq -abbr=U95Av2: hg15 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa # UPDATE AFFYRATIO (GNF) [DONE, hartera, 2004-03-30] # Merge with spot data and load into database. added -chip flag to # affyPslAndAtlasToBed to allow correct parsing. affyRatio needs updating # because the consensus/exemplar sequences are now being used for the # alignments in affyU95. Affy probe targe:t regions were previously used # for alignments in affyRatio ssh hgwdev cd /cluster/data/hg15/bed/affyRatio.2004-03-19 /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 affyU95shortQname.psl /projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt affyRatio.bed affyRatio.exr >& affyPslAndAtlasToBed.log hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg15 affyRatio affyRatio.bed # Clean up rm -r psl tmp err *.tab *.debug batch.bak contig.psl raw.psl batch template.sub *.lst para* # LOAD SAGE DATA (DONE - 2003-05-14 - Hiram) ssh hgwdev cd ~/kent/src/hg/sage make # XXX = uniGene build for which SAGE was built -- not necessarily current! # Figure out the build number by peeking at this file: wget -O - ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt 2> /dev/null # set Version = XXX set Version=160 mkdir /projects/cc/hg/sugnet/sage/sage.$Version cd /projects/cc/hg/sugnet/sage/sage.$Version ncftp ftp://ftp.ncbi.nih.gov/pub/sage mget -R map/readme.txt map/info.txt extr info map/Hs quit mkdir map mv Hs map cd map/Hs/NlaIII unzip -j SAGEmap_tag_ug-rel.zip cd ../../../extr/ ../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_* ../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab ../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \ ./SAGE_* ../../scripts/countsPerExp.pl expCounts.tab expList.tab cd ../map/Hs/NlaIII/ perl -e 'while (<>) { \ chomp($_); \ @p = split(/\t/, $_); \ print "$p[2]\t$p[3]\t$p[0]\n"\ }' \ < SAGEmap_tag_ug-rel | sort | sed -e 's/ /_/g' \ > SAGEmap_ug_tag-rel_Hs cd - createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \ tagExpArrays.tab sageSummary.sage # Create the uniGene alignments # ~/hg15/uniGene/hg15.uniGene.lifted.pslReps.psl # -- see "MAKE UNIGENE ALIGNMENTS" below cd /projects/cc/hg/sugnet/sage/sage.$Version/extr addAveMedScoreToPsls \ ~/hg15/bed/uniGene.$Version/hg15.uniGene.lifted.pslReps.psl \ sageSummary.sage uniGene.wscores.bed hgLoadBed hg15 uniGene_2 uniGene.wscores.bed hgsql hg15 < ~kent/src/hg/lib/sage.sql echo "load data local infile 'sageSummary.sage' into table sage" \ | hgsql hg15 cd ../info ../../scripts/parseRecords.pl ../extr/expList.tab > sageExp.tab hgsql hg15 < ~/kent/src/hg/lib/sageExp.sql echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg15 # update ~/kent/src/hg/makeDb/trackDb/human/hg15/uniGene_2.html # with current uniGene date. # MAKE UNIGENE ALIGNMENTS (DONE - 2003-05-15 - Hiram) # Download of the latest UniGene version is now automated by a # cron job -- see /cluster/home/angie/crontab , # /cluster/home/angie/unigeneVers/unigene.csh . # If hgwdev gets rebooted, that needs to be restarted... maybe there's # a more stable place to set up that cron job. # substitute XXX -> the uniGene version used by SAGE, if building the # uniGene/SAGE track; or just the latest uniGene version in # /projects/cc/hg/sugnet/uniGene/ , if doing uniGene alignments only. # set Version = XXX set Version = 160 cd /projects/cc/hg/sugnet/uniGene/uniGene.$Version gunzip Hs.seq.uniq.gz Hs.data.gz ../countSeqsInCluster.pl Hs.data counts.tab ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects) ssh kkstore set Version = 160 # same as above mkdir -p /iscratch/i/uniGene.$Version cp -p \ /projects/cc/hg/sugnet/uniGene/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \ /iscratch/i/uniGene.$Version ssh kkr1u00 ~kent/bin/iSync ssh kk set Version = 160 # same as above mkdir -p ~/hg15/bed/uniGene.$Version cd ~/hg15/bed/uniGene.$Version ls -1S /scratch/hg/gs.16/build33/trfFa/*.fa > allctg.lst # ls -1S /cluster/store5/gs.16/build33/trfFa/* > allctg.lst ls -1S /iscratch/i/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \ > uniGene.lst echo '#LOOP\n/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst uniGene.lst template.sub para.spec para create para.spec mkdir psl para try para check para push # ssh eieio set Version = 160 # same as above cd ~/hg15/bed/uniGene.$Version pslSort dirs raw.psl tmp psl >& pslSort.log liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \ | pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \ stdin hg15.uniGene.lifted.pslReps.psl /dev/null # use hg15.uniGene.lifted.pslReps.psl for building SAGE track (above). # LOADING MOUSE MM3 BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 041303) # Translate Penn State .lav files into sorted axt: ssh eieio set base="/cluster/store5/gs.16/build33/bed/blastz.mm3.2003-04-12-03-MS" set seq1_dir="/cluster/store5/gs.16/build33/nib/" set seq2_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/" set tbl="blastzMm3" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin $seq1_dir $seq2_dir stdout \ | axtSort stdin $out popd end # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r echo $c axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.16/build33/bed/blastz.mm3.2003-04-12-03-MS" set tbl="blastzMm3" cd $base/pslChrom hgLoadPsl hg15 chr*_${tbl}.psl # MAKING THE BLASTZBESTMOUSE TRACK FROM PENN STATE MM3 AXT FILES (DONE - 041303) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio tcsh set base="/cluster/store5/gs.16/build33/bed/blastz.mm3.2003-04-12-03-MS" set tbl="blastzBestMm3" cd $base mkdir -p axtBest pslBest foreach chrdir (lav/chr*) set chr=$chrdir:t echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.16/build33/bed/blastz.mm3.2003-04-12-03-MS" set tbl="blastzBestMm3" cd $base/pslBest hgLoadPsl hg15 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: # Not done for build 32: mkdir -p /gbdb/hg15/axtBestMm3 cd /gbdb/hg15/axtBestMm3 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/hg15/axtBestMm3/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql hg15 < ~/kent/src/hg/lib/axtInfo.sql hgsql hg15 < axtInfoInserts.sql # MAKING THE AXTTIGHT FROM AXTBEST (DONE 041203) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd ~/hg15/bed/blastz.mm3.2003-04-12-03-MS/axtBest mkdir -p ../axtTight tcsh foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm3.psl end # Load tables into database ssh hgwdev cd ~/hg15/bed/blastz.mm3.2003-04-12-03-MS/pslTight hgLoadPsl hg15 chr*_blastzTightMm3.psl # RUNNING MM4 BLASTZ (IN PROGRESS 2003-07-10 kate) ssh eieio mkdir -p /cluster/data/hg15/bed/blastz.mm4 cd /cluster/data/hg15/bed/blastz.mm4 cat << '_EOF_' > DEF # human/mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET SEQ1_DIR=/iscratch/i/gs.16/build33/chromTrfMixedNib/ SEQ1_RMSK=/iscratch/i/gs.16/build33/rmsk/ SEQ1_SMSK=/iscratch/i/gs.16/build33/linSpecRep/ SEQ1_FLAG=-primate SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY SEQ2_DIR=/cluster/bluearc/mm4/mixedNib/ # not currently used SEQ2_RMSK=/cluster/bluearc/mm4/rmsk/ SEQ2_SMSK=/cluster/bluearc/mm4/linSpecRep/ SEQ2_FLAG=-rodent SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/store5/gs.16/build33/bed/blastz.mm4 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.hg15-mm4.2003-07-10 ssh kk cd ~/hg15/bed/blastz.mm4 # source the DEF file bash . ./DEF # follow the next set of directions slavishly mkdir -p $BASE/run # give up on avoiding angie's directories # tcl script # creates xdir.sh and joblist run/j ~angie/hummus/make-joblist $DEF > $BASE/run/j # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size sh $BASE/xdir.sh cd $BASE/run # now edit j to prefix path to executable name # NOTE: we should have a controlled version of schwartz bin executables sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2 wc -l j* head j2 # make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j # 44460 jobs written to batch para try para check para push # ... etc ... # 1.3 hr longest job # post-process blastz ssh eieio cd ~/hg15/bed/blastz.mm4 # source the DEF file again in case you are coming back to this bash . ./DEF # a new run directory mkdir -p $BASE/run.1 # copy files to cluster fileserver clusterdir=/cluster/bluearc/hg/gs.16/build33/bed/blastz.mm4 mkdir -p $clusterdir/{raw,lav} cp -r $BASE/raw $clusterdir # create a new job list to convert out files to lav /cluster/bin/scripts/blastz-make-out2lav $DEF $clusterdir \ > $BASE/run.1/jobList cd $BASE/run.1 # make sure the job list is OK wc -l jobList # 342 jobs head jobList # run on cluster ssh kkr9u01 cd ~/hg15/bed/blastz.mm4/run.1 para create jobList para try para check para push # etc. # 10 minutes total # convert lav files to axt ssh eieio set base="/cluster/bluearc/hg/gs.16/build33/bed/blastz.mm4" set seq1_dir="/cluster/data/hg15/nib" set seq2_dir="/cluster/data/mm4/softNib" /cluster/bin/scripts/blastz-lav2axt $base $seq1_dir $seq2_dir # BEGINNING OF RAT BLASTZ # SET UP RAT BLASTZ CLUSTER RUNS # FIRST FOLLOW INSTRUCTIONS IN ~angie/hummus/Notes # LOADING RAT RN2 BLASTZ ALIGNMENTS FROM PENN STATE: # DONE - 2003-05-09 - Hiram, to be verified # Translate Penn State .lav files into sorted axt: ssh eieio set base="/cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06" set seq1_dir="/cluster/store5/gs.16/build33/nib/" set seq2_dir="/cluster/store4/rn2/nib/" set tbl="blastzRn2" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin $seq1_dir $seq2_dir stdout \ | axtSort stdin $out popd end # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06" set tbl="blastzRn2" cd $base/pslChrom /cluster/bin/i386/hgLoadPsl hg15 chr*_${tbl}.psl # MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE RN2 AXT FILES # DONE - 2003-05-09 - Hiram, to be verified # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio set base="/cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06" set tbl="blastzBestRn2" cd $base mkdir -p axtBest pslBest foreach chrdir (lav/chr*) set chr=$chrdir:t echo axtBesting $chr /cluster/bin/i386/axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr /cluster/bin/i386/axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06" set tbl="blastzBestRn2" cd $base/pslBest /cluster/bin/i386/hgLoadPsl hg15 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: # Not done for build 32: mkdir -p /gbdb/hg15/axtBestRn2 cd /gbdb/hg15/axtBestRn2 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/hg15/axtBestRn2/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('rn2','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end /cluster/bin/i386/hgsql hg15 < ~/kent/src/hg/lib/axtInfo.sql /cluster/bin/i386/hgsql hg15 < axtInfoInserts.sql # MAKING THE AXTTIGHT FROM AXTBEST # DONE - 2003-05-09 - Hiram, to be verified # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd /cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06/axtBest mkdir -p ../axtTight foreach i (*.axt) subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn2.psl end # Load tables into database ssh hgwdev cd /cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06/pslTight hgLoadPsl hg15 chr*_blastzTightRn2.psl # BLASTZ RAT RN3 (DONE 2003-07-09 kate) # Note: these alignments are swapped and displayed on Rn3 browser ssh eieio mkdir -p /cluster/data/hg15/bed/blastz.rn3 cd /cluster/data/hg15/bed/blastz.rn3 cat << '_EOF_' > DEF # human vs. rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET SEQ1_DIR=/iscratch/i/gs.16/build33/chromTrfMixedNib/ SEQ1_RMSK=/iscratch/i/gs.16/build33/rmsk/ SEQ1_SMSK=/iscratch/i/gs.16/build33/linSpecRep/ SEQ1_FLAG=-primate SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY SEQ2_DIR=/cluster/bluearc/rat/rn3/softNib/ # not currently used SEQ2_RMSK=/cluster/bluearc/rat/rn3/rmsk/ SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep/ SEQ2_FLAG=-rodent SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/store5/gs.16/build33/bed/blastz.rn3 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.hg15-rn3.2003-07-09 ssh kk cd ~/hg15/bed/blastz.rn3 # source the DEF file bash . ./DEF # follow the next set of directions slavishly mkdir -p $BASE/run # give up on avoiding angie's directories # tcl script # creates xdir.sh and joblist run/j ~angie/hummus/make-joblist $DEF > $BASE/run/j # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size sh $BASE/xdir.sh cd $BASE/run # now edit j to prefix path to executable name # NOTE: we should have a controlled version of schwartz bin executables sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2 wc -l j* head j2 # make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j # 1400 jobs written to batch para try para check para push # ... etc ... # 1.3 hr longest job # post-process blastz ssh eieio cd ~/hg15/bed/blastz.rn3 # source the DEF file again in case you are coming back to this bash . ./DEF # a new run directory mkdir -p $BASE/run.1 # copy files to cluster fileserver clusterdir=/cluster/bluearc/hg/gs.16/build33/bed/blastz.rn3 mkdir -p $clusterdir/{raw,lav} cp -r $BASE/raw $clusterdir # create a new job list to convert out files to lav /cluster/bin/scripts/blastz-make-out2lav $DEF $clusterdir \ > $BASE/run.1/jobList cd $BASE/run.1 # make sure the job list is OK wc -l jobList # 342 jobs head jobList # run on cluster ssh kkr9u01 cd ~/hg15/bed/blastz.rn3/run.1 para create jobList para try para check para push # etc. # 10 minutes total # convert lav files to axt and sort ssh kkstore # create template for gensub2 cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' cd /cluster/data/hg15/bed/blastz.rn3 set base="/cluster/bluearc/hg/gs.16/build33/bed/blastz.rn3" set seq1_dir="/cluster/data/hg15/nib" set seq2_dir="/cluster/data/rn3/softNib" /cluster/bin/scripts/blastz-lav2axt $base $seq1_dir $seq2_dir >&! \ lav2axt.log & tail -100f lav2axt.log # 3 hours total ? No problems with axtSort memory # copy axt's from cluster data area to permanent storage area rm -fr axtChrom cp -r $base/axtChrom . # translate sorted axt files into psl # (just for featureBits alignment quality check) # 2003-09-04 kate ssh eieio cd /cluster/data/hg15/bed/blastz.rn3 mkdir -p pslChrom set tbl="blastzRn3" foreach f (axtChrom/chr*.axt) echo $c set c=$f:t:r /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Takes about 20 minutes # BEST RAT RN3 ALIGNMENTS (DONE 2003-07-26 kate) # Consolidate AXT files to chrom level, sort, pick best ssh eieio cd /cluster/data/hg15/bed/blastz.rn3 mkdir -p axtBest pslBest mkdir run.2 cd run.2 # create script to filter files cat << '_EOF_' > doBestAxt #!/bin/csh -f # usage: doBestAxt chr axt-file best-file psl-file #/cluster/bin/i386/axtBest $2 $1 $3 -minScore-300 # incorrect option above, should be: /cluster/bin/i386/axtBest $2 $1 $3 -minScore=300 sleep 1 /cluster/bin/i386/axtToPsl $3 /cluster/data/hg15/bed/blastz.rn3/S1.len /cluster/data/hg15/bed/blastz.rn3/S2.len $4 '_EOF_' chmod +x doBestAxt cd axtChrom ls -1S | sed 's/.axt$//' > ../run.2/chrom.list cd ../run.2 # create template for cluster job cat << '_EOF_' > gsub #LOOP doBestAxt $(root1) /cluster/bluearc/hg/hg15/bed/blastz.rn3/axtChrom/$(root1).axt {check out line+ /cluster/data/hg15/bed/blastz.rn3/axtBest/$(root1).axt} {check out line+ /cluster/data/hg15/bed/blastz.rn3/pslBest/$(root1)_blastzBestRn3.psl} #ENDLOOP '_EOF_' gensub2 chrom.list single gsub jobList wc -l jobList # 44 jobs head jobList ssh kk cd /cluster/data/hg15/bed/blastz.rn3 cd run.2 para create jobList para try para check para push # 1 hour ? # load into database ssh hgwdev cd /cluster/data/hg15/bed/blastz.rn3 cd pslBest set tbl="blastzBestRn3" hgLoadPsl hg15 chr*_${tbl}.psl # BEGINNING OF HUMAN BLASTZ # LOADING HUMAN HG15 (SELF) BLASTZ ALIGNMENTS: (TO DO) # Translate Penn State .lav files into sorted axt, with alignments # to self/diagonal dropped: ssh eieio set base="/cluster/store5/gs.16/build33/bed/blastz.hg15.2003-06-06" set seq1_dir="/cluster/store5/gs.16/build33/mixedNib/" set seq2_dir="/cluster/store5/gs.16/build33/mixedNib/" set tbl="blastzHuman" cd $base mkdir -p axtChrom # sometimes alignments are so huge that they cause axtSort to run out # of memory. Run them in two passes like this: foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" foreach d (*.lav) set smallout=$d.axt lavToAxt $d $seq1_dir $seq2_dir stdout \ | axtDropSelf stdin stdout \ | axtSort stdin $smallout end cat `ls -1 *.lav.axt | sort -g` \ > $out popd end # STOPPED HERE -- big data, low demand. # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r echo translating $c.axt to psl axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.16/build33/bed/blastz.hg15.2003-03-18-ASH" set tbl="blastzHuman" cd $base/pslChrom hgLoadPsl hg15 chr*_${tbl}.psl # MAKING THE BLASTZBESTHUMAN TRACK FROM UNFILTERED AXT FILES (TO DO) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio set base="/cluster/store5/gs.16/build33/bed/blastz.hg15.2003-03-18-ASH" set tbl="blastzBestHuman" cd $base mkdir -p axtBest pslBest # run axtBest in 2 passes to reduce size of the input to final axtBest: foreach chrdir (lav/*) set chr=$chrdir:t echo two-pass axtBesting $chr foreach a ($chrdir/*.axt) axtBest $a $chr $a:r.axtBest end cat `ls -1 $chrdir/*.axtBest | sort -g` \ > $chrdir/$chr.axtBestPieces axtBest $chrdir/$chr.axtBestPieces $chr axtBest/$chr.axt axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.16/build33/bed/blastz.hg15.2003-03-18-ASH" set tbl="blastzBestHuman" cd $base/pslBest hgLoadPsl hg15 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: # Not done for build 32: mkdir -p /gbdb/hg15/axtBestHg15 cd /gbdb/hg15/axtBestHg15 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/hg15/axtBestHg15/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('hg15','Blastz Best Human Self','$chr','$f');" \ >> axtInfoInserts.sql end hgsql hg15 < ~/kent/src/hg/lib/axtInfo.sql hgsql hg15 < axtInfoInserts.sql # MAKING THE AXTTIGHT FROM AXTBEST (TO DO) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd ~/hg15/bed/blastz.hg15.2003-03-18-ASH/axtBest mkdir -p ../axtTight foreach i (*.axt) subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHuman.psl end # Load tables into database ssh hgwdev cd ~/hg15/bed/blastz.hg15.2003-03-18-ASH/pslTight hgLoadPsl hg15 chr*_blastzTightHuman.psl # XXX END OF HUMAN BLASTZ # LIFTING REPEATMASKER .ALIGN FILES (TODO) foreach d (?{,?}/NT_??????) set c=$d:t cd $d echo $c to $c.fa.align /cluster/bin/scripts/liftRMAlign.pl $c.lft > $c.fa.align cd ../.. end foreach chr (?{,?}) cd $chr echo making symbolic links for chr$chr NT .fa.align files foreach ctg (NT_??????) ln -s $ctg/$ctg.fa.align end cd .. if (-e $chr/lift/ordered.lft) then echo making $chr/chr$chr.fa.align /cluster/bin/scripts/liftRMAlign.pl $chr/lift/ordered.lft \ > $chr/chr$chr.fa.align endif if (-e $chr/lift/random.lft) then echo making $chr/chr${chr}_random.fa.align /cluster/bin/scripts/liftRMAlign.pl $chr/lift/random.lft \ > $chr/chr${chr}_random.fa.align endif echo removing symbolic links for chr$chr NT .fa.align files rm $chr/NT_??????.fa.align end # TWINSCAN GENE PREDICTIONS (5/1/03 KRR RELOADED -gtf 2004-04-02 kate) mkdir -p ~/hg15/bed/twinscan cd ~/hg15/bed/twinscan foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) rm -f chr$c.{gtf,ptx} set dir = genome.cs.wustl.edu/~bio/human/NCBI33 wget http://$dir/gtf/chr$c.gtf wget http://$dir/ptx/chr$c.ptx # replace first field (sequence identifier) with chrom name perl -wpe "s/^\d+/chr$c/" < chr$c.gtf > chr$c-fixed.gtf # pare down protein FASTA header to id and add missing .a: perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr$c.ptx > chr$c-fixed.fa end ldHgGene hg15 twinscan chr*-fixed.gtf -gtf # 25434 gene predictions runGeneCheck . # 44 badFrame # 29 noStart # 18 gap # 5 noStop hgPepPred hg15 generic twinscanPep chr*-fixed.fa # LOAD CHIMP DATA (TODO) # Download the chimp sequence and distribute to /iscratch/i ssh hgwdev mkdir /cluster/store1/chimpSeq cd /cluster/store1/chimpSeq wget http://www.cs.uni-duesseldorf.de/~ebersber/annotation_track_chimp/downloads/mpi-aligned_seqparts_jun02.fa.gz gunzip *.gz ssh kkr1u00 mkdir /iscratch/i/chimp cp -p /cluster/store1/chimpSeq/*.fa /iscratch/i/chimp/ # Make sure it unpacked OK ~kent/bin/iSync ssh kk mkdir ~/hg15/bed/blatChimp cd ~/hg15/bed/blatChimp cp ~/hg13/bed/blatChimp/gsub . ls -1S /iscratch/i/chimp/*.fa > chimp.lst ls -1S /scratch/hg/gs.16/build33/trfFa.1204/*.fa.trf > human.lst mkdir psl gensub2 human.lst chimp.lst gsub spec para create spec para try para push para check # Sort alignments as so ssh eieio cd ~/hg15/bed/blatChimp pslCat -dir psl \ | liftUp -type=.psl stdout ~/hg15/jkStuff/liftAll.lft warn stdin \ | pslSortAcc nohead chrom temp stdin pslCat -dir chrom > blatChimp.psl ssh hgwdev cd ~/hg15/bed/blatChimp hgLoadPsl hg15 blatChimp.psl # SGP GENE PREDICTIONS (DONE - 2003-05-14 - Hiram - to be verified) mkdir -p ~/hg15/bed/sgp/download cd ~/hg15/bed/sgp/download foreach f (~/hg15/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20030410/SGP/$chr.gtf wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20030410/SGP/$chr.prot end wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20030410/SGP/chrUn.gtf -O chrUn_random.gtf wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20030410/SGP/chrUn.prot -O chrUn_random.prot # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene hg15 sgpGene download/*.gtf -exon=CDS hgPepPred hg15 generic sgpPep download/*-fixed.prot # SGP GENES (UPDATE 1/18/2006) sgpPep table dropped, replaced by hgc generated protein seq in browser # ALIGNED ANCIENT REPEATS FROM MOUSE BLASTZ (TODO) ssh eieio mkdir -p ~/hg15/bed/aarMm2 cd ~/hg15/bed/aarMm2 set mmdir=../blastz.mm2.2002-12-5-ASH foreach aar ($mmdir/aar/*.aar.gz) set c = $aar:t:r:r echo translating chr$c aar to axt zcat $aar \ | $HOME/kent/src/hg/makeDb/hgAar/aarToAxt \ | axtToPsl stdin $mmdir/S1.len $mmdir/S2.len stdout \ > chr${c}_aarMm2.psl end ssh hgwdev cd ~/hg15/bed/aarMm2 hgLoadPsl hg15 *.psl # ALIGNMENT COUNTS FOR WIGGLE TRACK # this needs to be updated to reflected the full process. - Generate BED table of AARs used to select regions. cat ../bed/aarMm2/*.psl | awk 'BEGIN{OFS="\t"} {print $14,$16,$17,"aar"}' >aarMm2.bed - Generate background counts with windows that have a 6kb counts, with a maximum windows size of 512kb and sliding the windows by foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt) set chr=$axt:t:r set tab=$chr.6kb-aar.cnts (??? need better name ???) hgCountAlign -selectBed=aarMm2.bed -winSize=512000 -winSlide=1000 -fixedNumCounts=6000 -countCoords $axt $tab end - Generate counts for AARs with 50b windows, slide by 5b foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt) set chr=$axt:t:r set tab=$chr.50b-aar.cnts (??? need better name ???) hgCountAlign -selectBed=aarMm2.bed -winSize=50 -winSlide=5 $axt $tab end - Generate counts for all with 50b windows, slide by 5b foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt) set chr=$axt:t:r set tab=$chr.50b.cnts (??? need better name ???) hgCountAlign -winSize=50 -winSlide=5 $axt $tab end REFFULL (TODO) o ssh to eieio mkdir -p /cluster/store5/gs.16/build33/bed/refFull cd /cluster/store5/gs.16/build33/bed/refFull Download the sequence: wget ftp://blue3.ims.u-tokyo.ac.jp/pub/db/hgc/dbtss/ref-full.fa.gz gunzip it and split the ref-rull.fa file into about 200 pieces gunzip ref-full.fa.gz faSplit sequence ref-full.fa 50 splitRefFull ssh kkstore cd /cluster/store5/gs.16/build33/bed/refFull mkdir /scratch/hg/refFull splitRefFull* /scratch/hg/refFull/ ls -1S /scratch/hg/gs.16/build33/contig.0729/*.fa > genome.lst ls -1S /scratch/hg/refFull/split*.fa > refFull.lst o - Request the admins to do a binrsync to the cluster of /scratch/hg/refFull o - Use BLAT to generate refFull alignments as so: Make sure that /scratch/hg/gs.16/build33/contig/ is loaded with NT_*.fa and pushed to the cluster nodes. ssh kk cd /cluster/store5/gs.16/build33/bed/refFull mkdir -p psl # run mkdirs.sh script to create sudirs in the psl directory # in order to modularize the blat job. gensub2 genome.lst refFull.lst gsub spec para create spec Now run a para try/push and para check in each one. o - Process refFull alignments into near best in genome. cd ~/hg15/bed cd refFull pslSort dirs raw.psl /tmp psl/* pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.001 raw.psl contig.psl /dev/null liftUp -nohead all_refFull.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom /tmp all_refFull.psl o - Load refFull alignments into database ssh hgwdev cd /cluster/store5/gs.16/build33/bed/refFull pslCat -dir chrom > refFullAli.psl hgLoadPsl hg15 -tNameIx refFullAli.psl MAKING PROMOTER FILES cd /usr/local/apache/htdocs/goldenPath/14nov2002/bigZips featureBits hg15 -fa=upstream1000.fa refGene:upstream:1000 zip upstream1000.zip upstream1000.fa featureBits hg15 -fa=upstream2000.fa refGene:upstream:2000 zip upstream2000.zip upstream2000.fa featureBits hg15 -fa=upstream5000.fa refGene:upstream:5000 zip upstream5000.zip upstream5000.fa rm upstream*.fa # MAKING MOUSE AND RAT SYNTENY (MOUSE DONE 041403)(RAT DONE 2003-05-19) cd ~/hg15/bed mkdir syntenyMouse # Copy all the needed scripts from ~/hg14 ./syntenicBest.pl -db=hg15 -table=blastzBestMm3 ./smooth.pl ./joinsmallgaps.pl ./fillgap.pl -db=hg15 -table=blastzBestMm3 ./synteny2bed.pl hgLoadBed hg15 syntenyMouse ucsc100k.bed # syntenicBest.pl will overwrite the genomeBest1phase created by the # the previous run of this above. It takes several hours to run. syntenicBest.pl -db=hg15 -table=blastzBestRn2 # smooth.pl overwrites genomeBest2phase created by the previous # run of this above. Runs quickly. smooth.pl # joinsmallgaps.pl overwrites genomeBest3phase created above. Runs quickly. joinsmallgaps.pl # fillgap.pl creates genomeBestFinal fillgap.pl -db=hg15 -table=blastzBestRn2 # synteny2bed.pl creates ucsc100k.bed synteny2bed.pl hgLoadBed hg15 syntenyRat ucsc100k.bed # create MGC Genes track (markd 13 may 2003) # this will be automated in the incremental genbank update /cluster/store5/genbank/bin/i386/mgcDbLoad hg15 /cluster/store5/genbank/data/download/mgc/2003.05.06/mgcFullStatus.tab.gz #Creating the hg15Mm3L track on hg15 kent/src/hg/sampleTracks/makeHg15Mm3.csh # Create firstEF track from Zhang lab at CSHL # contacts # Gengxin Chen # Ivo Grosse # Michael Zhang # Got firstef.tar.gz from Gengzin 6/30/03 # Done 2003-07-02 braney # Got firstef.tar.gz from Gengzin 7/30/03 # Done again 2003-07-31 braney ssh kk set db=/cluster/data/hg15 cd $db mkdir -p bed/firstEF cd bed/firstEF # copy firstef.tar.gz to . zcat firstef.tar.gz | tar xf - cd firstef rm -rf *.alpha *.iris4d *.sun4 examples *.i386 ln firstef.i386-linux firstef.i686 # set up script parameters cat << '_EOF_' > firstef perl ./make_fasta.pl $1 ./tmp/list_file > /dev/null; ./firstef.$HOSTTYPE 1500 ./tmp/list_file ./parameters/ 0 $3 $4 $5 > /dev/null; perl ./FirstEF_parser.pl ./tmp/list_file_out $2 > /dev/null; perl ./file_erase.pl ./tmp/list_file; rm ./tmp/list_file*; '_EOF_' cat << '_EOF_' > sedit.1 /^No/d /^$/d s/\.\./ /g '_EOF_' cat << '_EOF_' > sedit.2 /0 0 CpGIsland/d /-1 1 CpGIsland/d s/ /\010/g '_EOF_' cat << '_EOF_' > tawk BEGIN { strand = "-"} /^>.*$/ { if (strand=="-") strand="+"; else strand = "-"} /^[^>].*$/ { rank=$11 if (rank > 1) extra="#"rank else extra="" if (strand == "+") printf "chrom %d %d promoter(%d+)%s %d +\nchrom %d %d exon(%d+)%s %d +\nchrom %d %d CpGWindow(%d+)%s %d +\n", $2-1, $3, $1, extra, $4 * 1000 , $5-1, $6, $1, extra, $7 * 1000 , $9-1, $10, $1, extra, 1000 else printf "chrom %d %d promoter(%d-)%s %d -\nchrom %d %d exon(%d-)%s %d -\nchrom %d %d CpGWindow(%d-)%s %d -\n", $3-1, $2, $1, extra, $4 * 1000 , $6-1, $5, $1, extra, $7 * 1000 , $10-1, $9, $1, extra, 1000 } '_EOF_' cat << '_EOF_' > runProg suffix=$1 inputFile=$2 chrom=`basename $2 $1` output=$3 ./firstef $inputFile /tmp/$chrom.tmp $4 $5 $6 > /dev/null 2>&1 cat /tmp/$chrom.tmp | sed -f sedit.1 | awk -f tawk | sed "s/chrom/$chrom/" | sed -f sedit.2 > $output rm -f /tmp/$chrom.tmp '_EOF_' chmod a+x runProg # since firstef isn't built to have more than one # copy executing at a time in a single directory # we build a tar file to extract in each run directory cd .. tar cf prog.tar firstef ls -1S /cluster/store5/gs.16/build33/*/*.fa.masked > human.lst cat << '_EOF_' > doit cp $1 /tmp f=`basename $1 .fa.masked` cd out/$f/firstef ./runProg .fa.masked /tmp/$f.fa.masked ../$f.bed $3 $4 $5 '_EOF_' chmod a+x doit set pExon=0.4 set pPromoter=0.3 set pDonor=0.4 set param="$pPromoter $pDonor $pExon" echo "#LOOP " > gsub echo "./doit \044(path1) {check out line+ out/\044(root1)/\044(root1).bed} $param" >> gsub echo "#ENDLOOP" >> gsub gensub2 human.lst single gsub temp cat temp | sed "s/\.fa\//\//" | sed "s/\.fa\.bed/.bed/" > spec rm temp # make directories for cluster output files rm -rf out mkdir -p out cd out foreach i ( `cat ../human.lst`) set f=`basename $i .fa.masked` mkdir $f; cd $f; tar xf ../../prog.tar; cd .. end cd .. para create spec para try para check para push para times #Average job time: 547s 9.11m 0.15h 0.01d #Longest job: 1739s 28.98m 0.48h 0.02d # Two jobs fail # out/chrY_random/chrY_random.bed is empty # out/chrM/chrM.bed is empty ssh hgwdev cat out/*/*.bed > firstEF.bed echo "drop table firstEF;" | hgsql hg15 hgLoadBed hg15 firstEF firstEF.bed rm -rf out # COW SYNTENY (Done, Heather, Mar. 2005) # Data from Harris A. Lewin ssh hgwdev cd /cluster/data/hg15/bed mkdir syntenyCow cd syntenyCow # Obtain bed5 file from # http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg15&position=chr1&hgt.customText=http://cagst.animal.uiuc.edu/BACcontig/larkin_et_al_track # Save as in.bed hgLoadBed -noBin hg15 syntenyCow in.bed # New data from UIUC (using chrom names) # Reloaded from in2.bed hgLoadBed -noBin hg15 syntenyCow in2.bed # New data again (Dec. 2005) fixChr.pl < in3.bed > in4.bed hgLoadBed -noBin hg15 syntenyCow in4.bed # add to kent/src/hg/makeDb/trackDb/human/hg15/trackDb.ra # COW BACENDS (Done, Heather, Mar. 8, 2005) ssh hgwdev cd /cluster/data/hg15/bed mkdir bacendsCow cd bacendsCow # Obtain bed4 file from #http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg15&position=chr1&hgt.customText=http://cagst.animal.uiuc.edu/BACcontig/map_and_bac_ends # Save as in.bed hgLoadBed -noBin hg15 bacendsCow in.bed # add to kent/src/hg/makeDb/trackDb/human/hg15/trackDb.ra # COW BACENDS (Updated, Heather, Mar. 21, 2005) ssh hgwdev cd /cluster/data/hg15/bed/bacendsCow # Obtain GFF file from Denis; unzip into BACendhg15.gff # Convert into BED 6: makebed.pl < BACendhg15.gff > BACendhg15.bed hgLoadBed -noBin hg15 bacendsCow BACendhg15.bed # 54262 warnings # RUN 3-WAY ALIGNMENTS HUMAN/RAT/MOUSE USING MULTIZ (DONE kate) set multizDir = /cluster/data/hg15/bed/multiz.rn3-mm3.2003-07-26-KRR # Edit hg/ratStuff/stageMultiz/stage.csh with correct paths # Saved in $multizDir # Perform steps in stage.csh (still must be done manually) # to set up staging directories and run multiz on cluster. # setup up external files for database to reference ssh hgwdev mkdir -p /gbdb/hg15/multizRn3Mm3 cd /gbdb/hg15/multizRn3Mm3 foreach f ($multizDir/hmr/*.maf) ln -s $f . end # load into database /cluster/bin/i386/hgLoadMaf -warn hg15 multizRn3Mm3 # check with previous run -- note fewer alignments in latest run featureBits hg15 multizRn3Mm3 # 1127632252 bases of 2866466359 (39.339%) in intersection featureBits hg15 multizMm3Rn2 # 1166321938 bases of 2866466359 (40.688%) in intersection # RUN 3-WAY ALIGNMENTS HUMAN/MOUSE/RAT USING MULTIZ (IN PROGRESS kate) set multizDir = /cluster/data/hg15/bed/multiz.mm3-rn3.2003-08-06-KRR mkdir $multizDir # Edit hg/ratStuff/stageMultiz/stage.csh with correct paths # Saved in $multizDir # Perform steps in stage.csh (still must be done manually) # to set up staging directories and run multiz on cluster. # setup up external files for database to reference ssh hgwdev mkdir -p /gbdb/hg15/multizMm3Rn3 cd /gbdb/hg15/multizMm3Rn3 foreach f ($multizDir/hmr/*.maf) ln -s $f . end # load into database /cluster/bin/i386/hgLoadMaf -warn hg15 multizMm3Rn3 # check with previous run featureBits hg15 multizMm3Rn3 featureBits hg15 multizMm3Rn2 # Making Humor 3 way alignment using axtNet 300 (baertsch) # Approximate directions below cd /cluster/data/hg15/bed/blastz.mm3/axtChain/mouseNet mkdir -p ../../axtNet300 forEachChrom netToAxt -maxGap=300 $i.net ../chain/$i.chain /cluster/data/hg15/nib /cluster/data/mm3/nib ../../axtNet300/$i.axt end cd /cluster/data/hg15/bed/blastz.rn3/axtChain/ratNet mkdir -p ../../axtNet300 forEachChrom netToAxt -maxGap=300 $i.net ../chain/$i.chain /cluster/data/hg15/nib /cluster/data/rn3/nib ../../axtNet300/$i.axt end set multizDir = /cluster/data/hg15/bed/humor300 mkdir $multizDir cd /cluster/data/hg15 forEachChrom axtSort blastz.mm3/axtNet300/ tmp mv tmp blastz.mm3/axtNet300 axtToMaf blastz.mm3/axtNet300 /cluster/data/hg15/chrom.sizes /cluster/data/mm3/chrom.sizes axtNet300/$i.mm3.maf -tPrefix=hg15. -qPrefix=mm3. axtSort blastz.rn3/axtNet300/ tmp mv tmp blastz.rn3/axtNet300 axtToMaf blastz.rn3/axtNet300 /cluster/data/hg15/chrom.sizes /cluster/data/rn3/chrom.sizes axtNet300/$i.rn3.maf -tPrefix=hg15. -qPrefix=rn3. /cluster/bin/penn/humor.v4 axtNet300/$i.mm3.maf axtNet300/$i.rn3.maf > $multizDir/$i.hmr.axtNet300.maf end run on kkr9 cluster: CPU time in finished jobs: 18977s 316.28m 5.27h 0.22d 0.001 y IO & Wait Time: 1237s 20.62m 0.34h 0.01d 0.000 y Average job time: 470s 7.83m 0.13h 0.01d Longest job: 1762s 29.37m 0.49h 0.02d Submission to last job: 1817s 30.28m 0.50h 0.02d mkdir -p /gbdb/hg15/humor300 cd /gbdb/hg15/humor300 foreach f ($multizDir/*.maf) ln -s $f . end # load into database /cluster/bin/i386/hgLoadMaf -warn hg15 humor300 rm *.tab # Creating knownToSuper (which enables superFamily stuff in hgNear/hgGene) # First see if need to update superfamily data from # ftp server at supfam.mrc-lmb.cam.ac.uk following instructions # in /cluster/store1/superFamily/genomes/README.ucsc. Then # make sure that knownToEnsembl and ensGtp tables are created, then: zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg15 hs stdin # RELOAD ENSEMBL GENES to fix confusion in current track (2004-02-01 - Hiram) # Turns out this wasn't necessary. The correct data was already # in Hg15, it just hadn't been pushed out to the RR. # Leaving these instructions here for future reference. # This is the way is should be done # save current tables, just in case. hgsql -e "rename table ensGene to ensGene_old;" hg15 hgsql -e "rename table ensGtp to ensGtp_old;" hg15 hgsql -e "rename table ensPep to ensPep_old;" hg15 # You do not want ensembl34a data for this assembly. # Ensembl does not yet make it convenient to fetch archived # assemblies. mkdir /cluster/data/hg15/bed/ensembl34a cd /cluster/data/hg15/bed/ensembl34a # Get the ensembl protein data from # http://www.ensembl.org/Homo_sapiens/martview # Follow this sequence through the pages: # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" tab. # Page 4) In "Select the output format:" Choose GTF as the output, # choose gzip compression. enter the name ensembleGene.gtf.gz # in the download file name entry, hit export. # Save as ensemblGene.gtf.gz - about 7.8 Mb - # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. # Finally, get rid of the ".1" or ".2" after the name zcat ensemblGene.gtf.gz \ | grep -v ^6_DR51 \ | grep -v ^DR51 \ | grep -v _NT_ \ | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ || die "Line $. doesnt start with human chrom:\n$_"' \ | sed -e 's/\..\"/\"/g' \ > ensGene.gtf ssh hgwdev /cluster/bin/i386/ldHgGene hg15 ensGene \ /cluster/data/hg15/bed/ensembl34a/ensGene.gtf # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" tab. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql hg15 < ~/kent/src/hg/lib/ensGtp.sql echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg15 gzip ensGtp.txt # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" tab. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz - about 7.7 Mb zcat ensemblPep.fa.gz | hgPepPred hg15 ensembl stdin # compare size of old and new tables as a sanity check foreach t (ensGene ensGtp ensPep) echo -n "${t}: " hgsql -e "select count(*) from ${t};" hg15 | tail -1 echo -n "${t}_old: " hgsql -e "select count(*) from ${t}_old;" hg15 | tail -1 end # if reasonable OK, can drop old tables drop table ensGene_old; drop table ensGtp_old; drop table ensPep_old; # Create knownToEnsembl column and knownToSuperfamily column hgMapToGene hg15 ensGene knownGene knownToEnsembl zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg15 hs stdin # KNOWN GENE UPDATE (DONE - 2004-02-24 - Hiram # using proteins and swissProt as created for Hg16 recently ssh hgwdev mkdir /cluster/data/kgDB/bed/kgHg15 cd /cluster/data/kgDB/bed/kgHg15 ~/kent/src/hg/protein/KGprocess.sh kgHg15 hg15 040115 # that runs to the first cluster run, then on kk ssh kk cd /cluster/data/kgDB/bed/kgHg15/kgBestMrna para create jobList para try para push ... etc ... # Completed: 37972 of 37972 jobs # CPU time in finished jobs: 106405s 1773.41m 29.56h 1.23d 0.003 y # IO & Wait Time: 104497s 1741.62m 29.03h 1.21d 0.003 y # Average job time: 6s 0.09m 0.00h 0.00d # Longest job: 12s 0.20m 0.00h 0.00d # Submission to last job: 238s 3.97m 0.07h 0.00d # Continuing back on hgwdev ssh hgwdev cd /cluster/data/kgDB/bed/kgHg15 ~/kent/src/hg/protein/KGprocess.sh kgHg15 hg15 040115 # that runs until it prepares the second cluster run, then ssh kk cd /cluster/data/kgDB/bed/kgHg15/kgProtMap para create jobList para try para push ... etc # Completed: 4953 of 4953 jobs # CPU time in finished jobs: 1612079s 26867.98m 447.80h 18.66d 0.051 y # IO & Wait Time: 218411s 3640.18m 60.67h 2.53d 0.007 y # Average job time: 370s 6.16m 0.10h 0.00d # Longest job: 1939s 32.32m 0.54h 0.02d # Submission to last job: 4682s 78.03m 1.30h 0.05d # Continuing back on hgwdev ssh hgwdev cd /cluster/data/kgDB/bed/kgHg15 ~/kent/src/hg/protein/KGprocess.sh kgHg15 hg15 040115 # That runs to completion # Being more than thorough, record status of all tables currently in # hg15 before loading tables, then backup all existing tables from # hg15 to kgHg15_2003_06 via a dump since a rename does not work # because hg15 is on a different filesystem than kgHg15_2003_06 mkdir /cluster/data/hg15/bed/famBro.2004-02-23 ln -l /cluster/data/hg15/bed/famBro.2004-02-23 /cluster/data/hg15/bed/famBro cd /cluster/data/hg15/bed/famBro ~hiram/bin/tblData.pl -verbose hg15 > tblData.before cat << '_EOF_' > loadNew.sh #!/bin/sh # # OldDB=hg15 NewDB=kgHg15 mkdir -p newKg # for T in cgapAlias cgapBiocDesc cgapBiocPathway dupSpMrna keggMapDesc \ keggPathway kgAlias kgProtMap kgProtAlias kgXref knownGene \ knownGeneLink knownGeneMrna knownGenePep mrnaRefseq spMrna do echo "copy table data from ${NewDB}.${T} to ${OldDB}.${T};" rm -f newKg/${T}.sql newKg/${T}.txt newKg/${T}.txt.gz hgsqldump --all -T newKg ${NewDB} ${T} hgsql -e "drop table ${T};" ${OldDB} cd newKg hgsql ${OldDB} < ${T}.sql mysqlimport --user=hgcat --password="Man-0-W4r" ${OldDB} ${T}.txt gzip ${T}.txt & cd .. done '_EOF_' # Eventually ended saving all of the following tables during the # family browser update process below: (~400 Mb of data) # affyUcla kgProtAlias knownToCdsSnp # ceBlastTab kgXref knownToEnsembl # cgapAlias knownBlastTab knownToPfam # cgapBiocDesc knownCanonical knownToRefSeq # cgapBiocPathway knownExpDistance knownToU133 # dmBlastTab knownGeneLink knownToU95 # drBlastTab knownGeneMrna mmBlastTab # dupSpMrna knownGenePep mrnaRefseq # keggMapDesc knownGene scBlastTab # keggPathway gnfU95Distance spMrna # kgAlias knownIsoforms # continue with the FAMILY BROWSER UPDATE to complete this process # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. The hgKgGetText takes # about 5 minutes when the database is not too busy. The rest # is real quick. ssh hgwdev cd /cluster/data/hg15/bed/knownGene mkdir index cd index hgKgGetText hg15 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/data/hg15/bed/knownGene/index/knownGene.ix /gbdb/hg15/knownGene.ix ln -s /cluster/data/hg15/bed/knownGene/index/knownGene.ixx /gbdb/hg15/knownGene.ixx # FAMILY BROWSER UPDATE (DONE - 2004-02-24 - Hiram) # to be done after knownGene tables are complete from known gene # process. # # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/hg15/bed/famBro.2004-02-23 ln -l /cluster/data/hg15/bed/famBro.2004-02-23 /cluster/data/hg15/bed/famBro cd /cluster/data/hg15/bed/famBro hgClusterGenes hg15 knownGene knownIsoforms knownCanonical # Got 19654 clusters, from 36011 genes in 44 chromosomes # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/hg15/bed/famBro/blastp cd /cluster/data/hg15/bed/famBro/blastp pepPredToFa hg15 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /scratch/blast/formatdb -i known.faa -t known -n known # This command is in /projects/compbio/bin/$MACH/formatdb # Copy over database to bluearc rm -fr /cluster/bluearc/hg15/blastp mkdir -p /cluster/bluearc/hg15/blastp cp -p /cluster/data/hg15/bed/famBro/blastp/known.* /cluster/bluearc/hg15/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/hg15/bed/famBro/blastp mkdir split faSplit sequence known.faa 8000 split/kg # /scratch/blast should already exist and we want to use it # Make parasol run directory ssh kk mkdir /cluster/data/hg15/bed/famBro/blastp/self cd /cluster/data/hg15/bed/famBro/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \ -d /cluster/bluearc/hg15/blastp/known -i $1 -o $2 \ -e 0.01 -m 8 -b 1000 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~10 minutes if the cluster is free. # Completed: 7753 of 7753 jobs # CPU time in finished jobs: 118480s 1974.67m 32.91h 1.37d 0.004 y # IO & Wait Time: 72240s 1203.99m 20.07h 0.84d 0.002 y # Average job time: 25s 0.41m 0.01h 0.00d # Longest job: 271s 4.52m 0.08h 0.00d # Submission to last job: 470s 7.83m 0.13h 0.01d # Load into database. This takes about 20 minutes ssh hgwdev cd /cluster/data/hg15/bed/famBro/blastp/self/run/out time hgLoadBlastTab hg15 knownBlastTab *.tab # Scanning through 7753 files # Loading database with 8832851 rows # previously had 7103098 rows # real 19m53.778s # user 3m38.460s # sys 0m24.020s cd /cluster/data/hg15/bed/famBro # Create table that maps between known genes and RefSeq hgMapToGene hg15 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # row count changed from 29278 to 29296 # Create table that maps between known genes and LocusLink hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg15 \ > refToLl.txt hgMapToGene hg15 refGene knownGene knownToLocusLink -lookup=refToLl.txt # this table did not exist in previous version, rows now: 29296 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt hg15 knownGene name proteinID Pfam knownToPfam # row count went from 30689 to 31337 # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene "-type=bed 12" hg15 affyUcla knownGene knownToU133 # row count changed from 32009 to 32067 # Create expression distance table. This will take about an hour. # run this in /tmp as it makes a huge (~800Mb) temp file # -rw-rw-r-- 1 848350571 Feb 23 14:08 /tmp/knownExpDistance.tab cd ~/kent/src/hg/near/hgExpDistance cp -p affyUcla.weight /tmp cd /tmp time hgExpDistance hg15 affyUcla affyUclaExp knownExpDistance \ -weights=affyUcla.weight -lookup=knownToU133 # Have 43344 elements in affyUcla # Got 32067 unique elements in affyUcla # real 65m16.947s # user 50m47.980s # sys 0m48.870s # row count went from 31504000 to 32067000 cd /cluster/data/hg15/bed/famBro # Create table that maps between known genes and # the GNF data. hgMapToGene hg15 affyRatio knownGene knownToU95 # row count changed from 13403 to 10949 cd /tmp # hgFixed.gnfHumanU95Exps argument is unused, no need to exist time hgExpDistance hg15 hgFixed.gnfHumanU95MedianRatio hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 # row count went from 11718000 to 10333000 # Got 10333 unique elements in hgFixed.gnfHumanU95MedianRatio # real 10m42.446s # user 6m0.980s # sys 0m12.430s # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. # I update this GO database very carefully, checking that all # structures in it remain the same from release to release and # backing up the current go DB in a backup database. In this case # the backup is go040107 - when it was loaded for Mm4, and the new # go database is based on data from Dec 17th 2003 and Feb 2004 according # to the time stamp on the fetched data. This build was done in # /cluster/store1/geneOntology/20040217 cd /cluster/data/hg15/bed/famBro # create knownToSuper table zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg15 hs stdin # row count changed from 34558 to 34334 # Create knownToEnsembl column hgMapToGene hg15 ensGene knownGene knownToEnsembl # row count changed from 33465 to 33561 # Make knownToCdsSnp column. This is a little complicated by # having to merge data form the snpTsc and the snpNih tracks. hgMapToGene hg15 snpTsc knownGene knownToCdsSnp -createOnly -all -cds hgMapToGene hg15 snpTsc knownGene snp1 -noLoad -all -cds hgMapToGene hg15 snpNih knownGene snp2 -noLoad -all -cds sort snp1.tab snp2.tab > knownToCdsSnp.tab rm snp1.tab snp2.tab hgsql \ -e 'load data local infile "knownToCdsSnp.tab" into table knownToCdsSnp;' \ hg15 # row count changed from 82592 to 83425 # # The five organisms below can be run through quickly. Get one kluster # run started and go on to the next. Takes about 7 minutes to get # them all running and by then one of them is already done. # Keep track of where you are at during this. # # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/ce1/blastp should have data # Create the ceBlastTab ssh kk mkdir /cluster/data/hg15/bed/famBro/blastp/ce1 cd /cluster/data/hg15/bed/famBro/blastp/ce1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \ -d /cluster/bluearc/ce1/blastp/wormPep \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7753 of 7753 jobs # CPU time in finished jobs: 53399s 889.99m 14.83h 0.62d 0.002 y # IO & Wait Time: 20094s 334.90m 5.58h 0.23d 0.001 y # Average job time: 9s 0.16m 0.00h 0.00d # Longest job: 140s 2.33m 0.04h 0.00d # Submission to last job: 247s 4.12m 0.07h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg15/bed/famBro/blastp/ce1/run/out hgLoadBlastTab hg15 ceBlastTab -maxPer=1 *.tab # row count went from 23504 to 23714 # Make mouse ortholog column using blastp on mouse known genes. # First make mouse protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeMm4.doc for procedure # the directory: /cluster/bluearc/mm4/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg15/bed/famBro/blastp/mm4 cd /cluster/data/hg15/bed/famBro/blastp/mm4 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \ -d /cluster/bluearc/mm4/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7753 of 7753 jobs # CPU time in finished jobs: 102603s 1710.06m 28.50h 1.19d 0.003 y # IO & Wait Time: 45327s 755.44m 12.59h 0.52d 0.001 y # Average job time: 19s 0.32m 0.01h 0.00d # Longest job: 213s 3.55m 0.06h 0.00d # Submission to last job: 530s 8.83m 0.15h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg15/bed/famBro/blastp/mm4/run/out hgLoadBlastTab hg15 mmBlastTab -maxPer=1 *.tab # row count went from 30536 to 31071 # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dr1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg15/bed/famBro/blastp/dr1 cd /cluster/data/hg15/bed/famBro/blastp/dr1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \ -d /cluster/bluearc/dr1/blastp/ensembl \ -i $1 -o $2 -e 0.005 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7753 of 7753 jobs # CPU time in finished jobs: 76565s 1276.09m 21.27h 0.89d 0.002 y # IO & Wait Time: 35019s 583.64m 9.73h 0.41d 0.001 y # Average job time: 14s 0.24m 0.00h 0.00d # Longest job: 208s 3.47m 0.06h 0.00d # Submission to last job: 508s 8.47m 0.14h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg15/bed/famBro/blastp/dr1/run/out hgLoadBlastTab hg15 drBlastTab -maxPer=1 *.tab # row count went from 27914 to 28141 # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/sc1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg15/bed/famBro/blastp/sc1 cd /cluster/data/hg15/bed/famBro/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \ -d /cluster/bluearc/sc1/blastp/sgd \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7753 of 7753 jobs # CPU time in finished jobs: 15864s 264.39m 4.41h 0.18d 0.001 y # IO & Wait Time: 22424s 373.74m 6.23h 0.26d 0.001 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest job: 31s 0.52m 0.01h 0.00d # Submission to last job: 172s 2.87m 0.05h 0.00d ssh hgwdev cd /cluster/data/hg15/bed/famBro/blastp/sc1/run/out hgLoadBlastTab hg15 scBlastTab -maxPer=1 *.tab # row count went from 15686 to 15840 # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg15/bed/famBro/blastp/dm1 cd /cluster/data/hg15/bed/famBro/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \ -d /cluster/bluearc/dm1/blastp/flyBase \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para make jobList # Completed: 7753 of 7753 jobs # CPU time in finished jobs: 62430s 1040.51m 17.34h 0.72d 0.002 y # IO & Wait Time: 23718s 395.29m 6.59h 0.27d 0.001 y # Average job time: 11s 0.19m 0.00h 0.00d # Longest job: 153s 2.55m 0.04h 0.00d # Submission to last job: 386s 6.43m 0.11h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg15/bed/famBro/blastp/dm1/run/out hgLoadBlastTab hg15 dmBlastTab -maxPer=1 *.tab # row count went from 24969 to 25152 # Update the proteins link into gdbPdb.hgcentraltest: hgsql \ -e 'update gdbPdb set proteomeDb = "proteins040115" where genomeDb = "hg15";' \ -h genome-testdb hgcentraltest # # Load tfbsCons track DONE 2004-03-19 braney # set humordir=/gbdb/hg15/humor25 set transfacdir=/projects/compbio/data/transfac set outdir=hg15_tfbsCons ssh hgwdev mkdir /cluster/data/hg15/bed/tfbsCons cd /cluster/data/hg15/bed/tfbsCons # Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts weirauch@soe.ucsc.edu set tarfile=/cluster/data/hg15/bed/tfbsCons/tfbsConsUtils.tar.gz tar zxf $tarfile # the following takes days (says Matt) nice getTfbsConsData.pl `pwd` $humordir $transfacdir ./IDS.txt $outdir -over & cd $outdir rm chr*.bed hgLoadBed -noSort hg15 tfbsCons -sqlTable=$HOME/kent/src/hg/lib/tfbsCons.sql tfbs.bed -tab # Get mapping of ID's from Matt so we can link into the TRANSFAC database set idmap=/cluster/data/hg15/bed/tfbsCons/tfbsConsMap hgsql hg15 < ~/kent/src/hg/lib/tfbsConsMap.sql echo "load data local infile '$idmap' into table tfbsConsMap;" | hgsql hg15 # PREP FOR CREATING LIFTOVER CHAINS TO HG15 (2004-04-12 kate) # split into 3K chunks ssh eieio set tempDir = /cluster/bluearc/hg/gs.16/build33/liftOver #set split = /cluster/bluearc/hg/gs.16/build33/liftOver/split cd $tempDir mkdir lift cat > split.csh << 'EOF' set scratch = /iscratch/i/gs.16/build33/liftOver/split mkdir -p $scratch foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M) echo chr$i faSplit -lift=lift/chr$i.lft size /cluster/data/hg15/$i/chr$i.fa -oneFile 3000 $scratch/chr$i end 'EOF' csh split.csh >&! split.log & tail -100f split.log /cluster/bin/iSync # LIFTOVER CHAINS TO HG16 (2004-04-14 kate) # doc'ed in hg16/bed/bedOver/liftOver.doc ? ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg15 mkdir -p liftOver cp /cluster/data/hg16/bed/bedOver/over.chain liftOver/hg15ToHg16.over.chain mkdir -p /cluster/data/hg15/liftOver cp liftOver/hg15ToHg16.over.chain /cluster/data/hg15/liftOver gzip liftOver/hg15ToHg16.over.chain # RECIPROCAL BEST LIFTOVER CHAINS TO HG16 (WORKING 2004-11-18 kate, sugnet) # Start with hg15Tohg16 liftover chains, which are best hg16 alignments to hg15 # Net these chains, and extract chains from the hg16-reference net to # get reciprocal best ssh kolossus cd /cluster/data/hg15/bed/liftOver grep chain hg15ToHg16.over.chain | wc -l # 3938 chainPreNet hg15ToHg16.over.chain /cluster/data/hg15/chrom.sizes \ /cluster/data/hg16/chrom.sizes stdout | \ chainNet stdin -minSpace=1 /cluster/data/hg15/chrom.sizes \ /cluster/data/hg16/chrom.sizes \ /dev/null hg15ToHg16.rbest.net chainSwap hg15ToHg16.over.chain stdout | \ chainSort -target stdin hg15ToHg16.swp.chain netChainSubset hg15ToHg16.rbest.net hg16ToHg16.swp.chain stdout | \ chainSort -target stdin hg15ToHg16.best.chain rm hg16ToHg16.rbest.net hg15ToHg16.swp.chain #### AFFYTRANSFRAG AND AFFYTRANCRIPTION TRACKS - (2004-07-21 sugnet) # tracks covering about 1/3 of genome with probes # every 5bp and hybridized to RNA from SK-N-AS cell line. # affyTranfrag track: SK_phase2_tfgs_final file emailed from # Hari at Affymetrix. cd /cluster/store6/weber/affy/transfrags/transfragsLabeled hgLoadBed hg15 affyTransfrags SK_phase2_tfgs_final.biggerThan50bp.tab # affyTranscription track: Wiggle track of hybridization levels # for every 5bp in genome. Quite a lot of data. cd /cluster/store6/weber/affy/graph/hg15/gz # Run wigAsciiToBinary for each chrom. I used small shell script "runWiggles.sh" wigAsciiToBinary -chrom=chr6 -wibFile=chr6.affyTranscription -minVal=0 chr6.sk.signal wigAsciiToBinary -chrom=chr7 -wibFile=chr7.affyTranscription -minVal=0 chr7.sk.signal wigAsciiToBinary -chrom=chr13 -wibFile=chr13.affyTranscription -minVal=0 chr13.sk.signal wigAsciiToBinary -chrom=chr14 -wibFile=chr14.affyTranscription -minVal=0 chr14.sk.signal wigAsciiToBinary -chrom=chr19 -wibFile=chr19.affyTranscription -minVal=0 chr19.sk.signal wigAsciiToBinary -chrom=chr20 -wibFile=chr20.affyTranscription -minVal=0 chr20.sk.signal wigAsciiToBinary -chrom=chr21 -wibFile=chr21.affyTranscription -minVal=0 chr21.sk.signal wigAsciiToBinary -chrom=chr22 -wibFile=chr22.affyTranscription -minVal=0 chr22.sk.signal wigAsciiToBinary -chrom=chrX -wibFile=chrX.affyTranscription -minVal=0 chrX.sk.signal wigAsciiToBinary -chrom=chrY -wibFile=chrY.affyTranscription -minVal=0 chrY.sk.signal hgLoadWiggle -pathPrefix=/gbdb/hg15/wib/affyTranscription hg15 affyTranscription *.wig # Connected to database hg15 for track affyTranscription # Creating table definition with 13 columns in hg15.affyTranscription # Saving wiggle.tab # Loading hg15 cp *.wib /cluster/data/hg15/bed/affyTranscription/wib/ cd /gbdb/hg15/wib/affyTranscription/ ln -s /cluster/data/hg15/bed/affyTranscription/wib/*.wib ./ cd /cluster/data/hg15/bed/affyTranscription/wib/ chmod 664 *.wib cd /cluster/store6/weber/affy/graph/hg15/gz rm *.tab *.wib *.wig gzip *.signal # Creating Zoom to 10 base view of the affyTranscription data above # (DONE - 2004-08-06 - Hiram) ssh eieio cd /cluster/data/hg15/bed/affyTranscription ln -s /cluster/store6/weber/affy/graph/hg15/gz/*.signal.gz . cat << '_EOF_' > runZoom.sh #!/bin/sh # mkdir -p zoom10wib mkdir -p zoom10limits for F in *.signal.gz do C=${F/.sk.signal.gz/} echo "wigZoom -dataSpan=10 $F | wigAsciiToBinary -dataSpan=10 -chrom=${C}" zcat $F | awk '{if ($2 < 0) { printf "%d\t0\n", $1 } else { print }}' | \ wigZoom -dataSpan=10 stdin | \ wigAsciiToBinary -dataSpan=10 -chrom=${C} \ -wibFile=zoom10wib/${C}.zoom10 stdin 2> zoom10limits/${C} done '_EOF_' # << keep emacs happy chmod +x runZoom.sh time ./runZoom.sh # takes about 15 minutes # the awk statement takes the negative values and brings them up # to zero as was done for the original data with the -minVal=0 # argument to the wigAsciiToBinary (would need to add this arg to # wigZoom to do the same thing here without the awk.) # Load this zoom 10 view ssh hgwdev cd /gbdb/hg15/wib/affyTranscription ln -s /cluster/data/hg15/bed/affyTranscription/zoom10wib/*.wib . cd /cluster/data/hg15/bed/affyTranscription/zoom10wib # Let's make sure we don't ruin the table: hgsql -N -e "select count(*) from affyTranscription;" hg15 # +--------+ # | 575775 | # +--------+ hgLoadWiggle -oldTable -pathPrefix=/gbdb/hg15/wib/affyTranscription hg15 \ affyTranscription *.wig hgsql -N -e "select count(*) from affyTranscription;" hg15 # +--------+ # | 649503 | # +--------+ # To test, at a browser image width of 1123 and a 10,000 bp display # at: # chr7:26,828,631-26,838,630 # That is exactly 10 bases per pixel. # go plus or minus one base from there to go under and over # the threshold, no dramatic change should be seen in the # display (while in "Mean" windowing viewing mode which isn't the # default for this track.) # LOADING AFFYTXNPHASE2 track (sugnet) # cd to where data is downloaded. cd /cluster/store10/sugnet/affyTranscription/graphs/transcriptome.affymetrix.com/download/publication/polyA_minus/graphs # Affymetrix uses names with underscores (which we don't) and has the annoying tendency to # put multiple data points at the same position in the genome which some of our tools # (like wigEncode) don't like. This script changes the format, changes the names and # does averaging of the points and puts them in single chromosome files. ./makeWigFiles.sh # takes a long time to run... # Use wigEncode to make the .wib and .wig files. ./makeWibWigHg15.sh # takes a long time to run. # Copy .wib files to /cluster/data/hg15 files mkdir /cluster/data/hg15/bed/affyTranscriptionPhase2/ cp `find ./ -name "*.wib"` /cluster/data/hg15/bed/affyTranscriptionPhase2/ chmod 775 /cluster/data/hg15/bed/affyTranscriptionPhase2/ chmod 664 /cluster/data/hg15/bed/affyTranscriptionPhase2/* # link /gbdb/hg15 to data mkdir /gbdb/hg15/wib/affyTranscriptionPhase2/ chmod 775 /gbdb/hg15/wib/affyTranscriptionPhase2/ cd /gbdb/hg15/wib/affyTranscriptionPhase2/ ln -s /cluster/data/hg15/bed/affyTranscriptionPhase2/* . cd - # Load the database tables (using bash) this takes a while for file in `find ./ -name "*.wig"`; do base=`basename $file .wig` echo "Doing ${base}Txn" hgLoadWiggle -pathPrefix=/gbdb/hg15/wib/affyTranscriptionPhase2/ hg15 ${base}Txn $file done # Do the transfrags cd ../transfrags # Make names consistent. ./changeNames2.sh foreach file (`ls -1 *PlusTnFg *MinusTnFg | grep -v '_'`) set base=`echo $file | sed -e 's/@//g'` echo Doing $base hgLoadBed hg15 $base $base end # MYTOUCH FIX - jen - 2006-01-24 sudo mytouch hg15 geneidPep 0404031400.00 sudo mytouch hg15 twinscanPep 0404031400.00 sudo mytouch hg15 dupSpMrna 0402242100.00 sudo mytouch hg15 keggPathway 0402242100.00 sudo mytouch hg15 kgAlias 0402242100.00 sudo mytouch hg15 kgProtAlias 0402242100.00 sudo mytouch hg15 kgXref 0402242100.00