# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on # NCBI build 32 (March, 2003 freeze) # [For importing GTF tracks, use /projects/compbio/bin/validate_gtf.pl] # HOW TO BUILD A ASSEMBLY FROM NCBI FILES # --------------------------------------- # NOTE: It is best to run most of this stuff on eieio since it # is not averse to handling files > 2Gb # 0) Make gs.15 directory, gs.15/build32 directory, and gs.15/ffa directory. mkdir /cluster/store5/gs.15 mkdir /cluster/store5/gs.15/build32 mkdir /cluster/store5/gs.15/agp mkdir /cluster/store5/gs.15/ffa # Make a symbolic link from /cluster/store1 to this location cd /cluster/store1 ln -s /cluster/store5/gs.15 ./gs.15 # Make a symbolic link from your home directory to the build dir: ln -s /cluster/store5/gs.15/build32 ~/oo # 1) Download seq_contig.md, ncbi_build32.agp, contig_overlaps.agp # and contig fa file into gs.15/build32 directory. # Download all finished agp's and fa's into gs.15/agp # Download sequence.inf and ncbi_build32.fa files into gs.15/ffa, and unzip # ncbi_build32.fa. # *** For build32, files split into reference.agp/reference.fa (main O&O), DR51.agp/DR51.fa, # and DR52.agp/DR52.fa. (alternate versions of MHC region). These were concatenated # to get the ncbi_build32.agp and ncbi_build32.fa # 2) Sanity check things with /cluster/bin/i386/checkYbr build32/ncbi_build32.agp ffa/ncbi_build32.fa \ build32/seq_contig.md # report any errors back to Richa and Greg at NCBI. # 3) Convert fa files into UCSC style fa files and place in "contigs" directory # inside the gs.15/build32 directory cd build32 mkdir contigs /cluster/bin/i386/faNcbiToUcsc -split -ntLast ../ffa/ncbi_build32.fa \ contigs # 3.1) Make a fake chrM contig cd ~/oo mkdir M # copy in chrM.fa, chrM.agp and chrM.gl from previous version. mkdir M/NT_999999 cp chrM.fa NT_999999/NT_999999.fa # copied chrM.fa, chrM.agp, chrM.gl, chrM.trf.bed, lift directory, NT_999999/NT_999999.fa - not sure which ones we need # 4) Create lift files (this will create chromosome directory structure) and inserts file /cluster/bin/scripts/createNcbiLifts seq_contig.md . # 5) Create contig agp files (will create contig directory structure) /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build32.agp . # 5.1) Create contig gl files ~kent/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md # 6) Create chromsome agp files /cluster/bin/scripts/createNcbiChrAgp . # 6.1) Copy over jkStuff from previous build mkdir jkStuff cp /cluster/store1/gs.14/build31/jkStuff/*.sh jkStuff cp /cluster/store1/gs.14/build31/jkStuff/*.csh jkStuff cp /cluster/store1/gs.14/build31/jkStuff/*.gsub jkStuff # 6.2) Patch in size of chromosome Y into Y/lift/ordered.lft # by grabbing it from the last line of Y/chrY.agp (not needed for build32) # 6.3) Create chromosome gl files jkStuff/liftGl.sh contig.gl # 7) Distribute contig .fa to appropriate directory (assumes all files # are in "contigs" directory). /cluster/bin/scripts/distNcbiCtgFa contigs . rm -r contigs # 8) Reverse complement NT contig fa files that are flipped in the assembly # (uses faRc program) # Not done for build32 because all contigs on + strand. It should be this # way for the rest of the assemblies /cluster/bin/scripts/revCompNcbiCtgFa seq_contig.md . # (NOTE: STS placements may be done at this point before repeat masking and # using the .fa's on NFS for QC analysis - all other placements should be # done after repeat masking and distributing to cluster nodes) # GET FRESH MRNA/EST AND REFSEQ SEQUENCE FROM GENBANK (DONE 03/15/03) # Run this just before the sequence gets here! It's OK to work on # this in parallel with Terry's steps above, or in parallel with # RepeatMasker below, but DO NOT let this hold up RepeatMasker. # This will create a genbank.134 directory containing compressed # GenBank flat files and a mrna.134 containing unpacked sequence # info and auxiliary info in a relatively easy to parse (.ra) # format. # Point your browser to ftp://ftp.ncbi.nih.gov/genbank and look at # the README.genbank. Figure out the current release number. (134) lynx ftp://ftp.ncbi.nih.gov/genbank/README.genbank # Consider deleting one of the older genbank releases. It's # good to at least keep one previous release though. # Where there is space make a new genbank directory. Create a # symbolic link to it: ssh eieio mkdir /cluster/store5/genbank.134 ln -s /cluster/store5/genbank.134 ~/genbank cd ~/genbank # ncftp is handy -- it does anonymous login; "prompt" command not needed. ncftp ftp.ncbi.nih.gov cd genbank mget gbpri* gbrod* gbv* gbsts* gbest* gbmam* gbinv* gbbct* gbhtc* gbpat* gbphg* gbpln* quit # This will take at least 2 hours. # Make the refSeq subdir and download files: ssh eieio mkdir -p /cluster/store5/mrna.134/refSeq cd /cluster/store5/mrna.134/refSeq ncftp ftp.ncbi.nih.gov cd refseq/cumulative mget *.Z quit # Get extra info & human proteins from NCBI: wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc wget ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/hs.faa.gz gunzip hs.faa.gz # Unpack this into species-specific fa files and get extra info with: cd /cluster/store5/mrna.134/refSeq cp /cluster/store2/mrna.133/*.fil .. gunzip -c rscu.gbff.Z \ | gbToFaRa -byOrganism=org ../anyRna.fil refSeq.{fa,ra,ta} stdin # Now unpack and organize the larger genbank mrna/est sequences... ssh eieio cd /cluster/store5/mrna.134 # Make the RNAs for all organisms gunzip -c \ /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa -byOrganism=org anyRna.fil mrna.{fa,ra,ta} stdin # Make the ESTs for all organisms gunzip -c /cluster/store5/genbank.134/gbest*.gz \ | gbToFaRa anyRna.fil est.{fa,ra,ta} stdin -byOrganism=org # Make the nonhuman RNAs gunzip -c \ /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa humanXenoRna.fil humanXenoRna.{fa,ra,ta} stdin # Make the nonMouse RNAs gunzip -c \ /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa mouseXenoRna.fil mouseXenoRna.{fa,ra,ta} stdin # Make the nonRat RNAs gunzip -c \ /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa ratXenoRna.fil ratXenoRna.{fa,ra,ta} stdin # Make the nonhuman ESTs gunzip -c /cluster/store5/genbank.134/gbest*.gz \ | gbToFaRa humanXenoRna.fil humanXenoEst.{fa,ra,ta} stdin # Split the really large ones into smaller pieces for more efficient # cluster runs. mkdir humanXenoRnaSplit humanXenoEstSplit faSplit about humanXenoRna.fa 10000000 humanXenoRnaSplit/xenoRna faSplit about humanXenoEst.fa 70000000 humanXenoEstSplit/xenoEst cd org/Homo_sapiens mkdir estSplit faSplit about est.fa 250000000 estSplit/est # Distribute the files to /iscratch/i/ so they're all ready to be aligned. ssh kkr1u00 mkdir -p /iscratch/i/mrna.134/Homo_sapiens cp -p /cluster/store5/mrna.134/refSeq/org/Homo_sapiens/refSeq.fa \ /iscratch/i/mrna.134/Homo_sapiens/ cp -p /cluster/store5/mrna.134/org/Homo_sapiens/mrna.fa \ /iscratch/i/mrna.134/Homo_sapiens/ cp -p /cluster/store5/mrna.134/org/Homo_sapiens/estSplit/*.fa \ /iscratch/i/mrna.134/Homo_sapiens/ cp -p /cluster/store5/mrna.134/humanXenoRnaSplit/*.fa \ /iscratch/i/mrna.134/Homo_sapiens/ cp -p /cluster/store5/mrna.134/humanXenoEstSplit/*.fa \ /iscratch/i/mrna.134/Homo_sapiens/ ~kent/bin/iSync # REPEAT MASKING (DONE 03/17/03) # Split contigs, run RepeatMasker, lift results # Notes: # * If there is a new version of RepeatMasker, build it and ask the admins # to binrsync it (kkstore:/scratch/hg/RepeatMasker/*). # * Contigs (*/NT_*/NT_*.fa) are split into 500kb chunks to make # RepeatMasker runs manageable on the cluster ==> results need lifting. # * For the NCBI assembly we repeat mask on the sensitive mode setting # (RepeatMasker -s) #- Split contigs into 500kb chunks: ssh eieio cd ~/hg14 foreach d ( ?{,?}/NT_* ) cd $d set contig = $d:t faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -maxN=500000 cd ../.. end #- Make the run directory and job list: cd ~/hg14 mkdir RMRun rm -f RMRun/RMJobs touch RMRun/RMJobs foreach d ( ?{,?}/NT_* ) foreach f ( /cluster/store5/gs.15/build32/$d/NT_*_*.fa ) set f = $f:t echo /cluster/bin/scripts/RMLocalSens \ /cluster/store5/gs.15/build32/$d $f \ '{'check out line+ /cluster/store5/gs.15/build32/$d/$f.out'}' \ >> RMRun/RMJobs end end #- Do the run ssh kk cd ~/hg14/RMRun para create RMJobs para try, para check, para check, para push, para check,... #- Now while that's running, run TRF (simpleRepeat), and RefSeq #- alignments, in parallel. Also, create the database and the #- tracks that don't rely on cluster runs or on masked sequence. #- Lift up the split-contig .out's to contig-level .out's ssh eieio cd ~/hg14 foreach d ( ?{,?}/NT_* ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null cd ../.. end #- Lift up RepeatMask .out files to chromosome coordinates via tcsh jkStuff/liftOut2.sh #- By this point, the database should have been created (below): ssh hgwdev cd ~/hg14 hgLoadOut hg14 ?/*.fa.out ??/*.fa.out # VERIFY REPEATMASKER RESULTS (DONE 03/18/03) # Run featureBits on hg14 and on a comparable genome build, and compare: ssh hgwdev featureBits hg14 rmsk # --> 1384772888 bases of 3060248386 (45.250%) in intersection featureBits hg13 rmsk # --> 1383216615 bases of 3070074689 (45.055%) in intersection # Validate the RepeatMasking by randomly selecting a few NT_*.fa files, # manually repeat masking them and matching the .out files with the # related part in the chromosome-level .out files. For example: ssh kkr1u00 # Pick arbitrary values of $chr and $nt and run these commands: set chr = 1 set nt = NT_004321 mkdir /tmp/RMTest/$nt cd /tmp/RMTest/$nt cp ~/hg14/$chr/$nt/$nt.fa . /scratch/hg/RepeatMasker/RepeatMasker -s $nt.fa # Compare $nt.fa.out against the original ~/hg14/$chr/$nt/$nt.fa.out # and against the appropriate part of $chr/chr$chr.fa.out (use the coords # for $nt given in seq_contig.md). # MAKE LIFTALL.LFT, NCBI.LFT (DONE 03/16/03) cd ~/hg14 cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly. # Note: this ncbi.lift will not lift floating contigs to chr_random coords, # but it will show the strand orientation of the floating contigs # (grep for '|'). mdToNcbiLift seq_contig.md jkStuff/ncbi.lft # If a lift file has been edited (e.g. as in 6.2.5 above), edit ncbi.lft # to match. # SIMPLE REPEAT [TRF] TRACK (DONE 03/16/03) # Distribute contigs to /iscratch/i ssh kkr1u00 rm -rf /iscratch/i/gs.15/build32/contigs mkdir -p /iscratch/i/gs.15/build32/contigs cd ~/hg14 foreach c (?{,?}) echo copying contigs of chr$c cp -p $c/NT_*/NT_??????.fa /iscratch/i/gs.15/build32/contigs end # Make sure the total size looks like what you'd expect: du -sh /iscratch/i/gs.15/build32/contigs ~kent/bin/iSync # Create cluster parasol job like so: ssh kk mkdir -p ~/hg14/bed/simpleRepeat cd ~/hg14/bed/simpleRepeat cp ~/hg13/bed/simpleRepeat/gsub . mkdir trf ls -1S /iscratch/i/gs.15/build32/contigs/*.fa > genome.lst echo "" > dummy.lst gensub2 genome.lst dummy.lst gsub spec para create spec para try para check para push para check # When cluster run is done liftUp simpleRepeat.bed ~/hg14/jkStuff/liftAll.lft warn trf/*.bed # Load into the database: ssh hgwdev cd ~/hg14/bed/simpleRepeat hgLoadBed hg14 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # REFSEQ ALIGNMENTS AND REFGENE TRACK PREP (DONE 03/16/03) # Make sure contigs have been distributed to /iscratch/i/ (should have # been done for simpleRepeat/TRF above) # Make sure refSeq.fa is under /iscratch/i too (GENBANK above) ssh kk mkdir ~/hg14/bed/refSeq cd ~/hg14/bed/refSeq mkdir psl ls -1S /iscratch/i/gs.15/build32/contigs/*.fa > genome.lst ls -1 /iscratch/i/mrna.134/Homo_sapiens/refSeq.fa > mrna.lst cp ~/hg13/bed/refSeq/gsub . gensub2 genome.lst mrna.lst gsub spec para create spec para try, para check, para push, para check.... para time > time # When cluster is done, process refSeq alignments into near best in genome. ssh eieio cd ~/hg14/bed/refSeq pslSort dirs raw.psl /tmp psl pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl \ contig.psl /dev/null liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft carry contig.psl pslSortAcc nohead chrom /tmp all_refSeq.psl pslCat -dir chrom > refSeqAli.psl # After the database has been created, go to "LOAD REFGENE" below... # PROCESS SIMPLE REPEATS INTO MASK (DONE 03/16/03) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh eieio cd ~/hg14/bed/simpleRepeat mkdir -p trfMask foreach f (trf/NT_*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd ~/hg14 mkdir -p bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 03/17/03) # This used to be done right after RepeatMasking. Now, we mask with # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above. ssh eieio cd ~/hg14 #- Soft-mask (lower-case) the contig and chr .fa's ./jkStuff/makeFaMasked.sh #- Make hard-masked .fa.masked files as well: ./jkStuff/makeHardMasked.sh #- Rebuild the nib, mixedNib, maskedNib files: ./jkStuff/makeNib.sh # Copy the masked contig fa to /iscratch and /scratch: ssk kkr1u00 rm -rf /iscratch/i/gs.15/build32/trfFa mkdir -p /iscratch/i/gs.15/build32/trfFa cp -p ~/hg14/?{,?}/NT_*/NT_??????.fa /iscratch/i/gs.15/build32/trfFa ~kent/bin/iSync ssh kkstore rm -rf /scratch/hg/gs.15/build32/trfFa mkdir -p /scratch/hg/gs.15/build32/trfFa cp -p ~/hg14/?{,?}/NT_*/NT_??????.fa /scratch/hg/gs.15/build32/trfFa # PREPARE CLUSTER FOR BLASTZ RUN (DONE 03/17/03) # This needs to be done after trf-masking and nib generation. ssh eieio # Extract lineage-specific repeats using Arian Smit's script: mkdir -p ~/hg14/bed/linSpecRep cd ~/hg14/bed/linSpecRep foreach f (~/hg14/*/*.out) ln -sf $f . end /cluster/bin/scripts/primateSpecificRepeats.pl *.out /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec rm *.out # Copy files to the kkstore:/scratch ssh kkstore # lineage-specific repeats: cd ~/hg14/bed mkdir -p /scratch/hg/gs.15/build32 rm -rf /scratch/hg/gs.15/build32/linSpecRep cp -Rp linSpecRep /scratch/hg/gs.15/build32 # RepeatMasker .out: cd ~/hg14 rm -rf /scratch/hg/gs.15/build32/rmsk mkdir -p /scratch/hg/gs.15/build32/rmsk cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/gs.15/build32/rmsk # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /scratch/hg/gs.15/build32/chromTrfMixedNib mkdir -p /scratch/hg/gs.15/build32/chromTrfMixedNib cp -p mixedNib/chr*.nib /scratch/hg/gs.15/build32/chromTrfMixedNib # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters # Copy to /iscratch as well so we can run blastz before binrsync finishes: rm -rf /iscratch/i/gs.15/build32/{linSpecRep,rmsk,chromTrfMixedNib} cp -Rp /scratch/hg/gs.15/build32/{linSpecRep,rmsk,chromTrfMixedNib} \ /iscratch/i/gs.15/build32/ ssh kkr1u00 ~kent/bin/iSync # Jim's comments Feb 12 '03 about the order in which to run blastz: # In general we should do # 1) hg/mm # 2) mm/rn # 3) rn/hg # 4) hg/hg # 5) mm/mm # 6) rn/rn # There is now an 'axtSwap' program that might let us # get out of having to run the inverse of 1,2 & 3, though # 2 in particular is so fast perhaps it's just as well to # do the inverse explicitly. # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 03/20/03) ssh eieio cd ~/hg14 #- Build the .zip files ./jkStuff/zipAll.sh |& tee zipAll.log #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: mkdir zip mv *.zip* zip cd zip foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd ~/hg14/zip # Edit cpToWeb.sh to contain the correct destination path. ../jkStuff/cpToWeb.sh cd /usr/local/apache/htdocs/goldenPath/10mar2003 #- Take a look at bigZips/* and chromosomes/*, update their README.txt's # CREATING DATABASE (DONE 03/16/03) ssh hgwdev # if you haven't already: ln -s /cluster/store5/gs.15/build32 ~/oo ln -s /cluster/store5/gs.15/build32 ~/hg14 # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql df -h /var/lib/mysql # Create the database. echo 'create database hg14' | hgsql hg13 # make a semi-permanent read-only alias (add this to your .cshrc/.bashrc): alias hg14 mysql -u hguser -phguserstuff -A hg14 # Initialize the relational-mrna and external sequence info tables: hgLoadRna new hg14 # Copy over grp table (for track grouping) from another database: echo "create table grp (PRIMARY KEY(NAME)) select * from hg13.grp" \ | hgsql hg14 # SEQUENCE INFO: CHROMINFO (DONE 03/16/03) ssh eieio cd ~/hg14 # Sanity-check */lift/ordered.lft length vs. agp length: foreach c ( ?{,?} ) if (-e $c/lift/ordered.lst) then set lftLen = `tail -1 $c/lift/ordered.lft | awk '{print $5;}'` set agpLen = `tail -1 $c/chr$c.agp | awk '{print $3;}'` if ($lftLen != $agpLen) then echo "ERROR: chr$c : lftLen=$lftLen, agpLen=$agpLen" else echo "chr$c : $lftLen" endif endif end # Make chr*.fa from contig .fa tcsh jkStuff/chrFa.sh # Make unmasked nibs -- necessary for building chromInfo. mkdir nib foreach f (?{,?}/chr?{,?}{,_random}.fa) echo making unmasked nib for $f faToNib $f nib/$f:t:r.nib end # Make symbolic links from /gbdb/hg14/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/hg14/nib foreach f (/cluster/store5/gs.15/build32/nib/chr*.nib) ln -s $f /gbdb/hg14/nib end # Load /gbdb/hg14/nib paths into database and save size info. hgsql hg14 < ~/src/hg/lib/chromInfo.sql cd ~/hg14 hgNibSeq -preMadeNib hg14 /gbdb/hg14/nib ?{,?}/chr?{,?}{,_random}.fa echo "select chrom,size from chromInfo" | hgsql -N hg14 > chrom.sizes # O+O: ASSEMBLY [GOLD], GAP, COVERAGE, MAP CONTIGS TRACKS (DONE 03/16/03) # Store o+o info in database. # Note: for build31, Terry specially requested these files from NCBI: # finished.finf # draft.finf # predraft.finf # extras.finf ssh eieio cd /cluster/store5/gs.15/build32 if (-f contig_overlaps.agp) then jkStuff/liftGl.sh contig.gl else hgGoldGapGl -noGl hg14 /cluster/store5/gs.15 build32 echo "" echo "*** Note from makeHg14.doc:" echo "Come back to this step later when we have contig_overlaps.agp\!" endif ssh hgwdev cd /cluster/store5/gs.15/build32 if (-f contig_overlaps.agp) then hgGoldGapGl hg14 /cluster/store5/gs.15 build32 cd /cluster/store5/gs.15 hgClonePos hg14 build32 ffa/sequence.inf /cluster/store5/gs.15 -maxErr=3 end cd /cluster/store5/gs.15 hgCtgPos hg14 build32 # LOAD REFGENE (DONE 03/16/03) # Do this after the database has been created and the RefSeq alignments # are done (above) # Load refSeq alignments into database ssh hgwdev cd ~/hg14/bed/refSeq hgLoadPsl hg14 -tNameIx refSeqAli.psl # Make /gbdb symlinks for refSeq.fa (not .ra) mkdir -p /gbdb/hg14/mrna.134 cd /gbdb/hg14/mrna.134 ln -s /cluster/store5/mrna.134/refSeq/org/Homo_sapiens/refSeq.fa # Load the refSeq mRNA cd /cluster/store2/tmp hgLoadRna add -type=refSeq hg14 /gbdb/hg14/mrna.134/refSeq.fa \ /cluster/store5/mrna.134/refSeq/org/Homo_sapiens/refSeq.ra cd ~/hg14/bed/refSeq hgRefSeqMrna hg14 /gbdb/hg14/mrna.134/refSeq.fa \ /cluster/store5/mrna.134/refSeq/org/Homo_sapiens/refSeq.ra \ all_refSeq.psl \ /cluster/store5/mrna.134/refSeq/loc2ref \ /cluster/store5/mrna.134/refSeq/hs.faa \ /cluster/store5/mrna.134/refSeq/mim2loc # Don't worry about the "No gene name" errors # Add RefSeq status info hgRefSeqStatus -human hg14 /cluster/store5/mrna.134/refSeq/loc2ref # Create precomputed join of refFlat and refGene: echo 'CREATE TABLE refFlat \ (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) \ SELECT refLink.name as geneName, refGene.* \ FROM refLink,refGene \ WHERE refLink.mrnaAcc = refGene.name' \ | hgsql hg14 # GC PERCENT (DONE 03/16/03) ssh hgwdev mkdir -p ~/hg14/bed/gcPercent cd ~/hg14/bed/gcPercent hgsql hg14 < ~/src/hg/lib/gcPercent.sql hgGcPercent hg14 ../../nib # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 03/16/03) ssh hgwdev # Enter hg14 into hgcentraltest.dbDb so test browser knows about it: echo 'insert into dbDb values("hg14", "Human Mar. 2003", \ "/gbdb/hg14/nib", "Human", "DUSP18", 1, 80, "Human");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: cd ~/src/hg/makeDb/trackDb cvs up -d -P . # Edit that makefile to add hg14 in all the right places and do make update make alpha cvs commit makefile # PRELOAD MRNA/EST SEQUENCE INFO INTO DATABASE (DONE 03/16/03) # Make /gbdb symlinks for sequence .fa (not .ra) mkdir -p /gbdb/hg14/mrna.134 cd /gbdb/hg14/mrna.134 ln -s /cluster/store5/mrna.134/org/Homo_sapiens/mrna.fa ln -s /cluster/store5/mrna.134/org/Homo_sapiens/est.fa ln -s /cluster/store5/mrna.134/humanXenoRna.fa ln -s /cluster/store5/mrna.134/humanXenoEst.fa # Store the sequence (non-alignment) info in database. cd /cluster/store2/tmp hgLoadRna add -type=mRNA hg14 /gbdb/hg14/mrna.134/mrna.fa \ /cluster/store5/mrna.134/org/Homo_sapiens/mrna.ra hgLoadRna add -type=EST hg14 /gbdb/hg14/mrna.134/est.fa \ /cluster/store5/mrna.134/org/Homo_sapiens/est.ra hgLoadRna add -type=xenoRna hg14 /gbdb/hg14/mrna.134/humanXenoRna.fa \ /cluster/store5/mrna.134/humanXenoRna.ra hgLoadRna add -type=xenoEst hg14 /gbdb/hg14/mrna.134/humanXenoEst.fa \ /cluster/store5/mrna.134/humanXenoEst.ra # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 03/20/03) ssh hgwdev # Substitute BBB with the correct number for the hostname: echo 'insert into blatServers values("hg14", "blat11", "17778", "1"); \ insert into blatServers values("hg14", "blat11", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest # MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 03/18/03) # Make sure that /scratch/hg/gs.15/build32/trfFa is loaded with NT_*.fa # and has been pushed to the big cluster nodes. (MASK SEQUENCE above) # Make sure mrna/est .fa's are under /iscratch/i too (GENBANK above) ssh kk mkdir -p ~/hg14/bed/{mrna,est}/psl cd ~/hg14/bed/mrna ls -1S /scratch/hg/gs.15/build32/trfFa/* > genome.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/mrna.fa > mrna.lst cp ~/hg13/bed/mrna/gsub . gensub2 genome.lst mrna.lst gsub spec para create spec para try cd ~/hg14/bed/est ls -1S /scratch/hg/gs.15/build32/trfFa/* > genome.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/est*.fa > mrna.lst # Using split est fa -- so create separate output dirs and special gsub: foreach f (`cat mrna.lst`) mkdir psl/$f:t:r end echo '#LOOP \ /cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -ooc={check in exists /scratch/hg/h/11.ooc} {check out line+ psl/$(root2)/$(root1)_$(root2).psl} \ #ENDLOOP' > gsub gensub2 genome.lst mrna.lst gsub spec para create spec para try # In each dir (~/hg14/bed/mrna, ~/hg14/bed/est): para check, para push, para check.... # para time > time # Process mRNA and EST alignments into near best in genome. cd ~/hg14/bed/mrna pslSort dirs raw.psl /tmp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \ /dev/null liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft carry contig.psl pslSortAcc nohead chrom /tmp all_mrna.psl cd ~/hg14/bed/est pslSort dirs raw.psl /cluster/store2/tmp psl/est* pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \ /dev/null liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft carry contig.psl pslSortAcc nohead chrom /cluster/store3/tmp all_est.psl # Load mRNA alignments into database. ssh hgwdev cd ~/hg14/bed/mrna/chrom rm -f *_mrna.psl foreach i (*.psl) mv $i $i:r_mrna.psl end hgLoadPsl hg14 *.psl cd .. hgLoadPsl hg14 all_mrna.psl -nobin # Load EST alignments into database. ssh hgwdev cd ~/hg14/bed/est/chrom rm -f *_est.psl foreach i (*.psl) mv $i $i:r_est.psl end hgLoadPsl hg14 *.psl cd .. hgLoadPsl hg14 all_est.psl -nobin # Sequence info should have already been loaded into database (PRELOAD) # SPLICED ESTS (INTRONEST) (DONE 03/18/03) # Create subset of ESTs with introns and load into database. ssh eieio cd ~/hg14 tcsh jkStuff/makeIntronEst.sh ssh hgwdev cd ~/hg14/bed/est/intronEst hgLoadPsl hg14 *.psl # ESTORIENTINFO, MRNAORIENTINFO, GENE BOUNDS (RNACLUSTER) (DONE 03/22/03) # Put orientation info on ESTs and mRNAs into database: ssh eieio cd ~/hg14/bed/est pslSortAcc nohead contig /cluster/store3/tmp contig.psl cd ~/hg14/bed/mrna pslSortAcc nohead contig /cluster/store3/tmp contig.psl # Distribute the est and mrna psl files to /iscratch/i ssh kkr1u00 rm -rf /iscratch/i/gs.15/build32/bed mkdir -p /iscratch/i/gs.15/build32/bed cp -r ~/hg14/bed/est/contig /iscratch/i/gs.15/build32/bed/est cp -r ~/hg14/bed/mrna/contig /iscratch/i/gs.15/build32/bed/mrna ~kent/bin/iSync # mrna: use big cluster. ssh kk mkdir -p ~/hg14/bed/mrnaOrientInfo/oi cd ~/hg14/bed/mrnaOrientInfo ls -1S /iscratch/i/gs.15/build32/bed/mrna/*.psl > psl.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/mrna*.fa > mrna.lst cp ~/hg13/bed/mrnaOrientInfo/gsub . # Edit gsub to point to the correct paths. gensub2 psl.lst mrna.lst gsub spec para create spec para try para check, para push, para check, .... # When the cluster run is done do: ssh hgwdev cd ~/hg14/bed/mrnaOrientInfo liftUp mrnaOrientInfo.bed ~/hg14/jkStuff/liftAll.lft warn oi/*.tab hgLoadBed hg14 mrnaOrientInfo mrnaOrientInfo.bed \ -sqlTable=$HOME/kent/src/hg/lib/mrnaOrientInfo.sql > /dev/null # est: use small cluster (I/O intensive). Use 2-level output dir # (input est.fa has been split into multiple files). ssh kkr1u00 mkdir -p ~/hg14/bed/estOrientInfo/oi cd ~/hg14/bed/estOrientInfo foreach f (`cat mrna.lst`) mkdir oi/$f:t:r end ls -1S /iscratch/i/gs.15/build32/bed/est/*.psl > psl.lst ls -1S /iscratch/i/est.134/Homo_sapiens/est*.fa > mrna.lst cp ~/hg13/bed/estOrientInfo/gsub . # Edit gsub to point to the correct paths. gensub2 psl.lst mrna.lst gsub spec para create spec para try para check, para push, para check, .... # When the cluster run is done do: ssh hgwdev cd ~/hg14/bed/estOrientInfo # oi/*/*.tab -> argument list too long... so cat the lowest level together: foreach d (oi/*) cat $d/*.tab > $d.tab end liftUp estOrientInfo.bed ~/hg14/jkStuff/liftAll.lft warn oi/*.tab bedSort estOrientInfo.bed estOrientInfo.bed hgLoadBed hg14 estOrientInfo estOrientInfo.bed \ -sqlTable=$HOME/kent/src/hg/lib/estOrientInfo.sql > /dev/null # Create rnaCluster table (depends on {est,mrna}OrientInfo above) cd ~/hg14 # Create a list of accessions that come from RAGE libraries and need to # be excluded. (added by Chuck Wed Nov 27 13:09:07 PST 2002) ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg14 \ rage.libs mkdir -p ~/hg14/bed/rnaCluster/chrom # Exclude accesions in the RAGE file foreach f (?{,?}/chr*.fa) set c = $f:t:r set out = bed/rnaCluster/chrom/$c.bed echo clusterRna -mrnaExclude=hg14.rage.libs hg14 /dev/null $out -chrom=$c clusterRna -mrnaExclude=hg14.rage.libs hg14 /dev/null $out -chrom=$c end cd bed/rnaCluster hgLoadBed hg14 rnaCluster chrom/*.bed > /dev/null # GENEBANDS (DONE 03/18/03) # Create precomputed geneBands table: ssh hgwdev hgGeneBands hg14 geneBands.txt hgsql hg14 < ~/kent/src/hg/lib/geneBands.sql echo "load data local infile 'geneBands.txt' into table geneBands;" \ | hgsql hg14 rm geneBands.txt # PRODUCING GENSCAN PREDICTIONS (DONE 03/23/03) ssh eieio mkdir -p ~/hg14/bed/genscan cd ~/hg14/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir -p gtf pep subopt # Generate a list file, genome.list, of all the contigs # *that do not have pure Ns* (due to heterochromatin, unsequencable # stuff) which would cause genscan to run forever. rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/store5/gs.15/build32/?{,?}/NT_*/NT_??????.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end # Log into kkr1u00 (not kk!). kkr1u00 is the driver node for the small # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the # big cluster, due to limitation of memory and swap space on each # processing node). ssh kkr1u00 # Create template file, gsub, for gensub2. For example (3-line file): #LOOP rm -f genome.list /cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP echo "" > dummy.list gensub2 genome.list dummy.list gsub jobList para create jobList para try para check para push # Issue either one of the following two commands to check the # status of the cluster and your jobs, until they are done. parasol status para check # If there were out-of-memory problems (run "para problems"), then # re-run those jobs by hand but change the -window arg from 2400000 # to 1200000. In build32, this was 22/NT_011519. # Convert these to chromosome level files as so: ssh eieio cd ~/hg14/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/NT*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/NT*.bed > \ /dev/null cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd ~/hg14/bed/genscan ldHgGene hg14 genscan genscan.gtf hgPepPred hg14 generic genscanPep genscan.pep hgLoadBed hg14 genscanSubopt genscanSubopt.bed > /dev/null # CPGISLANDS (DONE 03/17/03) ssh eieio mkdir -p ~/hg14/bed/cpgIsland cd ~/hg14/bed/cpgIsland # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu) # copy the tar file to the current directory tar xvf cpg_dist.tar cd cpg_dist gcc readseq.c cpg_lh.c -o cpglh.exe cd .. # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked) set fout=$f:t:r:r.cpg echo producing $fout... ./cpg_dist/cpglh.exe $f > $fout end cp ~/hg13/bed/cpgIsland/filter.awk . awk -f filter.awk chr*.cpg > cpgIsland.bed ssh hgwdev cd ~/hg14/bed/cpgIsland hgLoadBed hg14 cpgIsland -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed CREATE GOLDEN TRIANGLE (todo) # Make sure that rnaCluster table is in place. Then extract Affy # expression info into a form suitable for Eisen's clustering program with: cd ~/hg14/bed mkdir triangle cd triangle eisenInput hg14 affyHg10.txt Transfer this to Windows and do k-means clustering with k=200 with cluster. Transfer results file back to ~/hg14/bed/triangle/affyCluster_K_G200.kgg. Then do promoSeqFromCluster hg14 1000 affyCluster_K_G200.kgg kg200.unmasked Then RepeatMask the .fa file inkg200.unmasked, and copy masked versions to kg200. Then cat kg200/*.fa > all1000.fa and set up cluster Improbizer run to do 100 controls for every real run on each - putting the output in imp.200.1000.e. When improbizer run is done make a file summarizing the runs as so: cd imp.200.1000.e motifSig ../imp.200.1000.e.iri ../kg200 motif control* get rid of insignificant motifs with: cd .. awk '{if ($2 > $3) print; }' imp.200.1000.e.iri > sig.200.1000.e.iri turn rest into just dnaMotifs with iriToDnaMotif sig.200.1000.e.iri motif.200.1000.e.txt Extract all promoters with featureBits hg14 rnaCluster:upstream:1000 -bed=upstream1000.bed -fa=upstream1000.fa Locate motifs on all promoters with dnaMotifFind motif.200.1000.e.txt upstream1000.fa hits.200.1000.e.txt -rc -markov=2 liftPromoHits upstream1000.bed hits.200.1000.e.txt triangle.bed CREATE STS/FISH/BACENDS/CYTOBANDS DIRECTORY STRUCTURE AND SETUP (DONE 3/15/2003) o - Create directory structure to hold information for these tracks cd /projects/hg2/booch/psl/ change Makefile parameters for OOVERS, GSVERS, PREVGS, PREVOO make new o - Update all Makefiles with latest OOVERS and GSVERS, DATABASE, and locations of .fa files o - Create accession_info file make accession_info.rdb UPDATE STS INFORMATION (DONE 3/15/2003) o - Download and unpack updated information from dbSTS: In a web browser, go to ftp://ftp.ncbi.nih.gov/repository/dbSTS/. Download dbSTS.sts, dbSTS.aliases, and dbSTS.FASTA.dailydump.Z to /projects/hg2/booch/psl/update -Unpack dbSTS.FASTA.dailydump.Z gunzip dbSTS.FASTA.dailydump.Z o - Create updated files cd /projects/hg2/booch/psl/update edit Makefile to latest sts.X version from PREV (currently sts.4) make update o - Make new directory for this info and move files there ssh kks00 mkdir /cluster/store1/sts.5 cp all.STS.fa /cluster/store1/sts.5 cp all.primers /cluster/store1/sts.5 cp all.primers.fa /cluster/store1/sts.5 o - Copy new files to cluster ssh kkstore cd /cluster/store1/sts.5 cp /cluster/store1/sts.5/*.* /scratch/hg/STS ask for propagation from sysadmin STS ALIGNMENTS (DONE 3/19/2003) (alignments done without RepeatMasking, so start ASAP!) o - Create full sequence alignments ssh kk cd /cluster/home/booch/sts - update Makefile with latest OOVERS and GSVERS make new make jobList.scratch (if contig files propagated to nodes) - or _ make jobList.disk (if contig files not propagated) para create jobList para push (or para try/para check if want to make sure it runs) make stsMarkers.psl o - Copy files to final destination and remove originals ssh kks00 make copy.assembly make clean.assembly o - Create primer alignments ssh kk cd /cluster/home/booch/primers - update Makefile with latest OOVERS and GSVERS make new make jobList.scratch (if contig files propagated to nodes) - or _ make jobList.disk (if contig files not propagated) para create jobList para push (or para try/para check if want to make sure it runs) make primers.psl o - Copy files to final destination and remove ssh kks00 make copy.assembly make clean.assembly o - Create ePCR alignments ssh kk cd /cluster/home/booch/epcr - update Makefile with latest OOVERS and GSVERS make new make jobList.scratch (if contig files propagated to nodes) - or _ make jobList.disk (if contig files not propagated) para create jobList para push (or para try/para check if want to make sure it runs) make primers.psl o - Copy files to final destination and remove ssh kks00 make copy.assembly make clean.assembly CREATE AND LOAD STS MARKERS TRACK (DONE 3/19/2003) o - Copy in current stsInfo2.bed and stsAlias.bed files cd /projects/hg2/booch/psl/gs.15/build32 cp ../update/stsInfo2.bed . cp ../update/stsAlias.bed . o - Create final version of sts sequence placements ssh kks00 cd /projects/hg2/booch/psl/gs.15/build32/sts make stsMarkers.final o - Create final version of primers placements cd /projects/hg2/booch/psl/gs.15/build32/primers cp /cluster/store1/sts.5/all.primers . make primers.final o - Create bed file cd /projects/hg2/booch/psl/gs.15/build32 make stsMap.bed o - Create database tables ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg14 < all_sts_primer.sql hgsql hg14 < all_sts_seq.sql hgsql hg14 < stsAlias.sql hgsql hg14 < stsInfo2.sql hgsql hg14 < stsMap.sql o - Load the tables load /projects/hg2/booch/psl/gs.15/build32/sts/stsMarkers.psl.filter.lifted into all_sts_seq load /projects/hg2/booch/psl/gs.15/build32/primers/primers.psl.filter.lifted into all_sts_primer load /projects/hg2/booch/psl/gs.15/build32/stsAlias.bed into stsAlias load /projects/hg2/booch/psl/gs.15/build32/stsInfo2.bed into stsInfo2 echo 'load data local infile "/projects/hg2/booch/psl/gs.15/build32/stsMap.bed" into table stsMap;' \ | hgsql hg14 # Load the sequences (change sts.# to match correct location) mkdir /gbdb/hg14/sts.6 cd /gbdb/hg14/sts.6 ln -s /cluster/store1/sts.6/all.STS.fa ln -s /cluster/store1/sts.6/all.primers.fa cd /cluster/store2/tmp hgLoadRna addSeq hg14 /gbdb/hg14/sts.6/all.STS.fa hgLoadRna addSeq hg14 /gbdb/hg14/sts.6/all.primers.fa # UPDATE BACEND SEQUENCES (DONE 3/14/2003) # 1) Download new files (not done cause no change for build32): # In a web browser, go to ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/BACENDS/. # Download BACends.fa.gz and cl_acc_gi_len_primer to # /cluster/store1/bacends.3 # 2) Unpack AllBACends.fa.gz gunzip AllBACends.fa.gz # 3) Create new pairs file /cse/grads/booch/compbio/booch/scripts/convertBacEndPairInfo cl_acc_gi_len_primer /cluster/store1/bacends.2/bacEndPairs.txt # 4) Split file into pieces /cluster/bin/i386/faSplit sequence BACends.fa 100 BACends # 5) Move files to cluster ssh kkstore cd /cluster/store1/bacends.3 mv /cluster/store1/bacends3/BACends??.fa /scratch/hg/bacEnds/hs/ # 6) Ask for propagation from sysadmin # BACEND SEQUENCE ALIGNMENTS (DONE 3/17/2003) # (alignments done without RepeatMasking) # 1) Create full sequence alignments ssh kk cd /cluster/home/booch/bacends # update Makefile with latest OOVERS and GSVERS make new make jobList para create jobList para push make bacEnds.psl # 2) Lift the files (takes a while) make bacEnds.psl.lifted # 3) Copy files to final destination and remove ssh kks00 make copy make clean # (may want to wait until sure they're OK) # BACEND PAIRS TRACK (DONE 3/18/2003) # 1) Update Makefile with OOVERS, GSVERS, location of pairs/singles # files, if necessary cd /projects/hg2/booch/psl/gs.15/build32/bacends # edit Makefile # 2) Create initial rdb file make bacEnds.rdb (# Takes a while # 3) Create file of singles to search for make bacEndPairsBad.bed # 4) Try to fish out more pairs make bacEndsMiss.psl # 5) Re-make bacEnds.rdb with new info make bacEnds.rdb # 6) Create bacEndPairs track file make bacEndPairs.bed # 7) Create bacEndPairsBad and bacEndPairsLong files make bacEndPairsBad.bed # 8) Create psl file to load make bacEnds.load.psl # 9) Create database tables ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg14 < all_bacends.sql hgsql hg14 < bacEndPairs.sql hgsql hg14 < bacEndPairsBad.sql hgsql hg14 < bacEndPairsLong.sql # 10) Load the tables load /projects/hg2/booch/psl/gs.15/build32/bacends/bacEnds.psl.filter.lifted into all_bacends load /projects/hg2/booch/psl/gs.15/build32/bacends/bacEndPairs.bed into bacEndPairs load /projects/hg2/booch/psl/gs.15/build32/bacends/bacEndPairsBad.bed into bacEndPairsBad load /projects/hg2/booch/psl/gs.15/build32/bacends/bacEndPairsLong.bed into bacEndPairsLong # 11) Load the sequences (change bacends.# to match correct location) mkdir /gbdb/hg15/bacends.3 cd /gbdb/hg15/bacends.3 ln -s /cluster/store1/bacends.3/BACends.fa cd /cluster/store2/tmp hgLoadRna addSeq hg15 /gbdb/hg15/bacends.3/BACends.fa FOSEND SEQUENCE ALIGNMENTS (DONE 3/17/2003) o - Create full sequence alignments ssh kk cd /cluster/home/booch/fosends - update Makefile with latest OOVERS and GSVERS make new make jobList para create jobList para push (or para try/para check if want to make sure it runs) make fosEnds.psl o - Copy files to final destination and remove ssh kks00 make copy.assembly make clean.assembly FOSEND PAIRS TRACK (TODO) o - Update Makefile with location of pairs files, if necessary cd /projects/hg2/booch/psl/gs.15/build32/fosends o - Create bed file ssh kks00 cd /projects/hg2/booch/psl/gs.15/build32/fosends make fosEndPairs.bed o - Create database tables ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg14 < all_fosends.sql hgsql hg14 < fosEndPairs.sql o - Load the tables load /projects/hg2/booch/psl/gs.15/build32/fosends/fosEnds.psl.filter.lifted into all_fosends load /projects/hg2/booch/psl/gs.15/build32/fosends/fosEndPairs.bed into fosEndPairs # Load the sequences (change bacends.# to match correct location) mkdir /gbdb/hg14/fosends.1 cd /gbdb/hg14/fosends.1 ln -s /cluster/store1/fosends.1/fosEnds.fa cd /cluster/store2/tmp hgLoadRna addSeq hg14 /cluster/store1/fosends.1/fosEnds.fa UPDATE FISH CLONES INFORMATION (TODO) o - Download the latest info from NCBI point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg change "Show details on sequence-tag" to "yes" change "Download or Display" to "Download table for UNIX" press Submit - save as /projects/hg2/booch/psl/fish/hbrc/hbrc.YYYYMMDD.table o - Format file just downloaded cd /projects/hg2/booch/psl/fish/ make HBRC o - Copy it to the new freeze location cp /projects/hg2/booch/psl/fish/all.fish.format /projects/hg2/booch/psl/gs.15/build32/fish/ CREATE AND LOAD FISH CLONES TRACK (DONE 3/20/2003) (must be done after STS markers track and BAC end pairs track) o - Extract the file with clone positions from database ssh hgwdev hgsql hg14 mysql> select * into outfile "/tmp/booch/clonePos.txt" from clonePos; mysql> quit mv /tmp/booch/clonePos.txt /projects/hg2/booch/psl/gs.15/build32/fish o - Create bed file cd /projects/hg2/booch/psl/gs.15/build32/fish make bed o - Create database table ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg14 < fishClones.sql o - Load the table load /projects/hg2/booch/psl/gs.15/build32/fish/fishClones.bed into fishClones CREATE AND LOAD CHROMOSOME BANDS TRACK (DONE 3/20/2003) (must be done after FISH Clones track) o - Create bed file ssh hgwdev make setBands.txt make cytobands.pct.ranges make predict o - Create database table ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg14 < cytoBand.sql o - Load the table echo "load data local infile '/projects/hg2/booch/psl/gs.15/build32/cytobands/cytobands.bed' into table cytoBand;" | hgsql hg14 CREATE CHROMOSOME REPORTS (NOT BEING DONE) CREATE STS MAP COMPARISON PLOTS AND GENETIC PLOTS (NOT BEING DONE) o - Must wait until after the STS Map track has been finished o - Create sts plots cd /projects/hg2/booch/psl/gs.15/build32/stsPlots make stsplots o - Create genetic plots cd /projects/hg2/booch/psl/gs.15/build32/geneticPlots make all matlab -nodesktop >> allplot_ncbi('/cse/grads/booch/tracks/gs.15/build32/geneticPlots/','build32', 'jpg'); >> quit o - Set up directories where this will end up ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/mapPlots update Makefile with OOVERS, GSVERS, and FREEZE date make new o - Copy over files make sts make genetic o - Update the index.html to include links to these new plots, and delete oldest set Update the arch.html with the oldest set just removed from index.html *** Make sure to check into CVS *** # PRODUCING CROSS_SPECIES mRNA ALIGNMENTS (TODO) # Make sure masked contigs are in /scratch/hg/gs.15/build32/trfFa # Make sure split-up xenoRna sequence is under /iscratch too (GENBANK) ssh kkstore mkdir -p ~/hg14/bed/xenoMrna cd ~/hg14/bed/xenoMrna mkdir psl ls -1S /scratch/hg/gs.15/build32/trfFa/*.fa.trf > human.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/xenoRna*.fa > mrna.lst # Using split fa -- so create separate output dirs and special gsub: foreach f (`cat mrna.lst`) mkdir psl/$f:t:r end echo '#LOOP \ /cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=rnax -t=dnax -mask=lower {check out line+ psl/$(root2)/$(root1)_$(root2).psl} \ #ENDLOOP' > gsub gensub2 human.lst mrna.lst gsub spec para create spec ssh kk cd ~/hg14/bed/xenoMrna para try para check para push # Do para check until the run is done, doing para push if necessary # Sort xeno mRNA alignments as so: ssh eieio cd ~/hg14/bed/xenoMrna pslSort dirs raw.psl /cluster/store2/temp psl/xenoRna* pslReps raw.psl cooked.psl /dev/null -minAli=0.25 liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > xenoMrna.psl rm -r chrom raw.psl cooked.psl chrom.psl # Load into database as so: ssh hgwdev cd ~/hg14/bed/xenoMrna hgLoadPsl hg14 xenoMrna.psl -tNameIx # Sequence info should have already been loaded into database (PRELOAD) # PRODUCING CROSS_SPECIES EST ALIGNMENTS (TODO) # Make sure masked contigs are in /scratch/hg/gs.15/build32/trfFa # Make sure split-up xenoRna sequence is under /iscratch too (GENBANK) ssh kkstore mkdir -p ~/hg14/bed/xenoEst cd ~/hg14/bed/xenoEst mkdir psl ls -1S /scratch/hg/gs.15/build32/trfFa/*.fa.trf > human.lst ls -1S /iscratch/i/mrna.134/Homo_sapiens/xenoEst*.fa > mrna.lst # Using split fa -- so create separate output dirs and special gsub: foreach f (`cat mrna.lst`) mkdir psl/$f:t:r end echo '#LOOP \ /cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=dnax -t=dnax -mask=lower {check out line+ psl/$(root2)/$(root1)_$(root2).psl} \ #ENDLOOP' > gsub gensub2 human.lst mrna.lst gsub spec ssh kk cd ~/hg14/bed/xenoEst para create spec para try, para check, para push, para check, ... # Sort xenoEst alignments: ssh eieio cd ~/hg14/bed/xenoEst pslSort dirs raw.psl /cluster/store2/temp psl/xenoEst* pslReps raw.psl cooked.psl /dev/null -minAli=0.10 liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl pslSortAcc nohead chrom /cluster/store2/temp chrom.psl pslCat -dir chrom > xenoEst.psl rm -r chrom raw.psl cooked.psl chrom.psl # Load into database as so: ssh hgwdev cd ~/hg14/bed/xenoEst hgLoadPsl hg14 xenoEst.psl -tNameIx # Sequence info should have already been loaded into database (PRELOAD) # PRODUCING FUGU ALIGNMENTS (TODO) # Distribute fugu sequence to /iscratch/i/fugu/ (if it isn't already there) ssh kkr1u00 rm -rf /iscratch/i/fugu mkdir /iscratch/i/fugu cp -p /cluster/store3/fuguSeq/split2.5Mb/*.fa /iscratch/i/fugu ~kent/bin/iSync DONE TO HERE ssh kk mkdir ~/hg14/bed/blatFugu cd ~/hg14/bed/blatFugu mkdir psl foreach f (~/hg14/?{,?}/NT_??????/NT_??????.fa) set c=$f:t:r mkdir -p psl/$c end ls -1S /iscratch/i/fugu/*.fa > fugu.lst ls -1S /scratch/hg/gs.15/build32/trfFa/*.fa.trf > human.lst cp ~/hg13/bed/blatFugu gsub . gensub2 human.lst fugu.lst gsub spec para create spec para try para check para push para check # When cluster run is done, sort alignments: ssh eieio cd ~/hg14/bed/blatFugu pslCat -dir psl/NT_??????.fa | \ liftUp -type=.psl stdout ~/hg14/jkStuff/liftAll.lft warn stdin | \ pslSortAcc nohead chrom temp stdin # Rename to correspond with tables as so and load into database: ssh hgwdev cd ~/hg14/bed/blatFugu/chrom rm -f chr*_blatFugu.psl foreach i (chr?{,?}{,_random}.psl) set r = $i:r mv $i ${r}_blatFugu.psl end hgLoadPsl hg14 *.psl # Make fugu /gbdb/ symlink and load Fugu sequence data. mkdir /gbdb/hg14/fuguSeq cd /gbdb/hg14/fuguSeq ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta cd /cluster/store2/tmp hgLoadRna addSeq hg14 /gbdb/hg14/fuguSeq/fugu_v3_mask.fasta TIGR GENE INDEX (TODO) o mkdir -p ~/hg14/bed/tigr cd ~/hg14/bed/tigr wget ftp://ftp.tigr.org/private/HGI_ren/TGI_track_HumanGenome_build32.tgz tar xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ldHgGene -exon=TC hg14 tigrGeneIndex *.gff LOAD MOUSEREF TRACK (todo) First copy in data from eieio to ~/hg14/bed/mouseRef. Then substitute 'genome' for the appropriate chromosome in each of the alignment files. Finally do: hgRefAlign webb hg14 mouseRef *.alignments LOAD AVID MOUSE TRACK (todo) ssh cc98 cd ~/hg14/bed mkdir avidMouse cd avidMouse wget http://pipeline.lbl.gov/tableCS-LBNL.txt hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed hgLoadBed avidRepeat avidRepeat.bed hgLoadBed avidUnique avidUnique.bed LOAD SNPS (TODO) ssh hgwdev cd ~/hg14/bed mkdir snp cd snp mkdir build110 cd build110 ln -s ../../../seq_contig.md . ln -s ~/hg13/bed/cpgIsland/filter.awk . -Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/gp.ncbi.b31.gz -Unpack calcFlipSnpPos seq_contig.md gp.ncbi.b31 gp.ncbi.b31.flipped mv gp.ncbi.b31 gp.ncbi.b31.original gzip gp.ncbi.b31.original grep RANDOM gp.ncbi.b31.flipped > snpTsc.txt grep MIXED gp.ncbi.b31.flipped >> snpTsc.txt grep BAC_OVERLAP gp.ncbi.b31.flipped > snpNih.txt grep OTHER gp.ncbi.b31.flipped >> snpNih.txt awk -f filter.awk snpTsc.txt > snpTsc.contig.bed awk -f filter.awk snpNih.txt > snpNih.contig.bed liftUp snpTsc.bed ../../jkStuff/liftAll.lft warn snpTsc.contig.bed liftUp snpNih.bed ../../jkStuff/liftAll.lft warn snpNih.contig.bed hgLoadBed hg14 snpTsc snpTsc.bed hgLoadBed hg14 snpNih snpNih.bed -gzip all of the big files LOAD ENSEMBL GENES (TODO) cd ~/hg14/bed mkdir ensembl cd ensembl Get the ensembl gene data as below: GET http://www.ebi.ac.uk/~stabenau/human_8_30.gtf.gz > ensGene.gz (The above may only be a temproary location) Get the ensembl protein data from http://www.ensembl.org/Homo_sapiens/martview Follow this sequence through the pages: Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. Page 3) Choose the "Structures" box. Page 4) Choose Transcripts/Proteins and GTF as the ouput, choose gzip compression and then hit export. gunzip the file and name to ensembl.gtf # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. grep -v ^6_DR51 ensembl.gtf | grep -v _NT_ > unrandom.gtf # Add "chr" to front of each line in the gene data gtf file to make # it compatible with ldHgGene ~matt/bin/addchr.pl unrandom.gtf ensGene.gtf ./fixEns.pl ensGene.gtf ensFixed.gtf ldHgGene hg14 ensGene ensGene.gtf o - Load Ensembl peptides: Get them from ensembl as above in the gene section except for Page 3) Choose the "Sequences" box. Page 4) Choose GTF as the ouput, choose gzip compression and then hit export. Substitute ENST for ENSP in ensPep with the program called subs edit subs.in to read: ENSP|ENST subs -e ensPep.fa > /dev/null Run fixPep.pl ensPep.fa ensembl.pep hgPepPred hg14 generic ensPep ensembl.pep LOAD SANGER 22 Pseudogenes cd ~/hg14/bed/sanger22 cp ~/hg10/bed/sanger22/cChr22.3.lx.pseudogene.gff . replace ^chr22 with hg10:chr22 in Chr22.3.lx.pseudogene.gff liftUp -type=.gff pseudo.gff hg14.lft Chr22.3.lx.pseudogene.gff ldHgGene hg14 sanger22pseudo pseudo.gff LOAD SANGER22 GENES (TODO) cd ~/hg14/bed mkdir sanger22 cd sanger22 not sure where these files were downloaded from grep -v Pseudogene Chr22*.genes.gff | hgSanger22 hg14 stdin Chr22*.cds.gff *.genes.dna *.cds.pep 0 | ldHgGene hg14 sanger22pseudo stdin # Note: this creates sanger22extras, but doesn't currently create # a correct sanger22 table, which are replaced in the next steps sanger22-gff-doctor Chr22.3.1x.cds.gff Chr22.3.1x.genes.gff \ | ldHgGene hg14 sanger22 stdin sanger22-gff-doctor -pseudogenes Chr22.3.1x.cds.gff Chr22.3.1x.genes.gff \ | ldHgGene hg14 sanger22pseudo stdin hgPepPred hg14 generic sanger22pep *.pep LOAD SANGER 20 GENES (todo) # First download files from James Gilbert's email to ~/hg14/bed/sanger20 and # go to that directory while logged onto hgwdev. Then: grep -v Pseudogene chr_20*.gtf | ldHgGene hg14 sanger20 stdin hgSanger20 hg14 *.gtf *.info # JAX ORTHOLOG (still valid???) (TODO) # Add Jackson labs info cd ~/hg14/bed mkdir jaxOrtholog cd jaxOrtholog wget ftp://ftp.informatics.jax.org/pub/informatics/reports/HMD_Human3.rpt cp /cluster/store1/gs.12/build29/bed/jaxOrtholog/filter.awk . awk -f filter.awk *.rpt > jaxOrtholog.tab # Drop (just in case), create and load the table like this: echo 'drop table jaxOrtholog;' | hgsql hg14 hgsql hg14 < ~/src/hg/lib/jaxOrtholog.sql echo "load data local infile '"`pwd`"/jaxOrtholog.tab' into table \ jaxOrtholog;" \ | hgsql hg14 LOAD RNAGENES ssh hgwdev mkdir -p ~/hg14/bed/rnaGene cd ~/hg14/bed/rnaGene wget ftp://ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-hg14.gff.gz gunzip -c ncrna-hg14.gff.gz | grep -v '^#' > contig.gff liftUp chrom.gff ../../jkStuff/liftAll.lft warn contig.gff echo 'drop table hgRnaGene;' | hgsql hg14 hgsql hg14 < ~/kent/src/hg/lib/rnaGene.sql hgRnaGenes hg14 chrom.gff LOAD EXOFISH (todo) - login to hgwdev - cd /cluster/store5/gs.15/build32/bed - mkdir exoFish - cd exoFish - hg14 < ~kent/src/hg/lib/exoFish.sql - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr) into /cluster/store5/gs.15/build32/bed/exoFish/all_maping_ecore - awk -f filter.awk all_maping_ecore > exoFish.bed - hgLoadBed hg14 exoFish exoFish.bed LOAD MOUSE SYNTENY (TODO) ssh hgwdev mkdir -p ~/hg14/bed/mouseSyn cd ~/hg14/bed/mouseSyn # Saved Michael Kamal's email attachment: allDirectedSegmentsBySize300.txt # Process the .txt file (minus header) into a bed 6 + file: grep -v "^#" allDirectedSegmentsBySize300.txt \ | awk '($6 > $5) {printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\n", $4, $5-1, $6, $1, 999, $7, $2-1, $3, $8;} \ ($5 > $6) {printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\n", $4, $6-1, $5, $1, 999, $7, $2-1, $3, $8;}' \ > mouseSynWhd.bed hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/mouseSynWhd.sql \ hg14 mouseSynWhd mouseSynWhd.bed LOAD GENIE (todo) - cat */ctg*/ctg*.affymetrix.gtf > predContigs.gtf - liftUp predChrom.gtf ../../jkStuff/liftAll.lft warn predContigs.gtf - ldHgGene hg14 genieAlt predChrom.gtf - cat */ctg*/ctg*.affymetrix.aa > pred.aa - hgPepPred hg14 genie pred.aa - hg14 mysql> delete * from genieAlt where name like 'RS.%'; mysql> delete * from genieAlt where name like 'C.%'; LOAD SOFTBERRY GENES (TODO) mkdir -p ~/hg14/bed/softberry cd ~/hg14/bed/softberry wget ftp://www.softberry.com/pub/sc_fgenesh_hum_mar03up/sc_fgenesh_hum_mar03up.tar.gz gunzip -c sc_fgenesh_hum_mar03up.tar.gz | tar xvf - ldHgGene hg14 softberryGene chr*.gff hgPepPred hg14 softberry *.protein hgSoftberryHom hg14 *.protein LOAD GENEID GENES (TODO) mkdir ~/hg14/bed/geneid cd ~/hg14/bed/geneid mkdir download cd download # Now download *.gtf and *.prot from wget -r http://www1.imim.es/genepredictions/H.sapiens/golden_path_20021114/geneid_v1.1/ # oops, due to links in the index.html, it tries to get too much. # ctrl-c it when it starts to download other directories. mv www1.imim.es/genepredictions/H.sapiens/golden_path_20021114/geneid_v1.1/*.{gtf,prot} . rm -r www1.imim.es/ # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene hg14 geneid download/*.gtf -exon=CDS hgPepPred hg14 generic geneidPep download/*-fixed.prot LOAD ACEMBLY (TODO) mkdir -p ~/hg14/bed/acembly cd ~/hg14/bed/acembly # Get acembly*gene.gff from Jean and Danielle Thierry-Mieg wget ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_31.human.genes/acembly.ncbi_31.genes.proteins.fasta.tar.gz wget ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_31.human.genes/acembly.ncbi_31.genes.gff.tar.gz gunzip -c acembly.ncbi_31.genes.gff.tar.gz | tar xvf - gunzip -c acembly.ncbi_31.genes.proteins.fasta.tar.gz | tar xvf - cd acembly.ncbi_31.genes.gff # Save just the floating-contig features to different files for lifting # and lift up the floating-contig features to chr*_random coords: foreach f (acemblygenes.*.gff) set c=$f:r:e egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \ perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff if (-e ../../../$c/lift/random.lft) then liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \ ctg-chr${c}_random.gff endif # Strip out _random or floating contig lines from the normal chrom gff, # and add the "chr" prefix: grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' > chr$c.gff end cd ../acembly.ncbi_31.genes.proteins.fasta #- Remove G_t*_ prefixes from acemblyproteins.*.fasta: foreach f (acemblyproteins.*.fasta) perl -wpe 's/^\>G_t[\da-zA-Z]+_/\>/' $f > chr$f:r:e.fa end #- Load into database: cd .. ldHgGene hg14 acembly acembly.ncbi_31.genes.gff/chr*.gff hgPepPred hg14 generic acemblyPep \ acembly.ncbi_31.genes.proteins.fasta/chr*.fa LOAD GENOMIC DUPES (todo) o - Load genomic dupes ssh hgwdev cd ~/hg14/bed mkdir genomicDups cd genomicDups wget http://codon/jab/web/takeoff/hg1433_dups_for_kent.zip unzip *.zip awk -f filter.awk oo33_dups_for_kent > genomicDups.bed mysql -u hgcat -pbigSECRET hg14 < ~/src/hg/lib/genomicDups.sql hgLoadBed hg14 -oldTable genomicDups genomicDupes.bed LOAD NCI60 (TODO) o - # ssh hgwdev cd /projects/cc/hg/mapplots/data/NCI60/dross_arrays_nci60/ mkdir hg14 cd hg14 findStanAlignments hg14 ../BC2.txt.ns ../../image/cumulative_plates.011204.list.human hg14.image.psl >& hg14.image.log cp ../experimentOrder.txt ./ sed -e 's/ / \.\.\//g' < experimentOrder.txt > epo.txt stanToBedAndExpRecs hg14.image.good.psl hg14.nci60.exp hg14.nci60.bed `cat epo.txt` hg14S -A < ../../scripts/nci60.sql echo "load data local infile 'hg14.nci60.bed' into table nci60" | hg14S -A mkdir /cluster/store5/gs.15/build32/bed/nci60 mv hg14.nci60.bed /cluster/store5/gs.15/build32/bed/nci60 rm *.psl LOAD AFFYRATIO [GNF] (TODO) o - # ssh hgwdev cd /cluster/store1/sugnet/ mkdir gs.15 mkdir gs.15/build32 mkdi20r gs.15/build32/affyGnf cd gs.15/build32/affyGnf cp /projects/compbiodata/microarray/affyGnf/sequences/HG-U95Av2_target ./ ls -1 /cluster/store5/gs.15/build32/trfFa.1204/ > allctg.lst echo "/cluster/store1/sugnet/gs.15/build32/affyGnf/HG-U95Av2_target" > affy.lst echo '#LOOP\n/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/cluster/store5/gs.15/build32/jkStuff/post.refCheck.old/11.ooc /cluster/store5/gs.15/build32/trfFa.1204/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst affy.lst template.sub para.spec # ssh kkr1u00 para create para.spec para try para check para push # exit kkr1u00 pslSort dirs hg14.affy.psl tmp psl >& pslSort.log liftUp hg14.affy.lifted.psl /cluster/store5/gs.15/build32/jkStuff/liftAll.lft warn hg14.affy.psl pslAffySelect seqIdent=.95 basePct=.95 in=hg14.affy.lifted.psl out=hg14.affy.pAffySelect.95.95.psl affyPslAndAtlasToBed hg14.affy.pAffySelect.95.95.psl /projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt affyRatio.bed affyRatio.exr >& affyPslAndAtlasToBed.log hg14S -A ) { \ chomp($_); \ @p = split(/\t/, $_); \ print "$p[2]\t$p[3]\t$p[0]\n"\ }' \ < SAGEmap_tag_ug-rel | sort | sed -e 's/ /_/g' \ > SAGEmap_ug_tag-rel_Hs cd - createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \ tagExpArrays.tab sageSummary.sage # Create the uniGene alignments # ~/hg14/uniGene/hg14.uniGene.lifted.pslReps.psl # -- see "MAKE UNIGENE ALIGNMENTS" below cd /projects/cc/hg/sugnet/sage/sage.XXX/extr addAveMedScoreToPsls \ ~/hg14/bed/uniGene.$version/hg14.uniGene.lifted.pslReps.psl \ sageSummary.sage uniGene.wscores.bed hgLoadBed hg14 uniGene_2 uniGene.wscores.bed hgsql hg14 < ~kent/src/hg/lib/sage.sql echo "load data local infile 'sageSummary.sage' into table sage" \ | hgsql hg14 cd ../info ../../scripts/parseRecords.pl ../extr/expList.tab > sageExp.tab hgsql hg14 < ~/kent/src/hg/lib/sageExp.sql echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg14 # update ~/kent/src/hg/makeDb/trackDb/human/hg14/uniGene_2.html # with current uniGene date. # MAKE UNIGENE ALIGNMENTS (TODO) # Download of the latest UniGene version is now automated by a # cron job -- see /cluster/home/angie/crontab , # /cluster/home/angie/unigeneVers/unigene.csh . # If hgwdev gets rebooted, that needs to be restarted... maybe there's # a more stable place to set up that cron job. # substitute XXX -> the uniGene version used by SAGE, if building the # uniGene/SAGE track; or just the latest uniGene version in # /projects/cc/hg/sugnet/uniGene/ , if doing uniGene alignments only. set version = XXX cd /projects/cc/hg/sugnet/uniGene/uniGene.$version gunzip Hs.seq.uniq.gz ../countSeqsInCluster.pl Hs.data counts.tab ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects) ssh kkstore set version = XXX # same as above mkdir -p /iscratch/i/uniGene.$version cp -p \ /projects/cc/hg/sugnet/uniGene/uniGene.$version/Hs.seq.uniq.simpleHeader.fa \ /iscratch/i/uniGene.$version ssh kkr1u00 ~kent/bin/iSync ssh kk set version = XXX # same as above mkdir -p ~/hg14/bed/uniGene.$version cd ~/hg14/bed/uniGene.$version ls -1S /cluster/store5/gs.15/build32/trfFa/* > allctg.lst ls -1S /iscratch/i/uniGene.$version/Hs.seq.uniq.simpleHeader.fa \ > uniGene.lst echo '#LOOP\n/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst uniGene.lst template.sub para.spec para create para.spec mkdir psl para try para check para push # ssh eieio set version = XXX # same as above cd ~/hg14/bed/uniGene.$version pslSort dirs raw.psl tmp psl >& pslSort.log liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \ | pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 \ stdin hg14.uniGene.lifted.pslReps.psl /dev/null # use hg14.uniGene.lifted.pslReps.psl for building SAGE track (above). LOADING MOUSE MM3 BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 03/17/03) # Translate Penn State .lav files into sorted axt: ssh eieio set base="/cluster/store5/gs.15/build32/bed/blastz.mm3.2003-03-17-ASH" set seq1_dir="/cluster/store5/gs.15/build32/mixedNib/" set seq2_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/" set tbl="blastzMm3" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin $seq1_dir $seq2_dir stdout \ | axtSort stdin $out popd end # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r echo $c axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.15/build32/bed/blastz.mm3.2003-03-17-ASH" set tbl="blastzMm3" cd $base/pslChrom hgLoadPsl hg14 chr*_${tbl}.psl MAKING THE BLASTZBESTMOUSE TRACK FROM PENN STATE MM3 AXT FILES (DONE 03/17/03) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio set base="/cluster/store5/gs.15/build32/bed/blastz.mm3.2003-03-17-ASH" set tbl="blastzBestMm3" cd $base mkdir -p axtBest pslBest foreach chrdir (lav/chr*) set chr=$chrdir:t echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.15/build32/bed/blastz.mm3.2003-03-17-ASH" set tbl="blastzBestMm3" cd $base/pslBest hgLoadPsl hg14 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: # Not done for build 32: mkdir -p /gbdb/hg14/axtBestMm3 cd /gbdb/hg14/axtBestMm3 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/hg14/axtBestMm3/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql hg14 < ~/kent/src/hg/lib/axtInfo.sql hgsql hg14 < axtInfoInserts.sql MAKING THE AXTTIGHT FROM AXTBEST (DONE 03/17/03) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd ~/hg14/bed/blastz.mm3.2003-03-17-ASH/axtBest mkdir -p ../axtTight foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm3.psl end # Load tables into database ssh hgwdev cd ~/hg14/bed/blastz.mm3.2003-03-17-ASH/pslTight hgLoadPsl hg14 chr*_blastzTightMm3.psl BEGINNING OF RAT BLASTZ LOADING RAT RN2 BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 03/19/03) # Translate Penn State .lav files into sorted axt: ssh eieio set base="/cluster/store5/gs.15/build32/bed/blastz.rn2.2003-03-18-ASH" set seq1_dir="/cluster/store5/gs.15/build32/mixedNib/" set seq2_dir="/cluster/store4/rn2/mixedNib/" set tbl="blastzRn2" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin $seq1_dir $seq2_dir stdout \ | axtSort stdin $out popd end # STOPPED HERE -- big data, low demand. # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.15/build32/bed/blastz.rn2.2003-03-18-ASH" set tbl="blastzRn2" cd $base/pslChrom hgLoadPsl hg14 chr*_${tbl}.psl MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE RN2 AXT FILES (DONE 03/19/03) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio set base="/cluster/store5/gs.15/build32/bed/blastz.rn2.2003-03-18-ASH" set tbl="blastzBestRn2" cd $base mkdir -p axtBest pslBest foreach chrdir (lav/chr*) set chr=$chrdir:t echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 echo translating axtBest to psl for $chr axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.15/build32/bed/blastz.rn2.2003-03-18-ASH" set tbl="blastzBestRn2" cd $base/pslBest hgLoadPsl hg14 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: # Not done for build 32: mkdir -p /gbdb/hg14/axtBestRn2 cd /gbdb/hg14/axtBestRn2 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/hg14/axtBestRn2/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('rn2','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql hg14 < ~/kent/src/hg/lib/axtInfo.sql hgsql hg14 < axtInfoInserts.sql MAKING THE AXTTIGHT FROM AXTBEST (DONE 03/19/03) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd ~/hg14/bed/blastz.rn2.2003-03-18-ASH/axtBest mkdir -p ../axtTight foreach i (*.axt) subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn2.psl end # Load tables into database ssh hgwdev cd ~/hg14/bed/blastz.rn2.2003-03-18-ASH/pslTight hgLoadPsl hg14 chr*_blastzTightRn2.psl XXX END OF RAT BLASTZ BEGINNING OF HUMAN BLASTZ LOADING HUMAN HG14 (SELF) BLASTZ ALIGNMENTS: (DONE 03/19/20) # Translate Penn State .lav files into sorted axt, with alignments # to self/diagonal dropped: ssh eieio set base="/cluster/store5/gs.15/build32/bed/blastz.hg14.2003-03-18-ASH" set seq1_dir="/cluster/store5/gs.15/build32/mixedNib/" set seq2_dir="/cluster/store5/gs.15/build32/mixedNib/" set tbl="blastzHuman" cd $base mkdir -p axtChrom # sometimes alignments are so huge that they cause axtSort to run out # of memory. Run them in two passes like this: foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" foreach d (*.lav) set smallout=$d.axt lavToAxt $d $seq1_dir $seq2_dir stdout \ | axtDropSelf stdin stdout \ | axtSort stdin $smallout end cat `ls -1 *.lav.axt | sort -g` \ > $out popd end # STOPPED HERE -- big data, low demand. # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r echo translating $c.axt to psl axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.15/build32/bed/blastz.hg14.2003-03-18-ASH" set tbl="blastzHuman" cd $base/pslChrom hgLoadPsl hg14 chr*_${tbl}.psl MAKING THE BLASTZBESTHUMAN TRACK FROM UNFILTERED AXT FILES (DONE 03/20/03) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio set base="/cluster/store5/gs.15/build32/bed/blastz.hg14.2003-03-18-ASH" set tbl="blastzBestHuman" cd $base mkdir -p axtBest pslBest # run axtBest in 2 passes to reduce size of the input to final axtBest: foreach chrdir (lav/*) set chr=$chrdir:t echo two-pass axtBesting $chr foreach a ($chrdir/*.axt) axtBest $a $chr $a:r.axtBest end cat `ls -1 $chrdir/*.axtBest | sort -g` \ > $chrdir/$chr.axtBestPieces axtBest $chrdir/$chr.axtBestPieces $chr axtBest/$chr.axt axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl end # Load tables ssh hgwdev set base="/cluster/store5/gs.15/build32/bed/blastz.hg14.2003-03-18-ASH" set tbl="blastzBestHuman" cd $base/pslBest hgLoadPsl hg14 chr*_${tbl}.psl # Make /gbdb links and add them to the axtInfo table: # Not done for build 32: mkdir -p /gbdb/hg14/axtBestHg14 cd /gbdb/hg14/axtBestHg14 foreach f ($base/axtBest/chr*.axt) ln -s $f . end cd $base/axtBest rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/hg14/axtBestHg14/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo VALUES ('hg14','Blastz Best Human Self','$chr','$f');" \ >> axtInfoInserts.sql end hgsql hg14 < ~/kent/src/hg/lib/axtInfo.sql hgsql hg14 < axtInfoInserts.sql MAKING THE AXTTIGHT FROM AXTBEST (DONE 03/20/03) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd ~/hg14/bed/blastz.hg14.2003-03-18-ASH/axtBest mkdir -p ../axtTight foreach i (*.axt) subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHuman.psl end # Load tables into database ssh hgwdev cd ~/hg14/bed/blastz.hg14.2003-03-18-ASH/pslTight hgLoadPsl hg14 chr*_blastzTightHuman.psl XXX END OF HUMAN BLASTZ LIFTING REPEATMASKER .ALIGN FILES (TODO) foreach d (?{,?}/NT_??????) set c=$d:t cd $d echo $c to $c.fa.align /cluster/bin/scripts/liftRMAlign.pl $c.lft > $c.fa.align cd ../.. end foreach chr (?{,?}) cd $chr echo making symbolic links for chr$chr NT .fa.align files foreach ctg (NT_??????) ln -s $ctg/$ctg.fa.align end cd .. if (-e $chr/lift/ordered.lft) then echo making $chr/chr$chr.fa.align /cluster/bin/scripts/liftRMAlign.pl $chr/lift/ordered.lft \ > $chr/chr$chr.fa.align endif if (-e $chr/lift/random.lft) then echo making $chr/chr${chr}_random.fa.align /cluster/bin/scripts/liftRMAlign.pl $chr/lift/random.lft \ > $chr/chr${chr}_random.fa.align endif echo removing symbolic links for chr$chr NT .fa.align files rm $chr/NT_??????.fa.align end TWINSCAN GENE PREDICTIONS (TODO) mkdir -p ~/hg14/bed/twinscanchr_gtf cd ~/hg14/bed/twinscan foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) rm -f chr$c.{gtf,ptx} wget http://genome.cs.wustl.edu/~bio/human/NCBI31/12-30-02/chr_gtf/chr$c.gtf wget http://genome.cs.wustl.edu/~bio/human/NCBI31/12-30-02/chr_ptx/chr$c.ptx # clean up chrom name and put chrom in transcript_id: perl -wpe 's/^chr(\w+)\.\d+\.\d+(.*)transcript_id "(\d+\.\d+).a"/chr$1$2transcript_id "$1.$3.a"/' \ < chr$c.gtf > chr$c-fixed.gtf # pare down protein FASTA header to id and add missing .a: perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+)\s+chr=(\w+).*$/\>$2.$1.a/;' \ < chr$c.ptx > chr$c-fixed.fa end ldHgGene hg14 twinscan chr*-fixed.gtf -exon=CDS hgPepPred hg14 generic twinscanPep chr*-fixed.fa # LOAD CHIMP DATA (TODO) # Download the chimp sequence and distribute to /iscratch/i ssh hgwdev mkdir /cluster/store1/chimpSeq cd /cluster/store1/chimpSeq wget http://www.cs.uni-duesseldorf.de/~ebersber/annotation_track_chimp/downloads/mpi-aligned_seqparts_jun02.fa.gz gunzip *.gz ssh kkr1u00 mkdir /iscratch/i/chimp cp -p /cluster/store1/chimpSeq/*.fa /iscratch/i/chimp/ # Make sure it unpacked OK ~kent/bin/iSync ssh kk mkdir ~/hg14/bed/blatChimp cd ~/hg14/bed/blatChimp cp ~/hg13/bed/blatChimp/gsub . ls -1S /iscratch/i/chimp/*.fa > chimp.lst ls -1S /scratch/hg/gs.15/build32/trfFa.1204/*.fa.trf > human.lst mkdir psl gensub2 human.lst chimp.lst gsub spec para create spec para try para push para check # Sort alignments as so ssh eieio cd ~/hg14/bed/blatChimp pslCat -dir psl \ | liftUp -type=.psl stdout ~/hg14/jkStuff/liftAll.lft warn stdin \ | pslSortAcc nohead chrom temp stdin pslCat -dir chrom > blatChimp.psl ssh hgwdev cd ~/hg14/bed/blatChimp hgLoadPsl hg14 blatChimp.psl SGP GENE PREDICTIONS (TODO) mkdir -p ~/hg14/bed/sgp/download cd ~/hg14/bed/sgp/download foreach f (~/hg14/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20021114/SGP/$chr.gtf wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20021114/SGP/$chr.prot end wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20021114/SGP/chrUn.gtf -O chrUn_random.gtf wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20021114/SGP/chrUn.prot -O chrUn_random.prot # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene hg14 sgpGene download/*.gtf -exon=CDS hgPepPred hg14 generic sgpPep download/*-fixed.prot ALIGNED ANCIENT REPEATS FROM MOUSE BLASTZ (TODO) ssh eieio mkdir -p ~/hg14/bed/aarMm2 cd ~/hg14/bed/aarMm2 set mmdir=../blastz.mm2.2002-12-5-ASH foreach aar ($mmdir/aar/*.aar.gz) set c = $aar:t:r:r echo translating chr$c aar to axt zcat $aar \ | $HOME/kent/src/hg/makeDb/hgAar/aarToAxt \ | axtToPsl stdin $mmdir/S1.len $mmdir/S2.len stdout \ > chr${c}_aarMm2.psl end ssh hgwdev cd ~/hg14/bed/aarMm2 hgLoadPsl hg14 *.psl ALIGNMENT COUNTS FOR WIGGLE TRACK # this needs to be updated to reflected the full process. - Generate BED table of AARs used to select regions. cat ../bed/aarMm2/*.psl | awk 'BEGIN{OFS="\t"} {print $14,$16,$17,"aar"}' >aarMm2.bed - Generate background counts with windows that have a 6kb counts, with a maximum windows size of 512kb and sliding the windows by foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt) set chr=$axt:t:r set tab=$chr.6kb-aar.cnts (??? need better name ???) hgCountAlign -selectBed=aarMm2.bed -winSize=512000 -winSlide=1000 -fixedNumCounts=6000 -countCoords $axt $tab end - Generate counts for AARs with 50b windows, slide by 5b foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt) set chr=$axt:t:r set tab=$chr.50b-aar.cnts (??? need better name ???) hgCountAlign -selectBed=aarMm2.bed -winSize=50 -winSlide=5 $axt $tab end - Generate counts for all with 50b windows, slide by 5b foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt) set chr=$axt:t:r set tab=$chr.50b.cnts (??? need better name ???) hgCountAlign -winSize=50 -winSlide=5 $axt $tab end REFFULL (TODO) o ssh to eieio mkdir -p /cluster/store5/gs.15/build32/bed/refFull cd /cluster/store5/gs.15/build32/bed/refFull Download the sequence: wget ftp://blue3.ims.u-tokyo.ac.jp/pub/db/hgc/dbtss/ref-full.fa.gz gunzip it and split the ref-rull.fa file into about 200 pieces gunzip ref-full.fa.gz faSplit sequence ref-full.fa 50 splitRefFull ssh kkstore cd /cluster/store5/gs.15/build32/bed/refFull mkdir /scratch/hg/refFull splitRefFull* /scratch/hg/refFull/ ls -1S /scratch/hg/gs.15/build32/contig.0729/*.fa > genome.lst ls -1S /scratch/hg/refFull/split*.fa > refFull.lst o - Request the admins to do a binrsync to the cluster of /scratch/hg/refFull o - Use BLAT to generate refFull alignments as so: Make sure that /scratch/hg/gs.15/build32/contig/ is loaded with NT_*.fa and pushed to the cluster nodes. ssh kk cd /cluster/store5/gs.15/build32/bed/refFull mkdir -p psl # run mkdirs.sh script to create sudirs in the psl directory # in order to modularize the blat job. gensub2 genome.lst refFull.lst gsub spec para create spec Now run a para try/push and para check in each one. o - Process refFull alignments into near best in genome. cd ~/hg14/bed cd refFull pslSort dirs raw.psl /tmp psl/* pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl contig.psl /dev/null liftUp -nohead all_refFull.psl ../../jkStuff/liftAll.lft carry contig.psl pslSortAcc nohead chrom /tmp all_refFull.psl o - Load refFull alignments into database ssh hgwdev cd /cluster/store5/gs.15/build32/bed/refFull pslCat -dir chrom > refFullAli.psl hgLoadPsl hg14 -tNameIx refFullAli.psl MAKING PROMOTER FILES cd /usr/local/apache/htdocs/goldenPath/14nov2002/bigZips featureBits hg14 -fa=upstream1000.fa refGene:upstream:1000 zip upstream1000.zip upstream1000.fa featureBits hg14 -fa=upstream2000.fa refGene:upstream:2000 zip upstream2000.zip upstream2000.fa featureBits hg14 -fa=upstream5000.fa refGene:upstream:5000 zip upstream5000.zip upstream5000.fa rm upstream*.fa MAKING MOUSE AND RAT SYNTENY # syntenicBest.pl -db=hg14 -table=blastzBestMm3 smooth.pl joinsmallgaps.pl fillgap.pl -db=hg14 -table=blastzBestMm3 synteny2bed.pl hgLoadBed hg14 syntenyMouse ucsc100k.bed syntenicBest.pl -db=hg14 -table=blastzBestRn2 smooth.pl joinsmallgaps.pl fillgap.pl -db=hg14 -table=blastzBestRn2 synteny2bed.pl hgLoadBed hg14 syntenyRat ucsc100k.bed