# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on # NCBI build 36 (October 2005 freeze) # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +---------------+-------------------------------------+ # | tableName | type | # +---------------+-------------------------------------+ # | knownGene | genePred knownGenePep knownGeneMrna | # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | mgcGenes | genePred | # | ensGene | genePred ensPep | # | nscanGene | genePred nscanPep | # | sgpGene | genePred sgpPep | # | geneid | genePred geneidPep | # | genscan | genePred genscanPep | # | exonWalk | genePred | # | ecoresTetNig1 | genePred | # +---------------+-------------------------------------+ # HOW TO BUILD AN ASSEMBLY FROM NCBI FILES # --------------------------------------- # 10/06/2005 # Make gs.19 directory, gs.19/build36 directory, and gs.19/ffa directory. ssh kkstore02 mkdir /cluster/store11/gs.19 mkdir /cluster/store11/gs.19/build36 mkdir /cluster/store11/gs.19/agp mkdir /cluster/store11/gs.19/ffa # Make a symbolic link from /cluster/store1 to this location # (I assume there is some use for this later ?) cd /cluster/store1 ln -s /cluster/store11/gs.19 ./gs.19 ln -s /cluster/store11/gs.19/build36 /cluster/data/hg18 # Make a symbolic link from your home directory to the build dir: # (Investigate what this is used for, may no longer be necessary) cd ln -s /cluster/store11/gs.19/build36 ~/oo # NCBI download site, fetch everything into this one directory: # with the machine and password in your $HOME/.netrc file, this # wget command will require no login. Your $HOME/.netrc file # is set to 'chmod 600 .netrc' to prevent anyone from finding # the data. (There were some early files that later moved # into an OLD subdirectory. They were broken.) # 11/16/2005 # Received answer from Greg to go ahead with the new build. ssh kkstore02 mkdir /cluster/store11/gs.19/ncbi cd /cluster/store11/gs.19/ncbi bash wget --timestamp ftp://ftp-private.ncbi.nih.gov/build_36/* # New to this build is the sequence: NC_001807 which is the # mitochondria sequence. This prefix NC_ is new to the process # and will have to be accounted for below. The other two special # prefixes are similar to what was seen before: # from DR52.agp NG_002392 # Homo sapiens major histocompatibility complex, class II, # DR52 haplotype (DR52) on chromosome 6 # and from DR53.agp NG_002433 # Homo sapiens major histocompatibility complex, class II, # DR53 haplotype (DR53) on chromosome 6 # Fixup seq_contig.md # # It has a bunch of stuff belonging to the Celera # genome assembly. Filter those out. I don't know what the # NT_07959[0-7] items are, but there are no definitions for them # in the agp files and no sequence in any fa.gz file. # Fixup the names for the NG_ items, and change chrom MT to be M # get the seq_contig.md file Craig just made for us on 11/28/05. cd /cluster/store11/gs.19/ncbi wget --timestamp ftp://ftp-private.ncbi.nih.gov/build_36/seq_contig.md # remove Celera and Toronto entries # and replace chrom number for those haplotypes ssh hgwdev cd /cluster/store11/gs.19/build36 egrep -v "Celera|NT_07959[0-7]" ../ncbi/seq_contig.md |grep -v CRA_TCA >seq_contig0.tab hgsql hg18 -e 'drop table seq_contig0' hgsql hg18 <~/src/hg/lib/seq_contig0.sql hgsql hg18 -e 'load data local infile "seq_contig0.tab" into table seq_contig0' # fix seq_contig and # get the randoms sorted in proper order. The createNcbiLifts # does not work correctly if the randoms are not grouped together # by chromosome fixMd0 hg18 |sed -e "s/6_qbl_hap1/6_qbl_hap2/"| sed -e "s/MT/M/" | grep -v "|" >seq_contig1.tab hgsql hg18 -e 'drop table seq_contig1' hgsql hg18 <~/src/hg/lib/seq_contig1.sql hgsql hg18 -e 'load data local infile "seq_contig1.tab" into table seq_contig1' fixMd hg18 seq_contig1 >seq_contig.md # This pulls out all the randoms and groups them within the # same chrom but leaving them in the same order as they orginally # were (warning this is BASH code ...) bash grep "|" seq_contig0.tab | awk -F"|" '{print $1}' | \ awk '{print $2}' | sort -n -u | while read CHR do grep "[^0-9]${CHR}|" seq_contig0.tab done >> seq_contig.md exit hgsql hg18 -e 'drop table seq_contig' hgsql hg18 <~/src/hg/lib/seq_contig.sql hgsql hg18 -e 'load data local infile "seq_contig.md" into table seq_contig' # FYI: agp file format documented at: # http://www.ncbi.nlm.nih.gov/Genbank/WGS.agpformat.html# fixup a couple of names for our own purposes here cd /cluster/store11/gs.19/agp ln -s ../ncbi/chr*.agp ../ncbi/chr*.fa.gz . sed -e "s#MT/NC_001807#NC_001807#" ../ncbi/chrMT.agp > chrM.agp cat ../ncbi/c22_H2.agp > chr22_h2_hap1.agp cat ../ncbi/c5_H2.agp > chr5_h2_hap1.agp cat ../ncbi/c6_COX.agp > chr6_cox_hap1.agp cat ../ncbi/c6_QBL.agp > chr6_qbl_hap2.agp cp -p ../ncbi/c22_H2.fa.gz chr22_h2_hap1.fa.gz cp -p ../ncbi/c5_H2.fa.gz chr5_h2_hap1.fa.gz cp -p ../ncbi/c6_COX.fa.gz chr6_cox_hap1.fa.gz cp -p ../ncbi/c6_QBL.fa.gz chr6_qbl_hap2.fa.gz mkdir sav cp -p *hap*.agp sav # fix hap type agp files that have multiple contigs. fixAgp hg18 sav/chr5_h2_hap1.agp chr5_h2_hap1.agp fixAgp hg18 sav/chr6_qbl_hap2.agp chr6_qbl_hap2.agp # PLEASE NOTE THAT THESE TWO CORRECTED .agp FILES ABOVE ARE USED LATER, # NOT BY THE NEXT STEP IMMEDIATELY. # Put all the agp files together into one. # The chrM sequence now has its own agp, remove it from # ref_placed.agp # sed -e "/^NC_001807/d" ../ncbi/ref_placed.agp > ref_placed.agp # PLEASE NOTE THAT THE ORIGINAL NCBI .agp FILES FOR THOSE # SPECIAL HAP TYPE SEQUENCES ARE USED, NOT THE CORRECTED ONES. cd /cluster/store11/gs.19/build36 cat ../ncbi/ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \ ../ncbi/c22_H2.agp \ ../ncbi/c5_H2.agp \ ../ncbi/c6_COX.agp \ ../ncbi/c6_QBL.agp \ ../ncbi/PAR.agp > ncbi_build36.agp # cat ../ncbi/ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \ # ../agp/chr22_h2_hap1.agp ../agp/chr5_h2_hap1.agp \ # ../agp/chr6_cox_hap1.agp ../agp/chr6_qbl_hap2.agp \ # ../ncbi/PAR.agp > ncbi_build36.agp zcat ../ncbi/chrMT.fa.gz | \ sed -e "s/gi|17981852|ref|NC_001807.4/ref|NC_001807/" | \ gzip > chrM.fa.gz # and into ffa cd /cluster/store11/gs.19/ffa # NO LONGER TRUE FOR GS19! # There is a single bogus line at the end of ref_placed.fa.gz # declaring the NC_001807 MT sequence, this was later replaced by # chrMT.fa.gz, so remove that one line: zcat ../ncbi/ref_placed.fa.gz | sed -e "/^>ref|NC_001807/d" | \ gzip > ref_placed.fa.gz # (That's a 40 minute job) # sequence.inf is usually here, symlink it #ln -s ../ncbi/sequence.inf ln -s ../ncbi/chromosome_extents.inf # put all the fa.gz files together in one big fa.gz # time zcat ref_placed.fa.gz ../agp/chrM.fa.gz ../ncbi/ref_unplaced.fa.gz \ time zcat ../ncbi/ref_placed.fa.gz ../ncbi/ref_unplaced.fa.gz \ ../agp/*hap?.fa.gz ../ncbi/PAR.fa.gz | gzip \ > ncbi_build36.fa.gz # Make a listing of all the fasta record headers, just FYI: cd /cluster/store11/gs.19 zcat ffa/ncbi_build36.fa.gz | grep "^>" > ncbi.fa.headers # Sanity check, checkYbr was updated to handle the NC_ identifier cd /cluster/store11/gs.19/build36 zcat ../ffa/ncbi_build36.fa.gz | $HOME/bin/i386/checkYbr ncbi_build36.agp stdin seq_contig.md >check.seq_contig # result should be clean: cat check.seq_contig # Read 378 contigs from ncbi_build36.agp # Verifying sequence sizes in stdin # 0 problems detected # Convert fa files into UCSC style fa files and place in "contigs" # directory inside the gs.19/build36 directory # (a check that can be done here is make a list of the contigs # in this ./contigs directory before and compare it with the # list of distributed contigs created after they have been # disbursed.) # faNcbiToUcsc was fixed to handle the NC_ identifier cd /cluster/store11/gs.19/build36 # We've been through this often # mv contigs contigs.0 zcat ../ffa/ncbi_build36.fa.gz | $HOME/bin/i386/faNcbiToUcsc \ -split -ntLast stdin contigs # If you want to compare anything to previous work, check now, then: # rm -fr contigs.0 # Determine the chromosome sizes from agps # Watch carefully how chrY gets constructed. I'm not sure # this chrom_sizes represents the whole length of chrY with # the PAR added. We will see about that. # Script updated to handle new chrom names: # my @chroms = (1 .. 22, 'X', 'Y', 'M', '6_hla_hap1', '6_hla_hap2'); cd /cluster/store11/gs.19/build36 /cluster/bin/scripts/getChromSizes ../agp # Create chrom.lst list for use in foreach() loops awk '{print $1}' chrom_sizes | sed -e "s/chr//" > chrom.lst # Create lift files (this will create chromosome directory structure) and # inserts file /cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md . # Create contig agp files (will create contig directory structure) /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build36.agp . # Create chromsome random agp files. /cluster/bin/scripts/createNcbiChrAgp -randomonly . # Copy the original chrN.agp files from the gs.19/agp directory # into each of the chromosome directories since they contain better # gap information. Delete the comments at top from these. cd /cluster/store11/gs.19/build36 foreach c ( `cat chrom.lst` ) sed -e "/^#.*/d" ../agp/chr${c}.agp > ./${c}/chr${c}.agp end # chrM needs a name fixup sed -e "s#NC_001807#chrM#" ../agp/chrM.agp > M/chrM.agp # Distribute contig .fa to appropriate directory (assumes all files # are in "contigs" directory). # Create inserts file from agp and lift files (new - added by Terry, 2004-07-12) /cluster/bin/scripts/createInserts /cluster/data/hg18 > /cluster/data/hg18/inserts # create global data link for everyone. No more home directory # links required. ln -s /cluster/store11/gs.19/build36 /cluster/data/hg18 cd /cluster/data/hg18 /cluster/bin/scripts/distNcbiCtgFa contigs . # Verify that everything was moved properly, the contigs directory # should be empty: ls contigs # Nothing there, then remove it rmdir contigs # Make a list of the contigs for use later rm contig.lst touch contig.lst foreach chrom ( `cat chrom.lst` ) foreach c ( $chrom/N{C,G,T}_?????? ) set contig = $c:t echo "${chrom}/${contig}/${contig}.fa" >> contig.lst end end # For later comparisons, this is how many contigs we have: wc -l contig.lst # 378 contig.lst # Note 2004-06-30 - there are some clone numbers left in some of # the NCBI files that are incorrect. Due to version number # changes, more than one version is listed. Namely for accession # numbers: AC004491 AC004921 AC004983 AC005088 AC006014 AC099654 # The AGP files are correct, the sequence.inf file lists these # twice: AC004491.1 AC004491.2 # AC004921.1 AC004921.2 AC004983.2 AC004983.3 # AC005088.2 AC005088.3 AC006014.2 AC006014.3 # AC099654.4 AC099654.5 # for hg18, NCBI did not provide the seq.inf file. # FILES ARE NOW READY FOR REPEAT MASKING - start that process as # other steps here can proceed in parallel. # Previous practice used to copy everything over for jkStuff from a # previous build. Rather than do that, pick up whatever is needed # at the time it is needed and verify that it is going to do what # you expect. cd /cluster/data/hg18 mkdir jkStuff # Create the contig.gl files - XXX - NCBI doesn't deliver # contig_overlaps.agp - 2004-06-18 - this is beginning to come # together and there is now a contig_overlaps.agp file # This is properly done below with a combination of psLayout # alignments to create the contig_overlaps.agp file # /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md # Create chromosome gl files # jkStuff/liftGl.csh contig.gl # CREATING DATABASE (DONE - 2005-11-30 - Fan) ssh hgwdev # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql df -h /var/lib/mysql # Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 1.3T 356G 79% /var/lib/mysql # Create the database. hgsql -e 'create database hg18' mysql # Copy over grp table (for track grouping) from another database: hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" hg18 # The DB updates to grp below are not needed since we copied from hg17. # ENCODE groups # Added 2005-08016 kate echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg18 echo 'UPDATE grp SET priority=8 WHERE name="encode"'| hgsql hg18 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg18 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg18 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg18 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg18 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg18 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg18 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg18 # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS # (DONE - 2005-12-02 - Fan) # Make nib/, unmasked until RepeatMasker and TRF steps are done. # Do this now so that the chromInfo table will exist and thus the # trackDb tables can be built in the next step. # These unmasked nibs will be replaced by the masked nibs after # repeat mask and trf are done. ssh kkstore02 cd /cluster/data/hg18 cp /cluster/data/hg17/jkStuff/chrFa.csh jkStuff -p # Make chr*.fa from contig .fa # Copied chrFa.sh from hg17/jkStuff, renamed it to chrFa.csh bash time ./jkStuff/chrFa.csh # real 2m34.406s # user 1m17.405s # sys 0m16.730s exit mkdir nib foreach c (`cat chrom.lst`) foreach f ($c/chr${c}{,_random}.fa) if (-e $f) then echo "nibbing $f" /cluster/bin/i386/faToNib $f nib/$f:t:r.nib endif end end # Make symbolic links from /gbdb/hg18/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/hg18/nib ln -s /cluster/data/hg18/nib/chr*.nib /gbdb/hg18/nib # Load /gbdb/hg18/nib paths into database and save size info. cd /cluster/data/hg18 hgsql hg18 < $HOME/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib hg18 /gbdb/hg18/nib */chr*.fa hgsql -N -e "select chrom,size from chromInfo order by chrom" hg18 \ > chrom.sizes # You can compare this chrom.sizes with the previously created # chrom_sizes. Should be no difference sort chrom_sizes > s0 sort chrom.sizes | grep -v random > s1 diff s0 s1 rm s0 s1 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2005-12-06 - Fan) # dbDb orderKey updated 2005-12-06 - Fan ssh hgwdev # reset dbDb orderKey - these have never been ordered properly # before, this will get them on the program. hgsql -e 'update dbDb set orderKey=11 where name = "hg17";' \ -h genome-testdb hgcentraltest hgsql -e 'update dbDb set orderKey=12 where name = "hg16";' \ -h genome-testdb hgcentraltest hgsql -e 'update dbDb set orderKey=13 where name = "hg15";' \ -h genome-testdb hgcentraltest hgsql -e 'update dbDb set orderKey=14 where name = "hg13";' \ -h genome-testdb hgcentraltest # Enter hg18 into hgcentraltest.dbDb so test browser knows about it: hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \ "chr7:127,664,479-127,689,005", 1, 10, "Human", "Homo sapiens", \ "/gbdb/hg18/html/description.html", 0, 0, "NCBI Build 36.1");' \ -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P . # Edit the makefile to add hg18 in all the right places and do make update make alpha cvs commit makefile # MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2005-12-07 Fan) cd /cluster/data/hg18 mkdir -p jkStuff cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly. # Note: this ncbi.lift will not lift floating contigs to chr_random coords, # but it will show the strand orientation of the floating contigs # (grep for '|'). # mdToNcbiLift seq_contig.md jkStuff/ncbi.lft # XXXX - appears to be unused, not done - Hiram # REPEAT MASKING (DONE - 2005-12-09 - Fan) # Record the RM version here: # as this changes over time and there is no record in the results ls -l /cluster/bluearc/RepeatMasker # lrwxrwxrwx 1 angie protein 18 Nov 3 10:40 # /cluster/bluearc/RepeatMasker -> RepeatMasker051101 # beware that you can not actually include the precise single line output # by this command since it is a CVS ident line and it will get # changed as this file is checked into CVS. Remove the Id and # dollar sign business to allow it to stay as it is here. /cluster/bluearc/RepeatMasker/RepeatMasker | head -1 # RepeatMasker version development-: # RepeatMasker,v 1.10 2005/11/03 18:39:27 angie Exp cat /cluster/bluearc/RepeatMasker051101/Libraries/version # RepBase Update 9.11, RM database version 20050112 # Split contigs, run RepeatMasker, lift results # This split takes a few minutes ssh kkstore02 cd /cluster/data/hg18 foreach chrom ( `cat chrom.lst` ) foreach c ( $chrom/N{C,G,T}_?????? ) set contig = $c:t echo "splitting ${chrom}/${contig}/${contig}.fa" faSplit size ${chrom}/${contig}/$contig.fa 500000 \ ${chrom}/${contig}/${contig}_ \ -lift=${chrom}/${contig}/$contig.lft -maxN=500000 end end #- Make the run directory and job list: cd /cluster/data/hg18 mkdir -p jkStuff # According to RepeatMasker help file, no arguments are required to # specify species because its default is set for primate (human) # This run script saves the .tbl file to be sent to Arian. He uses # those for his analysis. Sometimes he needs the .cat and .align files for # checking problems. Krish needs the .align files, they are large. cat << '_EOF_' > jkStuff/RMHuman #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/hg18/$2 /bin/cp $2 /tmp/hg18/$2/ cd /tmp/hg18/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2 popd /bin/cp /tmp/hg18/$2/$2.out ./ if (-e /tmp/hg18/$2/$2.align) /bin/cp /tmp/hg18/$2/$2.align ./ if (-e /tmp/hg18/$2/$2.tbl) /bin/cp /tmp/hg18/$2/$2.tbl ./ # if (-e /tmp/hg18/$2/$2.cat) /bin/cp /tmp/hg18/$2/$2.cat ./ /bin/rm -fr /tmp/hg18/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/hg18/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/hg18 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMHuman ssh kkstore02 cd /cluster/data/hg18 mkdir RMRun rm -f RMRun/RMJobs touch RMRun/RMJobs foreach d ( `cat chrom.lst` ) foreach c ( ${d}/N{C,G,T}_*/N{C,G,T}_*_*.fa ) set f = $c:t set cc = $c:h set contig = $cc:t echo /cluster/store11/gs.19/build36/jkStuff/RMHuman \ /cluster/store11/gs.19/build36/${d}/${contig} $f \ '{'check out line+ /cluster/store11/gs.19/build36/${d}/${contig}/$f.out'}' \ >> RMRun/RMJobs end end # We have 5990 jobs in RMJobs: wc RMRun/RMJobs # 5990 41930 1127992 RMRun/RMJobs #- Do the run ssh pk cd /cluster/data/hg18/RMRun para create RMJobs para try, para check, para check, para push, para check,... #- While that is running, you can run TRF (simpleRepeat) on the small # cluster. See SIMPLE REPEAT section below # Completed: 5990 of 5990 jobs # CPU time in finished jobs: 30661460s 511024.34m 8517.07h 354.88d 0.972 y # IO & Wait Time: 38038s 633.96m 10.57h 0.44d 0.001 y # Average job time: 5125s 85.42m 1.42h 0.06d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 6693s 111.55m 1.86h 0.08d # Submission to last job: 86532s 1442.20m 24.04h 1.00d # Lift up the split-contig .out's to contig-level .out's # # If a mistake is made in the following it would be possible to # destroy all the RM output. So, just to be paranoid, save all # the RM output in bluearc for the time being: ssh kkstore02 cd /cluster/data/hg18 mkdir /cluster/bluearc/hg18/RMOutput foreach c ( `cat chrom.lst` ) foreach d ( ${c}/N{C,G,T}_* ) set T = /cluster/bluearc/hg18/RMOutput/${d} mkdir -p ${T} cd ${d} set contig = $d:t cp -p ${contig}_?{,?,??}.fa.out ${T} cd ../.. echo "${d} done" end end # Make sure we got them all: # (this doesn't work later since there are more *.fa.out files # after the lifting. More explicitly to find just these: # find . -name "N?_*_*.fa.out" -print | wc -l find . -name "*.fa.out" -print | wc -l # 5990 find /cluster/bluearc/hg18/RMOutput -type f | wc -l # 5990 # same count # OK, now you can try this operation, do it in a script like this # and save the output of the script for a record of what happened. cat << '_EOF_' > jkStuff/liftRM.csh #!/bin/csh -fe foreach c ( `cat chrom.lst` ) foreach d ( ${c}/N{C,G,T}_* ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out cd ../.. end end '_EOF_' chmod +x jkStuff/liftRM.csh mkdir scriptsOutput script lift.log bash time jkStuff/liftRM.csh > scriptsOutput/liftRM.1 2>&1 exit exit # Check that they all were done: grep "fa.out" scriptsOutput/liftRM.1 | wc -l # 5990 # same count as above #- Lift up RepeatMask .out files to chromosome coordinates via # picked up jkStuff/liftOut2.sh from the hg17 build. Renamed to # liftOut2.csh, changed the line that does the chrom listing bash time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2 2>&1 # real 0m30.488s # user 0m24.670s # sys 0m2.797s # seems much faster than hg17 ??? # hg17 numbers: # real 9m46.780s # user 1m18.900s # sys 7m33.990s #- By this point, the database should have been created (above): ssh hgwdev cd /cluster/data/hg18 bash time hgLoadOut hg18 ?/*.fa.out ??/*.fa.out *hap*/*.fa.out > \ scriptsOutput/hgLoadOut 2>&1 # real 9m9.045s # user 2m19.500s # sys 0m24.440s # errors during this load: (there are always a couple of these) # Strange perc. field -1.2 line 153851 of 2/chr2.fa.out # Strange perc. field -10423.3 line 174747 of 3/chr3.fa.out # Strange perc. field -5635.9 line 174747 of 3/chr3.fa.out # Strange perc. field -259.3 line 174747 of 3/chr3.fa.out # Strange perc. field -1.4 line 205545 of 4/chr4.fa.out # Strange perc. field -0.1 line 167690 of 7/chr7.fa.out # Strange perc. field -1331.2 line 198656 of 7/chr7.fa.out # Strange perc. field -1460.4 line 198656 of 7/chr7.fa.out # Strange perc. field -4.2 line 223183 of 7/chr7.fa.out # Strange perc. field -3192.0 line 60424 of 8/chr8.fa.out # Strange perc. field -423.4 line 60424 of 8/chr8.fa.out # Strange perc. field -784.0 line 60424 of 8/chr8.fa.out # Strange perc. field -0.1 line 52020 of X/chrX.fa.out # Strange perc. field -4526.7 line 190254 of X/chrX.fa.out # Strange perc. field -3757.2 line 190254 of X/chrX.fa.out # Strange perc. field -597.2 line 190254 of X/chrX.fa.out # Strange perc. field -13030.4 line 137624 of 16/chr16.fa.out # Strange perc. field -1359.8 line 137624 of 16/chr16.fa.out # Strange perc. field -2223.5 line 137624 of 16/chr16.fa.out # Strange perc. field -1.3 line 11573 of 22/chr22.fa.out # Strange perc. field -12.7 line 69873 of 22/chr22.fa.out # Verify we have similar results to previous assembly: # featureBits hg18 rmsk # 1406290513 bases of 3107677273 (45.252%) in intersection # featureBits -countGaps hg17 rmsk # 1390952984 bases of 3095016460 (44.942%) in intersection # featureBits hg17 rmsk # 1391378842 bases of 2867328468 (48.525%) in intersection # featureBits hg16 rmsk # 1388770568 bases of 2865248791 (48.469%) in intersection # Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF # following the SIMPLE REPEAT sections below # let Rachel know that RepeatMask is done. # SIMPLE REPEAT [TRF] TRACK (DONE - 2005-12-07 - Fan) # Copy the contigs, first to the bluearc, then to /iscratch/i ssh kkstore02 mkdir /cluster/bluearc/hg18 mkdir /cluster/bluearc/hg18/contigs cd /cluster/data/hg18 foreach ctg ( `cat contig.lst` ) set c = $ctg:t echo "$ctg > /cluster/bluearc/hg18/contigs/$c" cp -p $ctg /cluster/bluearc/hg18/contigs/$c end # Check how much is there: # du -hsc /cluster/bluearc/hg18/contigs # 2.8G /cluster/bluearc/hg18/contigs exit # Distribute contigs to /iscratch/i ssh pk mkdir -p /san/sanvol1/scratch/hg18/unmaskedContigs cd /san/sanvol1/scratch/hg18/unmaskedContigs cp -p /cluster/bluearc/hg18/contigs/* . ls . # Verify same amount made it there: # du -hsc /san/sanvol1/scratch/hg18/unmaskedContigs # 2.9G /san/sanvol1/scratch/hg18/unmaskedContigs # Then send them to the other 7 Iservers # /cluster/bin/iSync # Go to the small cluster for this business: ssh pk mkdir -p /cluster/data/hg18/bed/simpleRepeat cd /cluster/data/hg18/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << this line makes emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1S /san/sanvol1/scratch/hg18/unmaskedContigs/*.fa > genome.lst gensub2 genome.lst single gsub jobList para create jobList para try para check para push para check # Completed: 378 of 378 jobs # CPU time in finished jobs: 18956s 315.93m 5.27h 0.22d 0.001 y # IO & Wait Time: 2519s 41.98m 0.70h 0.03d 0.000 y # Average job time: 57s 0.95m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2345s 39.08m 0.65h 0.03d # Submission to last job: 2427s 40.45m 0.67h 0.03d bash liftUp simpleRepeat.bed /cluster/data/hg18/jkStuff/liftAll.lft \ warn trf/*.bed > lu.out 2>&1 # Load into the database: ssh hgwdev cd /cluster/data/hg18/bed/simpleRepeat /cluster/bin/i386/hgLoadBed hg18 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 629076 elements of size 16 # Compare with previous assembly featureBits hg18 simpleRepeat # 56164158 bases of 3107677273 (1.807%) in intersection # featureBits hg17 simpleRepeat # 54952425 bases of 2866216770 (1.917%) in intersection # featureBits hg16 simpleRepeat # 54320136 bases of 2865248791 (1.896%) in intersection # GAPS weren't in hg18 yet at this point, after gaps added: # featureBits hg18 simpleRepeat # 54964044 bases of 2867328468 (1.917%) in intersection # featureBits -countGaps hg18 simpleRepeat # 54964044 bases of 3096628158 (1.775%) in intersection # CREATE MICROSAT TRACK (done 2006-7-5 JK) ssh hgwdev cd /cluster/data/hg18/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed hg18 microsat microsat.bed # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2005-12-09 - Fan) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore02 mkdir -p cd /cluster/data/hg18/bed/simpleRepeat cd /cluster/data/hg18/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # The 4 lines below were left over from makeHg17.doc. # EXPERIMENT, at a filter of <= 12, we have coverage: # 20904399 bases of 2867328468 (0.729%) in intersection # at a filter of <= 9, we have coverage: # 19271270 bases of 2867328468 (0.672%) in intersection # Lift up filtered trf output to chrom coords as well: cd /cluster/data/hg18 mkdir bed/simpleRepeat/trfMaskChrom foreach c ( `cat chrom.lst` ) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2005-12-09, Fan) # This used to be done right after RepeatMasking. Now, we mask with # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above, # and after Repeat Masker is complete. ssh kkstore02 cd /cluster/data/hg18 # Make chr*.fa from contig .fa # chrFa.csh was already copied from hg17/jkStuff bash time ./jkStuff/chrFa.csh > scriptsOutput/chrFa.out 2>&1 & # real 2m35.734s # user 1m18.351s # sys 0m16.596s # much faster than hg17 numbers as shown below. ??? # old hg17 numbers: # real 13m18.512s # user 9m1.670s # sys 1m7.290s #- Soft-mask (lower-case) the contig and chr .fa's time ./jkStuff/makeFaMasked.csh > scriptsOutput/maFaMasked.out 2>&1 # real 8m47.289s # user 3m45.698s # sys 1m44.416s # old hg17 numbers: # real 29m31.623s # user 13m49.700s # sys 5m58.750s #- Make hard-masked .fa.masked files as well: time ./jkStuff/makeHardMasked.csh > scriptsOutput/maHardMasked.out 2>&1 # real 5m48.833s # user 1m41.926s # sys 0m52.084s #- Create the bothMasksNib/ directory time ./jkStuff/makeNib.csh > scriptsOutput/maNib.out 2>&1 # real 2m23.280s # user 1m6.462s # sys 0m19.795s # old hg17 numbers: # real 14m41.694s # user 6m28.000s # sys 1m42.500s # Make symbolic links from /gbdb/hg18/nib to the real nibs. ssh hgwdev cd /cluster/store11/gs.19/build36 mv nib nib.raw mv bothMasksNib nib rm /gbdb/hg18/nib/*.nib ln -s `pwd`/nib/* /gbdb/hg18/nib # Load /gbdb/hg18/nib paths into database and save size info. cd /cluster/data/hg18 hgNibSeq -preMadeNib hg18 /gbdb/hg18/nib */chr*.fa # 3107677273 total bases # Should be the same size as before hgsql -N -e "select chrom,size from chromInfo order by chrom" hg18 \ > chrom.sizes.masked diff chrom.sizes chrom.sizes.masked # should be no output at all, thus: rm chrom.sizes.masked # Copy the masked contig fa to /scratch and /iscratch # And everything else we will need for blastz runs, etc ... # Best to do this sequence first to /cluster/bluearc/scratch, # which is going to be the source for the /scratch copy. # And then from there to the /iscratch # Make sure you are on the fileserver for the original source: ssh kkstore02 mkdir -p /cluster/bluearc/scratch/hg/gs.19/build36 cd /cluster/bluearc/scratch/hg/gs.19/build36 # these copies take less than 2 minutes each mkdir bothMaskedNibs cp -p /cluster/data/hg18/nib/*.nib ./bothMaskedNibs mkdir maskedContigs foreach chrom ( `cat /cluster/data/hg18/chrom.lst` ) cp -p /cluster/data/hg18/${chrom}/N{C,G,T}_*/N{C,G,T}_??????.fa \ ./maskedContigs echo "done ${chrom}" end # make sure you have them all: ls maskedContigs | wc -l # 378 wc -l /cluster/data/hg18/contig.lst # 378 mkdir rmsk foreach chrom ( `cat /cluster/data/hg18/chrom.lst` ) cp -p /cluster/data/hg18/${chrom}/*.out ./rmsk echo "done ${chrom}" end # Now, go to the destination for /iscratch and copy from the # bluearc ssh kkr1u00 mkdir -p /iscratch/i/gs.19/build36 cd /iscratch/i/gs.19/build36 # This takes about 5 minutes rsync -arlv /cluster/bluearc/scratch/hg/gs.19/build36/ . bash time /cluster/bin/iSync # real 7m27.649s # request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch # Ask sysadmin to bring up BLAT server. # update central dbDb table to add the new blat server entry echo 'INSERT INTO blatServers (db, host, port, isTrans) \ VALUES ("hg18", "blat19", "17778", "1"); \ INSERT INTO blatServers (db, host, port, isTrans) \ VALUES ("hg18", "blat19", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest # LOAD ctgPos table - Contig position track # After fixing up hgCtgPos to accept the -chromLst argument, simply: cd /cluster/data/hg18 hgCtgPos -chromLst=chrom.lst hg18 . # GOLD AND GAP TRACKS (DONE - 2005-12-10 - Fan) (RE-DONE - 2006-04-06 - Fan) ssh hgwdev cd /cluster/data/hg18 # manually edit the 4 haplotype .agp files to change the first col from # contig IDs into chrom name. hgGoldGapGl -noGl -chromLst=chrom.lst hg18 /cluster/data/hg18 . # Disappointing to see this create so many tables ... # _gap and _gold for each chrom # contig.gl ... section skipped for the time being. (Fan 2005-12-13). ############################################################################# # GC5BASE (DONE - 2005-12-13 - Fan) ssh kkstore02 mkdir -p /cluster/data/hg18/bed/gc5Base cd /cluster/data/hg18/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 hg18 \ /cluster/data/hg18/nib | wigEncode stdin gc5Base.wig gc5Base.wib # runs for about 17 minutes # load database ssh hgwdev cd /cluster/data/hg18/bed/gc5Base mkdir /gbdb/hg18/wib ln -s `pwd`/gc5Base.wib /gbdb/hg18/wib hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 gc5Base gc5Base.wig # verify index is correct: hgsql hg18 -e "show index from gc5Base;" # should see good numbers in Cardinality column ######################################################################### # GENBANK auto update (DONE 2005-12-13 Fan) # align with revised genbank process. drop xeno ESTs. cd ~/kent/src/hg/makeDb/genbank cvs update -d etc # edit etc/genbank.conf to add hg18 # hg18 hg18.serverGenome = /cluster/data/hg18/nib/chr*.nib hg18.clusterGenome = /scratch/hg/gs.18/build36/bothMaskedNibs/chr*.nib hg18.ooc = /scratch/hg/h/11.ooc hg18.lift = /cluster/store11/gs.19/build36/jkStuff/liftAll.lft hg18.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} hg18.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} hg18.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} hg18.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} #hg18.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} #hg18.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter} #hg18.genbank.est.xeno.load = yes hg18.refseq.mrna.xeno.load = yes hg18.refseq.mrna.xeno.loadDesc = yes hg18.mgcTables.default = full hg18.mgcTables.mgc = all hg18.downloadDir = hg18 ### NOTE: in the future, enable orfeome tracks as part of this (markd) # update /cluster/data/genbank/ make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial hg18 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad hg18& # CPGISLANDS (DONE - 2005-12-14 - Fan) ssh hgwdev mkdir -p /cluster/data/hg18/bed/cpgIsland cd /cluster/data/hg18/bed/cpgIsland # Build software from Asif Chinwalla (achinwal at watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe mv cpglh.exe /cluster/data/hg18/bed/cpgIsland/ # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. ssh kkstore02 cd /cluster/data/hg18/bed/cpgIsland foreach f (../../*/chr*.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpglh.exe $f > $fout end # the warnings: # Bad char 0x52 = 'R' at line 2046, base 102229, sequence chr16_random # Bad char 0x4d = 'M' at line 1216113, base 60805573, sequence chr3 # Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3 # Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3 # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed ssh hgwdev cd /cluster/data/hg18/bed/cpgIsland hgLoadBed hg18 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 28226 elements of size 10 # Sorted # Saving bed.tab # Loading hg18 ######################################################################## # PRODUCING GENSCAN PREDICTIONS (DONE - 2005-12-16 - Fan) # RELOADED PEPTIDE TABLE, GENSCANPEP (DONE, 2006-07-11, hartera) ssh hgwdev mkdir /cluster/data/hg18/bed/genscan cd /cluster/data/hg18/bed/genscan cvs co hg3rdParty/genscanlinux ssh kkstore02 cd /cluster/data/hg18/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the contigs # *that do not have pure Ns* (due to heterochromatin, unsequencable # stuff) which would cause genscan to run forever. rm -f genome.list bash for f in `cat /cluster/data/hg18/contig.lst` do egrep '[ACGT]' /cluster/data/hg18/$f.masked > /dev/null if [ $? = 0 ]; then echo /cluster/data/hg18/$f.masked >> genome.list fi done # exit your bash shell if you are [t]csh ... # This egrep matched all the contigs in hg18. I guess none of # them are complete Ns* at this point. # Log into kki (not kk !). kki is the driver node for the small # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the # big cluster, due to limitation of memory and swap space on each # processing node). ssh kki cd /cluster/data/hg18/bed/genscan # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 377 of 378 jobs # Crashed: 1 jobs # CPU time in finished jobs: 78976s 1316.27m 21.94h 0.91d 0.003 y # IO & Wait Time: 4961s 82.68m 1.38h 0.06d 0.000 y # Average job time: 223s 3.71m 0.06h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3491s 58.18m 0.97h 0.04d # Submission to last job: 7541s 125.68m 2.09h 0.09d # Running the single failed job on kolossus with a smaller window: ssh kkr7u00.kilokluster.ucsc.edu /cluster/bin/x86_64/gsBig /cluster/data/hg18/5/NT_006576/NT_006576.fa.masked \ gtf/NT_006576.fa.gtf -trans=pep/NT_006576.fa.pep \ -subopt=subopt/NT_006576.fa.bed -exe=hg3rdParty/genscanlinux/genscan \ -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000 # If there were out-of-memory problems (run "para problems"), then # re-run those jobs by hand but change the -window arg from 2400000 # something lower. In build33, this was 22/NT_011519 # In build34 there were NO failures ! # Convert these to chromosome level files as so: ssh kkstore02 cd /cluster/data/hg18/bed/genscan $HOME/bin/i386/liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N*.gtf $HOME/bin/i386/liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft \ warn subopt/N*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/hg18/bed/genscan ldHgGene hg18 genscan genscan.gtf # Reading genscan.gtf # Read 43122 transcripts in 329799 lines in 1 files # 43122 groups 49 seqs 1 sources 1 feature types # 43122 gene predictions hgPepPred hg18 generic genscanPep genscan.pep # Processing genscan.pep hgLoadBed hg18 genscanSubopt genscanSubopt.bed # Reading genscanSubopt.bed # Loaded 514065 elements of size 6 # Sorted # Creating table definition for # Saving bed.tab # Loading hg18 # featureBits hg18 genscan # 56039161 bases of 2881515245 (1.945%) in intersection # featureBits hg17 genscan # 55323340 bases of 2866216770 (1.930%) in intersection # featureBits hg16 genscan # 55333689 bases of 2865248791 (1.931%) in intersection # featureBits hg18 genscanSubopt # 55685959 bases of 2881515245 (1.933%) in intersection # featureBits hg17 genscanSubopt # 55986178 bases of 2866216770 (1.953%) in intersection # featureBits hg16 genscanSubopt # 56082952 bases of 2865248791 (1.957%) in intersection # Should be zero intersection with rmsk # featureBits -chrom=chr1 hg18 genscan rmsk # Reload genscanPep table - requested by a user. It has been dropped # from hgwdev. # (hartera, 2006-07-11) ssh hgwdev cd /cluster/data/hg18/bed/genscan hgPepPred hg18 generic genscanPep genscan.pep ############################################################################ # CREATE 2 BIT FILE (DONE 12/20/05, Fan) ssh kkstore02 cd /cluster/data/hg18 faToTwoBit */chr*.fa hg18.2bit # BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR # ZEBRAFISH (danRer3) (DONE, 2005-12-23, hartera) ssh pk # Blastz uses lineage-specific repeats. There are none for mouse # and fish so use all repeats for each species as lineage-specific. mkdir -p /san/sanvol1/scratch/hg18/linSpecRep.notInOthers foreach f (/cluster/bluearc/hg18/linSpecRep/notInOthers/chr*.out.spec) cp -p $f /san/sanvol1/scratch/hg18/linSpecRep.notInOthers/ end # get only lineage specific repeats for chr1-25 and chrM mkdir -p /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers foreach f (/cluster/data/danRer3/*/chr[0-9M]*.fa.out) cp -p $f \ /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/$f:t:r:r.out.spec end # make a nib dir without random chroms mkdir -p /san/sanvol1/scratch/hg18/chromNib cp -p /cluster/data/hg18/nib/chr*.nib \ /san/sanvol1/scratch/hg18/chromNib rm -r chr*_random.nib # make a nib dir that is also just chr1-25 and chrM mkdir -p /san/sanvol1/scratch/danRer3/chromNib cp /cluster/data/danRer3/nib/chr[0-9M]*.nib \ /san/sanvol1/scratch/danRer3/chromNib ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.danRer3.2005-12-17 cd /cluster/data/hg18/bed ln -s blastz.danRer3.2005-12-17 blastz.danRer3 # Three separate runs done to create chains. Runs 1 and 3 could be # combined into one. # RUN 1: hg18 chroms (no randoms) vs danRer3 chr1-25 and chrM using # lineage-specific repeats. ssh hgwdev cd /cluster/data/hg18/bed/blastz.danRer3 # make run dir mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun # make out dir mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut cd chromsRun # use parameters as for hg17 vs danRer2 - see makeHg17.doc cat << '_EOF_' > DEF # human (hg18) vs zebrafish (danRer3) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz.v7.x86_64 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human (hg18) SEQ1_DIR=/san/sanvol1/scratch/hg18/chromNib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRep.notInOthers SEQ1_LIMIT=30 SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer3) # just chroms 1-25 and chrM SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers SEQ2_LIMIT=30 SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1chroms.len SEQ2_LEN=$BASE/S2chroms.len TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF grep -v random /cluster/data/hg18/chrom.sizes > S1chroms.len grep -v chrUn /cluster/data/danRer3/chrom.sizes \ | grep -v chrNA > S2chroms.len # do blastz and create chains for danRer3 chr1-25 and chrM using # all repeats as lineage-specific repeats. # chickenHumanTuned.gap scoring matrix is now used by axtChain if the # linearGap parameter is set to "loose". nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut \ -chainMinScore=5000 \ -chainLinearGap loose \ -stop chainRun `pwd`/DEF >& doChains.log & # Took 2 hours 45 minutes to run. # Then run the human hg18 chroms and randoms vs danRer3 chrUn and chrNA ssh hgwdev # get file of scaffolds for hg18 randoms. Use the Table Browser to # select sequence from the whole genome for the ctgPos table of contigs # restricting to chrom like "%_random" in the Free-form query box of # the filter. hg18RandomContigs.fa cd /cluster/data/hg18/bed/blastz.danRer3 # get the position and contig name from the ctgPos table hgsql -N -e 'select chrom, chromStart, chromEnd, contig from ctgPos \ where chromlike "%_random";' hg18 > contigPosAndNames.txt ssh kkstore02 cd /cluster/data/hg18/bed/blastz.danRer3 # change header to just the position perl -pi.bak -e 's/>.+range=(chr[0-9XY]+_random:[0-9]+\-[0-9]+).+/>$1/' \ hg18RandomContigs.fa awk '{print "perl -pi.bak -e s/"$1":"$2+1"-"$3"/"$4"/ hg18RandomContigs.fa"}' \ contigPosAndNames.txt > addContigNames.csh chmod +x addContigNames.csh # run script addContigNames.csh ssh hgwdev # make a 2 bit file of the chroms and random scaffolds cd /cluster/data/hg18 set dir=/san/sanvol1/scratch/hg18 faToTwoBit [1-9]/chr[1-9].fa [12][0-9]/chr[12][0-9].fa M/chrM.fa \ X/chrX.fa Y/chrY.fa *hap[12]/chr*.fa \ /cluster/data/hg18/bed/blastz.danRer3/hg18RandomContigs.fa \ $dir/chromsAndRandoms.2bit twoBitInfo $dir/chromsAndRandoms.2bit $dir/chromsAndRandoms.len # make a 2 bit file for just the random scaffolds faToTwoBit /cluster/data/hg18/bed/blastz.danRer3/hg18RandomContigs.fa \ $dir/randoms.2bit twoBitInfo $dir/randoms.2bit $dir/randoms.len # make sure all the random chroms contigs are included - should be 88. # make a 2 bit file for all the chroms and random chroms, make sure to # get the haplotype chrom sequences. faToTwoBit [1-9MXY]/chr*.fa [12][0-9]/chr*.fa *hap[12]/chr*.fa \ $dir/hg18.2bit twoBitInfo $dir/hg18.2bit $dir/hg18Chroms.len twoBitInfo /san/sanvol1/scratch/danRer3/danRer3.2bit \ /san/sanvol1/danRer3/danRer3Chroms.len # make file of scaffolds lengths for NA and Un scaffolds twoBitInfo \ /san/sanvol1/scratch/danRer3/scaffoldsNAandUn/danRer3NAandUnScaf.2bit \ /san/sanvol1/scratch/danRer3/scaffoldsNAandUn/NAandUnScafs.len cd /cluster/data/hg18/bed/blastz.danRer3 # make a lift file for the hg18 randoms contigs cat /cluster/data/hg18/*/lift/random.lft >> $dir/randomContigs.lft # RUN 2: hg18 chroms and random chroms contigs vs danRer3 chrNA and # chrUn scaffolds with no lineage-specific repeats as there are too # many scaffolds in chrNA and chrUn. Use the dynamic masking function # of Blastz instead. # make run dir mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun # make out dir mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut cd chromsAndRandomsRun # use parameters similar to hg17 vs danRer2 - see makeHg17.doc # As lineage-specific repeats can not be used with chrUn and chrNA # scaffolds, then use dynamic masking, M=50. cat << '_EOF_' > DEF # human (hg18) vs zebrafish (danRer3) # human chroms and random chrom contigs vs zebrafish chrNA and chrUn scaffolds export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz.v7.x86_64 # Reuse some parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human (hg18) SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.2bit SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/chromsAndRandoms.2bit SEQ1_LIFT=/san/sanvol1/scratch/hg18/randomContigs.lft SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_LIMIT=30 SEQ1_IN_CONTIGS=0 # 500 kb target with 5 kb overlap SEQ1_CHUNK=500000 SEQ1_LAP=5000 # QUERY: Zebrafish (danRer3) # just scaffolds for chrUn and chrNA SEQ2_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit SEQ2_CTGDIR=/san/sanvol1/scratch/danRer3/scaffoldsNAandUn/danRer3NAandUnScaf.2bit SEQ2_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=1000000000 SEQ2_LAP=0 BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/chromsAndRandoms.len SEQ2_LEN=/san/sanvol1/scratch/danRer3/danRer3Chroms.len SEQ2_CTGLEN=/san/sanvol1/scratch/danRer3/scaffoldsNAandUn/NAandUnScafs.len TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF # do blastz and create chains for human chroms and random chroms in contigs # vs zebrafish danRer3 chrNA and chrUn in scaffolds without # lineage-specific repeats but using blastz's dynamic masking. # chickenHumanTuned.gap scoring matrix is now used by axtChain if the # linearGap parameter is set to "loose". nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut \ -chainMinScore=5000 \ -chainLinearGap loose \ -stop chainRun `pwd`/DEF >& doChains.log & # Took about 15 hours to finish. ssh hgwdev # Try running hg18 random chroms in contigs vs danRer3 chroms 1-25 and chrM # with lineage-specific repeats. # make directory of human contigs repeats to serve as lineage-specific # repeats for the random chroms contigs. mkdir -p /san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers cd /cluster/data/hg18/bed/blastz.danRer3 awk '{print $4}' contigPosAndNames.txt > contigNames.txt foreach c (`cat contigNames.txt`) foreach f (/cluster/data/hg18/*/${c}/${c}.fa.out) cp -p $f \ /san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers/$f:t:r:r.out.spec end end # RUN 3: hg18 random chroms contigs vs danRer3 chr1-25 and chrM using # lineage-specific repeats. # make run dir mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun # make out dir mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut set dir=/san/sanvol1/scratch cp $dir/hg18/blastzDanRer3/chromsRun/S2chroms.len \ $dir/danRer3/chr1to25andM.len # make nib dir for random contigs for hg18 mkdir -p $dir/hg18/randomContigsNib foreach c (`cat contigNames.txt`) foreach f (/cluster/data/hg18/*/${c}/${c}.fa) faToNib -softMask $f $dir/hg18/randomContigsNib/$f:t:r.nib end end cd randomsRun cat << '_EOF_' > DEF # human (hg18) vs zebrafish (danRer3) # human random chrom contigs vs zebrafish chr1-15 and chrM export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz.v7.x86_64 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human (hg18) SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.2bit SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/randomContigsNib SEQ1_LIFT=/san/sanvol1/scratch/hg18/randomContigs.lft SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers SEQ1_LIMIT=30 SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Zebrafish (danRer3) # just chr1-25 and chrM SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib SEQ2_RMSK= SEQ2_FLAG= SEQ2_LIMIT=30 SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/randoms.len SEQ2_LEN=/san/sanvol1/scratch/danRer3/chr1to25andM.len TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF # do blastz and create chains for human random chroms in contigs # vs zebrafish danRer3 chroms 1 to 25 and chrM using all repeats # as lineage-specific repeats. # chickenHumanTuned.gap scoring matrix is now used by axtChain if the # linearGap parameter is set to "loose". nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut \ -chainMinScore=5000 \ -chainLinearGap loose \ -stop chainRun `pwd`/DEF >& doChains.log & # Took 15 minutes. # chains are sorted by score so move into one directory and use # chainMergeSort ssh kolossus set blastzDir=/cluster/data/hg18/bed/blastz.danRer3 cd $blastzDir/chromsRun/axtChain mkdir -p chainsNotMerged foreach r (chromsRun chromsAndRandomsRun randomsRun) nice cp -p ${blastzDir}/${r}/axtChain/run/chain/*.chain \ ${blastzDir}/chromsRun/axtChain/chainsNotMerged/ end nice chainMergeSort ./chainsNotMerged/*.chain | nice gzip -c \ > hg18.danRer3.all.chain.gz # split into chains by chrom nice zcat hg18.danRer3.all.chain.gz | chainSplit chain stdin # check chains, there are 48 should be 49. Chains for chr11_random # are missing. These sequences have a lot of repeats in the regions that # hits danRer3 with BLAT. # carry on with doBlastzChainNet.pl starting from net step ssh hgwdev cd /cluster/data/hg18/bed/blastz.danRer3/chromsRun mv DEF DEF.chroms # edit DEF to give hg18.2bit as the SEQ1_DIR and danRer3.2bit as SEQ2_DIR # and remove lineage-specfic repeats. nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut \ -chainMinScore=5000 \ -chainLinearGap loose \ -continue net `pwd`/DEF >& doNetAndDownloads.log & # Took about 25 minutes. # crashed on ssh -X sanhead1 for cleanup so re-run script cleanUp.csh # copy chainDanRer3.html and netDanRer3.html to # kent/src/hg/makeDb/trackDb/human/hg18/ and edit to describe method used. # Add tracks to trackDb.ra there. Edit README.txt in the downloads # directory to describe method used for alignments. # featureBits -chrom=chr1 hg18 refGene:cds chainDanRer3Link -enrichment # refGene:cds 1.378%, chainDanRer3Link 2.601%, both 0.927%, cover 67.26%, # enrich 25.86x # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment # refGene:cds 1.386%, chainDanRer3Link 2.742%, both 0.909%, cover 65.58%, # enrich 23.91x # So similar coverage and enrichment to hg17 vs danRer2 chains. ######################################################################### # BLASTZ MOUSE Mm7 second time (DONE - 2005-12-24 - 2005-12-25 Fan) # After fixing a bug in the lineage specific repeat snip business # in blastz-run-ucsc script ssh pk mkdir /cluster/data/hg18/bed/blastzMm7.2005-12-24 cd /cluster/data/hg18/bed rm blastz.mm7 ln -s blastzMm7.2005-12-24 blastz.mm7 cd blastzMm7.2005-12-24 cat << '_EOF_' > DEF # human vs mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInMouse SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes # QUERY: Mouse Mm7 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/mm7/nib SEQ2_SMSK=/scratch/hg/mm7/linSpecRep/notInHumanDogCow SEQ2_LEN=/cluster/bluearc/mm7/chrom.sizes SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzMm7.2005-12-24 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -stop=load \ `pwd`/DEF > to-load.out 2>&1 & # Started 2005-12-24 06:15 mv to-load.out to-load.out.1 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainMerge -stop=load \ `pwd`/DEF > to-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # PLEASE NOTE THAT SOME .OUT FILES MIGHT HAVE BEEN OVERWRITTEN # DUE TO RETRIES AND/OR NEXT STEP COMMAND NOT FULLY EDITED CORRECTLY. # Measurements: ssh hgwdev featureBits mm7 chainHg18Link # 990285408 bases of 2583394090 (38.333%) in intersection featureBits hg18 chainMm7Link # 991769039 bases of 2881515245 (34.418%) in intersection # each of above took about half hour. ######################################################################### # BLASTZ CHICKEN GalGal2 second time (DONE - 2005-12-28 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzGalGal2.2005-12-28 cd /cluster/data/hg18/bed rm blastz.galGal2 ln -s blastzGalGal2.2005-12-28 blastz.galGal2 cd blastzGalGal2.2005-12-28 cat << '_EOF_' > DEF # human vs chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken GalGal2 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/galGal2/nib SEQ2_LEN=/cluster/bluearc/galGal2/chrom.sizes SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzGalGal2.2005-12-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started 2005-12-28 10:35 # Two jobs stuck in the same node. Did manual para stop and para push. # Both finished within a few minutes. # Done! On Wed Dec 28 15:32:45 PST 2005. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # Had an error at the net step time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=net -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > download.out 2>&1 & # the gzip job on kolossus seems not moving at all. # killed it manually. Try again. # Seemed not moving, kill it again. Now use pk instead of kolossus. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -workhorse=pk \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -workhorse=pk \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Wed Dec 28 20:39:44 PST 2005 # Measurements: ssh hgwdev nice featureBits galGal2 chainHg18Link # 91564024 bases of 1054197620 (8.686%) in intersection nice featureBits hg18 chainGalGal2Link # 102417858 bases of 2881515245 (3.554%) in intersection nice featureBits galGal2 chainHg17Link # 93277286 bases of 1054197620 (8.848%) in intersection nice featureBits hg17 chainGalGal2Link # 103882699 bases of 2866216770 (3.624%) in intersection ######################################################################### # BLASTZ DOG CanFam2 time (DONE - 2005-12-28 - 2005-12-29 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzCanFam2.2005-12-28 cd /cluster/data/hg18/bed rm blastz.canFam2 ln -s blastzCanFam2.2005-12-28 blastz.canFam2 cd blastzCanFam2.2005-12-28 cat << '_EOF_' > DEF # human vs dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for dog (per Webb email to Brian Raney) BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog CanFam2 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_LEN=/cluster/bluearc/canFam2/chrom.sizes SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzCanFam2.2005-12-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started 2005-12-28 21:33 # Two jobs stuck in the same node. Did manual para stop and para push. # Both finished within a few minutes. # Done! On Thu Dec 29 05:27:31 PST 2005. # system seems hang on kolossus (3 processes of [tcsh -c nice chainMergeSort], not moving) # manually killed the jobs. # now use pk as the workhorse. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -continue=chainMerge \ -stop=load \ `pwd`/DEF > load2.out 2>&1 & # Done! Thu Dec 29 09:10:02 PST 2005. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # Had an error at the load step, # mySQL error 2013: Lost connection to MySQL server during query, # probably due to sys admin working on network connections, # continue at the load step time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -swap -continue=load -stop=load \ `pwd`/DEF > swap-load2.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -workhorse=pk \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Dec 29 13:21 # Measurements: ssh hgwdev nice featureBits canFam2 chainHg18Link # 1477551526 bases of 2384996543 (61.952%) in intersection nice featureBits hg18 chainCanFam2Link # 1524764349 bases of 2881515245 (52.915%) in intersection nice featureBits canFam2 chainHg17Link # 1487483112 bases of 2384996543 (62.368%) in intersection nice featureBits hg17 chainCanFam2Link # 1530197469 bases of 2866216770 (53.387%) in intersection # ENABLE GENBANK UPDATE (1/3/06 Fan) # add hg18 to the following two files and check them in. src/hg/makeDb/genbank/etc/align.dbs src/hg/makeDb/genbank/etc/hgwdev.dbs # then go to /cluster/data/genbank/etc and do cvs update on these two files. ######################################################################### # BLASTZ RAT Rn3 (STARTED - 2005-12-22, DONE 2006-01-05 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzRn3.2005-12-22 cd /cluster/data/hg18/bed rm blastz.rn3 ln -s blastzRn3.2005-12-22 blastz.rn3 cd blastzRn3.2005-12-22 cat << '_EOF_' > DEF # human vs rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Muman Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInRat SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole pieces SEQ2_DIR=/scratch/rat/rn3/softNib SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzRn3.2005-12-22 TMPDIR=/scratch/tmp '_EOF_' # happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -stop=load \ `pwd`/DEF > to-load.out 2>&1 & # start processing again on 12/31/05. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -swap \ -stop=load \ `pwd`/DEF > swap.out 2>&1 & # Either UCSC RR and hgwdev systems or network went down around 11 AM 12/31/05. # After holidays, start again on 1/3/06 and again on 1/5/06. ssh pk cd /cluster/data/hg18/bed cd blastzRn3.2005-12-22 screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -swap \ -continue=net \ -stop=load \ `pwd`/DEF > swap6.out 2>&1 & # DONE! Jan 5 13:39 # Measurements: nice featureBits rn3 chainHg18Link # 962630574 bases of 2571104688 (37.440%) in intersection nice featureBits hg18 chainRn3Link # 964251210 bases of 2881515245 (33.463%) in intersection ######################################################################### # BLASTZ FUGU fr1 (STARTED - 2005-12-20, DONE 2006-01-05 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzFr1.2005-12-20 cd /cluster/data/hg18/bed ln -s blastzFr1.2005-12-20 blastz.fr1 cd blastzFr1.2005-12-20 cat << '_EOF_' > DEF # human vs. fugu export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # Reuse parameters from human-chicken, except L=6000 (more relaxed) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human Hg18 - testing 100,000,000 sized chunk on pk kluster SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=100000000 SEQ1_LAP=10000 # QUERY: Fugu Fr1 - chunk big enough to run the whole chrom at once SEQ2_DIR=/san/sanvol1/scratch/fr1/nib SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ2_CHUNK=400000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzFr1.2005-12-20 '_EOF_' # << happy emacs # establish a screen to control this job ssh pk cd /cluster/data/hg18/bed/blastzFr1.2005-12-20 screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -stop=load \ `pwd`/DEF > thruLoad.out 2>&1 & ssh pk cd /cluster/data/hg18/bed/blastzFr1.2005-12-20 screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -continue=chainMerge -stop=load \ `pwd`/DEF > thruLoad.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -continue=download \ `pwd`/DEF > download.clean.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -swap \ `pwd`/DEF > swap.out 2>&1 & # Finish the remaining step, 1/4/05. ssh pk cd /cluster/data/hg18/bed/blastzFr1.2005-12-20 screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 \ -swap -continue=download \ `pwd`/DEF > DownloadSwap.out 2>&1 & # First try found the DEF was some how altered for rn3. # Re-generated DEF and try again. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 \ -swap -continue=download \ `pwd`/DEF > DownloadSwap2.out 2>&1 & # Done. Jan 4 09:48. # measurements nice featureBits hg18 chainFr1Link # 51795958 bases of 2881515245 (1.798%) in intersection nice featureBits hg17 chainFr1Link #50831650 bases of 2866216770 (1.773%) in intersection nice featureBits hg18 netFr1 # 691148929 bases of 2881515245 (23.986%) in intersection nice featureBits hg17 netFr1 # 714234935 bases of 2866216770 (24.919%) in intersection nice featureBits fr1 chainHg18Link # 43267869 bases of 315518167 (13.713%) in intersection # nice featureBits fr1 chainHg17Link 0 bases of 315518167 (0.000%) in intersection nice featureBits fr1 netHg18 # 140843080 bases of 315518167 (44.639%) in intersection nice featureBits fr1 netHg17 # 0 bases of 315518167 (0.000%) in intersection # BLASTZ TETRAODON TetNig1 second time (DONE - 2006-01-07 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzTetNig1.2006-01-07 cd /cluster/data/hg18/bed rm blastz.tetNig1 ln -s blastzTetNig1.2006-01-07 blastz.tetNig1 cd blastzTetNig1.2006-01-07 cat << '_EOF_' > DEF # human vs tetraodon export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes SEQ2_CHUNK=410000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzTetNig1.2006-01-07 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started Sat Jan 7 05:40:51 PST 2006 # Encountered an error: startStep: 0, at step 5 net to stopStep 6 netChains: looks like previous stage was not successful (can't find [hg18.tetNig1.]all.chain[.gz]). # Try it with pk as the workhorse. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -continue=net \ -stop=load \ `pwd`/DEF > load2.out 2>&1 & # Load done. Sat Jan 7 07:34:56 PST 2006 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Sat Jan 7 08:02:14 PST 2006 # The download and swap-download took less than 10 seconds each. ??? # Measurements: ssh hgwdev nice featureBits tetNig1 chainHg18Link # 50026847 bases of 342403326 (14.611%) in intersection nice featureBits hg18 chainTetNig1Link # 57654754 bases of 2881515245 (2.001%) in intersection nice featureBits tetNig1 chainHg17Link # 34379509 bases of 342403326 (10.041%) in intersection nice featureBits hg17 chainTetNig1Link # 35910128 bases of 2866216770 (1.253%) in intersection # BLASTZ FROG XenTro1 second time (STARTED - 2006-01-06 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06 cd /cluster/data/hg18/bed rm blastz.xenTro1 ln -s blastzXenTro1.2006-01-06 blastz.xenTro1 cd blastzXenTro1.2006-01-06 cat << '_EOF_' > DEF # human vs frog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Frog XenTro1 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes SEQ2_LIMIT=400 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started Fri Jan 6 20:19:30 PST 2006 # Blastz run done. Jan 7 02:07 load.out time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # got the following error: startStep: 4, at step 5 net to stopStep 6 netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]). # Try it with pk instead of kolossus: time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -swap -stop=load \ `pwd`/DEF > swap-load2.out 2>&1 & # It worked, swap-load done. Jan 7 06:05 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -workhorse=pk \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Jan 7 06:18 # Measurements: ssh hgwdev nice featureBits xenTro1 chainHg18Link # 61197900 bases of 1381238994 (4.431%) in intersection nice featureBits hg18 chainXenTro1Link # 67810866 bases of 2881515245 (2.353%) in intersection nice featureBits xenTro1 chainHg17Link # 81777842 bases of 1381238994 (5.921%) in intersection nice featureBits hg17 chainXenTro1Link # 85701475 bases of 2866216770 (2.990%) in intersection ############################################################################ # BLASTZ COW BosTau2 second time (STARTED - 2006-01-07, DONE 2006-01-08 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzBosTau2.2006-01-07 cd /cluster/data/hg18/bed rm blastz.bosTau2 ln -s blastzBosTau2.2006-01-07 blastz.bosTau2 cd blastzBosTau2.2006-01-07 cat << '_EOF_' > DEF # human vs cow export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow BosTau2 - single chunk big enough to run entire genome SEQ2_DIR=/san/sanvol1/scratch/bosTau2/bosTau2.2bit SEQ2_LEN=/san/sanvol1/scratch/bosTau2/chrom.sizes SEQ2_CHUNK=3200000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzBosTau2.2006-01-07 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -stop=load \ -workhorse=pk \ `pwd`/DEF > load.out 2>&1 & # Started Sat Jan 7 07:57:22 PST 2006 # blastz run (and load) done Jan 8 00:13 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # took a long time to finish. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Jan 8 21:10 # Measurements: ssh hgwdev nice featureBits bosTau2 chainHg18Link # 1357027317 bases of 2812203870 (48.255%) in intersection nice featureBits hg18 chainBosTau2Link # 1357291762 bases of 2881515245 (47.103%) in intersection nice featureBits bosTau2 chainHg17Link # 0 bases of 2812203870 (0.000%) in intersection nice featureBits hg17 chainBosTau2Link # 1350076765 bases of 2866216770 (47.103%) in intersection ####################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2006-01-11 - Fan) ssh kkstore02 cd /cluster/data/hg18 blat hg18.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024 # Wrote 30378 overused 11-mers to 11.ooc # Copy over to the bluearc cp -p 11.ooc /cluster/bluearc/hg18 ####################################################################### # PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE # (DONE - 2006-01-12 - 2006-04-04 - Hiram) # (RE-DONE 2006-10-31 - Hiram - see section:) # REWORK PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE ssh kkstore02 mkdir /cluster/data/hg18/bed/coverage cd /cluster/data/hg18/bed/coverage # find all the clones that were used in the assembly sed -e "/^#.*/d" ../../ncbi_build36.agp | \ awk '{if (!match($5,"N")) {print $6}}' | \ sort -u > placed_in_assembly.list wc -l placed_in_assembly.list # 27093 placed_in_assembly.list # And all possible clones considered for assembly. # The AADB clones are the Celera assembly, don't want them. sed -e "/^#.*/d" /cluster/store11/gs.19/ncbi/sequence.inf | \ grep for_assembly | grep -v AADB | awk '{print $1}' | sort -u \ > allButOneClonesConsidered.list (grep AADB01066164.1 \ /cluster/store11/gs.19/ncbi/sequence.inf | awk '{print $1}'; \ cat allButOneClonesConsidered.list) | sort -u \ > allClonesConsidered.list # The grep for AADB eliminates a single clone: AADB01066164.1 # Which actually should be in the list since it is in the # ncbi_build36.agp file. Back in Hg17, this was the only AADB # clone in the sequence.inf file, now there are 400,673 of them in # this Hg18 sequence.inf file marked "for_assembly" # Later after a lot of this was done, it was discovered that some # of the clones on this allConsidered list are actually obsolete # and have newer versions in use. They were identified by the # following perl script: cat << '_EOF_' > ckMultipleVersions.pl #!/usr/bin/env perl use warnings; use strict; sub usage() { print "usage: ./ckMultipleVersions.pl allClonesConsidered.list\n"; exit 255; } my $argc = scalar(@ARGV); if ($argc != 1) { usage; } my $fileName = shift; open (FH,"<$fileName") or die "Can not open $fileName"; my %cloneAcc; # key is clone accession major number, value is version while (my $clone = ) { chomp $clone; my ($major, $version) = split('\.', $clone); if (exists($cloneAcc{$major})) { my $previousVersion = $cloneAcc{$major}; if ($previousVersion >= $version) { printf STDERR "$major.$version - obsolete\n"; } else { printf STDERR "$major.$previousVersion - obsolete\n"; $cloneAcc{$major} = $version; } } else { $cloneAcc{$major} = $version; } } close (FH); foreach my $major (sort keys %cloneAcc) { printf "$major.$cloneAcc{$major}\n"; } '_EOF_' # happy emacs chmod +x ckMultipleVersions.pl ./ckMultipleVersions.pl allClonesConsidered.list \ 2> obsoleteClone.list > allClones.notObsolete.list # After this obsolete list was made, those clone results were # removed from the kluster run hierarchies of results. # And when we finally got to loading up the coverage track # 2006-04-04, a few additional ones had crept into the mix. # These were added to this list at that loading time. comm -12 allClonesConsidered.list \ /cluster/data/hg17/bed/contig_overlaps/sequence.list \ > allClones.InHg17AndHg18.list comm -23 allClonesConsidered.list \ /cluster/data/hg17/bed/contig_overlaps/sequence.list \ > allClones.InHg18NotHg17.list comm -13 allClonesConsidered.list \ /cluster/data/hg17/bed/contig_overlaps/sequence.list \ > allClones.InHg17NotHg18.list # how many are the same as previous build: comm -12 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \ placed_in_assembly.list > sameAsHg17.list wc sameAsHg17.list # 26775 26775 300641 sameAsHg17.list # There is one clone: AADB01066164.1 # Which is listed in allClones.InHg17NotHg18.list # But it is on the Hg18 placed_in_assembly.list # And it is on the Hg17 placed_in_assembly.list but it isn't # actually found in Hg17 ? Perhaps it didn't align good enough. comm -23 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \ placed_in_assembly.list > uniqueToHg17.list wc uniqueToHg17.list # 97 97 1080 uniqueToHg17.list # and unique to hg18, not in hg17: comm -13 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \ placed_in_assembly.list > newToHg18.list wc newToHg18.list # 318 318 3547 newToHg18.list # make a list of these new contigs: # using the previous perl scripts: cp -p /cluster/data/hg17/bed/contig_overlaps/*.pl . # Now, we need to distribute the clone sequence files in a # directory hierarchy by chrom name. Using the contigAcc.pl file # from the previous release: cp /cluster/data/hg17/bed/contig_overlaps/contigAcc.pl . # This newer version is generalized a bit better to take command # line arguments for the two files it is to read instead of having # them explicitly in the code, then: ./contigAcc.pl /cluster/data/hg18/ncbi_build36.agp \ /cluster/data/hg18/seq_contig.md > cloneToChrom.list 2>&1 # And now, since most of the clone sequence already exists in the # Hg17 work directory, we only need to make symlinks to the # existing ones, and move only the new ones. The following script # will find an existing copy and symlink it correctly. cat << '_EOF_' > createPlacedHierarchy.sh #!/bin/sh mkdir -p placedClones sed -e "/^#.*/d" cloneToChrom.list | while read L do CHROM=`echo "${L}" | awk '{print $1}'` CLONE=`echo "${L}" | awk '{print $2}'` if [ ! -d "placedClones/${CHROM}" ]; then mkdir placedClones/${CHROM} fi HG17_version="/cluster/data/hg17/bed/contig_overlaps/${CHROM}/${CLONE}" HG18_version_0="/cluster/data/hg18/bed/coverage/newToHg18/${CLONE}" HG18_version_1="/cluster/data/hg18/bed/coverage/allClones.newToHg18/${CLONE}" if [ -f "${HG17_version}" ]; then if [ -f "${HG18_version_0}" -o -f "${HG18_version_1}" ]; then echo "ERROR: Why is there both an Hg17 and Hg18 version for ${CLONE}" exit 255 fi ln -s "/cluster/data/hg17/bed/contig_overlaps/${CHROM}/${CLONE}" \ "./placedClones/${CHROM}/${CLONE}" else if [ -f "${HG18_version_0}" -a -f "${HG18_version_1}" ]; then echo "ERROR: Why are there two Hg18 copies for ${CLONE}" exit 255 fi if [ -f "${HG18_version_0}" ]; then ln -s "${HG18_version_0}" "./placedClones/${CHROM}/${CLONE}" else if [ -f "${HG18_version_1}" ]; then ln -s "${HG18_version_1}" "./placedClones/${CHROM}/${CLONE}" else # must be on a different chrom in hg17 HG17_chrom=`grep -v "^#" \ /cluster/data/hg17/bed/contig_overlaps/disburseEm.list \ | grep "^${L}$" | awk '{print $1}'` HG17_version="/cluster/data/hg17/bed/contig_overlaps/${HG17_chrom}/${CLONE}" if [ -f "${HG17_version}" ]; then echo "ERROR: Why is there no version for ${CLONE}" exit 255 fi ln -s "${HG17_version}" "./placedClones/${CHROM}/${CLONE}" fi fi fi done '_EOF_' # happy emacs chmod +x createPlacedHierarchy.sh ./createPlacedHierarchy.sh # There should be no errors # We need masked contigs for the psLayout alignments ssh hgwdev mkdir /cluster/data/hg18/bed/coverage/maskedContigs cd /cluster/data/hg18/bed/coverage/maskedContigs hgsql -N \ -e "select chrom,chromStart,chromEnd,contig,size from ctgPos;" hg18 \ > ctgPos.txt ssh kkstore02 cd /cluster/data/hg18/bed/coverage/maskedContigs # verify each contig only listed once: awk '{print $4}' ctgPos.txt | sort | uniq -c | sort -n | less # should all have a count of one # verify all chrom sizes match the contig sizes: awk '{print $3-$2}' ctgPos.txt > chrSize.list awk '{print $5}' ctgPos.txt > ctgSize.list diff ctgSize.list chrSize.list # should be no difference # OK, now fetch the contigs from the twoBit file: cat << '_EOF_' > 2bitToFa.pl #!/usr/bin/env perl use warnings; use strict; while (my $line=<>) { chomp $line; my ($chrom, $start, $end, $contig, $size) = split('\s',$line); $chrom =~ s/chr//; printf "echo -n 'working $contig ...'; mkdir -p $chrom; twoBitToFa /cluster/data/hg18/hg18.2bit:chr$chrom:$start-$end stdout | sed -e 's/^>.*/>$contig/' > $chrom/$contig.fa; gzip $chrom/$contig.fa; echo 'done'\n"; } '_EOF_' # happy emacs chmod +x 2bitToFa.pl cat ctgPos.txt | ./2bitToFa.pl > 2bitToFa.sh chmod +x 2bitToFa.sh time ./2bitToFa.sh # and create a lift file for these contigs cat << '_EOF_' > mkCtgLift.pl #!/usr/bin/env perl use warnings; use strict; while (my $line=<>) { chomp $line; my ($start, $chrCtg, $size, $chrom, $chrLen) = split('\s',$line); $chrCtg =~ s#.*/##; printf "%s\t%s\t%s\t%s\t%s\n", $start, $chrCtg, $size, $chrom, $chrLen; } '_EOF_' # happy emacs chmod +x mkCtgLift.pl cat /cluster/data/hg18/jkStuff/liftAll.lft \ | ./mkCtgLift.pl > liftContigs.lft # Create individual ooc files for each contig mkdir ooc for C in `ls */*.fa.gz | sed -e "s/.fa.gz//"` do CONTIG=`basename ${C}` CHR=`dirname ${C}` mkdir -p ooc/${CHR} zcat ${C}.fa.gz | blat -repMatch=256 \ -makeOoc=ooc/${CHR}/${CONTIG}.10.ooc -tileSize=10 \ stdin /dev/null /dev/null echo "done: ${CONTIG}" done # Copy everything to san filesystem for kluster run: ssh pk mkdir /san/sanvol1/scratch/hg18/coverage cd /san/sanvol1/scratch/hg18/coverage rsync -a --progress --copy-links \ /cluster/data/hg18/bed/coverage/placedClones/ ./placedClones/ rsync -a --progress --copy-links \ /cluster/data/hg18/bed/coverage/maskedContigs/ ./maskedContigs/ mkdir /san/sanvol1/scratch/hg18/coverage/runPlaced cd /san/sanvol1/scratch/hg18/coverage/runPlaced cat << '_EOF_' > runPsLayout.sh #!/bin/sh # runPsLayout.sh # where is the chrom this contig is on # is one of the .fa.gz files in # /san/sanvol1/scratch/hg18/coverage/placedClones//.fa.gz # is one of the contigs found in: # /san/sanvol1/scratch/hg18/coverage/maskedContigs//.fa.gz # HERE=`pwd` CHROM=$1 CLONE=$2 CONTIG=$3 TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz CLONESRC=/san/sanvol1/scratch/hg18/coverage/placedClones/$CHROM/$CLONE.fa.gz OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl" mkdir -p psl/${CHROM}/${CONTIG} if [ ! -s ${CLONESRC} ]; then echo "Can not find: ${CLONESRC}" 1>/dev/stderr exit 255 fi if [ ! -s ${TARGET} ]; then echo "Can not find: ${TARGET}" 1>/dev/stderr exit 255 fi if [ ! -s ${OOC} ]; then echo "Can not find: ${OOC}" 1>/dev/stderr exit 255 fi WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}" mkdir -p "${WRKDIR}" cd ${WRKDIR} zcat ${CLONESRC} > ${CLONE}.fa zcat ${TARGET} > ${CONTIG}.fa cp -p ${OOC} ./10.ooc /cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT} RET=$? cd ${HERE} rm -fr ${WRKDIR} rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}" rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}" exit ${RET} '_EOF_' # happy emacs chmod +x runPsLayout.sh # create jobList from cloneToChrom.list: grep -v "^#" /cluster/data/hg18/bed/coverage/cloneToChrom.list \ | sed -e "s/.fa.gz//" \ | awk '{ printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s/%s.psl}\n", $1, $2, $3, $1, $3, $2 }' > masterJobList # To do a quick test, run just chrM: grep " M " masterJobList > jobList s para create jobList para try ... check ... etc ... # Then, the whole run: rm -fr psl err para create masterJobList para try ... check ... push ... etc ... # running 2006-01-17 16:41 # We need the phase information from the sequence.inf file: ssh hgwdev cd /cluster/data/hg18/bed/coverage cp /cluster/data/hg17/phase.pl . # this script was fixed up for hg18 to take an argument to the # sequence.inf file: ./phase.pl /cluster/data/hg18/ncbi/sequence.inf > phase.txt # what kind of phases do we have: awk '{print $2}' phase.txt | sort | uniq -c # 1134 D # 562513 F # 17270 P # Compared to hg17 we had: awk '{print $2}' /cluster/data/hg17/phase.txt | sort | uniq -c # 1088 D # 146900 F # 17300 P # Back in the kluster runPlaced directory, we put together the # kluster run results with: ssh pk mkdir /san/sanvol1/scratch/hg18/coverage/runPlaced/filteredLifted cd /san/sanvol1/scratch/hg18/coverage/runPlaced/filteredLifted cat << '_EOF_' > filterLift.sh #!/bin/sh for C in 22 do echo -n "chr${C} working ... " mkdir -p ${C} OUT="${C}/filterLift.out" pslSort dirs ${C}/raw.psl tmp ../psl/${C}/N* > ${OUT} 2>&1 pslReps -singleHit -nearTop=0.001 ${C}/raw.psl ${C}/repsSingle.psl \ /dev/null >> ${OUT} 2>&1 liftUp ${C}/chr${C}.psl ../../maskedContigs/liftContigs.lft warn \ ${C}/repsSingle.psl >> ${OUT} 2>&1 clusterClone -agp -minCover=80 -maxGap=60000 ${C}/repsSingle.psl \ > ${C}/single.agp 2>> ${OUT} 2>&1 liftUp ${C}/rawLifted.psl ../../maskedContigs/liftContigs.lft warn \ ${C}/raw.psl >> ${OUT} 2>&1 clusterClone -agp -minCover=80 -maxGap=60000 ${C}/chr${C}.psl \ > ${C}/chr${C}.bed 2>> ${OUT} echo "done" done '_EOF_' # happy emacs chmod +x filterLift.sh time ./filterLift.sh cp /cluster/data/hg17/fixPhase.pl . # fixed up the script to take an argument pointing to the phase.txt file ssh kkstore02 cd /cluster/data/hg18 grep "for_assembly" ncbi/sequence.inf \ | sed -e "s/\tW\t/\t3\t/;" > sequence.inf cd /cluster/store11/gs.19/ffa ln -s ../build36/sequence.inf . ssh hgwdev cd /cluster/data/hg18 # currently working only on chr22 echo "22" > clonePos.list # need to reload gold gap *and* gl at this time. gl wasn't loaded # before this. It is required for the clonePos track. hgGoldGapGl -chrom=chr22 hg18 /cluster/store11/gs.19 build36 hgClonePos -maxErr=3 -maxWarn=2000 -chromLst=clonePos.list \ hg18 /cluster/data/hg18 ./sequence.inf /cluster/store11/gs.19 \ 2> clone.pos.errors # OK, now for the hard part. The unplaced clones. # First we will make an attempt to determine which clones they # belong to by using information from the previous build, the # sequence.inf file, the seq_contig.md file, and the # ncbi_build36.agp file. ssh kkstore02 cd /cluster/data/hg18/bed/coverage comm -13 placed_in_assembly.list allClonesConsidered.list \ > unplaced.clone.list comm -12 unplaced.clone.list allClones.InHg17AndHg18.list \ > common.to.hg17.unplaced.list comm -23 unplaced.clone.list allClones.InHg17AndHg18.list \ > unique.to.hg18.unplaced.list awk '{print $1,$6}' /cluster/data/hg17/contig_overlaps.agp \ | sed -e "s/_[0-9]*$//" | sort -u > hg17.contig.clone.list awk '{print $1,$6}' ../../sequence.inf | sed -e "s/(//; s/)//" \ > cloneToChrom.from.seq.inf.txt # using the contig to clone information from Hg17, attempt to # locate the common.to.hg17.unplaced.list in terms of chrom and # contig. Along with the ncbi_build36.agp, seq_contig.md and # cloneToChrom.from.seq.inf.txt infomation, we can attempt to # place clones that have perhaps moved, or don't have entries in # one file or another. The relationships obtained from the # various files: # ncbi_build36.agp - gives clone to contig name and clone to chr name # but for placed clones only, not useful here # unless they moved from hg17 (try this with the # placed list) # seq_contig.md - gives contig to chrom relationship ./chrCloneContig.pl /cluster/data/hg18/ncbi_build36.agp \ hg17.contig.clone.list /cluster/data/hg18/seq_contig.md \ common.to.hg17.unplaced.list cloneToChrom.from.seq.inf.txt \ > chrCloneContigCommonToHg17.list \ 2> common.to.hg17.unplaced.stderr # With this chrCloneContigCommonToHg17.list list in hand, can now # create a hierarchy of ./unPlacedClones/ ./createUnplacedHierarchy.sh # Then, copy them to the san for kluster run ssh pk cd /san/sanvol1/scratch/hg18/coverage rsync -a --progress --copy-links \ /cluster/data/hg18/bed/coverage/unPlacedClones/ ./unPlacedClones/ mkdir runUnPlaced cd runUnPlaced # create jobList from the chrCloneContigCommonToHg17.list egrep -v "^#|XX_000" \ /cluster/data/hg18/bed/coverage/chrCloneContigCommonToHg17.list \ | sed -e "s/.fa.gz//" \ | awk '{ printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s/%s.psl}\n", $1, $2, $3, $1, $3, $2 }' > masterJobList # Test a subset: grep " Y " masterJobList > jobListY para create jobListY para try ... check ... etc ... # ... some time later ... 2006-04-04 # All the clones were eventually run through the placement kluster # runs. Ending up with five different directory results: [hiram@hgwdev64 /san/sanvol1/scratch/hg18/coverage] # -rw-rw-r-- 1 3144245541 Mar 15 09:24 runFishClones/raw.psl # -rw-rw-r-- 1 91182723 Mar 15 10:44 runUnPlaced/raw.psl # -rw-rw-r-- 1 102642706 Mar 15 10:49 runPlaced/raw.psl # -rw-rw-r-- 1 15839733941 Mar 15 14:56 runLastRecover/raw.psl # -rw-rw-r-- 1 14338192704 Mar 15 18:25 runLastOnes/raw.psl # Combining those results together required a large memory # machine and a couple of days processing time: ssh hgwdev64 cd /san/sanvol1/scratch/hg18/coverage pslSort dirs raw.psl tmp runPlaced runUnPlaced runFishClones \ runLastRecover runLastOnes > raw.psl.out 2>&1 # resulting in a 33 Gb result file: -rw-rw-r-- 1 33515995907 Apr 2 10:54 raw.psl # trimming that down with pslReps: time pslReps -nohead -nearTop=0.001 -singleHit \ raw.psl repsSingle.psl /dev/null # real 14m58.371s # -rw-rw-r-- 1 42333543 Apr 4 10:22 repsSingle.psl # wc -l repsSingle.psl # 48005 repsSingle.psl # Now, clustering those alignments together: clusterClone -allowDuplicates -agp -minCover=80 -maxGap=60000 \ repsSingle.psl > single.agp 2> single.out wc -l single.agp # 45714 single.agp # Sort them, and set their phase correctly: sort -k1,1 -k2,2n single.agp \ | ./fixPhase.pl /cluster/data/hg18/bed/coverage/phase.txt \ > contig_overlaps.agp # some of them are not in the phase.txt file, these are # set to draft status: # WARN: can not find contig AC024654.2 in phase.txt # WARN: can not find contig AL133291.12 in phase.txt # WARN: can not find contig AC055712.12 in phase.txt # WARN: can not find contig AC024480.2 in phase.txt # WARN: can not find contig AC068738.2 in phase.txt # WARN: can not find contig AL354703.14 in phase.txt # WARN: can not find contig AL354756.17 in phase.txt # WARN: can not find contig AL157825.11 in phase.txt # WARN: can not find contig AC073306.1 in phase.txt # WARN: can not find contig AL138892.13 in phase.txt # WARN: can not find contig AL590104.7 in phase.txt # WARN: can not find contig AC079146.4 in phase.txt # WARN: can not find contig AC024497.3 in phase.txt # WARN: can not find contig AC021295.3 in phase.txt # WARN: can not find contig AC040906.3 in phase.txt # WARN: can not find contig AC008372.5 in phase.txt # WARN: can not find contig AC026054.3 in phase.txt # WARN: can not find contig AC053504.4 in phase.txt # create the gl files from that overlaps.agp file: ssh hgwdev cd /cluster/data/hg18 cp -p /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp . # after going through this sequence and loading everything, # a few clones were discovered to have crept into the list that # were obsolete. So, add them to the list used by the # removeObsoleteClones.sh script: awk '{print $6}' contig_overlaps.agp > clone.coverage.list bed/coverage/ckMultipleVersions.pl clone.coverage.list \ > /dev/null 2> /tmp/clone.transitions awk '{if (! match($1,$3)){ print }}' /tmp/clone.transitions \ >> bed/coverage/obsoleteClone.list time ./removeObsoleteClones.sh wc -l /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp \ ./contig_overlaps.agp # 45714 /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp # 45597 ./contig_overlaps.agp # after adding ten new ones the second time around: # 45587 ./contig_overlaps.agp time agpToGl contig_overlaps.agp . -md=seq_contig.md # this liftGl.csh finds all the contig.gl files under each # contig directory and creates chromsome coordinate chr*.gl # files in each chrom directory jkStuff/liftGl.csh contig.gl # Then hgGoldGapGl uses those chrom level chr*.gl files to add # the gl tables (as well as gold and gap hgGoldGapGl -chromLst=chrom.lst hg18 /cluster/store11/gs.19 build36 # strip some business from the sequence.inf file that is not needed # The sed here has to be done in a shell script, those tabs are # actual tabs and not the explicit ^I mkdir -p /scratch/tmp grep -v AADB /cluster/store11/gs.19/ncbi/sequence.inf \ > /scratch/tmp/seq0.inf (cat /scratch/tmp/seq0.inf; \ grep AADB01066164.1 /cluster/store11/gs.19/ncbi/sequence.inf) \ | grep "for_assembly" \ | sed -e "s/^IW^I/^I3^I/" > cleanedSequence.inf # Then hgClonePos uses those tables to create the Coverage track hgClonePos -maxErr=600 -maxWarn=50000 -chromLst=clonePos.list \ hg18 /cluster/data/hg18 ./cleanedSequence.inf /cluster/store11/gs.19 \ > clone.pos.errors 2>&1 ########################################################################### # RECOMBINATION RATES (DONE 2006-02-15 Fan) # The STS MArkers track must be completed prior to creating this track ssh kkstore02 cd /cluster/data/hg18/bed mkdir -p recombRate cd recombRate # Copy other necessary files here (in future, can take from previous version) # NOTE: these are stable, and could be saved in a permanent spot cp -p /projects/hg2/booch/psl/info/decode_all . cp -p /projects/hg2/booch/psl/info/marshfield_all . cp -p /projects/hg2/booch/psl/info/genethon_all . # Compared these 3 files with the 3 files of hg17, they are identical. # Determine maximum concordant set of markers for each of the maps /cluster/bin/scripts/assignGPsts -full -maxcon \ /cluster/data/ncbi/sts.10/stsAlias.bed \ /cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \ decode_all > decode.marker.rdb /cluster/bin/scripts/assignGPsts -full -maxcon \ /cluster/data/ncbi/sts.10/stsAlias.bed \ /cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \ marshfield_all > marshfield.marker.rdb /cluster/bin/scripts/assignGPsts -full -maxcon \ /cluster/data/ncbi/sts.10/stsAlias.bed \ /cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \ genethon_all > genethon.marker.rdb # Determine the rates for each of the maps /cluster/bin/scripts/markers_to_recomb_rate.terry.pl decode.marker.rdb \ /cluster/data/hg18/chrom.sizes 1000000 1000000 \ > decode_1mb_slide_1mb /cluster/bin/scripts/markers_to_recomb_rate.terry.pl genethon.marker.rdb \ /cluster/data/hg18/chrom.sizes 1000000 1000000 \ * genethon_1mb_slide_1mb # got 338 "... DISCARDING" messages. /cluster/bin/scripts/markers_to_recomb_rate.terry.pl marshfield.marker.rdb \ /cluster/data/hg18/chrom.sizes 1000000 1000000 \ * marshfield_1mb_slide_1mb # Got 424 "... DISCARDING" messages. # Convert files to proper format /cluster/bin/scripts/convertRecombRate decode_1mb_slide_1mb \ /cluster/data/hg18/inserts \ /cluster/data/hg18 1000 > decode_1mb_slide_1mb_conv /cluster/bin/scripts/convertRecombRate marshfield_1mb_slide_1mb \ /cluster/data/hg18/inserts \ /cluster/data/hg18 1000 > marshfield_1mb_slide_1mb_conv /cluster/bin/scripts/convertRecombRate genethon_1mb_slide_1mb \ /cluster/data/hg18/inserts \ /cluster/data/hg18 1000 > genethon_1mb_slide_1mb_conv # Create bed file and load /cluster/bin/scripts/createRRbed decode_1mb_slide_1mb_conv \ marshfield_1mb_slide_1mb_conv genethon_1mb_slide_1mb_conv \ > recombRate.bed ssh hgwdev cd /cluster/store11/gs.19/build36/bed/recombRate hgLoadBed -noBin -tab \ -sqlTable=/cluster/home/kent/src/hg/lib/recombRate.sql \ hg18 recombRate recombRate.bed ########################################################################### # FISH CLONES (DONE - 2006-01-13 - 2006-02-07 - Hiram) # **** RE-LOAD fishClones after bacEnds update - see below 2007-09-04 **** # The STS Marker, Coverage, and BAC End Pairs tracks must be completed prior to # creating this track (and why is this ?) ssh kkstore01 mkdir /cluster/data/ncbi/fishClones/fishClones.2006-01/ cd /cluster/data/ncbi/fishClones/fishClones.2006-01/ # Download information from NCBI # point browser at: # http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg # change "Sequence tag:" to "placed on contig" # change "Show details on sequence-tag" to "yes" # change "Download or Display" to "Download table for UNIX" # press Submit - save as # /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.txt chmod 664 /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.txt # Unfortunately the format of this hbrc file has changed since # last time. The columns have been rearranged, and one important # column is missing, the contig information. So, let's see if we # can recover the original format by putting this together with # some other things we have here. $HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \ /cluster/data/hg18/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \ 2> dbg # the seq_clone.pmd file was obtained via email from Wonhee Jang # jang at ncbi.nlm.nih.gov - I have asked for clarification where # such a file can be fetched without resorting to email. # Get current clone/accession information wget --timestamping http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out # Create initial Fish Clones bed file ssh kkstore02 mkdir /cluster/data/hg18/bed/fishClones cd /cluster/data/hg18/bed/fishClones # Copy previous sts info from fhcrc (take from previous build in future) cp -p /cluster/data/ncbi/fishClones/fishClones.2004-07/fhcrc.sts . # This fhcrc.sts listing doesn't change. It is merely a listing # of aliases that remain in effect. # Create cl_acc_gi_len file form cloneend information: grep -v "^#" /cluster/data/hg18/bed/cloneend/all.txt \ | awk '{gsub("\.[0-9]*$", "", $2); printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len ssh hgwdev # have to be on hgwdev for this since it is going to read from the # database. Had to work on this program to get it past what is # evidently a bad entry in hbrc.fixed where columns of information # are missing for one clone in particular time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg18 \ /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt \ /cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out \ ./cl_acc_gi_len \ /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl \ fishClones # real 2m4.708s # Reading Fish Clones file /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.fixed # reading fishInfo file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt # Reading Clone/Acc (clac.out) file /cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out # Reading BAC Ends file ./cl_acc_gi_len # Reading BAC Ends psl file /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl # Reading additional STS Marker links fhcrc.sts # Determining good positions # findClonePos: determining positions of fish clones # Writing output file # ERROR: at line # 170, no cytoband info for chrX:104048913-104206974 # RP11-79L11 # ERROR: at line # 171, no cytoband info for chrX:104048913-104206974 # RP11-79L11 # Load the track ssh hgwdev cd /cluster/data/hg18/bed/fishClones hgLoadBed -notItemRgb -noBin -tab \ -sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \ hg18 fishClones fishClones.bed # Loaded 9461 elements of size 16 ########################################################################### # CHROMOSOME BANDS TRACK (DONE - 2006-01-20 - 2006-02-07 - Hiram) # This must wait until the Fish Clones tracks is done # This was loaded in place of the previously loaded ideoband data # created from NCBI information, see below for "ideogram" ssh hgwdev mkdir /cluster/data/hg18/bed/cytoband cd /cluster/data/hg18/bed/cytoband # Copy in some necessary files (usually from previous version) cp -p /cluster/data/hg17/bed/cytoband/pctSetBands.txt . cp -p /cluster/data/hg17/bed/cytoband/ISCN800.txt . # Create some preliminary information files /cluster/bin/scripts/createSetBands pctSetBands.txt \ /cluster/data/hg18/inserts /cluster/data/hg18 100 > setBands.txt /cluster/bin/scripts/makeBands ISCN800.txt \ /cluster/data/hg18 > cytobands.pct.bed /cluster/bin/scripts/makeBandRanges cytobands.pct.bed \ > cytobands.pct.ranges # Reformat fishClones file /cluster/bin/scripts/createBanderMarkers \ /cluster/data/hg18/bed/fishClones/fishClones.bed > fishClones.txt /cluster/bin/scripts/runBander fishClones.txt \ ISCN800.txt setBands.txt /cluster/data/hg18 # Should be 862 bands wc -l cytobands.bed # 862 cytobands.bed hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \ hg18 cytoBand cytobands.bed hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \ hg18 cytoBandIdeo cytobands.bed ########################################################################### # BLASTZ SELF (DONE - 2006-01-17 - 2006-01-20 - Hiram) ssh pk mkdir /cluster/data/hg18/bed/blastzSelf.2006-01-17 cd /cluster/data/hg18/bed/blastzSelf.2006-01-17 cat << '_EOF_' > DEF # human vs human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_M=400 # TARGET: Human Hg18 SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Human Hg18 SEQ2_DIR=/san/sanvol1/scratch/hg18/selfNib SEQ2_LEN=/san/sanvol1/scratch/hg18/self.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 BASE=/cluster/data/hg18/bed/blastzSelf.2006-01-17 TMPDIR=/scratch/tmp '_EOF_' # happy emacs cd /cluster/data/hg18/bed/blastzSelf.2006-01-17 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & # real 640m37.637s ssh kolossus cd /cluster/data/hg18/bed/blastzSelf.2006-01-17 time HGDB_CONF=~/.hg.conf.read-only featureBits \ -noRandom -noHap hg18 chainSelfLink > fb.chainSelfLink 2>&1 & # real 21m52.697s # 324067552 bases of 2858034764 (11.339%) in intersection # compared to Hg17: cd /cluster/data/hg17/bed/blastzSelf.2004-07-01 time HGDB_CONF=~/.hg.conf.read-only featureBits \ -noRandom -noHap hg17 chainSelfLink > fb.chainSelfLink 2>&1 & # real 56m34.802s # 240976607 bases of 2851352871 (8.451%) in intersection # reloaded these chains to add normalized score column ssh hgwdev cd /cluster/data/hg18/bed/blastzSelf.2006-01-17/axtChain chainSplit chain hg18.hg18.all.chain.gz cd /cluster/data/hg18/bed/blastzSelf.2006-01-17/axtChain/chain foreach f (*.chain) set c = $f:r hgLoadChain -normScore hg18 ${c}_chainSelf $f end cd .. rm -fr chain ############################################################################## # CLONE ENDS - BACEND TRACK (DONE - 2006-01-11 - Fan) ssh kkstore02 cd /cluster/data/hg18 # check disk space: 73Gb free df -h . # Filesystem Size Used Avail Use% Mounted on # /export/cluster/store11 1.8T 1.4T 323G 82% /cluster/store11 mkdir -p bed/cloneend/ncbi cd bed/cloneend/ncbi wget --timestamping ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/* # Somehow the wget did not work. Did it by hand. cd /cluster/data/hg18/bed/cloneend # seems like the *.mfa files were split just for convenience # concatenate bash for F in ncbi/*.mfa.gz do zcat ${F} done | gzip > all.mfa.gz exit # Convert the title line of the all.mfa file cat << '_EOF_' > convert.pl #!/usr/bin/env perl use strict; use warnings; while (my $line = <>) { if ($line !~ m/^>/) { print $line } else { my @fields = split('\|', $line); my $fieldCount = scalar(@fields); my $printed = 0; for (my $i = 0; $i < $fieldCount; $i++) { if ($fields[$i] eq "gb" || $fields[$i] eq "dbj" || $fields[$i] eq "emb") { (my $name, my $vers) = split(/\./,$fields[$i+1]); print ">$name\n"; $i= $fieldCount; $printed = 1; } } if (!$printed) { die("Failed for $line\n"); } } } '_EOF_' # < happy emacs chmod +x convert.pl zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz # make sure nothing got broken: faSize all.mfa.gz # 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files faSize cloneEnds.fa.gz # 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files # identical numbers # concatenate the text files, too bash for F in ncbi/*.txt.gz do zcat ${F} done | gzip > all.txt.gz # generate cloneEndPairs.txt and cloneEndSingles.txt cp -p /cluster/data/mm6/bed/cloneend/ncbi/convertTxt.pl . zcat all.txt.gz >all.txt ./convertTxt.pl all.txt # Reading in end info # Writing out pair info # Writing out singleton info # 249619 pairs and 318500 singles # faSplit does not function correctly if given a .gz source file # AND, we need the unzipped file for sequence loading below gunzip cloneEnds.fa.gz # split mkdir splitdir cd splitdir faSplit sequence ../cloneEnds.fa 100 cloneEnds # Check to ensure no breakage: cat *.fa | faSize stdin # 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files # same numbers as before # Copy to san for cluster runs ssh pk cd /cluster/data/hg18/bed/cloneend/splitdir mkdir /san/sanvol1/scratch/hg18/cloneEnds cp -p *.fa /san/sanvol1/scratch/hg18/cloneEnds rm * cd .. rmdir splitdir # load sequences ssh hgwdev mkdir /gbdb/hg18/cloneend cd /gbdb/hg18/cloneend ln -s /cluster/data/hg18/bed/cloneend/cloneEnds.fa . cd /tmp hgLoadSeq hg18 /gbdb/hg18/cloneend/cloneEnds.fa # Advisory lock created # Creating .tab file # Adding /gbdb/hg18/cloneend/cloneEnds.fa # 832860 sequences # Updating seq table # Advisory lock has been released # All done ############################################################################ # BACEND SEQUENCE ALIGNMENTS (STARTED - 2006-01-11, DONE 2006-01-18 - Fan) # REDONE 2006-02-02 - Hiram ssh pk # The ooc file was created earlier into /cluster/bluearc/hg18/11.ooc cp -p /cluster/bluearc/hg18/11.ooc /san/sanvol1/scratch/hg18/11.ooc mkdir /san/sanvol1/scratch/hg18/bacends cd /san/sanvol1/scratch/hg18/bacends ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst ls -1S /san/sanvol1/scratch/hg18/cloneEnds/cloneEnds???.fa > bacends.lst # 378 contigs vs 98 bacends files -> 37,044 jobs mkdir out cat > template << '_EOF_' #LOOP /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/hg18/11.ooc {check out line+ out/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # << emacs gensub2 contigs.lst bacends.lst template jobList foreach f (`cat bacends.lst`) set d = $f:r:t echo $d mkdir out/$d end para create jobList # 37044 jobs in batch para try, check, push, etc ... # lift alignments ssh pk cd /san/sanvol1/scratch/hg18/bacends pslSort dirs raw.psl temp out/cloneEnds* # 37044 files in 98 dirs # Got 37044 files 192 files per mid file # real 32m24.804s # -rw-rw-r-- 1 6487445210 Feb 2 21:08 raw.psl time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 & # real 6m33.218s # Processed 51898639 alignments mkdir lifted time liftUp lifted/bacEnds.lifted.psl ./liftContigs.lft warn bacEnds.psl # real 0m30.067s pslSort dirs bacEnds.sorted.psl temp lifted # cleanup rmdir temp rm -fr out /cluster/store7/kate/hg17/bacends wc -l *.sorted.psl # 2490892 bacEnds.sorted.psl time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \ -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.sorted.psl \ /cluster/data/hg18/bed/cloneend/cloneEndPairs.txt \ all_bacends bacEnds # Reading pair file # Reading psl file # Creating Pairs # Writing to files # real 0m11.221s # this creates the files: # -rw-rw-r-- 1 16224182 Feb 2 21:36 bacEnds.pairs # -rw-rw-r-- 1 4655633 Feb 2 21:36 bacEnds.orphan # -rw-rw-r-- 1 399525 Feb 2 21:36 bacEnds.slop # -rw-rw-r-- 1 106252 Feb 2 21:36 bacEnds.mismatch # -rw-rw-r-- 1 634909 Feb 2 21:36 bacEnds.short # -rw-rw-r-- 1 4023 Feb 2 21:36 bacEnds.long # create header required by "rdb" tools # TODO: replace w/ awk & sort echo -e \ 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header echo -e '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long \ bacEnds.mismatch bacEnds.orphan \ | row score ge 300 | sorttbl chr start | headchg -del \ > bacEndPairsBad.bed extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \ bacEndPairsBad.bed | \ sorttbl tname tstart | headchg -del > bacEnds.load.psl # Move the previous build out of the way and copy these # results over to the primary hg18 bed location: mv /cluster/data/hg18/bed/bacends /cluster/data/hg18/bed/bacends.2006-01-18 mkdir /cluster/data/hg18/bed/bacends cp -p bacEnd* /cluster/data/hg18/bed/bacends cp -p lifted/bacEnds.lifted.psl /cluster/data/hg18/bed/bacends # load them into the database ssh hgwdev cd /cluster/data/hg18/bed/bacends # CHECK bacEndPairs.bed ID's to make sure they have no blanks in them awk '{print $5}' bacEndPairs.bed | sort | uniq -c # result should be the scores, no extraneous strings: # 156984 1000 # 195 300 # 316 375 # 297 500 # 1476 750 # edit the file and fix it if it has a bad name. hgLoadBed -notItemRgb hg18 bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 159268 # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed -notItemRgb hg18 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 69788 #hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl # NOTE: truncates file to 0 if -nobin is used # NOTE: truncates file to 0 if -nobin is used hgLoadPsl hg18 -table=all_bacends bacEnds.load.psl # no complaints ! Usually there are, this loaded: hgsql -N -e "select count(*) from all_bacends;" hg18 # 1249956 nice featureBits hg18 all_bacends # 191078854 bases of 2881515245 (6.631%) in intersection nice featureBits hg17 all_bacends # 225763317 bases of 2866216770 (7.877%) in intersection nice featureBits hg18 bacEndPairs # 2842800422 bases of 2881515245 (98.656%) in intersection nice featureBits hg17 bacEndPairs # 2846568377 bases of 2866216770 (99.314%) in intersection nice featureBits hg18 bacEndPairsBad # 729313572 bases of 2881515245 (25.310%) in intersection nice featureBits hg17 bacEndPairsBad # 797412909 bases of 2866216770 (27.821%) in intersection ############################################################################ # BACEND PAIRS TRACK (OBSOLETE - DONE ABOVE) (DONE - 2006-01-18 - Fan) ssh kolossus cd /cluster/data/hg18/bacends bash time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.psl \ ../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds # create header required by "rdb" tools echo -e \ "chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header cat header bacEnds.pairs | \ /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \ /cluster/bin/scripts/sorttbl chr start | \ /cluster/bin/scripts/headchg -del > bacEndPairsBad.bed /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \ bacEndPairsBad.bed >j1.out cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl rm j1.out j2.out # CHECK bacEndPairs.bed ID's to make sure they have no blanks in them awk '{print $5}' bacEndPairs.bed | sort | uniq -c # result should be the scores, no extraneous strings: # 156984 1000 # 195 300 # 316 375 # 297 500 # 1476 750 # edit the file and fix it if it has a bad name. # load into database ssh hgwdev cd /cluster/data/hg18/bacends hgLoadBed -strict -notItemRgb hg18 bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 146284 elements of size 11 # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed -strict -notItemRgb hg18 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 75995 elements of size 11 # NOTE: truncates file to 0 if -nobin is used hgLoadPsl hg18 -table=all_bacends bacEnds.load.psl nice featureBits hg18 all_bacends # 162081172 bases of 2881515245 (5.625%) in intersection nice featureBits hg17 all_bacends # 225763317 bases of 2866216770 (7.877%) in intersection nice featureBits hg18 bacEndPairs # 2835522069 bases of 2881515245 (98.404%) in intersection nice featureBits hg17 bacEndPairs # 2846568377 bases of 2866216770 (99.314%) in intersection nice featureBits hg18 bacEndPairsBad # 781697678 bases of 2881515245 (27.128%) in intersection nice featureBits hg17 bacEndPairsBad # 797412909 bases of 2866216770 (27.821%) in intersection ########################################################################## # BLASTZ OPOSSUM monDom2 second time (DONE - 2006-02-13 - Hiram) ssh kk mkdir /cluster/data/hg18/bed/blastzMonDom2.2006-02-13 cd /cluster/data/hg18/bed ln -s blastzMonDom2.2006-02-13 blastz.monDom4 cd blastzMonDom2.2006-02-13 cat << '_EOF_' > DEF # human vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin BLASTZ=blastz.v7 # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human (hg18) SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom4 SEQ2_DIR=/iscratch/i/monDom4/monDom4RMExtra.2bit SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & ssh kolossus cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13 time nice -n +19 featureBits hg18 chainMonDom4Link \ > fb.hg18.chainMonDom4Link 2>&1 & cat fb.hg18.chainMonDom4Link # 356865888 bases of 2881515245 (12.385%) in intersection # for the swap, see makeMonDom4.doc 2006-04-28 # Creating download directory (DONE - 2006-07-18 - Hiram) ssh hgwdev cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download -stop=download `pwd`/DEF > download.out 2>&1 ########################################################################## # BLASTZ OPOSSUM monDom2 first time (EXPERIMENT - 2006-01-23 - Hiram) ssh pk mkdir /cluster/data/hg18/bed/blastzMonDom2.2006-01-23 cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23 cat << '_EOF_' > DEF # human vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human (hg18) SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom2 SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit SEQ2_LEN=/san/sanvol1/scratch/monDom2/chrom.sizes SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzMonDom2.2006-01-23 TMPDIR=/scratch/tmp '_EOF_' # happy emacs cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # real 912m22.818s # This failed during the load of the chains due to the size of # chr19.chain. So, go to kolossus: ssh kolossus # There isn't any hg18 db here yet, get it established with a # chromInfo and a 2bit sequence: hgsql -e "create database hg18;" mysql cd /cluster/data/hg18 twoBitInfo hg18.2bit stdout | awk '{printf "%s\t%s\t/gbdb/hg18/hg18.2bit\n", $1,$2}' \ > chromInfo.kolossus.tab hgsql hg18 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql hg18 \ -e 'load data local infile "chromInfo.kolossus.tab" into table chromInfo;' mkdir /gbdb/hg18 ln -s /cluster/data/hg18/hg18.2bit /gbdb/hg18/hg18.2bit # now, loading only chr19: cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain hgLoadChain hg18 chr19_chainMonDom2 chain/chr19.chain # while that is running, back on hgwdev, get the other chains loaded ssh hgwdev cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain cp loadUp.csh loadUp.noChr19.csh # change the foreach line to eliminate the chr19.chain: diff loadUp.csh loadUp.noChr19.csh < foreach f (*.chain) --- > foreach f (`ls *.chain | grep -v chr19.chain`) # And then run that script time ./loadUp.noChr19.csh > load.noChr19.out 2>&1 # When the kolossus load finishes, email to push-request and ask # for the two tables to be pushed from kolossus to hgwdev: # chr19_chainMonDom2 # chr19_chainMonDom2Link # then, continuing: time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=download -bigClusterHub=pk -chainMinScore=5000 \ -chainLinearGap=loose `pwd`/DEF > download.out 2>&1 & # real 2m42.505s ssh kolossus cd /cluster/data/hg18/bed/blastz.monDom2 time HGDB_CONF=~/.hg.conf.read-only featureBits \ hg18 chainMonDom2Link > fb.hg18.chainMonDom2Link 2>&1 # real 124m34.435s cat fb.hg18.chainMonDom2Link # 357258631 bases of 2881515245 (12.398%) in intersection # then, to swap ssh pk cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > swap.out 2>&1 & # running 2006-01-25 17:28 # real 51m27.447s # this swap failed at: # startStep: 4, at step 5 net to stopStep 9 # netChains: looks like previous stage was not successful # (can't find [monDom2.hg18.]all.chain[.gz]). # This failure does not make any sense. The end of swapChains # does an nfsNoodge on this file to verify it exists. # I don't understand why it wouldn't be in existence # as netChains starts up. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net `pwd`/DEF > net-swap.out 2>&1 & # running 2006-01-26 09:28 # real 27m57.077s # This swap failed at the load chain: # startStep: 5, at step 6 load to stopStep 9 # # chmod a+x # # /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain/loadUp.csh # # ssh -x hgwdev nice # # /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain/loadUp.csh # cd /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain # hgLoadChain -tIndex monDom2 chainHg18 monDom2.hg18.all.chain.gz # Out of memory needMem - request size 56 bytes # So, over to kolossus to give it a try: # There isn't any monDom2 db here yet, get it established with a # chromInfo and a 2bit sequence: hgsql -e "create database monDom2;" mysql cd /cluster/data/monDom2 hgsql monDom2 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql monDom2 \ -e 'load data local infile "chromInfo.tab" into table chromInfo;' mkdir /gbdb/monDom2 ln -s /cluster/data/monDom2/monDom2.2bit /gbdb/monDom2/monDom2.2bit # now, loading into monDom2 cd /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain time hgLoadChain -tIndex monDom2 chainHg18 monDom2.hg18.all.chain.gz \ > kolossus.load # running - 2006-01-26 ########################################################################## # test BLASTZ Opossum MonDom1 (DONE - 2006-01-30 - Hiram) # to see what happened with the blow up of data in monDom2 # ssh kk mkdir /cluster/data/hg18/bed/blastzMonDom1.2006-01-30 cd /cluster/data/hg18/bed/blastzMonDom1.2006-01-30 cat << '_EOF_' > DEF # human vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin BLASTZ=blastz.v7 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human (hg18) SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom1 SEQ2_DIR=/iscratch/i/monDom1/chunks SEQ2_LEN=/iscratch/i/monDom1/chrom.sizes SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzMonDom1.2006-01-30 TMPDIR=/scratch/tmp '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # started 2006-01-30 - 15:40 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=cat -stop=load `pwd`/DEF > cat_load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=net `pwd`/DEF > blastz.out 2>&1 & ############################################################################ ############################################################################ # STS MARKERS (STARTED 2006-01-27 Fan - DONE 2006-02-06 - Hiram) # FOR NEXT TIME - a lot of the perl scripts used in this process # need to be placed into the source tree and cleaned up to modern # perl warnings and strict standards. In particular, one script # was placed into the source tree this time: src/utils/findAccession.pl # update from NCBI ssh kkstore02 # use store11 for space mkdir -p /cluster/store11/sts.2006-01 ln -s /cluster/store11/sts.2006-01 /cluster/data/ncbi ln -s /cluster/data/ncbi/sts.2006-01 sts.10 cd /cluster/data/ncbi/sts.2006-01 wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases # old # wget --timestamping ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts # wget --timestamping ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz gunzip sts.gz mv sts dbSTS.fa # these items are copied in from the previous builds cp -p /cluster/data/ncbi/sts.9/all.STS.fa ./all.STS.fa.prev cp -p /cluster/data/ncbi/sts.9/stsInfo2.bed ./stsInfo2.bed.prev # Convert dbSTS.fa file to easier reading format, and get accessions /cluster/bin/scripts/convertGbFaFile dbSTS.fa > UniSTS.convert.fa grep ">" UniSTS.convert.fa | cut -f 2 -d ">" > UniSTS.acc # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers, # all.STS.fa, stsAlias.bed files #### XXX - FOR NEXT TIME: need to fix something here for the #### XXX - broken symbol AFM067XA9 which has over 6,000 aliases. #### XXX - This isn't right #### hand-editted the record for AFM067XA9. KUHN/ARCHANA 10-08-2007 #### preserving the list of otherNames that showed up stsInfo2.otherNames for #### trueName=AFM067XA9 #### cp hg18.AFM067XA9.otherNames /cluster/data/hg18/bed/sts #### preserving the list of stsMarkers that showed up in stsAlias.alias #### in excess of those in the above file (10 k total) #### cp hg18.AFM067XA9.dropped.aliases /cluster/data/hg18/bed/sts updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \ UniSTS.sts UniSTS.aliases UniSTS.convert.fa new # 5610 MFD330 1000006 (0) not in dbSTS anymore # 5667 D3S4560 1000008 (0) not in dbSTS anymore # 5686 ATA92F01 1000007 (0) not in dbSTS anymore # 5945 MFD206 1000009 (0) not in dbSTS anymore # 6591 MFD311 1000011 (0) not in dbSTS anymore # 6841 MFD306 1000013 (0) not in dbSTS anymore # 6842 MFD310 1000012 (0) not in dbSTS anymore # 6844 MFD349 1000026 (0) not in dbSTS anymore # 7024 D12S2343 1000015 (0) not in dbSTS anymore # 7042 ATA73C05 1000014 (0) not in dbSTS anymore # 7226 MFD341 1000016 (0) not in dbSTS anymore # 7500 D17S2200 1000018 (0) not in dbSTS anymore # 7628 ATA92E03 1000020 (0) not in dbSTS anymore # 7642 GATA178F11 1000019 (0) not in dbSTS anymore # 7910 MFD338 1000022 (0) not in dbSTS anymore # 97723 GATA172D05 1000023 (0) not in dbSTS anymore # 205088 CPLA3610 1000000 (0) not in dbSTS anymore # 205089 COX_1935 1000001 (0) not in dbSTS anymore # 205090 24534CA2 1000002 (0) not in dbSTS anymore # 205091 D5S811 1000003 (0) not in dbSTS anymore # 205092 AC016604-5 1000004 (0) not in dbSTS anymore # 205093 CA-JAP-180 1000005 (0) not in dbSTS anymore # 205094 D10S1120 1000025 (0) not in dbSTS anymore # 205095 D21S2039 1000024 (0) not in dbSTS anymore # 205102 D12S1013 1000028 (0) not in dbSTS anymore mv new.info stsInfo2.bed mv new.primers all.primers mv new.alias stsAlias.bed mv new.fa all.STS.fa # get list of all STS id's in the fasta file sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n > all.STS.id wc -l all.STS.id # 93698 total sequences /cluster/bin/scripts/convertPrimerToFA all.primers > all.primers.fa # Copy stsInfo2.bed and stsAlias.bed to data directory becuase # these will be loaded into the database later mkdir -p /cluster/data/hg18/bed/sts cp -p stsInfo2.bed /cluster/data/hg18/bed/sts/ cp -p stsAlias.bed /cluster/data/hg18/bed/sts/ # Create sts sequence alignments mkdir /san/sanvol1/scratch/hg18/sts mkdir /san/sanvol1/scratch/hg18/sts/split faSplit sequence all.STS.fa 200 /san/sanvol1/scratch/hg18/sts/split/sts cp -p all.STS.fa /san/sanvol1/scratch/hg18/sts ssh pk cd /cluster/data/hg18/bed/sts mkdir run cd run ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst ls -1S /san/sanvol1/scratch/hg18/sts/split/sts*.fa > sts.lst mkdir /san/sanvol1/scratch/hg18/sts/out foreach f (`cat sts.lst`) set d = $f:t:r mkdir /san/sanvol1/scratch/hg18/sts/out/$d end # create alignments cat > template << '_EOF_' #LOOP /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/hg18/11.ooc -stepSize=5 {check out line+ /san/sanvol1/scratch/hg18/sts/out/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # happy emacs gensub2 contigs.lst sts.lst template jobList para create jobList # 70686 jobs para try ... check ... push ... etc # Completed: 70686 of 70686 jobs # CPU time in finished jobs: 117490s 1958.16m 32.64h 1.36d 0.004 y # IO & Wait Time: 195274s 3254.57m 54.24h 2.26d 0.006 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest finished job: 97s 1.62m 0.03h 0.00d # Submission to last job: 8085s 134.75m 2.25h 0.09d # Compile sts sequence results ssh pk cd /san/sanvol1/scratch/hg18/sts time pslSort dirs raw.psl temp out/sts* # real 8m50.714s # -rw-rw-r-- 1 810548945 Feb 3 14:19 raw.psl # 70686 files in 187 dirs # Got 70686 files 266 files per mid file rm -rf temp time pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons raw.psl \ stsMarkers.psl /dev/null # Processed 7252745 alignments # real 0m28.102s # -rw-rw-r-- 1 10981952 Feb 3 14:26 stsMarkers.psl cp -p stsMarkers.psl /cluster/data/hg18/bed/sts/run # Lift them and get them ready to combine with primer alignments liftUp -nohead stsMarkers.lifted.psl \ /cluster/data/hg18/jkStuff/liftContigs.lft \ warn stsMarkers.psl /cluster/bin/scripts/extractPslInfo stsMarkers.lifted.psl # creates stsMarkers.lifted.psl.initial wc stsMarkers.lifted.psl.initial # 93236 559416 4111801 stsMarkers.lifted.psl.initial $HOME/kent/src/utils/findAccession.pl -agp stsMarkers.lifted.psl.initial \ /cluster/data/hg18 wc stsMarkers.lifted.psl.initial.acc # 93236 652652 4947261 stsMarkers.lifted.psl.initial.acc sort -k4,4n stsMarkers.lifted.psl.initial.acc > stsMarkers.final # determine found markers (4th field in file) cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found wc -l stsMarkers.found # 90676 stsMarkers.found # out of 93698 total sequences # from wc /cluster/data/ncbi/sts.2006-01/all.STS.id) # extract sequences for markers not yet found, and # blat w/o ooc to try to place more comm -1 -3 stsMarkers.found /cluster/data/ncbi/sts.2006-01/all.STS.id \ > stsMarkers.notFound wc -l stsMarkers.notFound # 3022 stsMarkers.notFound faSomeRecords /san/sanvol1/scratch/hg18/sts/all.STS.fa stsMarkers.notFound \ notFound.STS.fa mkdir /san/sanvol1/scratch/hg18/sts/splitNotFound faSplit sequence notFound.STS.fa 20 \ /san/sanvol1/scratch/hg18/sts/splitNotFound/sts # blat with 11.ooc misses alignments, so reblat w/o the # sequences that aren't found # NOTE: filtering produces yield of only 101 markers placed (out of 3022). # not enough to justify this step next time ssh pk mkdir /cluster/data/hg18/bed/sts/run.noOoc cd /cluster/data/hg18/bed/sts/run.noOoc ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst ls -1S /san/sanvol1/scratch/hg18/sts/splitNotFound/sts*.fa > sts.lst mkdir /san/sanvol1/scratch/hg18/sts/out.noOoc foreach f (`cat sts.lst`) set d = $f:t:r mkdir /san/sanvol1/scratch/hg18/sts/out.noOoc/$d end cat > template << '_EOF_' #LOOP /cluster/bin/x86_64/blat $(path1) $(path2) -stepSize=5 {check out line+ /san/sanvol1/scratch/hg18/sts/out.noOoc/$(root2)/$(root1).$(root2).psl} #ENDLOOP '_EOF_' # happy emacs gensub2 contigs.lst sts.lst template jobList para create jobList # 7182 jobs written to batch para try para check # process this set of alignments cd /san/sanvol1/scratch/hg18/sts pslSort dirs raw.noOoc.psl temp out.noOoc/* # -rw-rw-r-- 1 459858612 Feb 3 15:56 raw.noOoc.psl # Wow, that is almost half the size of the original raw.psl with # everything in it. rm -rf temp pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons \ raw.noOoc.psl stsMarkers.noOoc.psl /dev/null # Processed 4027664 alignments # Lift them and get them ready to combine with primer alignments liftUp -nohead stsMarkers.noOoc.lifted.psl \ /cluster/data/hg18/jkStuff/liftContigs.lft \ warn stsMarkers.noOoc.psl /cluster/bin/scripts/extractPslInfo stsMarkers.noOoc.lifted.psl # creates .initial $HOME/kent/src/utils/findAccession.pl -agp \ stsMarkers.noOoc.lifted.psl.initial /cluster/data/hg18 #rm stsMarkers.lifted.psl.initial mv stsMarkers.final stsMarkers.ooc.final sort -k4,4n stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra sort -k4,4n stsMarkers.lifted.psl.initial.acc \ stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.final # determine found markers (4th field in file) cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.more.found wc -l stsMarkers.more.found # 90777 stsMarkers.found cut -f 4 stsMarkers.extra | sort -n -u > stsMarkers.extra.found wc -l stsMarkers.extra.found # 101 out of 3022 attempted # out of 93698 total sequences cp -p stsMarkers.final stsMarkers.lifted.psl \ stsMarkers.*lifted.psl.initial* stsMarkers.found \ /cluster/data/hg18/bed/sts # Alignments from noOoc set were not added to all_sts_seq but info for the # markers is in stsMap and stsInfo2. Some of the alignments are bad so # filter by removing all alignments from noOoc psl file where # tBaseInsert >=1000. Add the remaining alignments to the set of final # alignments for stsMarkers. The information for the removed markers # from the filtered set was also removed from stsMap and stsInfo2. ssh pk mkdir /cluster/data/hg18/bed/sts/fix cd /cluster/data/hg18/bed/sts/fix cp /san/sanvol1/scratch/hg18/sts/stsMarkers.noOoc.lifted.psl . awk '{if ($8 < 1000) print}' stsMarkers.noOoc.lifted.psl \ > stsMarkers.noOoc.lifted.filt1000.psl wc -l *.filt*.psl # 23 483 4206 stsMarkers.noOoc.lifted.filt1000.psl sort -k4,4n \ /san/sanvol1/scratch/hg18/sts/stsMarkers.noOoc.lifted.psl.initial.acc \ > stsMarkers.extra awk '{print $4}' stsMarkers.extra | sort -n | uniq > extra.ids # in psl file, the ids are the 10th field awk '{print $10}' stsMarkers.noOoc.lifted.psl | sort -n | uniq \ > noOoc.ids diff extra.ids noOoc.ids # there is no difference as expected # get list of IDs from filtered file, filter < 1000 awk '{print $10}' stsMarkers.noOoc.lifted.filt1000.psl \ | sort -n | uniq > filt1000.ids for i in `cat filt1000.ids` do awk 'BEGIN {OFS="\t"} \ {if ($4 == "'$i'") print $1, $2, $3, $4, $5, $6, $7}' \ stsMarkers.extra >> stsMarkers.extra.filt1000 done cp -p ../stsMarkers.final stsMarkers.final # need to filter stsMarkers.final not just cat this on the end # get list of alignments with tBaseInsert >= 1000 and remove these cd /cluster/data/hg18/bed/sts/fix awk '{if ($8 >= 1000) print;}' stsMarkers.noOoc.lifted.psl \ > stsMarkers.noOoc.lifted.filtToRemove.psl wc -l *.filt*.psl # 23 stsMarkers.noOoc.lifted.filt1000.psl # 175 stsMarkers.noOoc.lifted.filtToRemove.psl # get list of IDs that need to be removed awk '{print $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl | sort -n \ | uniq > noOoc.IdsToRemove.txt # get chrom and co-ordinates for IDs to be removed awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \ stsMarkers.noOoc.lifted.filtToRemove.psl | sort | uniq \ > sts.noOoc.filtToRemove.coords # checked that the stsMarkers.final contain the noOoc alignments # use this perl script to remove lines with these IDs from stsMarkers.final cat << '_EOF_' > removeIds.pl #!/usr/bin/env perl use warnings; use strict; my $ids = $ARGV[0]; my $file = $ARGV[1]; # list of IDs with chrom and coords to remove open(IDS, $ids) || die "Can not open $ids: $!\n"; # file for removal of IDs open(FILE, $file) || die "Can not open $file: $!\n"; open(OUT, ">removed.txt") || die "Can not create removed.txt: $!\n"; my %idsHash; while () { chomp; my @a = split(/\t/); my $chr = $a[0]; my $st = $a[1]; my $end = $a[2]; my $id = $a[3]; my $key = $id."_".$chr . "_" . $st . "_" . $end; $idsHash{$key}->{chrom} = $chr; $idsHash{$key}->{start} = $st; $idsHash{$key}->{end} = $end; } close IDS; while () { chomp; my $l = $_; my $found = "FALSE"; my @f = split(/\t/, $l); foreach my $k (keys(%idsHash)) { # if the id is contained in the key if ($k =~ /^$f[3]/) { my $c = $idsHash{$k}->{chrom}; my $s = $idsHash{$k}->{start}; my $e = $idsHash{$k}->{end}; if ($f[0] eq $c && $f[1] == $s && $f[2] == $e) { print OUT "$c\t$s\t$e\t$f[3]\n"; $found = "TRUE"; } } } if ($found eq "FALSE") { print "$l\n"; } } '_EOF_' chmod +x removeIds.pl ./removeIds.pl sts.noOoc.filtToRemove.coords stsMarkers.final \ > stsMarkers.final.new wc -l stsMarkers.final* wc stsMarkers.final* # 93434 654038 4957784 stsMarkers.final # 93259 652813 4948484 stsMarkers.final.new # There are 175 ids and sets of co-ordinates in list of Ids to remove # 175 stsMarkers.noOoc.lifted.filtToRemove.psl # check that stsMarkers.final.new contains all the alignments that # are in filtered set: stsMarkers.noOoc.lifted.filt1000.psl awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \ stsMarkers.noOoc.lifted.filt1000.psl | sort | uniq \ > sts.noOoc.filt1000.coords awk 'BEGIN {OFS = "\t"} {print $1,$2,$3,$4}' \ stsMarkers.final.new | sort | uniq \ > sts.finalnew.coords diff sts.finalnew.coords sts.noOoc.filt1000.coords > finalnewvsfilt1000 grep '>' finalnewvsfilt1000 # there is nothing in sts.noOoc.filt1000.coords not found in the # sts.finalnew.coords file therefore this contains all the alignments # from the filtered noOoc file. cp ../primers/primers.final . awk '{print $4}' stsMarkers.final.new | sort | uniq > stsfinal.new.ids # primers ssh eieio cd /cluster/data/ncbi/sts.10 # strip out N's and wobbles (KS) from primers, as isPcr # can't currently handle them # strip out primers < 10 as isPcr can't handle them awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \ all.primers > all.primers.ispcr mkdir -p /san/sanvol1/scratch/hg18/sts.10/primers cd /san/sanvol1/scratch/hg18/sts.10/primers split -l 4000 /cluster/data/ncbi/sts.10/all.primers.ispcr primers_ ssh pk mkdir /cluster/data/hg18/bed/sts/primers cd /cluster/data/hg18/bed/sts/primers mkdir run cd run ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst ls -1S /san/sanvol1/scratch/hg18/sts.10/primers/primers_* > primers.lst mkdir /san/sanvol1/scratch/hg18/sts.10/primers/out cat > template << '_EOF_' #LOOP /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/san/sanvol1/scratch/hg18/10ooc/$(root1).10.ooc -stepSize=5 $(path1) $(path2) {check out line /san/sanvol1/scratch/hg18/sts.10/primers/out/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # happy emacs gensub2 contigs.lst primers.lst template jobList para create jobList # 29106 jobs para try ... check ... push ... etc ... # Completed: 29106 of 29106 jobs # CPU time in finished jobs: 658245s 10970.76m 182.85h 7.62d 0.021 y # IO & Wait Time: 82764s 1379.39m 22.99h 0.96d 0.003 y # Average job time: 25s 0.42m 0.01h 0.00d # Longest finished job: 534s 8.90m 0.15h 0.01d # Submission to last job: 2282s 38.03m 0.63h 0.03d # Filter output file quickly based on simple parameters ssh pk cd /san/sanvol1/scratch/hg18/sts.10/primers mkdir filter pslQuickFilter -minMatch=26 -maxMismatch=5 \ -maxTinsert=5000 -verbose out/ filter/ # Note: there will be many messages saying files are empty - this is OK time pslSort dirs ../primers.psl.unlifted temp filter # Got 29106 files 171 files per mid file # real 3m31.401s # filter primer alignments and create not found primer file for ePCR run cd /san/sanvol1/scratch/hg18/sts.10 pslFilterPrimers primers.psl.unlifted \ /cluster/data/ncbi/sts.10/all.primers primers.filter.unlifted.psl # creates primers.filter.unlifted.psl.notfound.primers wc -l primers.filter.unlifted.psl.notfound.primers # 22943 primers.filter.unlifted.psl.notfound.primers # use Greg Schuler's ePCR to attempt alignment of primers missed # by isPcr ssh pk mkdir /san/sanvol1/scratch/hg18/sts.10/epcr mkdir /san/sanvol1/scratch/hg18/sts.10/epcr/out cd /san/sanvol1/scratch/hg18/sts.10/epcr split -l 3000 ../primers.filter.unlifted.psl.notfound.primers primers_ mkdir /cluster/data/hg18/bed/sts/primers/run.epcr cd /cluster/data/hg18/bed/sts/primers/run.epcr ls -1S /san/sanvol1/scratch/hg18/sts.10/epcr/primers_* > primers.lst # These jobs are going to go quickly, make sure all I/O comes and # goes from something that can handle it. ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contig.lst # This runEpcr64 script was made from the existing runEpcr script # and from the looks of it, I doubt the original script works in # the way this was set up here. It appears to be reading the # second argument $(path2) line by line and sending that as # arguments to e-PCR. That wouldn't be right here. cat > template << '_EOF_' #LOOP /cluster/bin/scripts/runEpcr64 $(path1) $(path2) {check out line /san/sanvol1/scratch/hg18/sts.10/epcr/out/$(root1).$(root2).epcr} #ENDLOOP '_EOF_' # << emacs gensub2 primers.lst contig.lst template jobList para create jobList # 3420 jobs para try ... check ... push ... etc ... # Completed: 3024 of 3024 jobs # CPU time in finished jobs: 31802s 530.04m 8.83h 0.37d 0.001 y # IO & Wait Time: 12804s 213.40m 3.56h 0.15d 0.000 y # Average job time: 15s 0.25m 0.00h 0.00d # Longest finished job: 193s 3.22m 0.05h 0.00d # Submission to last job: 372s 6.20m 0.10h 0.00d # merge output ssh pk cd /cluster/bluearc/hg17/sts/primers/epcr cd /san/sanvol1/scratch/hg18/sts.10/epcr cat out/*.epcr > all.epcr wc -l all.epcr # 3792 # should be on the fileserver (kkstore02) for the following heavy # I/O operations. Didn't do that here, was on pk instead. # use all.epcr file to re-filter alignemnts and determine which # ePCR records to keep cp all.epcr /cluster/data/hg18/bed/sts/primers cd /cluster/data/hg18/bed/sts/primers pslFilterPrimers -epcr=all.epcr -verbose=1 \ /san/sanvol1/scratch/hg18/sts.10/primers.psl.unlifted \ /cluster/data/ncbi/sts.10/all.primers primers.unlifted.epcr.psl # creates three files: # -rw-rw-r- 1 148528 Feb 6 10:39 epcr.not.found # -rw-rw-r- 1 51632003 Feb 6 10:39 primers.unlifted.epcr.psl # -rw-rw-r- 1 1189756 Feb 6 10:39 primers.unlifted.epcr.psl.notfound.primers # convert to PSL and combine with other psl file time /cluster/bin/scripts/epcrToHgPsl epcr.not.found \ /cluster/data/ncbi/sts.10/all.primers /cluster/data/hg18 # real 81m24.041s (on pk, may have been better on kkstore02 # where all of the data is) cat primers.unlifted.epcr.psl epcr.not.found.psl \ | sort -k 10n > primers.final.unlifted.psl wc -l primers.final.unlifted.psl # 454869 primers.final.unlifted.psl # should have been on kkstore02 already ssh kkstore02 cd /cluster/data/hg18/bed/sts/primers # Fix the query gap lengths so that they match the all.primers.fa # file lengths time /cluster/bin/scripts/fixPrimersQueryGaps \ /cluster/data/ncbi/sts.10/all.primers primers.final.unlifted.psl \ > primers.final.unlifted.fix.psl # real 0m19.814s wc -l primers.final.unlifted.fix.psl # 454869 primers.final.unlifted.fix.psl # lift results from contigs to chrom coordinates, and create final file time liftUp -nohead primers.psl \ /cluster/data/hg18/jkStuff/liftContigs.lft warn \ primers.final.unlifted.fix.psl # real 0m2.897s wc -l primers.psl # 454869 primers.psl # Extract relevant info, make alignments unique, and create final file to # be merged with full sequence alignments time /cluster/bin/scripts/extractPslInfo primers.psl # real 0m15.303s wc -l primers.psl.initial # 451023 primers.psl.initial $HOME/kent/src/utils/findAccession.pl -agp primers.psl.initial \ /cluster/data/hg18 wc -l primers.psl.initial.acc # 451023 primers.psl.initial.acc /cluster/bin/scripts/getStsId /cluster/data/hg18/bed/sts/stsInfo2.bed \ primers.psl.initial.acc \ | sort -k 4n > primers.final #rm primers.psl.initial.acc wc -l primers.final # 451023 primers.final # There doesn't appear to be any use for this primers.ids list # except for curiosity. Check the head and tail of this list to # verify no garbage is in here. There should just be numbers. awk '{print $4}' primers.final | sort -n | uniq > primers.ids wc -l primers.ids # 287465 primers.ids # Merge primer and sequence files to create final bed file # Merge (combineSeqPrimerPos) takes about an hour to run ssh kkstore02 cd /cluster/data/hg18/bed/sts time /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final \ primers/primers.final # real 55m33.254so wc -l stsMarkers_pos.rdb # 307082 stsMarkers_pos.rdb time /cluster/bin/scripts/createSTSbed \ /cluster/data/ncbi/sts.10/stsInfo2.bed stsMarkers_pos.rdb > stsMap.bed # real 0m13.351s wc -l stsMap.bed # 300492 stsMap.bed # Set up sequence files ssh hgwdev mkdir /gbdb/hg18/sts.10/ ln -s /cluster/data/ncbi/sts.10/all.STS.fa /gbdb/hg18/sts.10/all.STS.fa ln -s /cluster/data/ncbi/sts.10/all.primers.fa \ /gbdb/hg18/sts.10/all.primers.fa # Load all files cd /cluster/data/hg18/bed/sts hgLoadSeq hg18 /gbdb/hg18/sts.10/all.STS.fa /gbdb/hg18/sts.10/all.primers.fa # Advisory lock created # Creating .tab file # Adding /gbdb/hg18/sts.10/all.STS.fa # 93698 sequences # Adding /gbdb/hg18/sts.10/all.primers.fa # 306885 sequences # Updating seq table # Advisory lock has been released # All done # real 1m25.459s hgsql hg18 < $HOME/kent/src/hg/lib/stsInfo2.sql hgsql hg18 < $HOME/kent/src/hg/lib/stsAlias.sql # these two files are already here from previous operations above # cp /cluster/data/ncbi/sts.10/{stsInfo2.bed,stsAlias.bed} . hgsql hg18 -e 'load data local infile "stsInfo2.bed" into table stsInfo2' hgsql hg18 -e 'load data local infile "stsAlias.bed" into table stsAlias' # a couple minutes for each load above hgLoadBed -notItemRgb -noBin -tab \ -sqlTable=$HOME/kent/src/hg/lib/stsMap.sql \ hg18 stsMap stsMap.bed hgLoadPsl -nobin -table=all_sts_primer hg18 primers/primers.psl # load of all_sts_primer did not go as planned: 454869 record(s), # 0 row(s) skipped, 10 warning(s) loading primers/primers.psl hgLoadPsl -nobin -table=all_sts_seq hg18 stsMarkers.lifted.psl # PRUNE stsMap RECORDS (DONE 3/3/06) hgsql hg18 -e 'delete from stsMap where chromEnd-chromStart > 5000' ########################################################################### # CREATE HAPLOTYPEPOS TRACK (DONE 1/31/06, Fan) ssh kkstore02 cd /cluster/data/hg18/bed mkdir haplotypePos cd haplotypePos cp /cluster/data/hg18/*hap*/*.fa . -p ls *.fa|sed -e 's/chr/split1 chr/' |sed -e 's/.fa//' >splitAll cat << '_EOF_' > split1 echo processing $1 faSplit2 -lift=$1.lft -overlap=500 size $1.fa 3500 split/$1 '_EOF_' chmod +x split* mkdir split mkdir result splitAll ls ./split/*.fa > split.lst cat << '_EOF_' > gsub #LOOP /cluster/store11/gs.19/build36/bed/haplotypePos/hblat1 $(file1) {check out line+ /cluster/store11/gs.19/build36/bed/haplotypePos/result/$(root1).psl} #ENDLOOP '_EOF_' gensub2 split.lst single gsub jobList ssh pk cd /cluster/data/hg18/bed/haplotypePos mkdir result para create jobList para try, push, check ... # Completed: 3091 of 3092 jobs # Crashed: 1 jobs # CPU time in finished jobs: 33164s 552.73m 9.21h 0.38d 0.001 y # IO & Wait Time: 172783s 2879.72m 48.00h 2.00d 0.005 y # Average job time: 67s 1.11m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 300s 5.00m 0.08h 0.00d # Submission to last job: 743s 12.38m 0.21h 0.01d # The single job that crashed was due to chr5_h2_hap1368.fa, which # does not have a decent alignment on chr5. # collect BLAT results cat result/*.psl >all.psl # keep the main alignments pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 all.psl all_filtered.psl all.psr cat chr*.lft > hap.lft liftUp lifted.psl hap.lft warn all_filtered.psl -pslQ mkdir tNibs qNibs cp -p /cluster/data/hg18/nib/*hap*.nib qNibs cp -p /cluster/data/hg18/nib/chr5.nib tNibs cp -p /cluster/data/hg18/nib/chr6.nib tNibs cp -p /cluster/data/hg18/nib/chr22.nib tNibs axtChain -psl -linearGap=medium lifted.psl tNibs qNibs out.chain chainAntiRepeat tNibs qNibs out.chain final.chain cat << '_EOF_' > hap.chrom.lis /cluster/data/hg18/nib/chr5.nib /cluster/data/hg18/nib/chr6.nib /cluster/data/hg18/nib/chr22.nib '_EOF_' ls *.fa >q.lis chainToPsl final.chain /cluster/data/hg18/chrom.sizes \ /cluster/data/hg18/chrom.sizes hap.chrom.lis q.lis haplotypePos.psl # took about 20 minutes hgLoadPsl hg18 haplotypePos.psl # add haplotypePos entry in trackDb.ra ########################################################################### # LOAD AFFYRATIO (DONE - 2006-02-01 - Fan) # Copied from Hg17 doc # NOTE: Jim recommends that, in the future, all AFFY blat alignments should drop # -mask=lower for blat and drop -minIdentity=95 to -minIdentity=90 as the # higher minIdentity is causing alignments to be dropped that should not be. # e.g. # /cluster/bin/i386/blat -fine -minIdentity=90 -ooc=/cluster/bluearc/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} # pslReps can be used to handle filtering at a later step. Blat's minIdentity # seems to be more severe than that for pslReps as it takes insertions and # deletions into account. # # NOTE FROM QA (brooke, 8/28/07): In the future, run hgLoadBed without the # -sqlTable=$HOME/src/hg/lib/affyRatio.sql option, so that tableDescriptions # will be built properly. affyRatio.sql was needed before Jim added bed15 # capability to hgLoadBed (in Oct. 2003), but now bed15 tables can use the # default bedExp.as and bedExp.sql files. # # Set up cluster job to align consenesus/exemplars to hg18 ssh kkstore02 mkdir /cluster/bluearc/hg18/affyGnf cp -p /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa \ /cluster/bluearc/hg18/affyGnf ssh kkr1u00 mkdir -p /iscratch/i/affyGnf cp -p /cluster/bluearc/hg18/affyGnf/* /iscratch/i/affyGnf /cluster/bin/iSync ssh kki mkdir /cluster/data/hg18/bed/affyGnf.2004-06-09 cd /cluster/data/hg18/bed/affyGnf.2004-06-09 ls -1 /iscratch/i/affyGnf/* > affy.lst ls -1 /iscratch/i/gs.19/build36/maskedContigs/* > allctg.lst cat << '_EOF_' > template.sub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 allctg.lst affy.lst template.sub jobList mkdir psl para create jobList para try, push, check # Completed: 378 of 378 jobs # CPU time in finished jobs: 3055s 50.91m 0.85h 0.04d 0.000 y # IO & Wait Time: 1267s 21.12m 0.35h 0.01d 0.000 y # Average job time: 11s 0.19m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 78s 1.30m 0.02h 0.00d # Submission to last job: 367s 6.12m 0.10h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU95.psl ssh kkstore02 cd /cluster/data/hg18/bed/affyGnf.2004-06-09 pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least 95% identity in aligned # region. # minAli = 0.97 too high. low minCover as a lot of n's in these # sequences pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl # Eliminate the long names sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl # Merge with spot data and load into database. added -chip flag to # affyPslAndAtlasToBed to allow correct parsing ssh hgwdev cd /cluster/data/hg18/bed/affyGnf.2004-06-09 bash /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 \ affyU95shortQname.psl \ /projects/compbio/data/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt \ affyRatio.bed affyRatio.exr > affyPslAndAtlasToBed.log 2>&1 hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg18 \ affyRatio affyRatio.bed # Loaded 13043 elements of size 15 mkdir affyU95 hgLoadPsl hg18 -table=affyU95 affyU95shortQname.psl # sequences loaded 2006-02-1 hgLoadSeq -abbr=U95Av2: hg18 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa # Advisory lock created # Creating .tab file # Adding /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa # 12386 sequences # Updating seq table # Advisory lock has been released # All done # Load AFFYUCLANORM, extended version of affyUcla track. Hopefully # final freeze of data set. (DONE - 2006-02-01 - Fan) ssh hgwdev mkdir /cluster/data/hg18/bed/affyUclaNorm cd /cluster/data/hg18/bed/affyUclaNorm cp -p /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa . ssh pk cd /cluster/data/hg18/bed/affyUclaNorm ls -1 /scratch/hg/gs.19/build36/maskedContigs/* > contig.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy mkdir psl ls HG-U133AB_all.fa > affy.lst gensub2 contig.lst affy.lst gsub jobList para create jobList para try para check para push ... etc # Completed: 378 of 378 jobs # CPU time in finished jobs: 6766s 112.77m 1.88h 0.08d 0.000 y # IO & Wait Time: 1541s 25.68m 0.43h 0.02d 0.000 y # Average job time: 22s 0.37m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 202s 3.37m 0.06h 0.00d # Submission to last job: 302s 5.03m 0.08h 0.00d ssh kkstore02 cd /cluster/data/hg18/bed/affyUclaNorm pslSort dirs hg18.affyU133AB_all.psl tmp psl wc hg18.affyU133AB_all.psl # 62043 1302842 13163424 hg18.affyU133AB_all.psl liftUp hg18.affyU133AB_all.lifted.psl \ /cluster/data/hg18/jkStuff/liftAll.lft warn hg18.affyU133AB_all.psl pslReps -minCover=0.5 -sizeMatters -minAli=0.97 \ -nearTop=0.005 hg18.affyU133AB_all.lifted.psl \ hg18.affyU133AB_all.lifted.pslReps.psl out.psr # Processed 62038 alignments ~/kent/src/hg/affyGnf/affyUclaMergePslData \ -pslFile=hg18.affyU133AB_all.lifted.pslReps.psl \ -affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt \ -bedOut=hg18.affyUcla.bed \ -expRecordOut=hg18.affyUcla.expRecords \ -expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt ~/kent/src/hg/affyGnf/addUclaAnnotations.pl hg18.affyUcla.expRecords \ /projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg18.affyUcla.annotations.expRecords # Load the databases ssh hgwdev cd /cluster/data/hg18/bed/affyUclaNorm sed -e 's/affyRatio/affyUclaNorm/' ~/kent/src/hg/lib/affyRatio.sql > affyUclaNorm.sql hgLoadBed hg18 affyUclaNorm hg18.affyUcla.bed -sqlTable=affyUclaNorm.sql ############################################################################ # MAKE AFFY U133 - made after above affyUclaNorm (DONE - 2006-02-01 - Fan) # Someday the names can be fixed. ssh hgwdev mkdir /cluster/data/hg18/bed/affyU133 cd /cluster/data/hg18/bed/affyU133 ln -s ../affyUclaNorm/hg18.affyU133AB_all.lifted.pslReps.psl affyU133.psl hgLoadPsl hg18 affyU133.psl hgsql -e "select count(*) from affyU133;" hg18 # row count in hg17: 44620, in hg18: 45559 hgLoadSeq hg18 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa # 44792 sequences # GNF ATLAS 2 (DONE - 2006-02-01 - Fan) # Align probes from GNF1H chip. ssh pk cd /cluster/data/hg18/bed mkdir -p geneAtlas2/run/psl cd geneAtlas2/run # This bluearc/geneAtlas2 directory already exists # mkdir -p /cluster/bluearc/geneAtlas2 # cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2 ls -1 /scratch/hg/gs.19/build36/maskedContigs > genome.lst ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/blat -fine -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.19/build36/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.lst mrna.lst gsub jobList para create jobList para try para check para push para time # Completed: 378 of 378 jobs # CPU time in finished jobs: 4038s 67.29m 1.12h 0.05d 0.000 y # IO & Wait Time: 2182s 36.37m 0.61h 0.03d 0.000 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 250s 4.17m 0.07h 0.00d # Submission to last job: 322s 5.37m 0.09h 0.00d # Estimated complete: 0s 0.00m 0.00h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null # Processed 79733 alignments liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl rm -r contig.psl raw.psl psl # Load probes and alignments from GNF1H into database. ssh hgwdev cd /cluster/data/hg18/bed/geneAtlas2 # Already symlinked # ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa \ # /gbdb/hgFixed/affyProbes hgLoadPsl hg18 affyGnf1h.psl hgLoadSeq hg18 /gbdb/hgFixed/affyProbes/gnf1h.fa grep -v U133B ../affyUclaNorm/hg18.affyU133AB_all.lifted.pslReps.psl \ | sed -e "s/exemplar://; s/consensus://; s/U133A://" \ | sed -e "s/;//" > affyU133A.psl hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \ affyU133A.psl /cluster/data/hg18/bed/geneAtlas2/affyGnf1h.psl # Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio # Mapped 32926, multiply-mapped 2000, missed 48, unmapped 11770 hgLoadBed hg18 gnfAtlas2 gnfAtlas2.bed # Loaded 34926 elements of size 15 ######################################################################## # Creating the ideoband data track (DONE - 2006-02-02 - Hiram) # This was reloaded upon completion of the cytoband sequence # mentioned above. # Received the following files in email from Wonhee Jang from NCBI: # -rw-rw-r-- 1 1917 Feb 2 14:01 setBands.txt # -rw-rw-r-- 1 39058 Feb 2 14:01 human_ideogram.dat # -rw-rw-r-- 1 673148 Feb 2 14:01 fish.markers.bed # placed them into /cluster/data/hg18/bed/ideogram ssh hgwdev mkdir /cluster/data/hg18/bed/ideogram cd /cluster/data/hg18/bed/ideogram cat << '_EOF_' > mkBands.sh #!/bin/sh T=/cluster/data/hg18/bed/ideogram HI=${T}/human_ideogram.dat FM=${T}/fish.markers.bed SB=${T}/setBands.txt bander chr1 ${HI} ${FM} ${SB} 1 247199719 100 2.0 2 bander chr2 ${HI} ${FM} ${SB} 2 242751149 100 2.0 2 bander chr3 ${HI} ${FM} ${SB} 3 199446827 100 2.0 2 bander chr4 ${HI} ${FM} ${SB} 4 191263063 100 2.0 2 bander chr5 ${HI} ${FM} ${SB} 5 180837866 100 2.0 2 bander chr6 ${HI} ${FM} ${SB} 6 170896992 100 2.0 2 bander chr7 ${HI} ${FM} ${SB} 7 158821424 100 2.0 2 bander chr8 ${HI} ${FM} ${SB} 8 146274826 100 2.0 2 bander chr9 ${HI} ${FM} ${SB} 9 140273252 100 2.0 2 bander chr10 ${HI} ${FM} ${SB} 10 135374737 100 2.0 2 bander chr11 ${HI} ${FM} ${SB} 11 134452384 100 2.0 2 bander chr12 ${HI} ${FM} ${SB} 12 132289534 100 2.0 2 bander chr13 ${HI} ${FM} ${SB} 13 114127980 100 2.0 2 bander chr14 ${HI} ${FM} ${SB} 14 106360585 100 2.0 2 bander chr15 ${HI} ${FM} ${SB} 15 100338915 100 2.0 2 bander chr16 ${HI} ${FM} ${SB} 16 88822254 100 2.0 2 bander chr17 ${HI} ${FM} ${SB} 17 78654742 100 2.0 2 bander chr18 ${HI} ${FM} ${SB} 18 76117153 100 2.0 2 bander chr19 ${HI} ${FM} ${SB} 19 63806651 100 2.0 2 bander chr20 ${HI} ${FM} ${SB} 20 62435964 100 2.0 2 bander chr21 ${HI} ${FM} ${SB} 21 46944323 100 2.0 2 bander chr22 ${HI} ${FM} ${SB} 22 49591432 100 2.0 2 bander chrX ${HI} ${FM} ${SB} X 154913754 100 2.0 2 bander chrY ${HI} ${FM} ${SB} Y 57443437 100 2.0 2 for I in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y do cat chr${I}.bed done > cytobands.bed '_EOF_' # happy emacs chmod +x mkBands.sh ./mkBands.sh # should be 862 wc cytobands.bed # 862 4310 29911 cytobands.bed hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \ hg18 cytoBand cytobands.bed hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \ hg18 cytoBandIdeo cytobands.bed ############################################################################ # H-INVITATIONAL GENE ANNOTATION DATABASE (DONE 2006-0202, Fan) # http://www.jbirc.aist.go.jp/hinv/top.html # Create knownGene table to reference HINV gene ID's # for link on knownGenes details page # Also, create an HINV gene track # download CDNA file release 2.2 (Jan 20, 2006) -- got release # from downloads page). ssh kkstore03 cd /cluster/data/hinv mkdir 2005-02-02 cd 2005-02-02 wget --timestamp http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz gunzip FCDNA.gz mv FCDNA FCDNA.2.2 # set up assembly work area ssh kkstore02 cd /cluster/data/hg18 mkdir -p bed/hinv cd bed/hinv # extract H-INV ID's and Genbank accessions of mRNAs awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > accessions.txt awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > ids.txt paste accessions.txt ids.txt > queries.txt wc -l ids.txt # 56419 ids.txt # create PSL file from alignments for these mRNA's, extracted from the # table of all aligned mRNA's ssh hgwdev cd /cluster/data/hg18/bed/hinv hgsql hg18 -s -e "SELECT * FROM all_mrna" | cut -f 2- > all_mrna.tab ssh kkstore02 cd /cluster/data/hg18/bed/hinv pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl # using pslReps to generate the PSL file header pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl # NEXT TIME, LOAD HInvGeneMrna TABLE AFTER HInv TABLE IS LOADED TO AVOID # joinerCheck TO COMPLAIN. # load track of mrna alignments ssh hgwdev cd /cluster/data/hg18/bed/hinv hgLoadPsl hg18 -table=HInvGeneMrna hinv_mrna.psl hgsql hg18 -s -e \ "select distinct(qName) from HInvGeneMrna order by qName" > hg18.mrna hgsql hg17 -s -e \ "select distinct(qName) from HInvGeneMrna order by qName" > hg17.mrna wc -l hg*.mrna # 41023 hg17.mrna # 54974 hg18.mrna comm -1 -3 *.mrna > hg18.aligned wc -l hg18.aligned # 14758 (transcripts newly aligned in hg18) comm -2 -3 *.mrna > hg17.aligned wc -l hg17.aligned # 807 (transcripts no longer aligned in hg18) comm -2 -3 ids.txt hg18.mrna > hg18.notaligned wc -l hg18.notaligned # 1445 (transcripts not aligned in hg18 -- checking on why...) # also make a table with various useful items for each transcript ssh hgwdev hgsql hg18 < ~/kent/src/hg/lib/HInv.sql cd /cluster/data/hg18/bed/hinv /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > HInv.tab echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg18 hgsql hg17 -s -e "select count(*) from HInv" # 41118 hgsql hg18 -s -e "select count(*) from HInv" # 56419 # !!! DO THIS AFTER KG IS BUILD !!! # DONE (4/13/06 Fan). # create table for knownGenes detail page ssh hgwdev cd /cluster/data/hg18/bed/hinv hgMapToGene hg18 HInvGeneMrna knownGene knownToHInv # QA NOTE (3-6-2006): did a mytouch to update the time for the HInvGeneMrna table # (because joinerCheck was complaining during -times check): # sudo mytouch hg18 HInvGeneMrna 200602031600.00 # touch -t 200602031600.00 /var/lib/mysql/hg18/HInvGeneMrna.MYD # PRODUCE FUGU BLAT ALIGNMENT (DONE - 2006-02-02 - Fan) ssh kk mkdir /cluster/data/hg18/bed/blatFr1 cd /cluster/data/hg18/bed/blatFr1 mkdir psl # next time, use N?_?????? (to pick up NG_ contigs) foreach f ( `cat /cluster/data/hg18/contig.lst` ) set c=$f:t:r echo $c mkdir psl/$c end # create cluster job mkdir run cd run ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst ls -1S /scratch/hg/gs.19/build36/maskedContigs/*.fa > human.lst cat << 'EOF' > gsub #LOOP /cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg18/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl} #ENDLOOP 'EOF' # << keep emacs happy gensub2 human.lst fugu.lst gsub jobList para create jobList # 218484 jobs written to batch para try para check para push -maxQueue=300000 -maxPush=220000 para check # Completed: 218484 of 218484 jobs # CPU time in finished jobs: 5073329s 84555.48m 1409.26h 58.72d 0.161 y # IO & Wait Time: 692572s 11542.87m 192.38h 8.02d 0.022 y # Average job time: 26s 0.44m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 910s 15.17m 0.25h 0.01d # Submission to last job: 14753s 245.88m 4.10h 0.17d # cd ../psl # count files with aligments # find . -not -size 427c | wc -l # 44458 # count files with no aligments # find . -size 427c | wc -l # 174405 # When cluster run is done, sort alignments # into chrom directory ssh kkstore02 cd /cluster/data/hg18/bed/blatFr1 pslCat -dir psl/N?_?????? | \ liftUp -type=.psl stdout \ /cluster/data/hg18/jkStuff/liftAll.lft warn stdin | \ pslSortAcc nohead chrom temp stdin # Processed 218887 lines into 1 temp files # Rename to correspond with tables and load into database: ssh hgwdev cd /cluster/data/hg18/bed/blatFr1/chrom foreach i (chr*.psl) set r = $i:r echo mv $i ${r}_blatFr1.psl mv $i ${r}_blatFr1.psl end # lift fugu scaffolds to Fugu browser chrUn, # so you can link to other browser. And don't need to load sequence cd /cluster/data/hg18/bed/blatFr1 liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl hgLoadPsl -table=blatFr1 hg18 all.psl nice featureBits hg18 blatFr1 refGene:CDS # 14636876 bases of 2881515245 (0.508%) in intersection nice featureBits hg17 blatFr1 refGene:CDS # 14488047 bases of 2866216770 (0.505%) in intersection ####################################################################### # OPOSSUM BLASTZ - (DONE - 2006-02-10 - Hiram) ssh kk # this was done again after this, see 2006-02-13 mkdir /cluster/data/hg18/bed/blastzMonDom4.2006-02-10 cd /cluster/data/hg18/bed/blastzMonDom4.2006-02-10 cat << '_EOF_' > DEF # human vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin BLASTZ=blastz.v7 # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human (hg18) SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom4 SEQ2_DIR=/iscratch/i/monDom4/monDom4.2bit SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-10 TMPDIR=/scratch/tmp '_EOF_' # happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -stop=net `pwd`/DEF > blastz.out 2>&1 & # running 2006-02-10 # Completed: 43469 of 43470 jobs # Crashed: 1 jobs # CPU time in finished jobs: 25745592s 429093.20m 7151.55h 297.98d 0.816 y # IO & Wait Time: 8466642s 141110.70m 2351.85h 97.99d 0.268 y # Average job time: 787s 13.12m 0.22h 0.01d # Longest finished job: 51561s 859.35m 14.32h 0.60d # Submission to last job: 103470s 1724.50m 28.74h 1.20d # There wasn't actually an outstanding job, it had been completed. # Completed: 345 of 345 jobs # CPU time in finished jobs: 620s 10.33m 0.17h 0.01d 0.000 y # IO & Wait Time: 1631s 27.19m 0.45h 0.02d 0.000 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest finished job: 69s 1.15m 0.02h 0.00d # Submission to last job: 255s 4.25m 0.07h 0.00d # Completed: 49 of 49 jobs # CPU time in finished jobs: 224697s 3744.94m 62.42h 2.60d 0.007 y # IO & Wait Time: 4790s 79.84m 1.33h 0.06d 0.000 y # Average job time: 4683s 78.06m 1.30h 0.05d # Longest finished job: 115041s 1917.35m 31.96h 1.33d # Submission to last job: 115147s 1919.12m 31.99h 1.33d time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=cat -stop=net `pwd`/DEF > cat-net.out 2>&1 & # running 2006-02-11 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -continue=load -stop=load `pwd`/DEF > load.out 2>&1 & ssh kolossus cd /cluster/data/hg18/bed/blastz.monDom4 time nice -n +19 featureBits hg18 chainMonDom4Link \ > fb.hg18.chainMonDom4Link 2>&1 & cat fb.hg18.chainMonDom4Link # 356865888 bases of 2881515245 (12.385%) in intersection #################################################################################### # BUILD KNOWN GENES TABLES (STARTED 2/1/06, DONE 2/13/06 Fan) # First build protein databases, sp060115 and proteins060115 # See makeProteins060115.doc for details. # Create working subdirectories and temporary databases (kgHg18A) ssh hgwdev cd /cluster/store11/kg mkdir kgHg18A ln -s /cluster/store11/kg/kgHg18A /cluster/store6/kgDB/bed/kgHg18A ln -s /cluster/store11/kg/kgHg18A /cluster/data/hg18/bed/kgHg18A hgsql hg18 -e "create database kgHg18A" hgsql hg18 -e "create database kgHg18ATemp" mkdir /cluster/bluearc/kgDB/kgHg18A mkdir /cluster/bluearc/kgDB/kgHg18A/protBlat ln -s /cluster/bluearc/kgDB/kgHg18A/protBlat /cluster/store11/kg/kgHg18A/protBlat cd /cluster/store11/kg/kgHg18A/protBlat # Get all human protein sequences hgsql -N sp060115 -e \ 'select p.acc, p.val from protein p, accToTaxon x where x.taxon=9606 and p.acc=x.acc'\ |awk '{print ">" $1;print $2}' >humanProt.fa hgsql -N sp060115 -e \ 'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x where v.parAcc = p.acc and x.taxon=9606 and v.parAcc=x.acc'\ |awk '{print ">" $1;print $2}' \ >humanVarProt.fa # append var proteins to humanProt.fa cat humanVarProt.fa >>humanProt.fa # Prepare and perform cluster run for protein/genome alignment ssh pk cd /cluster/data/hg18/bed/kgHg18A/protBlat mkdir prot faSplit sequence humanProt.fa 2000 prot/prot ls /cluster/bluearc/kgDB/kgHg18A/protBlat/prot/* > prot.lis ssh hgwdev cd /cluster/data/hg18/bed/kgHg18A/protBlat hgsql hg18 -N -e 'select chrom from chromInfo' > chrom.lis exit cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/blat -t=dnax -q=prot /cluster/data/hg18/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgHg18A/protBlat/result/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' mkdir result gensub2 chrom.lis prot.lis gsub jobList para create jobList para try para check para push para check ... # Completed: 97020 of 97020 jobs # CPU time in finished jobs: 16070335s 267838.92m 4463.98h 186.00d 0.510 y # IO & Wait Time: 279789s 4663.15m 77.72h 3.24d 0.009 y # Average job time: 169s 2.81m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 152051s 2534.18m 42.24h 1.76d # Submission to last job: 152235s 2537.25m 42.29h 1.76d # This cluster run took a little less than 2 days. # collect BLAT results pslSort -nohead dirs raw.psl temp result pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl protBlat.psl /dev/null ssh hgwdev cd /cluster/bluearc/kgDB/kgHg18A/protBlat hgLoadPsl hg18 protBlat.psl # create all_mrna.psl and tight_mrna.psl hgsql hg18 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \ all_mrna.psl tight_mrna.psl /dev/null # Save a copy of the following hg18 tables: all_mrna gbCdnaInfo gbExtFile gbLoaded gbSeq gbStatus genbank.lis refFlat refGene refLink refSeqAli refSeqStatus refSeqSummary xenoMrna xenoRefFlat xenoRefGene xenoRefSeqAli # Use overlapSelect to get protein and mRNA alignment overlaps overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat.psl protMrna.stat overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \ -inFmt=psl tight_mrna.psl protBlat.psl protMrna.out # Create protein/mRNA pair and protein lists cut -f 10,31 protMrna.out|sort -u >spMrna.tab cut -f 10 protMrna.out|sort -u >protein.lis mv protein.lis .. # Load spMrna.tab into spMrna table in temp DB. hgsql kgHg18ATemp < ~/src/hg/lib/spMrna.sql hgsql kgHg18ATemp -e 'load data local infile "spMrna.tab" into table spMrna' hgsql kgHg18ATemp -e 'create index mrnaID on spMrna(mrnaID)' # Prepare and perform cluster run of protein/mRNA alignment # Get mRNA fa file. cd /cluster/data/hg18/bed/kgHg18A /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=hg18 \ -gbRoot=/cluster/data/genbank genbank mrna mrna.fa # Create mrnaSeq table in kgHg18ATemp DB. faToTab mrna.fa mrnaSeq.tab hgsql kgHg18ATemp -e 'drop table mrnaSeq' hgsql kgHg18ATemp <~/src/hg/lib/mrnaSeq.sql hgsql kgHg18ATemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq' # Prepare files for cluster run cd /cluster/bluearc/kgDB/kgHg18A ~/src/hg/protein/KG2B.sh kgHg18A hg18 060115 # Perform cluster run of protein/mRNA alignment ~/src/hg/protein/KG3.sh kgHg18A hg18 060115 # Collect cluster run results cd kgBestMrna ls out | sed -e 's/prot/do1 prot/g' >doall # create do1 with the following 2 lines: cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protMrnaRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis wc protMrna.lis # Load BLAT results into temp DB. ssh hgwdev cd /cluster/store11/kg/kgHg18A/kgBestMrna hgsql kgHg18ATemp < ~/src/hg/lib/protMrnaBlat.sql hgsql kgHg18ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat' hgsql kgHg18ATemp -e 'create index tName on protMrnaBlat(tName)' # Create CDS files from protein/mRNA alignment results. hgsql kgHg18ATemp -N -e \ 'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\ |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds # Create protMrna.psl with proteinID_mrnaID as query ID. cut -f 22-30 ../protBlat/protMrna.out > j1.tmp cut -f 32-42 ../protBlat/protMrna.out > j2.tmp cut -f 10,31 ../protBlat/protMrna.out|sed -e 's/\t/_/g' >j3.tmp paste j1.tmp j3.tmp j2.tmp >protMrna.psl rm j1.tmp j2.tmp j3.tmp # Run mrnaToGene to create protMrna.gp bash mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log exit # move kgBestMrna to /san/sanvol1 to save space on store11 mv /cluster/store11/kg/kgHg18A/kgBestMrna/clusterRun /san/sanvol1/scratch/fan/hg18/kgHg18A/kgBestMrna ln -s /san/sanvol1/scratch/fan/hg18/kgHg18A/kgBestMrna/clusterRun \ /cluster/store11/kg/kgHg18A/kgBestMrna/clusterRun # Prepare refGene and all_mrna gp files. cd .. cp -p base/refGene.tab ref.gp # hgsql hg18 -N -e 'select * from refGene' >ref.gp hgsql hg18 -N -e \ 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \ |sort -u > all_mrna.cds cat base/all_mrna.tab |cut -f 2-22 >all_mrna.psl bash mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log exit # Align proteins to RefSeq. overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp ref.stat overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat/protBlat.psl ref.gp protRef.gp overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\ -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out cut -f 10,22 protRef.out | sort -u >spRef.tab cut -f 10 protRef.out | sort -u >protRef.lis hgsql kgHg18ATemp -e 'drop table spRef' hgsql kgHg18ATemp <~/src/hg/lib/spRef.sql hgsql kgHg18ATemp -e 'load data local infile "spRef.tab" into table spRef' # Prepare and perform cluster runs for protein/RefSeq alignments ~/src/hg/protein/KGRef2.sh kgHg18A hg18 060115 # Took 7 hours. This step should be investigated and improved. ~/src/hg/protein/KGRef3.sh kgHg18A hg18 060115 cd kgBestRef ls out | sed -e 's/prot/do1 prot/g' >doall cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protRefRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments. pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis wc protRef.lis hgsql kgHg18ATemp -e 'drop table protRefBlat' hgsql kgHg18ATemp < ~/src/hg/lib/protRefBlat.sql hgsql kgHg18ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat' hgsql kgHg18ATemp -e 'create index tName on protRefBlat(tName)' # Run gene-check to filter out invalid gp entries cd /cluster/data/hg18/bed/kgHg18A cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/hg18/nib kgCandidate0.gp kgCandidate0.check hgsql kgHg18ATemp -e 'drop table kgCandidate0' hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidate0.sql hgsql kgHg18ATemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0' hgsql kgHg18ATemp -e 'drop table geneCheck' hgsql kgHg18ATemp < ~/src/hg/lib/geneCheck.sql hgsql kgHg18ATemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgCheck to get all KG candidates that pass the KG gene check criteria kgCheck kgHg18ATemp hg18 kgCandidate0 geneCheck kgCandidate.tab hgsql kgHg18ATemp -e 'drop table kgCandidate' hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidate.sql hgsql kgHg18ATemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate' hgsql kgHg18ATemp -e 'create index alignID on kgCandidate(alignID)' # ####### NEXT TIME AROUND PUT IN AN EXTRA STEP TO BRING IN ITEMS ON A "PUT BACK" LIST # FOR SPECIAL CASES LIKE SELENOCYSTEINE, NON-AUG INITIATION CODON, RIBOSOMAL SLIPPAGE, ETC. # ####### # Construct the kgCandidateX table that has alignID in the name field. cut -f 2-10 kgCandidate.tab >j2.tmp cut -f 11 kgCandidate.tab >j1.tmp paste j1.tmp j2.tmp >kgCandidateX.tab hgsql kgHg18ATemp -e 'drop table kgCandidateX' hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateX.sql hgsql kgHg18ATemp -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX' # Score protein/mRna and protein/RefSeq alignments ln -s protBlat/protein.lis protein.lis kgResultBestMrna2 060115 kgHg18ATemp hg18 protMrnaBlat|sort -u >protMrnaBlatScore.tab kgResultBestRef2 060115 kgHg18ATemp hg18 protRefBlat|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgHg18ATemp -e 'drop table protMrnaScore' hgsql kgHg18ATemp < ~/src/hg/lib/protMrnaScore.sql hgsql kgHg18ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore' hgsql kgHg18ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgGetCds to get CDS structure of each gene kgGetCds kgHg18ATemp 060115 kgCandidateX jY.tmp cat jY.tmp |sort -u >kgCandidateY.tab rm jY.tmp hgsql kgHg18ATemp -e 'drop table kgCandidateY' hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateY.sql hgsql kgHg18ATemp -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY' # Run kgPickPrep to replace long cds structure string with cdsId. kgPickPrep kgHg18ATemp kgCandidateZ.tab hgsql kgHg18ATemp -e 'drop table kgCandidateZ' hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateZ.sql hgsql kgHg18ATemp -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ' hgsql kgHg18ATemp -e 'create index cdsId on kgCandidateZ(cdsId)' # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure. kgPick kgHg18ATemp hg18 sp060115 kg3.tmp dupSpMrna.tmp sort -u dupSpMrna.tmp >dupSpMrna.tab # Create put back list # gbGetSeqs2, a modified version of gbGetSeqs output the RefSeq IDs at the beginning of each output line. gbGetSeqs2 -gbRoot=/cluster/data/genbank db=hg18 -get=ra RefSeq mrna ref.ra cat ref.ra | sed -e 's/ /\t/' | sort -u >refRa.tab hgsql hg18 -e 'drop table refRa' hgsql hg18 < ~/src/hg/lib/refRa.sql hgsql hg18 -e 'load data local infile "refRa.tab" into table refRa ignore 1 lines' hgsql hg18 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \ >kgPutBack2.tab hgsql hg18 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%ribosomal frameshift%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \ >>kgPutBack2.tab hgsql hg18 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \ >>kgPutBack2.tab hgsql hg18 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \ >>kgPutBack2.tab hgsql hg18 -N -e \ 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \ >>kgPutBack2.tab hgsql kgHg18ATemp -e 'drop table kgPutBack2' hgsql kgHg18ATemp < ~/src/hg/lib/kgPutBack2.sql hgsql kgHg18ATemp -e 'load data local infile "kgPutBack2.tab" into table kgPutBack2' kgPutBack kgHg18ATemp hg18 sp060115 kgPutBack2 kgPutBack2.gp # No matching protein found for NM_201397. # No matching protein found for NM_203341. # No matching protein found for NM_213593. # No matching protein found for NM_052987. # No matching protein found for NM_201397. # No matching protein found for NM_203341. # No matching protein found for NM_213593. # Sort KG genes to make the kg4.gp table file. cat kgPutBack2.gp kg3.tmp > kg4.tmp ~/kent/src/hg/protein/sortKg.pl kg4.tmp >knownGene.tab hgsql kgHg18ATemp -e 'drop table knownGene' hgsql kgHg18ATemp < ~/src/hg/lib/knownGene.sql hgsql kgHg18ATemp -e 'load data local infile "knownGene.tab" into table knownGene' # Load data into hg18 knownGene table. hgsql hg18 -e 'drop table knownGene' hgsql hg18 < ~/src/hg/lib/knownGene.sql hgsql hg18 -e 'load data local infile "knownGene.tab" into table knownGene' # Load dupSpMrna table after knownGene table is loaded so that joinerCheck does not complain. hgsql hg18 -e 'drop table dupSpMrna' hgsql hg18 < ~/src/hg/lib/dupSpMrna.sql hgsql hg18 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna' # Perform analysis on KG # Build knownGeneMrna and knownGenePep tables. kgPepMrna kgHg18ATemp hg18 060115 hgsql hg18 -e 'drop table knownGeneMrna' hgsql hg18 < ~/src/hg/lib/knownGeneMrna.sql hgsql hg18 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' hgsql hg18 -e 'drop table knownGenePep' hgsql hg18 < ~/src/hg/lib/knownGenePep.sql hgsql hg18 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build kgXref table kgXref2 kgHg18ATemp 060115 hg18 hgsql hg18 -e 'drop table kgXref' hgsql hg18 < ~/src/hg/lib/kgXref.sql hgsql hg18 -e 'load data local infile "kgXref.tab" into table kgXref' # Build spMrna table hgsql hg18 -N -e 'select proteinID, name from knownGene' >kgSpMrna.tab hgsql hg18 -e 'drop table spMrna' hgsql hg18 <~/src/hg/lib/spMrna.sql hgsql hg18 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build kgProtMap table ~/src/hg/protein/kgProtMap2.sh kgHg18A hg18 060115 # Found the number of kgProtMap table was less than 20,000, # indicating missing a lot of entries. The problem was # due to that tight_mrna.psl was now in ~/hg18Kg/protBlat. # Manually ran the following to correct the problem: cd ~/hg18Kg/kgProtMap/psl.tmp cat ~/hg18Kg/protBlat/tight_mrna.psl refSeqAli.psl > both.psl pslMap kgProtMrna.psl both.psl stdout | sort -u| \ sort -k 14,14 -k 16,16n -k 17,17n > kgProtMap.psl hgsql hg18 -e "drop table kgProtMap;" hgLoadPsl -tNameIx hg18 kgProtMap.psl ##################################### # Build alias tables. kgAliasM hg18 proteins060115 # kgAliasKgXref reads from hg18.knownGene.proteinID, # hg18.knownGene.name, hg18.kgXref.geneSymbol # to create kgAliasKgXref.tab kgAliasKgXref hg18 # kgAliasRefseq reads from hg18.knownGene.name, # hg18.knownGene.proteinID, hg18.kgXref.refseq # to create kgAliasRefseq.tab kgAliasRefseq hg18 hgsql sp060115 -N -e 'select name,gene.val from hg18.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \ | sort -u > kgAliasP.tab hgsql hg18 -N -e 'select name, name from knownGene' >kgAliasDup.tab hgsql hg18 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" hg18 hgsql hg18 < ~/kent/src/hg/lib/kgAlias.sql hgsql hg18 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from hg18.knownGene.name, # hg18.knownGene.proteinID, hg18.knownGene.alignID, # proteins060115.spXref3.accession, proteins060115.spSecondaryID, proteins060115.pdbSP.pdb # to create kgProtAlias.tab# kgProtAlias hg18 060115 hgsql hg18 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab # include variant splice protein IDs hgsql hg18 -N -e \ 'select name, proteinID, parAcc from knownGene,sp060115.varAcc where varAcc=proteinID'\ |sort -u >kgProtAliasDup.tab # include duplicate protein IDs from dupSpMrna table hgsql hg18 -N -e \ 'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\ |sort -u >>kgProtAliasDup.tab # catch parent acc from dupProteinID too hgsql hg18 -N -e\ 'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp060115.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\ |sort -u >>kgProtAliasDup.tab cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab echo "`date` creating table kgProtAlias" hgsql hg18 -e "drop table kgProtAlias;" hgsql hg18 <~/src/hg/lib/kgProtAlias.sql; hgsql hg18 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;' # Build kgSpAlias table hgsql hg18 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql hg18 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >hg18.kgSpAlias.tab rm j.tmp hgsql hg18 -e 'drop table kgSpAlias'; hgsql hg18 < ~/src/hg/lib/kgSpAlias.sql hgsql hg18 -e 'load data local infile "hg18.kgSpAlias.tab" into table kgSpAlias' # QA NOTE (3-6-2006): did a mytouch to update the time for the knownGene table # (because joinerCheck was complaining during -times check): # [hgwdev:~/joiner> sudo mytouch hg18 knownGene 200602061707 # touch -t 200602061707 /var/lib/mysql/hg18/knownGene.MYD # MAKE FOLDUTR TABLES (DONE 2006-02-09, Fan) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/hg18/bed mkdir rnaStruct.2006-02-09 rm rnaStruct ln -s rnaStruct.2006-02-09 rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa hg18 knownGene utr3 utr3/utr.fa utrFa hg18 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh pk cd /cluster/data/hg18/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < cgapBIOCARTAdescSorted.tab hgsql hg18 -e "drop table cgapAlias" hgsql hg18 -e "drop table cgapBiocDesc" hgsql hg18 -e "drop table cgapBiocPathway" hgsql hg18 <~/src/hg/lib/cgapAlias.sql hgsql hg18 <~/src/hg/lib/cgapBiocDesc.sql hgsql hg18 <~/src/hg/lib/cgapBiocPathway.sql hgsql hg18 -e 'load data local infile "cgapAlias.tab" \ into table cgapAlias' hgsql hg18 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc' hgsql hg18 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway' # Build hg18 PROTEOME BROWSER TABLES # These are instructions for building tables # needed for the Proteome Browser. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This build is based on proteins DBs dated 060115. # Create the working directory ssh hgwdev mkdir /cluster/store11/kg/kgHg18A/pb-2006-02-10 cd /cluster/data/hg18/bed rm pb ln -s /cluster/store11/kg/kgHg18A/pb-2006-02-10 pb cd pb # Define pep* tables in hg18 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # First edit out pepPred table definition, then hgsql hg18 < pepAll.sql # Build the pepMwAa table hgsql proteins060115 -N -e \ "select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql hg18 -e 'load data local infile "pepMwAa.tab" into table pepMwAa' o Build the pepPi table hgsql proteins060115 -e \ "select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis hgsql hg18 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis pbCalPi protAcc.lis sp060115 pepPi.tab hgsql hg18 -e 'delete from pepPi' hgsql hg18 -e 'load data local infile "pepPi.tab" into table hg18.pepPi' # Calculate and load pep distributions pbCalDist sp060115 proteins060115 9606 hg18 >pbCalDist.out wc pbCalDist.out hgsql hg18 load data local infile "pepExonCntDist.tab" into table hg18.pepExonCntDist; load data local infile "pepCCntDist.tab" into table hg18.pepCCntDist; load data local infile "pepHydroDist.tab" into table hg18.pepHydroDist; load data local infile "pepMolWtDist.tab" into table hg18.pepMolWtDist; load data local infile "pepResDist.tab" into table hg18.pepResDist; load data local infile "pepIPCntDist.tab" into table hg18.pepIPCntDist; load data local infile "pepPiDist.tab" into table hg18.pepPiDist; quit # Calculate frequency distributions pbCalResStd sp060115 9606 hg18 # Create pbAnomLimit and pbResAvgStd tables hgsql hg18 -e "drop table pbAnomLimit" hgsql hg18 -e "drop table pbResAvgStd" hgsql hg18 < ~/src/hg/lib/pbAnomLimit.sql hgsql hg18 < ~/src/hg/lib/pbResAvgStd.sql hgsql hg18 -e 'load data local infile "pbResAvgStd.tab" into table hg18.pbResAvgStd;' hgsql hg18 -e 'load data local infile "pbAnomLimit.tab" into table hg18.pbAnomLimit;' # Create pbStamp table for PB hgsql hg18 -e "drop table pbStamp" hgsql hg18 < ~/src/hg/lib/pbStamp.sql hgsql hg17 -N -e 'select * from pbStamp' > pbStamp.tab hgsql hg18 -e 'load data local infile "pbStamp.tab" into table hg18.pbStamp' # Turn on Proteome Browser for hg18. hgsql -e 'delete from dbDb where name="hg18"' \ -h genome-testdb hgcentraltest hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \ "chr7:127,664,479-127,689,005", 1, 10, "Human", "Homo sapiens", \ "/gbdb/hg18/html/description.html", 0, 1, "NCBI Build 36.1");' \ -h genome-testdb hgcentraltest # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. hgsql hg18 -e "drop table pbStamp" hgsql hg18 < ~/src/hg/lib/pbStamp.sql hgsql hg18 -e 'load data local infile "pbStamp.tab" into table hg18.pbStamp' # Perform preliminary review of Proteome Browser for hg18, then notify QA for formal review. # First build entrez DB tables. cd /cluster/store10/entrez mkdir 060208 ln -s /cluster/store10/entrez/060208 /cluster/data/entrez/060208 cd /cluster/data/entrez/060208 wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz gzip -d *.gz cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab hgsql entrez -e 'drop table entrezRefseq' hgsql entrez -e 'drop table entrezMrna' hgsql entrez -e 'drop table entrezRefProt' hgsql entrez < ~/src/hg/lib/entrezRefseq.sql hgsql entrez < ~/src/hg/lib/entrezMrna.sql hgsql entrez < ~/src/hg/lib/entrezRefProt.sql hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq' hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna' hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt' cd /cluster/store11/kg/kgHg18A hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna, hg18.all_mrna where qName=mrna and entrezRefseq.geneID=entrezMrna.geneID' \ >mrnaRefseq1.tab # Include RefSeq as valid mRNA too. hgsql hg18 -N -e 'select name, name from refGene' >mrnaRefseq2.tab cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab hgsql hg18 -e 'drop table mrnaRefseq' hgsql hg18 < ~/src/hg/lib/mrnaRefseq.sql hgsql hg18 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 2/16/06 Fan) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. The hgKgGetText takes # about 5 minutes when the database is not too busy. The rest # is real quick. ssh hgwdev cd /cluster/store11/kg/kgHg18A mkdir index cd index hgKgGetText hg18 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/store11/kg/kgHg18A/index/knownGene.ix /gbdb/hg18/knownGene.ix ln -s /cluster/store11/kg/kgHg18A/index/knownGene.ixx /gbdb/hg18/knownGene.ixx # BUILD KNOWN GENE LIST FOR GOOGLE. (REDONE 8/12/08 JK) # make knownGeneLists.html hg18GeneList.html mm5GeneList.html rm3GeneList.html cd /cluster/data/hg18/bed rm -rf knownGeneList/hg18 # Run hgKnownGeneList to generate the tree of HTML pages # under ./knownGeneList/hg18 hgKnownGeneList hg18 # copy over to /usr/local/apache/htdocs rm -rf /usr/local/apache/htdocs/knownGeneList/hg18 mkdir -p /usr/local/apache/htdocs/knownGeneList/hg18 cp -Rfp knownGeneList/hg18/* /usr/local/apache/htdocs/knownGeneList/hg18 ################################################################################## # Create description.html for hg18 mkdir -p ~/kent/src/hg/makeDb/trackDb/human/hg18 cd ~/kent/src/hg/makeDb/trackDb/human/hg18 cp ../hg17/description.html . vi description.html # Change release date and build number and change hg17 to hg18 # Check it into CVS mkdir -p /cluster/data/hg18/html cp -p description.html /cluster/data/hg18/html ln -s /cluster/data/hg18/html/description.html /gbdb/hg18/html/description.html # BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-02-11, DONE 2006-02-14 - Fan) # This should be done after KG tables are complete from known genes build # process. # # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/hg18/bed/geneSorter.2006-02-11 # remove old symbolic link rm /cluster/data/hg18/bed/geneSorter ln -s /cluster/data/hg18/bed/geneSorter.2006-02-11 /cluster/data/hg18/bed/geneSorter cd /cluster/data/hg18/bed/geneSorter hgClusterGenes hg18 knownGene knownIsoforms knownCanonical # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/hg18/bed/geneSorter/blastp cd /cluster/data/hg18/bed/geneSorter/blastp pepPredToFa hg18 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /scratch/blast/formatdb -i known.faa -t known -n known # This command is in /projects/compbio/bin/$MACH/formatdb # Copy over database to bluearc rm -fr /cluster/bluearc/hg18/blastp mkdir -p /cluster/bluearc/hg18/blastp cp -p /cluster/data/hg18/bed/geneSorter/blastp/known.* /cluster/bluearc/hg18/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/hg18/bed/geneSorter/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh pk mkdir /cluster/data/hg18/bed/geneSorter/blastp/self cd /cluster/data/hg18/bed/geneSorter/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/hg18/blastp/known -i $1 -o $2 \ -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod +x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para push para check # Completed: 7733 of 7733 jobs # CPU time in finished jobs: 56608s 943.47m 15.72h 0.66d 0.002 y # IO & Wait Time: 467120s 7785.33m 129.76h 5.41d 0.015 y # Average job time: 68s 1.13m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 119s 1.98m 0.03h 0.00d # Submission to last job: 1433s 23.88m 0.40h 0.02d # Load into database. This takes about 20 minutes ssh hgwdev cd /cluster/data/hg18/bed/geneSorter/blastp/self/run/out bash time hgLoadBlastTab hg18 knownBlastTab *.tab # Scanning through 7733 files # Loading database with 9647176 rows # real 21m51.039s cd /cluster/data/hg18/bed/geneSorter # Create table that maps between known genes and RefSeq hgMapToGene hg18 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # hgsql -e "select count(*) from knownToRefSeq;" hg18 # row count changed 34267 # Create table that maps between known genes and LocusLink hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg18 > refToLl.txt hgMapToGene hg18 refGene knownGene knownToLocusLink -lookup=refToLl.txt # hgsql -e "select count(*) from knownToLocusLink;" hg18 # row count changed to 34267 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt hg18 knownGene name proteinID Pfam knownToPfam # hgsql -e "select count(*) from knownToPfam;" hg18 # row count changed to 34177 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene hg18 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # hgsql -e "select count(*) from knownToGnfAtlas2;" hg18 # row count changed to 32015 # Create expression distance table - takes about an hour hgExpDistance hg18 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 & # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio # Got 32015 unique elements in hgFixed.gnfHumanAtlas2MedianRatio # hgsql -e "select count(*) from gnfAtlas2Distance;" hg18 # row count changed to 32015000 # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene "-type=bed 12" hg18 affyUclaNorm knownGene knownToU133 # hgsql -e "select count(*) from knownToU133;" hg18 # row count changed to 32632 # Create expression distance table. This will take about 2.5 hours cd /tmp cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight . time hgExpDistance hg18 affyUclaNorm affyUclaExp knownExpDistance \ -weights=affyUcla.weight -lookup=knownToU133 & # Have 43039 elements in affyUclaNorm # 211 genes, 42 weights, 26.500000 total wieght # Got 32965 unique elements in affyUclaNorm # Create table that maps between known genes and # the GNF data. cd /tmp hgMapToGene hg18 affyU95 knownGene knownToU95 # row count changed to 17401 # hgFixed.gnfHumanU95Exps argument is unused, no need to exist hgExpDistance hg18 hgFixed.gnfHumanU95MedianRatio \ hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 & # Have 11545 elements in hgFixed.gnfHumanU95MedianRatio # Got 16378 unique elements in hgFixed.gnfHumanU95MedianRatio # row count changed to 16378000 # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes only 10 minutes.) hgMapToGene hg18 affyGnf1h knownGene knownToGnf1h hgExpDistance hg18 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnf1h & # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio # Got 8739 unique elements in hgFixed.gnfHumanAtlas2MedianRatio # AFFYMETRIX HG-U133 PLUS TRACK (DONE, 2006-02-11, Fan) # Loaded the HG-U133 Plus 2 sequences for hg18 (DONE, 2006-03-29, hartera) # The below was already done. # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv ssh hgwdev mkdir -p /projects/compbio/data/microarray/affyHuman/HG-U133Plus2 # Go to http://www.affymetrix.com/support/technical/byproduct.affx?product=hg-u133-plus # and download the consensus and exemplar sequences to this directory cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2 unzip HG-U133_Plus_2_consensus.zip unzip HG-U133_Plus_2_exemplar.zip cat HG-U133_Plus_2_consensus HG-U133_Plus_2_exemplar >> U133Plus2_all.fa perl -pi.bak -e "s/(consensus|exemplar):HG-U133_Plus_2:/U133+2:/" \ U133Plus2_all.fa # remove ";" from probe set names perl -pi.bak -e "s/;//" U133Plus2_all.fa # clean up rm *.zip *.bak # Set up cluster job to align consensus/exemplars to hg16 ssh kkr1u00 mkdir -p /iscratch/i/affy mv /cluster/data/hg18/bed/affyU133Plus2.2006-02-11/U133Plus2_all.fa \ /iscratch/i/affy iSync # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # The above is already done by Rachel during hg17 build. ssh hgwdev cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2 mkdir -p /cluster/data/hg18/bed/affyU133Plus2.2006-02-11 cp -p U133Plus2_all.fa /cluster/data/hg18/bed/affyU133Plus2.2006-02-11 cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2 mkdir -p /cluster/data/hg18/bed/affyU133Plus2.2006-02-11 cp -p U133Plus2_all.fa /cluster/data/hg18/bed/affyU133Plus2.2006-02-11 ssh kk cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11 ls -1 /iscratch/i/affy/U133Plus2_all.fa > affy.lst ls -1 /iscratch/i/gs.19/build36/maskedContigs/* > allctg.lst cat << '_EOF_' > template.sub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << for emacs gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para create para.spec para try, para check, para push ... # Completed: 378 of 378 jobs # CPU time in finished jobs: 24764s 412.74m 6.88h 0.29d 0.001 y # IO & Wait Time: 13823s 230.38m 3.84h 0.16d 0.000 y # Average job time: 102s 1.70m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 782s 13.03m 0.22h 0.01d # Submission to last job: 827s 13.78m 0.23h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU133Plus2.psl pslSort dirs raw.psl tmp psl # use filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least 95% identity in aligned region. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyU133Plus2.psl ../../jkStuff/liftAll.lft warn contig.psl perl -pi.bak -e "s/U133\+2://" affyU133Plus2.psl # load into the database ssh hgwdev cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11 hgLoadPsl hg18 affyU133Plus2.psl # The below was already done. # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv # Add sequence data to database # Copy probe sequence to /gbdb if it isn't already mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes ln -s /projects/compbio/data/microarray/affyHuman/HG-U133Plus2/U133Plus2_all.fa . # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # The above is already done by Rachel during hg17 build. cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11 # the sequences need to be loaded for the hg18 database # (2006-03-29, hartera) hgLoadSeq -abbr=U133+2: hg18 /gbdb/hgFixed/affyProbes/U133Plus2_all.fa # clean up rm -r psl tmp err contig.psl raw.psl *.bak psl.tab seq.tab # Added knownToU133Plus2 track cd /cluster/data/hg18/bed/geneSorter hgMapToGene hg18 affyU133Plus2 knownGene knownToU133Plus2 # row count changed to 34745 # Make knownToCdsSnp table (DONE Sept 12, 2007, jk) ssh hgwdev hgMapToGene hg18 snp126 knownGene knownToCdsSnp -all -cds # approx. 5 minutes running time # UPDATE GO DATABASE # Download the terms and make the database. ssh hgwdev mkdir /cluster/store1/geneOntology/20060211 cd /cluster/store1/geneOntology/20060211 wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200601-assocdb-data.gz hgsql mysql <j.tmp hgsql go060211 config.ra # Latest human vs. other Gene Sorter orgs: # mouse, rat, zebrafish, worm, yeast, fly targetGenesetPrefix human targetDb hg18 queryDbs mm7 rn3 danRer3 ce2 sacCer1 dm2 hg18Fa /cluster/data/hg18/bed/blastp/known.faa mm7Fa /cluster/data/mm7/bed/geneSorter/blastp/known.faa rn3Fa /cluster/data/rn3/bed/blastp/known.faa danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa dm2Fa /cluster/data/dm2/bed/flybase4.1/flybasePep.fa buildDir /cluster/data/hg18/bed/hgNearBlastp scratchDir /san/sanvol1/scratch/hg18HgNearBlastp _EOF_ # doHgNearBlastp.pl config.ra >& do.log & doHgNearBlastp.pl config.ra >do3.log # tail -f do.log 0657.tab dm2_0658.tab dm2_0659.tab dm2_0660.tab dm2_0661.tab dm2_0662.tab dm2_0663.tab dm2_0664.tab dm2_0665.tab dm2_0666.tab dm2_0667.tab dm2_0668.tab dm2_0669.tab dm2_0670.tab Scanning through 671 files Loading database with 14488 rows # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/hg18.split # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/hg18.formatdb # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/mm7.split # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/mm7.formatdb # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/rn3.split # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/rn3.formatdb # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/danRer3.split # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/danRer3.formatdb # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/ce2.split # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/ce2.formatdb # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/sacCer1.split # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/sacCer1.formatdb # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/dm2.split # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/dm2.formatdb # ssh -x pk rmdir /san/sanvol1/scratch/hg18HgNearBlastp *** All done! *** Check these tables in hg18: *** humanBlastTab mmBlastTab rnBlastTab drBlastTab ceBlastTab scBlastTab dmBlastTab *** and hgBlastTab in these databases: *** mm7 rn3 danRer3 ce2 sacCer1 dm2 # MAKE ORGANISM-SPECIFIC HGNEARDATA FILES cd ~/kent/src/hg/near/hgNear/hgNearData # any updates necessary? # ENABLE HGNEAR FOR HG18 IN HGCENTRALTEST echo "update dbDb set hgNearOk = 1 where name = 'hg18';" \ | hgsql -h genome-testdb hgcentraltest # END OF HGNEAR STUFF ############################################################################# # UPDATE BIOCYCTABLES NEEDED BY hgGene (DONE 2/16/06 Fan) # First register with BioCyc to download their HumanCyc database # The site will email you the URL for download wget --timestamping http://bioinformatics.ai.sri.com/ecocyc/dist/pdff-XXXXXX/humancyc-flatfiles.zip unzip humancyc-flatfiles.zip cp genes.col genes.tab cp pathways.col pathways.tab # delete the first 20 or so header lines from these two files. vi genes.tab vi pathways.tab hgsql hg18 -e 'create database bioCyc060216' hgsql bioCyc060216 < ~/src/hg/lib/bioCycGenes.sql hgsql bioCyc060216 -e 'load data local infile "genes.tab" into table genes' hgsql bioCyc060216 < ~/src/hg/lib/bioCycPathways.sql hgsql bioCyc060216 -e 'load data local infile "pathways.tab" into table pathways' # Create bioCycMapDesc.tab hgsql bioCyc060216 -N -e 'select UNIQUE_ID, NAME from pathways' |sort -u > bioCycMapDesc.tab # Create bioCycPathway.tab kgBioCyc0 bioCyc060216 hg18 hg17 hgsql hg18 -e 'delete from bioCycPathway' hgsql hg18 -e 'delete from bioCycMapDesc' hgsql hg18 < ~/src/hg/lib/bioCycPathway.sql hgsql hg18 < ~/src/hg/lib/bioCycMapDesc.sql # Load results into hg18. hgsql hg18 -e 'LOAD DATA local INFILE "bioCycMapDesc.tab" into table bioCycMapDesc' hgsql hg18 -e 'LOAD DATA local INFILE "bioCycPathway.tab" into table bioCycPathway' ############################################################################# # BLASTZ/CHAIN/NET RN4 (DONE 2/17/06 Fan) ssh kkstore02 cd /cluster/store11/gs.19/build36 cp -Rp linSpecRep /san/sanvol1/scratch/hg18 cp -Rp nib /san/sanvol1/scratch/hg18 mkdir /cluster/data/hg18/bed/blastz.rn4.2006-02-17 cd /cluster/data/hg18/bed/blastz.rn4.2006-02-17 cat << '_EOF_' > DEF # human vs. rat BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/san/sanvol1/scratch/hg18/nib SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRep/notInRat SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat SEQ2_DIR=/san/sanvol1/scratch/rn4/nib SEQ2_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman SEQ2_LEN=/cluster/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.rn4.2006-02-17 '_EOF_' # << for emacs doBlastzChainNet.pl DEF -chainLinearGap medium \ -bigClusterHub pk -smallClusterHub pk -workhorse pk \ -blastzOutRoot /san/sanvol1/scratch/blastzHg17Rn4Out >& do.log & tail -f do.log rm -f /cluster/data/hg18/bed/blastz.rn4 ln -s blastz.rn4.2006-02-17 /cluster/data/hg18/bed/blastz.rn4 ############################################################################# # BUILD WGRNA TRACK (DONE, 2006-02-22, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2006-05-15, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan) # rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan) ssh hgwdev cd /cluster/data/hg18/bed mkdir wgRna-2006-02-22 cd wgRna-2006-02-22 # Received the data file, wg_hg18_track.txt, from Michel Weber's email # (Michel.Weber at ibcg.biotoul.fr) # and place it under cd /cluster/data/hg18/bed/wgRna-2006-02-22. cp -p wg_hg18_track.txt wgRna.tab hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab ############################################################################# # RE-BUILD WGRNA TRACK (DONE, 2006-05-15, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan) # rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2008-05-29, Fan) ssh hgwdev cd /cluster/data/hg18/bed mkdir wgRna-2008-05-28 cd wgRna-2008-05-28 # Received the data file, wgtrack_may2008.doc, from Michel Weber's # email # (Michel.Weber at ibcg.biotoul.fr) # Save it as .txt file and change all blanks into tabs. # and place it under cd /cluster/data/hg18/bed/wgRna-2008-05-28. cp -p wgtrack_may2008.txt wgRna.tab hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab ############################################################################# # 17-WAY MULTIZ ALIGNMENTS (DONE - 2006-02-22 Fan) # copy net mafs to cluster-friendly storage for multiz run ssh kkstore02 ln -s /cluster/data/hg18/bed/blastzMonDom4.2006-02-13 /cluster/data/hg18/bed/blastz.monDom4 cd /cluster/data/hg18/bed/blastz.monDom4 cd /cluster/data/hg18/bed mkdir -p multiz17way.2006-02-18 ln -s multiz17way.2006-02-18 multiz17way cd multiz17way # copy MAF's to cluster-friendly server # These MAF's already on bluearc: # canFam2, fr1, galGal2, panTro1, rn4 mkdir -p /san/sanvol1/scratch/hg18/mafNet cd /san/sanvol1/scratch/hg18/mafNet ln -s /cluster/bluearc/hg18/mafNet/{*} . # copy others foreach s (bosTau2 canFam2 danRer3 dasNov1 echTel1 fr1 galGal2 loxAfr1 \ mm8 monDom4 oryCun1 panTro1 rn4 tetNig1 xenTro1 rheMac2) echo $s cp -Rp /cluster/data/hg18/bed/blastz.$s/mafNet $s end # danRer3 directory structure is different. It is under /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun/mafNet /san/sanvol1/scratch/hg18/mafNet/danRer3 # thanks for the tree, Hiram! Taken from mm7 17way... cd /cluster/data/hg18/bed/multiz17way cat << '_EOF_' > 17way.nh ((((((((( (human_hg18:0.006690,chimp_panTro1:0.007571):0.024272, macaque_rheMac2:0.0592):0.023960, ((rat_rn4:0.081728,mouse_mm8:0.077017):0.229273, rabbit_oryCun1:0.206767):0.1065):0.023026, (cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505, armadillo_dasNov1:0.149862):0.015994, (elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400, monodelphis_monDom4:0.371073):0.189124, chicken_galGal2:0.454691):0.123297, xenopus_xenTro1:0.782453):0.156067, ((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961, zebrafish_danRer3:0.782561):0.156067); '_EOF_' /cluster/bin/draw_tree 17way.nh > 17way.ps /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt grep hg18 17way.distances.txt | sort -k3,3n | \ awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt # edit distances.txt to include featureBits, and chain parameters # from blastz run. cat distances.txt # 0.0143 chimp_panTro1 # 0.0902 macaque_rheMac2 # 0.2563 armadillo_dasNov1 # 0.2651 dog_canFam2 # 0.2677 elephant_loxAfr1 # 0.2766 cow_bosTau2 # 0.3682 rabbit_oryCun1 # 0.4226 tenrec_echTel1 # 0.4677 mouse_mm8 # 0.4724 rat_rn4 # use loose chain params and score from here, down (5000) # 0.7119 monodelphis_monDom4 # 0.9847 chicken_galGal2 # 1.4357 xenopus_xenTro1 # 1.6577 tetraodon_tetNig1 # 1.6983 fugu_fr1 # 1.7480 zebrafish_danRer3 # the order in the browser display will be by tree topology, # not by distance, so it will be: # >> # 0.0143 chimp_panTro1 # >> # 0.0902 macaque_rheMac2 # >> # 0.4677 mouse_mm8 # >> # 0.4724 rat_rn4 # >> # 0.3682 rabbit_oryCun1 # >> # 0.2651 dog_canFam2 # >> # 0.2766 cow_bosTau2 # >> # 0.2563 armadillo_dasNov1 # >> # 0.2677 elephant_loxAfr1 # >> # 0.4226 tenrec_echTel1 # >> # 0.7119 monodelphis_monDom4 # >> # 0.9847 chicken_galGal2 # >> # 1.4357 xenopus_xenTro1 # >> # 1.6577 tetraodon_tetNig1 # >> # 1.6983 fugu_fr1 # >> # 1.7480 zebrafish_danRer3 # make output dir and run dir ssh pk cd /cluster/data/hg18/bed/multiz17way.2006-02-18 # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 17way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir -p maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = hg18 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/mafNet rm -fr $tmp mkdir -p $tmp cp ../tree/tree.nh ../species.lst $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == hg18) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp 'EOF' # << happy emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz17way.2006-02-18/maf/$(root1).maf} #ENDLOOP 'EOF' # << happy emacs awk '{print $1}' /cluster/data/hg18/chrom.sizes > chrom.lst gensub2 chrom.lst single spec jobList para create jobList # 49 files para try para check para push # NOTE: much faster than V10 (40 hrs for hg17 V10, 14.53 hrs for hg17 V11) # Completed: 49 of 49 jobs # CPU time in finished jobs: 341776s 5696.26m 94.94h 3.96d 0.011 y # IO & Wait Time: 122801s 2046.69m 34.11h 1.42d 0.004 y # Average job time: 9481s 158.02m 2.63h 0.11d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 81334s 1355.57m 22.59h 0.94d # Submission to last job: 81334s 1355.57m 22.59h 0.94d # Load into database ssh hgwdev cd /cluster/data/hg18/bed/multiz17way/maf mkdir -p /gbdb/hg18/multiz17way/maf ln -s /cluster/data/hg18/bed/multiz17way/maf/*.maf \ /gbdb/hg18/multiz17way/maf cat > loadMaf.csh << 'EOF' time hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/maf hg18 multiz17way cat *.maf | \ nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 -maxSize=200000 multiz17waySummary stdin 'EOF' # 3213116 #<< happy emacs # expect lengthy load time for this -- a few hours ? # csh loadMaf.csh >&! loadMaf.log & script loadMaf.log csh loadMaf.csh exit ############################################################### # PHASTCONS CONSERVATION (DONE, 2ND TIME, 2006-03-28 Fan) # This process is distilled from Hiram and Adam's experiments # on mouse (mm7) 17way track. Many parameters are now fixed, without # being experimentally derived, either because the experiments # were lengthy and produced similar results, or because they # weren't runnable given the alignment size. # These parameters are: # --rho # --expected-length # --target-coverage # Also, instead of generating cons and noncons tree models, # we use a single, pre-existing tree model -- Elliot Margulies' model # from the (37-way) ENCODE alignments. # NOTE: reusing cluster-friendly chrom fasta files created earlier ssh kkstore02 mkdir /cluster/bluearc/hg18/chrom cd /cluster/data/hg18 foreach f (`cat chrom.lst`) echo $f cp $f/*.fa /cluster/bluearc/hg18/chrom end # Split chromosome MAF's into windows and use to generate # "sufficient statistics" (ss) files for phastCons input # NOTE: as the SAN fs has lotsa space, we're leaving these # big (temp) files unzipped, to save time during phastCons run. # Note also the larger chunk sizes from previous runs -- this # reduces run-time on the split, slows down the actual phastCons # enough so jobs don't crash (jobs are very quick, just a minute # or so), and according to Adam, will produce better results. # The previous small chunks were probably required by # the phyloFit step, which we are no longer using for the # human alignments. ssh pk mkdir /cluster/data/hg18/bed/multiz17way.2006-02-18/cons cd /cluster/data/hg18/bed/multiz17way.2006-02-18/cons cp /cluster/store5/gs.18/build35/bed/multiz17way.2005-12-20/cons/elliotsEncode.mod . # edit, change to hg18, monDom4, mm8, and rn4. mkdir run.split cd run.split set WINDOWS = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.csh #!/bin/csh -ef # unfortunately this exhausts 2G mem limit currently on pk # next time, run on mini-cluster set MAFS = /cluster/data/hg18/bed/multiz17way.2006-02-18/maf set WINDOWS = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss cd $WINDOWS set c = $1 echo $c rm -fr $c mkdir $c /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \ -M /cluster/bluearc/hg18/chrom/$c.fa \ -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 echo "Done" >> $c.done 'EOF' # << happy emacs chmod +x doSplit.csh rm -f jobList foreach f (../../maf/*.maf) set c = $f:t:r echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList end para create jobList # 49 jobs para try para check para push # Completed: 49 of 49 jobs # CPU time in finished jobs: 9254s 154.24m 2.57h 0.11d 0.000 y # IO & Wait Time: 15027s 250.44m 4.17h 0.17d 0.000 y # Average job time: 496s 8.26m 0.14h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1916s 31.93m 0.53h 0.02d # Submission to last job: 1921s 32.02m 0.53h 0.02d # check tree model on 5MB chunk, using params recommended by Adam, # (to verify branch lengths on 2X species) # he ok'ed the results -- not necessary for next human run ssh kolossus cd /cluster/data/hg18/bed/multiz17way.2006-02-18/cons /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \ --tree "`cat ../tree-commas.nh`" \ /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss/chr7/chr7.110000001-120000000.ss \ -o phyloFit.tree # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ # cd .. mkdir run.cons cd run.cons cat > doPhast.csh << 'EOF' #!/bin/csh -fe set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set tmp = /scratch/tmp/$f mkdir -p $tmp set san = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp pushd $tmp > /dev/null /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative panTro1,rheMac2 \ --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/pp/$c $san/bed/$c sleep 1 mv $tmp/$f.pp $san/pp/$c mv $tmp/$f.bed $san/bed/$c rm -fr $tmp 'EOF' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << 'EOF' #LOOP doPhast.csh $(root1) $(file1) 14 .008 .28 #ENDLOOP 'EOF' # happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons # mkdir /cluster/data/hg18/bed/multiz17way/cons/run.cons ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \ /cluster/data/hg18/bed/multiz17way/cons/run.cons/in.list ssh pk cd /cluster/store11/gs.19/build36/bed/multiz17way.2006-02-18/cons/run.cons gensub2 in.list single template jobList para create jobList # 337 jobs para try para check para push # NOTE: some jobs crashed due to can not stat some /san/... files, but worked when pushed once again # Completed: 337 of 337 jobs # CPU time in finished jobs: 16000s 266.66m 4.44h 0.19d 0.001 y # IO & Wait Time: 13307s 221.79m 3.70h 0.15d 0.000 y # Average job time: 87s 1.45m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 173s 2.88m 0.05h 0.00d # Submission to last job: 225s 3.75m 0.06h 0.00d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -name "chr*.bed" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/hg18/bed/multiz17way/cons # load into database ssh hgwdev cd /cluster/data/hg18/bed/multiz17way/cons hgLoadBed -strict hg18 phastConsElements17way mostConserved.bed # Loaded 2037557 elements # compare with previous tracks hgsql hg18 -e "select count(*) from phastConsElements17way" # 2260575 # hgsql hg18 -e "select count(*) from phastConsElements" # hg18 does not have phastConsElements table # 1601903 # Try for 5% overall cov, and 70% CDS cov (used elen=13, tcov=.007, rho=.27) featureBits hg18 -enrichment refGene:cds phastConsElements17way # refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x featureBits hg17 -enrichment refGene:cds phastConsElements17way # refGene:cds 1.064%, phastConsElements17way 5.104%, both 0.748%, cover 70.29%, enrich 13.77x # compare with previous tracks featureBits hg18 -enrichment refGene:cds phastConsElements10way # refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x featureBits hg18 -enrichment refGene:cds phastConsElements # refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x # Create merged posterier probability file and wiggle track data files # pk is currently closer to the san than any other machine ssh pk cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ # sort by chromName, chromStart so that items are in numerical order # for wigEncode find ./pp -name "chr*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode stdin phastCons17way.wig phastCons17way.wib # about 23 minutes for above cp -p phastCons17way.wi? /cluster/data/hg18/bed/multiz17way/cons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/hg18/bed/multiz17way/cons ln -s `pwd`/phastCons17way.wib /gbdb/hg18/multiz17way/phastCons17way.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz17way hg18 \ phastCons17way phastCons17way.wig # ~ 3 minute load # Downloads (2006-02-22 Fan) ssh hgwdev cd /cluster/data/hg18/bed/multiz17way mkdir mafDownloads cd mafDownloads # upstream mafs (mafFrags takes a while) cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags hg18 multiz17way up.bed upstream$i.maf \ -orgs=../species.lst rm up.bed end date 'EOF' time csh mafFrags.csh > mafFrags.log nice gzip up*.maf ssh kkstore02 cd /cluster/data/hg18/bed/multiz17way/mafDownloads cat > downloads.csh << 'EOF' date foreach f (../maf/chr*.maf) set c = $f:t:r echo $c nice gzip -c $f > $c.maf.gz end md5sum *.gz > md5sum.txt date 'EOF' time csh downloads.csh > downloads.log ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz17way mkdir $dir ln -s /cluster/data/hg18/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir ############################################################################## # SET DEFAULT POSITION TO chrX:151,073,054-151,383,976, TO SHOW GENE GABRA3 hgsql -e 'delete from dbDb where name="hg18"' \ -h genome-testdb hgcentraltest hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \ "chrX:151,073,054-151,383,976", 1, 9, "Human", "Homo sapiens", \ "/gbdb/hg18/html/description.html", 1, 1, "NCBI Build 36.1");' \ -h genome-testdb hgcentraltest ############################################################################ # HG16/HG17 -> HG18 LIFTOVER CHAINS (DONE 2/24/06 Fan) # These chains hopefully don't suck. # Sorry I only used the makeLoChain-align script from the set of scripts # already created for this task. I wanted more control. I should mention # I used a size of 10kb instead of 3kb for the split (blat query) sizes in # hg18. This had a huge affect on the amount of hits in the blat, which # then had a huge effect on the amount of chains. I should also mention # that hg18 chromosomes chr1 and chr2 were split further # into more than a single query file. This helped a LOT in avoiding # cluster hippos classically associated with those chroms. ######## LIFTOVER PREPARATION # Split up hg18 ssh pk cd /san/sanVol1/scratch/hg18 mkdir -p liftSplits/{split,lift} bash for fa in /cluster/data/hg18/?{,?,*hap*}/*.fa; do c=`basename $fa .fa` echo $c faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c done mkdir -p biggerSplits/split cd biggerSplits/ ln -s ../liftSplits/lift cd split/ ln -s ../../liftSplits/split/* . faSplit sequence chr1.fa 5 chr1_ faSplit sequence chr2.fa 5 chr2_ rm chr{1,2}.fa # Make some dirs cd /san/sanVol1/scratch mkdir -p hg{15,16,17} # Copy 11.ooc files to each of hg15, hg16, hg17 dirs. cp -p /cluster/store5/gs.16/build33/11.ooc hg15 cp -p /cluster/store4/gs.17/build34/11.ooc hg16 cp -p /cluster/store5/gs.18/build35/11.ooc hg17 ## First, copy over Andy's scripts. mkdir -p /san/sanVol1/scratch/fan cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan ######## LIFTOVER BLATING # HG16 ssh pk cd /cluster/data/hg16 makeLoChain-align hg16 /scratch/hg/hg16/bothMaskedNibs hg18 \ /san/sanVol1/scratch/hg18/biggerSplits/split cd bed/ mv blat.hg18.2006-02-24 /san/sanVol1/scratch/hg16 cd /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/run/ sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg16ToHg18"}' > newspec para create newspec para try para push # Completed: 2394 of 2394 jobs # CPU time in finished jobs: 623927s 10398.79m 173.31h 7.22d 0.020 y # IO & Wait Time: 13255s 220.91m 3.68h 0.15d 0.000 y # Average job time: 266s 4.44m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3613s 60.22m 1.00h 0.04d # Submission to last job: 4112s 68.53m 1.14h 0.05d # HG17 ssh pk cd /cluster/data/hg17 makeLoChain-align hg17 /scratch/hg/hg17/bothMaskedNibs hg18 /san/sanVol1/scratch/hg18/biggerSplits/split cd bed/ mv blat.hg18.2006-02-24/ /san/sanVol1/scratch/hg17 cd /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/run/ sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg17ToHg18"}' > newspec para create newspec para try para push # Completed: 2622 of 2622 jobs # CPU time in finished jobs: 618557s 10309.28m 171.82h 7.16d 0.020 y # IO & Wait Time: 13735s 228.92m 3.82h 0.16d 0.000 y # Average job time: 241s 4.02m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3655s 60.92m 1.02h 0.04d # Submission to last job: 4228s 70.47m 1.17h 0.05d ######## LIFTOVER CHAINING # LIFTING ssh pk cd /san/sanVol1/scratch/fan cp mm7SplitLift.sh hg18SplitLift.sh # change andy to fan, mm7 to hg18, and chrX to chr2, and remove chrUn_random vi hg18SplitLift.sh cat << 'EOF' > hg18ChainMergeSplit.sh #!/bin/bash cp -r chainRaw/ /scratch/fan/hg18Lifts pushd /scratch/fan/hg18Lifts mkdir chain /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin cp -r chain `dirs +1` rm -rf chain chainRaw 'EOF' chmod +x hg18ChainMergeSplit.sh # HG16 cd /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/raw /san/sanVol1/scratch/fan/hg18SplitLift.sh cd ../ mkdir chainRun chainRaw cd chainRun cat > gsub << 'EOF' #LOOP /cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg16/bothMaskedNibs /san/sanVol1/scratch/hg18/nib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP 'EOF' ls -1S ../psl/*.psl > in.lst gensub2 in.lst single gsub spec para create spec para push para time # Completed: 49 of 49 jobs # CPU time in finished jobs: 3599s 59.98m 1.00h 0.04d 0.000 y # IO & Wait Time: 1040s 17.34m 0.29h 0.01d 0.000 y # Average job time: 95s 1.58m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 303s 5.05m 0.08h 0.00d # Submission to last job: 303s 5.05m 0.08h 0.00d # HG17 cd /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/raw /san/sanVol1/scratch/fan/hg18SplitLift.sh cd ../ mkdir chainRun chainRaw cd chainRun cat > gsub << 'EOF' #LOOP /cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg17/bothMaskedNibs /san/sanVol1/scratch/hg18/nib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP 'EOF' ls -1S ../psl/*.psl > in.lst gensub2 in.lst single gsub spec para create spec para try para push para time # Completed: 49 of 49 jobs # CPU time in finished jobs: 3671s 61.19m 1.02h 0.04d 0.000 y # IO & Wait Time: 1186s 19.76m 0.33h 0.01d 0.000 y # Average job time: 99s 1.65m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 282s 4.70m 0.08h 0.00d # Submission to last job: 282s 4.70m 0.08h 0.00d ######### CHAINMERGE/NET/NETSUBSET ssh kolossus mkdir -p /scratch/fan/hg18Lifts cd /scratch/fan/hg18Lifts cp -rp /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/chainRaw/ . mkdir chain time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin cp -rp chain /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/ mv chain chain.17 # remove it later rm -rf chain.17 cp -r /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/chainRaw/ . mkdir chain /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin # about 30 minutes. cp -rp chain /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/ rm -rf chain* ssh pk cd /san/sanvol1/scratch/fan cat << 'EOF' > netOver.sh #!/bin/bash chain=$1 chrom=`basename $chain .chain` sizesHGOld=$2 sizesHG18=/cluster/data/hg18/chrom.sizes chainDir=`dirname $chain` blatDir=`dirname $chainDir` net=${blatDir}/net/${chrom}.net over=${blatDir}/over/${chrom}.over mkdir -p ${blatDir}/{over,net} /cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG18 $net /dev/null /cluster/bin/x86_64/netChainSubset $net $chain $over 'EOF' chmod +x netOver.sh mkdir netRun cd netRun/ find /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/chain -name "*.chain" \ | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg16/chrom.sizes"}' >> spec find /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/chain -name "*.chain" \ | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg17/chrom.sizes"}' >> spec para create spec para push para time # Completed: 88 of 88 jobs # CPU time in finished jobs: 881s 14.68m 0.24h 0.01d 0.000 y # IO & Wait Time: 284s 4.74m 0.08h 0.00d 0.000 y # Average job time: 13s 0.22m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 33s 0.55m 0.01h 0.00d # Submission to last job: 73s 1.22m 0.02h 0.00d # seems much faster than mm7. ########## FINISHING ssh hgwdev # HG16 cd /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/over cat * >> ../hg16ToHg18.over.chain cd ../ rm -rf psl/ net/ chain/ chainRaw/ over/ cd ../ cp -rp blat.hg18.2006-02-24/ /cluster/data/hg16/bed cd /cluster/data/hg16/bed ln -s blat.hg18.2006-02-24 blat.hg18 ln -s `pwd`/blat.hg18/hg16ToHg18.over.chain liftOver/hg16ToHg18.over.chain ln -s `pwd`/liftOver/hg16ToHg18.over.chain /gbdb/hg16/liftOver/hg16ToHg18.over.chain mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/liftOver cd /usr/local/apache/htdocs/goldenPath/hg16/liftOver cp /gbdb/hg16/liftOver/hg16ToHg18.over.chain . gzip hg16ToHg18.over.chain hgAddLiftOverChain hg16 hg18 /gbdb/hg16/liftOver/hg16ToHg18.over.chain # HG17 cd /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/over cat * >> ../hg17ToHg18.over.chain cd ../ rm -rf psl/ net/ chain/ chainRaw/ over/ cd ../ cp -r blat.hg18.2006-02-24/ /cluster/data/hg17/bed cd /cluster/data/hg17/bed ln -s blat.hg18.2006-02-24 blat.hg18 ln -s `pwd`/blat.hg18/hg17ToHg18.over.chain liftOver/hg17ToHg18.over.chain ln -s `pwd`/liftOver/hg17ToHg18.over.chain /gbdb/hg17/liftOver/hg17ToHg18.over.chain mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/liftOver cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /gbdb/hg17/liftOver/hg17ToHg18.over.chain . gzip hg17ToHg18.over.chain hgAddLiftOverChain hg17 hg18 /gbdb/hg17/liftOver/hg17ToHg18.over.chain ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram) ssh pk cd /cluster/data/mm8/bed/blastzHg18.2006-02-16 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits hg18 chainMm8Link # 994530182 bases of 2881515245 (34.514%) in intersection # GENOSCOPE TETRAODON (tetNig1) ECORES (DONE, 2006-03-03, Fan) # GENOSCOPE TETRAODON (tetNig1) ECORES (REBUILT, 2006-04-04, Fan) ssh kkstore02 mkdir -p /cluster/data/hg18/bed/ecoresTetNig1 cd /cluster/data/hg18/bed/ecoresTetNig1 wget --timestamp \ http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecoresHumBuild36/EXOFISH_HS_WITH_TN.gff wget --timestamp \ http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecoresHumBuild36/EXOFISH_TN_WITH_HS.gff # this is in gff format # remove "Ecotig" from name field sed -e 's/Ecotig EG/EG/g' EXOFISH_HS_WITH_TN.gff |sed -e 's/CHR//' > ExofishHs36Tnig1.gff # sed -e 's/Ecotig EG/EG/g' ExofishHs36Tnig1 > ExofishHs36Tnig1.gff # need to have tabs between fields not a space to load file into table sed -e 's/ /\t/g' ExofishHs36Tnig1.gff > Hs36Tnig1format.gff # if "ecore" is changed to "CDS" and "ecotig" to "transcript" this loads # correctly into the table. sed -e 's/ecore/CDS/' Hs36Tnig1format.gff | sed -e 's/ecotig/transcript/' \ | cut -f 1-8,11 > Hg18vstetNig1.gff # add "chr" in front of the chromsome name in first field (2005-02-08) perl -pi.bak -e 's/^([0-9XYM]{1,2})/chr$1/' Hg18vstetNig1.gff rm *.bak # need to reload table ssh hgwdev cd /cluster/data/hg18/bed/ecoresTetNig1 echo 'drop table ecoresTetNig1;' | hgsql hg18 nice ldHgGene hg18 ecoresTetNig1 Hg18vstetNig1.gff ######################################################################### # BUILD MAF ANNOTATION FOR MULTIZ17WAY (DONE 2006-03-07, Fan) ssh kkstore01 cd /cluster/data/monDom4 twoBitInfo -nBed monDom4.2bit monDom4.N.bed cd /cluster/data/rn4 twoBitInfo -nBed rn4.2bit rn4.N.bed cd /cluster/data/mm8 twoBitInfo -nBed mm8.2bit mm8.N.bed ssh kolossus cd /cluster/data/hg18/bed/multiz17way mkdir anno cd anno mkdir maf run cd run rm sizes nBeds foreach i (`cat /cluster/data/hg18/bed/multiz17way/species.lst`) ln -s /cluster/data/$i/chrom.sizes $i.len ln -s /cluster/data/$i/$i.N.bed $i.bed echo $i.bed >> nBeds echo $i.len >> sizes end echo date > jobs.csh foreach i (../../maf/*.maf) echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg18/hg18.2bit ../maf/`basename $i` >> jobs.csh echo "echo $i" >> jobs.csh end echo date >> jobs.csh # do smaller jobs first tac jobs.csh > jobsRev.csh mv jobsRev.csh jobs.csh csh jobs.csh > jobs.log # This took 10 hours. Hg17 took 1.5 hrs. ssh kolossus # loading here because summary table load crashed on hgwdev cd /cluster/data/hg18/bed/multiz17way/anno/maf mkdir -p /gbdb/hg18/multiz17way/anno/maf ln -s /cluster/data/hg18/bed/multiz17way/anno/maf/*.maf \ /gbdb/hg18/multiz17way/anno/maf cat > loadMaf.csh << 'EOF' date hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/anno/maf \ hg18 multiz17way date cat *.maf | \ nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz17waySummary stdin date 'EOF' csh loadMaf.csh > loadMaf.log ssh kkstore02 cd /cluster/data/hg18/bed/multiz17way mkdir frames cd frames cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames . cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile . #edit Makefile to correct species names cat > copy.csh << 'EOF' set dir = /cluster/bluearc/hg18/multiz17way/frames/maf mkdir -p $dir foreach i (../maf/*.maf) echo $i cp -p $i $dir end 'EOF' csh copy.csh > copy.log ssh hgwdev cd /cluster/data/hg18/bed/multiz17way/frames time make getGenes > getGenes.log # 26.100u 4.360s 1:02.78 48.5% 0+0k 0+0io 29643pf+0w time make getFrames > getFrames.log # Batch failed after 4 tries on ../mkMafFrames bosTau2 hg18 /san/sanvol1/scratch/hg18/multiz17way/frames/genes/bosTau2.gp.gz /cluster/data/hg18/bed/multiz17way/maf/chr1.maf /san/sanvol1/scratch/hg18/multiz17way/frames/mafFrames/bosTau2/chr1.mafFrames #make[1]: *** [mafFrames/bosTau2.cluster.done] Error 255 # copy Makefile to Makefile.try2 and remove bosTau2 time make -f Makefile.try2 getFrames > getFrames.try2.log # copy Makefile to Makefile.try3 and with only bosTau2 remains time make -f Makefile.try3 getGenes > getGenes.try3.log time make -f Makefile.try3 getFrames > getFrames.try3.log time make -f Makefile.try3 getFrames > getFrames.try5.log time make -f Makefile.try3 getFrames > getFrames.try6.log # Finally after Mark fixed the bug and recompiled, it worked. time make -f Makefile.try3 getFrames > getFrames.try7.log time make loadDb > loadDb.log ######################################################################### # Build maf annotation for multiz17way (STARTED 2006-02-28, DONE 2006-03-09, Fan) # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd) ssh kkstore01 cd /cluster/data/monDom4 twoBitInfo -nBed monDom4.2bit monDom4.N.bed cd /cluster/data/rn4 twoBitInfo -nBed rn4.2bit rn4.N.bed cd /cluster/data/mm8 twoBitInfo -nBed mm8.2bit mm8.N.bed ssh kolossus cd /cluster/data/hg18/bed/multiz17way mkdir anno cd anno mkdir maf run cd run rm sizes nBeds foreach i (`cat /cluster/data/hg18/bed/multiz17way/species.lst`) ln -s /cluster/data/$i/chrom.sizes $i.len ln -s /cluster/data/$i/$i.N.bed $i.bed echo $i.bed >> nBeds echo $i.len >> sizes end echo date > jobs.csh foreach i (../../maf/*.maf) echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg18/hg18.2bit ../maf/`basename $i` >> jobs.csh echo "echo $i" >> jobs.csh end echo date >> jobs.csh # do smaller jobs first tac jobs.csh > jobsRev.csh mv jobsRev.csh jobs.csh csh jobs.csh > jobs.log # This took 10 hours. Hg17 took 1.5 hrs. ssh hgwdev # loading here because summary table load crashed on hgwdev cd /cluster/data/hg18/bed/multiz17way/anno/maf mkdir -p /gbdb/hg18/multiz17way/anno/maf ln -s /cluster/data/hg18/bed/multiz17way/anno/maf/*.maf \ /gbdb/hg18/multiz17way/anno/maf cat > loadMaf.csh << 'EOF' date hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/anno/maf \ hg18 multiz17way date cat *.maf | \ nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz17waySummary stdin date 'EOF' csh loadMaf.csh > loadMaf.log # Dropped unused indexes (2006-05-09 kate) # NOTE: this is not required in the future, as the loader # has been fixed to not generate these indexes hgsql hg18 -e "alter table multiz17waySummary drop index chrom_2" hgsql hg18 -e "alter table multiz17waySummary drop index chrom_3" ssh kkstore02 cd /cluster/data/hg18/bed/multiz17way mkdir frames cd frames cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames . cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile . # !!! NEXT TIME, COPY ALL maf FILES OVER TO san TO AVOID kkstore02 OVERLOAD. # edit Makefile to correct species names cat > copy.csh << 'EOF' set dir = /cluster/bluearc/hg18/multiz17way/frames/maf mkdir -p $dir foreach i (../maf/*.maf) echo $i cp -p $i $dir end 'EOF' csh copy.csh > copy.log #for i in ../../maf/*.maf; do echo $i; cp $i /san/sanvol1/scratch/hg18/multiz17wayFrames/maf/$i; done ssh hgwdev cd /cluster/data/hg18/bed/multiz17way/frames time make getGenes > getGenes.log # 26.100u 4.360s 1:02.78 48.5% 0+0k 0+0io 29643pf+0w time make getFrames > getFrames.log # ~2 hours time make loadDb > loadDb.log ### # rebuild frames to get bug fix, using 1-pass maf methodology # (2006-06-09 markd) ssh kkstore02 cd /cluster/data/hg18/bed/multiz17way/frames mv mafFrames/ mafFrames.old2 nice tcsh # easy way to get process niced (cat ../maf/*.maf | time genePredToMafFrames hg18 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro1 genes/panTro1.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz xenTro1 genes/xenTro1.gp.gz | gzip >multiz17way.mafFrames.gz)>&frames.log& ssh hgwdev cd /cluster/data/hg18/bed/multiz17way/frames hgLoadMafFrames hg18 multiz17wayFrames multiz17way.mafFrames.gz >&log& ########################################################################## # BUILD ALLEN BRAIN TRACK (DONE 03/11/06 Fan) # Make the working directory ssh hgwdev cd /cluster/data/hg18/bed mkdir allenBrain cd allenBrain # Remap the probe alignments from mm7 to hg18 zcat /gbdb/mm7/liftOver/mm7ToHg18.over.chain.gz \ | pslMap -chainMapFile -swapMap \ /cluster/data/mm7/bed/allenBrain/allenBrainAli.psl stdin stdout \ | sort -k 14,14 -k 16,16n > unscored.psl pslRecalcMatch unscored.psl /cluster/data/hg18/nib \ /cluster/data/mm7/bed/allenBrain/allProbes.fa allenBrainAli.psl # Load the database hgsql hg18 < ~/kent/src/hg/lib/allenBrainUrl.sql hgsql hg18 -e 'load data local infile "/cluster/data/mm7/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;' hgLoadPsl hg18 allenBrainAli.psl mkdir /gbdb/hg18/allenBrain ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/hg18/allenBrain/allProbes.fa hgLoadSeq hg18 /gbdb/hg18/allenBrain/allProbes.fa # Make mapping between known genes and allenBrain hgMapToGene hg18 allenBrainAli -type=psl knownGene knownToAllenBrain ########################################################################## #### Blat knownGene proteins to determine exons # (DONE - 2006-03-15 - 2006-03-24 - hiramc) ssh hgwdev cd /cluster/data/hg18/bed mkdir blat.hg18KG.2006-03-15 rm blat.hg18KG ln -s blat.hg18KG.2006-03-15 blat.hg18KG cd blat.hg18KG pepPredToFa hg18 knownGenePep known.fa # The kluster run ssh pk cd /cluster/data/hg18/bed/blat.hg18KG cat << '_EOF_' > blatSome #!/bin/csh -fe blat -t=dnax -q=prot -out=pslx /scratch/hg/gs.19/build36/bothMaskedNibs/$1.nib \ kgfa/$2.fa $3 '_EOF_' # << keep emacs happy chmod +x blatSome ls -1S /scratch/hg/gs.19/build36/bothMaskedNibs > human.lst mkdir kgfa cd kgfa # This split should be done on the file server, not over NFS faSplit sequence ../known.fa 3000 kg ls -1S *.fa > ../kg.lst cd .. cat << '_EOF_' > template #LOOP blatSome $(root1) $(root2) {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy gensub2 human.lst kg.lst template jobList mkdir psl cd psl sed -e "s/.nib//" ../human.lst | xargs mkdir cd .. para create jobList para try ... check ... push ... etc # Completed: 142100 of 142100 jobs # CPU time in finished jobs: 7520598s 125343.30m 2089.06h 87.04d 0.238 y # IO & Wait Time: 415523s 6925.38m 115.42h 4.81d 0.013 y # Average job time: 56s 0.93m 0.02h 0.00d # Longest finished job: 5737s 95.62m 1.59h 0.07d # Submission to last job: 72538s 1208.97m 20.15h 0.84d ssh kkstore02 cd /cluster/data/hg18/bed/blat.hg18KG.2006-03-15 pslSort dirs raw.psl /tmp psl/* # -rw-rw-r-- 1 568238823 Mar 20 13:30 raw.psl pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null # -rw-rw-r-- 1 43446007 Mar 24 11:13 cooked.psl pslUniq cooked.psl hg18KG.psl # -rw-rw-r-- 1 41321225 Mar 24 11:14 hg18KG.psl cut -f 10 hg18KG.psl > kgName.lst faSomeRecords known.fa kgName.lst hg18KG.fa faSize hg18KG.fa # 16419953 bases (12961273 N's 3458680 real 3458680 upper 0 lower) # in 36727 sequences in 1 files faSize known.fa # 16430067 bases (12969298 N's 3460769 real 3460769 upper 0 lower) # in 36798 sequences in 1 files # You may need to build this pslxToFa - it is not in the standard build pslxToFa hg18KG.psl hg18KG_ex.fa -liftTarget=genome.lft \ -liftQuery=protein.lft # -rw-rw-r-- 1 11294262 Mar 24 11:31 protein.lft # -rw-rw-r-- 1 21428637 Mar 24 11:31 hg18KG_ex.fa # -rw-rw-r-- 1 14324928 Mar 24 11:31 genome.lft wc -l *.psl *.lft *.fa kgName.lst # 39908 cooked.psl # 36727 hg18KG.psl # 1521400 raw.psl # 303516 genome.lft # 303516 protein.lft # 383037 hg18KG.fa # 607032 hg18KG_ex.fa # 383348 known.fa # 36727 kgName.lst # 3615211 total # back on hgwdev ssh hgwdev cd /cluster/data/hg18/bed/blat.hg18KG kgName hg18 hg18KG.psl blastKGRef04 # After about an hour, it exited with this message: # sqlFreeConnection called on cache (hg18) that doesn't contain # the given connection # This may be a lurking error in this program, because the # resulting file seems to have the correct number of lines: hgsql hg18 < ~/kent/src/hg/lib/blastRef.sql echo "rename table blastRef to blastKGRef04" | hgsql hg18 echo "load data local infile 'blastKGRef04' into table blastKGRef04" | hgsql hg18 wc -l kgName.lst blastKGRef04 hg18KG.psl # 36727 kgName.lst # 36727 blastKGRef04 # 36727 hg18KG.psl # 110181 total hgPepPred hg18 generic blastKGPep04 hg18KG.fa # end blat proteins ########################################################################## # BUILD NIBB IMAGE PROBES (DONE 2006-03-14 galt following Jim's hg17 example) # Make directory on san for cluster job and copy in sequence ssh pk mkdir /san/sanvol1/scratch/hg18/nibbPics cd /san/sanvol1/scratch/hg18/nibbPics cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa . # Make parasol job dir and sequence list files mkdir run cd run mkdir psl ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst echo ../nibbImageProbes.fa > mrna.lst # Create parasol gensub file file cat << '_EOF_' > gsub #LOOP blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl #ENDLOOP '_EOF_' # Create parasol batch gensub2 genome.lst mrna.lst gsub spec para create spec # Do para try/push/time etc. #Completed: 49 of 49 jobs #CPU time in finished jobs: 12585s 209.74m 3.50h 0.15d 0.000 y #IO & Wait Time: 411s 6.86m 0.11h 0.00d 0.000 y #Average job time: 265s 4.42m 0.07h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 1145s 19.08m 0.32h 0.01d #Submission to last job: 1195s 19.92m 0.33h 0.01d # Make sort and filter catDir psl | sort -k 10 \ | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \ | sort -k 14,14 -k 16,16n \ | sed 's#/san/sanvol1/scratch/hg18/nib/chr#chr#' \ | sed 's/.nib//' > ../nibbImageProbes.psl # Make bed file and copy in stuff ssh hgwdev cd /cluster/data/hg18/bed mkdir nibbPics cd nibbPics cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa . cp /san/sanvol1/scratch/hg18/nibbPics/nibbImageProbes.psl . # Load into database ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/hg18/nibbImageProbes.fa hgLoadSeq hg18 /gbdb/hg18/nibbImageProbes.fa hgLoadPsl hg18 nibbImageProbes.psl ########################################################################## # UPDATED hg18.knownToVisiGene (2006-03-15 galt) # after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc) ssh hgwdev knownToVisiGene hg18 -fromProbePsl=vgAllProbes ########################################################################## # GENERATE SUMMARY STATISTICS (DONE, Fan 3/18/06) ssh hgwdev cd /cluster/data/hg18 mkdir stat cd stat stats.pl ~/hg18 >hg18.pl.out hgCalStat hg18.pl.out hg18 hg18.out cp hg18.out hg18.out.sorted # Editi hg18.out.sorted to order by chromosomes and # replace the "?" in the Y chrom line with 6265435 and align its position. vi hg18.out.sorted # Add the hg18 stats to goldenPath/stats.html cd ~/browser/goldenPath # insert hg18.out.sorted into stats.html and add necessary # surrounding HTML lines for the hg18 section. vi stats.html cvs update stats.html cvs commit stats.html # Change description of hg18, per suggestion by Kim at NCBI (3/20/06, Fan). ssh hgwdev echo "update dbDb set description='Mar. 2006' where name = 'hg18';" \ | hgsql -h genome-testdb hgcentraltest ############################################################################ # hg18 -> hg17 LIFTOVER CHAINS (DONE 3/20/06 Fan) # I used a size of 10kb instead of 3kb for the split (blat query) sizes in # hg17. This had a huge affect on the amount of hits in the blat, which # then had a huge effect on the amount of chains. I should also mention # that hg17 chromosomes chr1 and chr2 were split further # into more than a single query file. This helped a LOT in avoiding # cluster hippos classically associated with those chroms. ######## LIFTOVER PREPARATION # The following paragraph was already done during hg15 to hg17 liftover built # Split up hg17 ssh pk cd /san/sanVol1/scratch/hg17 mkdir -p liftSplits/{split,lift} bash for fa in /cluster/data/hg17/?{,?,*hap*}/*.fa; do c=`basename $fa .fa` echo $c faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c done mkdir -p biggerSplits/split cd biggerSplits/ ln -s ../liftSplits/lift cd split/ ln -s ../../liftSplits/split/* . faSplit sequence chr1.fa 5 chr1_ faSplit sequence chr2.fa 5 chr2_ rm chr{1,2}.fa # Make some dirs cd /san/sanVol1/scratch mkdir -p hg18 # Copy 11.ooc files to hg18 subdirectory. # cp -p /cluster/store5/gs.16/build33/11.ooc hg18 ## First, copy over scripts. (Already done before) # mkdir -p /san/sanVol1/scratch/fan # cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan # cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan ######## LIFTOVER BLATING # HG18 ssh pk cd /cluster/data/hg18 makeLoChain-align hg18 /scratch/hg/hg18/nib hg17 /san/sanVol1/scratch/hg17/biggerSplits/split cd bed mv blat.hg17.2006-03-20 /san/sanVol1/scratch/hg18 cd /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/run/ sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg18ToHg17"}' > newspec para create newspec para try para push # Completed: 2646 of 2646 jobs # CPU time in finished jobs: 633021s 10550.35m 175.84h 7.33d 0.020 y # IO & Wait Time: 14063s 234.39m 3.91h 0.16d 0.000 y # Average job time: 245s 4.08m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3645s 60.75m 1.01h 0.04d # Submission to last job: 6153s 102.55m 1.71h 0.07d ######## LIFTOVER CHAINING # LIFTING ssh pk cd /san/sanVol1/scratch/fan cp mm7SplitLift.sh hg17SplitLift.sh # change andy to fan, mm7 to hg17, and chrX to chr2, and remove chrUn_random vi hg17SplitLift.sh cat << 'EOF' > hg17ChainMergeSplit.sh #!/bin/bash cp -r chainRaw/ /scratch/fan/hg17Lifts pushd /scratch/fan/hg17Lifts mkdir chain /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin cp -r chain `dirs +1` rm -rf chain chainRaw 'EOF' chmod +x hg17ChainMergeSplit.sh # HG18 cd /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/raw /san/sanVol1/scratch/fan/hg17SplitLift.sh cd ../ mkdir chainRun chainRaw cd chainRun cat > gsub << 'EOF' #LOOP /cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg18/nib /san/sanVol1/scratch/hg17/nib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP 'EOF' ls -1S ../psl/*.psl > in.lst gensub2 in.lst single gsub spec para create spec para try para push para time # Completed: 46 of 46 jobs # CPU time in finished jobs: 3713s 61.88m 1.03h 0.04d 0.000 y # IO & Wait Time: 1284s 21.41m 0.36h 0.01d 0.000 y # Average job time: 109s 1.81m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 310s 5.17m 0.09h 0.00d # Submission to last job: 310s 5.17m 0.09h 0.00d ######### CHAINMERGE/NET/NETSUBSET ssh kolossus mkdir -p /scratch/fan/hg17Lifts cd /scratch/fan/hg17Lifts cp -r /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/chainRaw/ . mkdir chain /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin # about 30 minutes. cp -rp chain /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/ rm -rf chain rm -rf chainRaw ssh pk cd /san/sanvol1/scratch/fan cat << 'EOF' > netOver.sh #!/bin/bash chain=$1 chrom=`basename $chain .chain` sizesHGOld=$2 sizesHG17=/cluster/data/hg17/chrom.sizes chainDir=`dirname $chain` blatDir=`dirname $chainDir` net=${blatDir}/net/${chrom}.net over=${blatDir}/over/${chrom}.over mkdir -p ${blatDir}/{over,net} /cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG17 $net /dev/null /cluster/bin/x86_64/netChainSubset $net $chain $over 'EOF' chmod +x netOver.sh mkdir netRun cd netRun/ find /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/chain -name "*.chain" \ | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg18/chrom.sizes"}' > spec para create spec para push para time # Completed: 49 of 49 jobs # CPU time in finished jobs: 431s 7.18m 0.12h 0.00d 0.000 y # IO & Wait Time: 151s 2.52m 0.04h 0.00d 0.000 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 30s 0.50m 0.01h 0.00d # Submission to last job: 43s 0.72m 0.01h 0.00d ########## FINISHING ssh hgwdev # HG18 cd /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/over cat * >> ../hg18ToHg17.over.chain cd ../ rm -rf psl/ net/ chain/ chainRaw/ over/ cd ../ cp -rp blat.hg17.2006-03-20/ /cluster/data/hg18/bed cd /cluster/data/hg18/bed ln -s blat.hg17.2006-03-20 blat.hg17 ln -s `pwd`/blat.hg17/hg18ToHg17.over.chain liftOver/hg18ToHg17.over.chain ln -s `pwd`/liftOver/hg18ToHg17.over.chain /gbdb/hg18/liftOver/hg18ToHg17.over.chain mkdir -p /usr/local/apache/htdocs/goldenPath/hg18/liftOver cd /usr/local/apache/htdocs/goldenPath/hg18/liftOver cp /gbdb/hg18/liftOver/hg18ToHg17.over.chain . gzip hg18ToHg17.over.chain hgAddLiftOverChain hg18 hg17 /gbdb/hg18/liftOver/hg18ToHg17.over.chain ########################################################################## # NSCAN track - ( markd) # hg17 had both NSCAN and NSCAN-EST tracks, in a composite track. # currently have only NSCAN for hg18 cd /cluster/data/hg18/bed/nscan/ # obtainedf NSCAN predictions from michael brent's group # at WUSTL wget -nv http://genes.cse.wustl.edu/jeltje/hg18/hg18.nscan.gtf wget -r -np -nv http://genes.cse.wustl.edu/jeltje/hg18/chr_ptx/ mv genes.cse.wustl.edu/jeltje/hg18/chr_ptx . rm -rf genes.cse.wustl.edu chr_ptx/index.html* gzip -9 hg18.nscan.gtf chr_ptx/*.fa chmod a-w hg18.nscan.gtf.gz chr_ptx/*.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt hg18 nscanGene hg18.nscan.gtf.gz # add .a suffix to match transcript id hgPepPred -suffix=.a hg18 generic nscanPep chr_ptx/*.fa.gz rm -f *.tab # update trackDb; need a hg18-specific page to describe informants human/hg18/nscanGene.html human/hg18/trackDb.ra # QA NOTE [ASZ 9-11-2006]: mytouch nscanPep 200603271900.00 ########################################################################## # UPDATED hg18.knownToVisiGene (2006-04-05 galt) # after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc) ssh hgwdev knownToVisiGene hg18 -fromProbePsl=vgAllProbes ############################################################################## # BLASTZ CHIMP PanTro1 second time (STARTED - 2006-01-05, DONE 2006-01-13 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzPanTro1.2006-01-05 cd /cluster/data/hg18/bed rm blastz.panTro1 ln -s blastzPanTro1.2006-01-05 blastz.panTro1 cd blastzPanTro1.2006-01-05 cat << '_EOF_' > DEF # human vs chimp export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/hg18.2bit SEQ1_CHUNK=100000000 SEQ1_LAP=10000 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes # QUERY: Chimp PanTro1 - single chunk big enough to run entire genome SEQ2_DIR=/san/sanvol1/scratch/panTro1/panTro1.2bit SEQ2_LEN=/scratch/hg/panTro1/chrom.sizes SEQ2_CHUNK= 30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzPanTro1.2006-01-05 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started Thu Jan 5 11:26:45 PST 2006 # Encountered an error at the net step: startStep: 0, at step 5 net to stopStep 6 # chmod a+x /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh # ssh -x pk nice /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh cd /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain chainPreNet hg18.panTro1.all.chain.gz /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes stdout chainNet stdin -minSpace=1 /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes stdout /dev/null netSyntenic stdin noClass.net Got 49 chroms in /scratch/hg/hg18/chrom.sizes, 52 in /scratch/hg/panTro1/chrom.sizes Finishing nets writing stdout writing /dev/null memory usage 363347968, utime 1042 s/100, stime 56 netChainSubset -verbose=0 noClass.net hg18.panTro1.all.chain.gz stdout chainSort stdin stdout gzip -c Out of memory needMem - request size 6 bytes gzip: stdout: Broken pipe Command failed: ssh -x pk nice /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh # 1/9/06, Retry again ssh pk cd /cluster/data/hg18/bed cd blastzPanTro1.2006-01-05 screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -continue=net \ -stop=load \ `pwd`/DEF > load2.out 2>&1 & # Same error. # Try with kolossus time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=net \ -stop=load \ `pwd`/DEF > load3.out 2>&1 & # Still have problems, which seem to be related to the # wrong $MACHTYPE and $PATH on kolossus. Updated my .cshrc # Did the following manually on kolossus: # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv /cluster/bin/x86_64/netChainSubset -verbose=0 noClass.net hg18.panTro1.all.chain.gz stdout | chainSort stdin stdout | gzip -c > hg18.panTro1.over.chain.gz mkdir -p /cluster/data/hg18/bed/liftOver cp -p hg18.panTro1.over.chain.gz /cluster/data/hg18/bed/liftOver/hg18ToPanTro1.over.chain.gz # Make axtNet for download: one .axt per hg18 seq. netSplit noClass.net net cd .. mkdir axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /scratch/hg/hg18/hg18.2bit /san/sanvol1/scratch/panTro1/panTro1.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.hg18.panTro1.net.axt.gz end # Make mafNet for multiz: one .maf per hg18 seq. mkdir mafNet foreach f (axtNet/*.hg18.panTro1.net.axt.gz) axtToMaf -tPrefix=hg18. -qPrefix=panTro1. $f \ /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ above by hand. ssh pk cd /cluster/data/hg18/bed cd blastzPanTro1.2006-01-05 screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=load \ -stop=load \ `pwd`/DEF > load4.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Measurements: # Go to kolossus to run featureBits to avoid out of memory problem. ssh kolossus bash time HGDB_CONF=~/.hg.conf.read-only featureBits panTro1 chainHg18Link # 2641472125 bases of 2733948177 (96.617%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits hg18 chainPanTro1Link # 2681146909 bases of 2881515245 (93.046%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits panTro1 chainHg17Link # 0 bases of 2733948177 (0.000%) in intersection time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainPanTro1Link # 2633869032 bases of 2866216770 (91.894%) in intersection ######################################################################### # BLASTZ RAT Rn3 (STARTED - 2005-12-22, DONE 2006-01-05 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzRn3.2005-12-22 cd /cluster/data/hg18/bed rm blastz.rn3 ln -s blastzRn3.2005-12-22 blastz.rn3 cd blastzRn3.2005-12-22 cat << '_EOF_' > DEF # human vs rat export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Muman Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInRat SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole pieces SEQ2_DIR=/scratch/rat/rn3/softNib SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzRn3.2005-12-22 TMPDIR=/scratch/tmp '_EOF_' # happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -stop=load \ `pwd`/DEF > to-load.out 2>&1 & # start processing again on 12/31/05. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -swap \ -stop=load \ `pwd`/DEF > swap.out 2>&1 & # Either UCSC RR and hgwdev systems or network went down around 11 AM 12/31/05. # After holidays, start again on 1/3/06 and again on 1/5/06. ssh pk cd /cluster/data/hg18/bed cd blastzRn3.2005-12-22 screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -swap \ -continue=net \ -stop=load \ `pwd`/DEF > swap6.out 2>&1 & # DONE! Jan 5 13:39 # Measurements: nice featureBits rn3 chainHg18Link # 962630574 bases of 2571104688 (37.440%) in intersection nice featureBits hg18 chainRn3Link # 964251210 bases of 2881515245 (33.463%) in intersection ######################################################################### # BLASTZ ARMADILLO DasNov1 (STARTED - 2006-01-06 - 2006-01-09 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzDasNov1.2006-01-06 cd /cluster/data/hg18/bed rm blastz.dasNov1 ln -s blastzDasNov1.2006-01-06 blastz.dasNov1 cd blastzDasNov1.2006-01-06 cat << '_EOF_' > DEF # human vs armadillo export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for armadillo (per Webb email to Brian Raney) # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=30000000 SEQ1_LAP=10000 # QUERY: Armadillo DasNov1 SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit SEQ2_LEN=/scratch/hg/dasNov1/chrom.sizes SEQ2_LIMIT=400 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzDasNov1.2006-01-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started Fri Jan 6 06:20:12 PST 2006 # 1:20 PM, 1/7/06 # The blastz cluster run seemed finished OK, but make jobList some how # does not end, even after creating the run.time file manually. Kill it manually. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -continue=cat \ -stop=load \ `pwd`/DEF > load2.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # Done, Jan 8 21:40. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Jan 9 06:11 # Reciprocal best net mafs for multiz (kate) ssh kkstore02 cd /cluster/data/hg18/bed/blastz.dasNov1 ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 dasNov1 >&! rbest.log & # Load nets (2007-03-12 kate) ssh hgwdev cd /cluster/data/hg18/bed/blastz.dasNov1/axtChain netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netDasNov1 stdin netFilter -minGap=10 hg18.dasNov1.rbest.net.gz | \ hgLoadNet -warn hg18 netRBestDasNov1 stdin ######################################################################### # BLASTZ DOG CanFam2 second time (DONE - 2005-12-28 - 2005-12-29 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzCanFam2.2005-12-28 cd /cluster/data/hg18/bed rm blastz.canFam2 ln -s blastzCanFam2.2005-12-28 blastz.canFam2 cd blastzCanFam2.2005-12-28 cat << '_EOF_' > DEF # human vs dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for dog (per Webb email to Brian Raney) BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog CanFam2 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/canFam2/nib SEQ2_LEN=/cluster/bluearc/canFam2/chrom.sizes SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzCanFam2.2005-12-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started 2005-12-28 21:33 # Two jobs stuck in the same node. Did manual para stop and para push. # Both finished within a few minutes. # Done! On Thu Dec 29 05:27:31 PST 2005. # system seems hang on kolossus (3 processes of [tcsh -c nice chainMergeSort], not moving) # manually killed the jobs. # now use pk as the workhorse. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -continue=chainMerge \ -stop=load \ `pwd`/DEF > load2.out 2>&1 & # Done! Thu Dec 29 09:10:02 PST 2005. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # Had an error at the load step, # mySQL error 2013: Lost connection to MySQL server during query, # probably due to sys admin working on network connections, # continue at the load step time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -swap -continue=load -stop=load \ `pwd`/DEF > swap-load2.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -workhorse=pk \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Dec 29 13:21 # Measurements: ssh hgwdev nice featureBits canFam2 chainHg18Link # 1477551526 bases of 2384996543 (61.952%) in intersection nice featureBits hg18 chainCanFam2Link # 1524764349 bases of 2881515245 (52.915%) in intersection nice featureBits canFam2 chainHg17Link # 1487483112 bases of 2384996543 (62.368%) in intersection nice featureBits hg17 chainCanFam2Link # 1530197469 bases of 2866216770 (53.387%) in intersection ######################################################################### # BLASTZ ELEPHANT LoxAfr1 second time (STARTED - 2006-01-03 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03 cd /cluster/data/hg18/bed rm blastz.loxAfr1 ln -s blastzLoxAfr1.2006-01-03 blastz.loxAfr1 cd blastzLoxAfr1.2006-01-03 cat << '_EOF_' > DEF # human vs elephant export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Elephant LoxAfr1 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit SEQ2_LEN=/scratch/hg/loxAfr1/chrom.sizes SEQ2_LIMIT=300 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # failed at step 2 due to kki cluster not started. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=pk \ -smallClusterHub=pk \ -continue=cat \ -stop=load \ `pwd`/DEF > load2.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -smallClusterHub=pk \ -continue=net \ -stop=load \ `pwd`/DEF > load3.out 2>&1 & # Same broken pipe error. netChainSubset -verbose=0 noClass.net hg18.loxAfr1.all.chain.gz stdout chainSort stdin stdout gzip -c Out of memory needMem - request size 28 bytes gzip: stdout: Broken pipe Command failed: ssh -x kolossus nice /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03/axtChain/netChains.csh time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=net \ -stop=load \ `pwd`/DEF > load4.out 2>&1 & # Finally, a success! tail load4.out #... # cd /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03/axtChain #netClass -verbose=0 -noAr noClass.net hg18 loxAfr1 hg18.loxAfr1.net #netFilter -minGap=10 hg18.loxAfr1.net #hgLoadNet -verbose=0 hg18 netLoxAfr1 stdin #startStep: 5, at step 7 download to stopStep 6 # *** All done! # *** Add {chain,net}LoxAfr1 tracks to trackDb.ra if necessary. # The swap-load was not successful, after several tries. # Last one seems was due to out of memory problem. # Per Hiram, we no longer do swap for 2X genomes, unless specifically requested. # Mark made an inquiry, but said he can get by with hg18->loxAfr1 nets. # reciprocal best net mafs for multiz (2007-03-09 kate) ssh kkstore02 cd /cluster/data/hg18/bed/blastz.loxAfr1 ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 loxAfr1 >&! rbest.log & # load net and reciprocal best net for comparison # note sure why these tables and cleanup aren't done -- ask Fan ssh hgwdev cd /cluster/data/hg18/bed/blastz.loxAfr1/axtChain netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netLoxAfr1 stdin netFilter -minGap=10 hg18.loxAfr1.rbest.net.gz | \ hgLoadNet -warn hg18 netRBestLoxAfr1 stdin ######################################################################### # BLASTZ COW BosTau2 second time (STARTED - 2006-01-07 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzBosTau2.2006-01-07 cd /cluster/data/hg18/bed rm blastz.bosTau2 ln -s blastzBosTau2.2006-01-07 blastz.bosTau2 cd blastzBosTau2.2006-01-07 cat << '_EOF_' > DEF # human vs cow export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow BosTau2 - single chunk big enough to run entire genome SEQ2_DIR=/san/sanvol1/scratch/bosTau2/bosTau2.2bit SEQ2_LEN=/san/sanvol1/scratch/bosTau2/chrom.sizes SEQ2_CHUNK=3200000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzBosTau2.2006-01-07 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -stop=load \ -workhorse=pk \ `pwd`/DEF > load.out 2>&1 & # Started Sat Jan 7 07:57:22 PST 2006 # blastz run (and load) done Jan 8 00:13 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # took a long time to finish. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Jan 8 21:10 # Measurements: ssh hgwdev nice featureBits bosTau2 chainHg18Link # 1357027317 bases of 2812203870 (48.255%) in intersection nice featureBits hg18 chainBosTau2Link # 1357291762 bases of 2881515245 (47.103%) in intersection nice featureBits bosTau2 chainHg17Link # 0 bases of 2812203870 (0.000%) in intersection # nice featureBits hg17 chainBosTau2Link 1350076765 bases of 2866216770 (47.103%) in intersection ######################################################################### # BLASTZ TENREC EchTel1 second time (STARTED - 2006-01-09 DONE 2006-01-12 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzEchTel1.2006-01-09 cd /cluster/data/hg18/bed rm blastz.echTel1 ln -s blastzEchTel1.2006-01-09 blastz.echTel1 cd blastzEchTel1.2006-01-09 cat << '_EOF_' > DEF # human vs tenrec export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Tenrec EchTel1 SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes SEQ2_LIMIT=400 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzEchTel1.2006-01-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started Mon Jan 9 08:09:03 PST 2006 # Found over a thousand jobs failed, all with the following 7 hosts. [pk:run.blastz> fgrep host j1.err | sort -u host: kkr10u06.kilokluster.ucsc.edu host: kkr10u58.kilokluster.ucsc.edu host: kkr10u62.kilokluster.ucsc.edu host: kkr11u34.kilokluster.ucsc.edu host: kkr11u39.kilokluster.ucsc.edu host: kkr12u18.kilokluster.ucsc.edu host: kkr12u29.kilokluster.ucsc.edu # manually created /scratch/tmp on above machines (except one). # 2 jobs still running for more than 5 hours each. para stop para recover jobList newJobList # newJobList contains only 2 jobs. Checked the .psl files under psl confirming only two files missing. para create newJobList para push # This 2 jobs finished within a couple of mintues! para time >run.time bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=cat \ -stop=load \ `pwd`/DEF > load2.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net \ -swap \ -stop=load \ `pwd`/DEF > swap-load3.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! On Jan 12 09:18 # reciprocal best net mafs for multiz (2007-03-09 kate) ssh kkstore02 cd /cluster/data/hg18/bed/blastz.echTel1 ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 echTel1 >&! rbest.log & # reloading chains which disappeared (2007-04-17 kate) cd /cluster/data/hg18/bed/blastz.echTel1/axtChain # edit loadUp.csh --> create loadUp2.csh and loadUp3.csh # run loadUp2.csh (does chainSplit) on kkstore02 # run loadUp3.csh (does hgLoadChain) on hgwdev ######################################################################### # BLASTZ CHICKEN GalGal2 second time (DONE - 2005-12-28 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzGalGal2.2005-12-28 cd /cluster/data/hg18/bed rm blastz.galGal2 ln -s blastzGalGal2.2005-12-28 blastz.galGal2 cd blastzGalGal2.2005-12-28 cat << '_EOF_' > DEF # human vs chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken GalGal2 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/galGal2/nib SEQ2_LEN=/cluster/bluearc/galGal2/chrom.sizes SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzGalGal2.2005-12-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started 2005-12-28 10:35 # Two jobs stuck in the same node. Did manual para stop and para push. # Both finished within a few minutes. # Done! On Wed Dec 28 15:32:45 PST 2005. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # Had an error at the net step time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=net -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > download.out 2>&1 & # the gzip job on kolossus seems not moving at all. # killed it manually. Try again. # Seemed not moving, kill it again. Now use pk instead of kolossus. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -workhorse=pk \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -workhorse=pk \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Wed Dec 28 20:39:44 PST 2005 # Measurements: ssh hgwdev nice featureBits galGal2 chainHg18Link # 91564024 bases of 1054197620 (8.686%) in intersection nice featureBits hg18 chainGalGal2Link # 102417858 bases of 2881515245 (3.554%) in intersection nice featureBits galGal2 chainHg17Link # 93277286 bases of 1054197620 (8.848%) in intersection nice featureBits hg17 chainGalGal2Link # 103882699 bases of 2866216770 (3.624%) in intersection # BLASTZ FROG XenTro1 second time (DONE - 2006-01-07 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06 cd /cluster/data/hg18/bed rm blastz.xenTro1 ln -s blastzXenTro1.2006-01-06 blastz.xenTro1 cd blastzXenTro1.2006-01-06 cat << '_EOF_' > DEF # human vs frog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Frog XenTro1 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes SEQ2_LIMIT=400 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started Fri Jan 6 20:19:30 PST 2006 # Blastz run done. Jan 7 02:07 load.out time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # got the following error: startStep: 4, at step 5 net to stopStep 6 netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]). # Try it with pk instead of kolossus: time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -swap -stop=load \ `pwd`/DEF > swap-load2.out 2>&1 & # It worked, swap-load done. Jan 7 06:05 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -workhorse=pk \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Jan 7 06:18 # Measurements: ssh hgwdev nice featureBits xenTro1 chainHg18Link # 61197900 bases of 1381238994 (4.431%) in intersection nice featureBits hg18 chainXenTro1Link # 67810866 bases of 2881515245 (2.353%) in intersection nice featureBits xenTro1 chainHg17Link # 81777842 bases of 1381238994 (5.921%) in intersection nice featureBits hg17 chainXenTro1Link # 85701475 bases of 2866216770 (2.990%) in intersection # BLASTZ FROG XenTro1 second time (STARTED - 2006-01-06 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06 cd /cluster/data/hg18/bed rm blastz.xenTro1 ln -s blastzXenTro1.2006-01-06 blastz.xenTro1 cd blastzXenTro1.2006-01-06 cat << '_EOF_' > DEF # human vs frog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Frog XenTro1 - single chunk big enough to run entire genome SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes SEQ2_LIMIT=400 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started Fri Jan 6 20:19:30 PST 2006 # Blastz run done. Jan 7 02:07 load.out time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & # got the following error: startStep: 4, at step 5 net to stopStep 6 netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]). # Try it with pk instead of kolossus: time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -swap -stop=load \ `pwd`/DEF > swap-load2.out 2>&1 & # It worked, swap-load done. Jan 7 06:05 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -workhorse=pk \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Jan 7 06:18 # Measurements: ssh hgwdev nice featureBits xenTro1 chainHg18Link # 61197900 bases of 1381238994 (4.431%) in intersection nice featureBits hg18 chainXenTro1Link # 67810866 bases of 2881515245 (2.353%) in intersection nice featureBits xenTro1 chainHg17Link # 81777842 bases of 1381238994 (5.921%) in intersection nice featureBits hg17 chainXenTro1Link # 85701475 bases of 2866216770 (2.990%) in intersection # BLASTZ TETRAODON TetNig1 second time (DONE - 2006-01-07 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzTetNig1.2006-01-07 cd /cluster/data/hg18/bed rm blastz.tetNig1 ln -s blastzTetNig1.2006-01-07 blastz.tetNig1 cd blastzTetNig1.2006-01-07 cat << '_EOF_' > DEF # human vs tetraodon export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes SEQ2_CHUNK=410000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzTetNig1.2006-01-07 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=load \ `pwd`/DEF > load.out 2>&1 & # Started Sat Jan 7 05:40:51 PST 2006 # Encountered an error: startStep: 0, at step 5 net to stopStep 6 netChains: looks like previous stage was not successful (can't find [hg18.tetNig1.]all.chain[.gz]). # Try it with pk as the workhorse. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -continue=net \ -stop=load \ `pwd`/DEF > load2.out 2>&1 & # Load done. Sat Jan 7 07:34:56 PST 2006 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -swap -stop=load \ `pwd`/DEF > swap-load.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -continue=download \ `pwd`/DEF > download.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=pk \ -swap -continue=download \ `pwd`/DEF > swap-download.out 2>&1 & # Done! Sat Jan 7 08:02:14 PST 2006 # The download and swap-download took less than 10 seconds each. ??? # Measurements: ssh hgwdev nice featureBits tetNig1 chainHg18Link # 50026847 bases of 342403326 (14.611%) in intersection nice featureBits hg18 chainTetNig1Link # 57654754 bases of 2881515245 (2.001%) in intersection nice featureBits tetNig1 chainHg17Link # 34379509 bases of 342403326 (10.041%) in intersection nice featureBits hg17 chainTetNig1Link # 35910128 bases of 2866216770 (1.253%) in intersection ######################################################################### # BLASTZ FUGU fr1 (STARTED - 2005-12-20, DONE 2006-01-04 Fan) ssh pk mkdir /cluster/data/hg18/bed/blastzFr1.2005-12-20 cd /cluster/data/hg18/bed ln -s blastzFr1.2005-12-20 blastz.fr1 cd blastzFr1.2005-12-20 cat << '_EOF_' > DEF # human vs. fugu export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 # Reuse parameters from human-chicken, except L=6000 (more relaxed) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human Hg18 - testing 100,000,000 sized chunk on pk kluster SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=100000000 SEQ1_LAP=10000 # QUERY: Fugu Fr1 - chunk big enough to run the whole chrom at once SEQ2_DIR=/san/sanvol1/scratch/fr1/nib SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ2_CHUNK=400000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzFr1.2005-12-20 '_EOF_' # << happy emacs # establish a screen to control this job ssh pk cd /cluster/data/hg18/bed/blastzFr1.2005-12-20 screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -stop=load \ `pwd`/DEF > thruLoad.out 2>&1 & ssh pk cd /cluster/data/hg18/bed/blastzFr1.2005-12-20 screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -continue=chainMerge -stop=load \ `pwd`/DEF > thruLoad.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -continue=download \ `pwd`/DEF > download.clean.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -swap \ `pwd`/DEF > swap.out 2>&1 & # Finish the remaining step, 1/4/05. ssh pk cd /cluster/data/hg18/bed/blastzFr1.2005-12-20 screen bash time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 \ -swap -continue=download \ `pwd`/DEF > DownloadSwap.out 2>&1 & # First try found the DEF was some how altered for rn3. # Re-generated DEF and try again. time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 \ -swap -continue=download \ `pwd`/DEF > DownloadSwap2.out 2>&1 & # Done. Jan 4 09:48. # measurements nice featureBits hg18 chainFr1Link # 51795958 bases of 2881515245 (1.798%) in intersection nice featureBits hg17 chainFr1Link #50831650 bases of 2866216770 (1.773%) in intersection nice featureBits hg18 netFr1 # 691148929 bases of 2881515245 (23.986%) in intersection nice featureBits hg17 netFr1 # 714234935 bases of 2866216770 (24.919%) in intersection nice featureBits fr1 chainHg18Link # 43267869 bases of 315518167 (13.713%) in intersection # nice featureBits fr1 chainHg17Link 0 bases of 315518167 (0.000%) in intersection nice featureBits fr1 netHg18 # 140843080 bases of 315518167 (44.639%) in intersection nice featureBits fr1 netHg17 # 0 bases of 315518167 (0.000%) in intersection ################################################## # For blastz runs between hg18 and other organisms, they are documented in # makeMm8.doc makeRn4.doc, makeRheMac2.doc, makeDanRer3.doc. # PHASTCONS SCORES DOWNLOADABLES FOR 17WAY (2006-04-06 Fan) ssh kkstore02 cd /cluster/data/hg18/bed/multiz17way mkdir phastConsDownloads cd phastConsDownloads cat > downloads.csh << 'EOF' date cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/pp foreach chr (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`) echo $chr cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \ | nice gzip -c \ > /cluster/data/hg18/bed/multiz17way/phastConsDownloads/$chr.gz end date 'EOF' csh downloads.csh >&! downloads.log & # ~20 minutes # << happy emacs ssh hgwdev cd /cluster/data/hg18/bed/multiz17way/phastConsDownloads md5sum *.gz > md5sum.txt set dir = /usr/local/apache/htdocs/goldenPath/hg18/phastCons17way mkdir $dir ln -s /cluster/data/hg18/bed/multiz17way/phastConsDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/hg17/phastCons17way/README.txt $dir # edit this file to reflect the latest releases used. vi $dir/README.txt ########################################################################## # RE-BUILT GO DATABASE (DONE 4/12/06, Fan) # GO changed the content of gene_association.goa_uniprot.gz. # Tho original one we use no longer has human, mouse, etc in it. # They are placed in separate files. # Per GO's suggestion, we now get the file from the submission sub-directory. # This seems cover more than concatenating the individual goa... files. # Download the terms and make the database. ssh hgwdev mkdir /cluster/store1/geneOntology/20060330 cd /cluster/store1/geneOntology/20060330 wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200603-assocdb-data.gz hgsql mysql <j.tmp hgsql go060330 chr\S+)/$1.1/' $F done >> geneid.fa # one of the files in this delivery, chr1.prot, did *not* have a # terminal character and it caused the next protein in the # next file processed, chr10.prot, to be a continuation of the # last protein in chr1.prot. To check for this: grep ">" geneid.fa | grep -v "^>" # shows a line: # AVSET>chr10_1.1 # This turns out to have been the result of a truncated file. # Fetch that file again: mv chr1.prot chr1.prot.orig wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/geneid_v1.2/chr1.prot # That's better: wc -l chr1.prot chr1.prot.orig # 24494 chr1.prot # 4524 chr1.prot.orig rm chr1.prot.orig # run the above loop again to generate geneid.fa after: rm geneid.fa ldHgGene -gtf -genePredExt hg18 geneid *.gtf # Read 33410 transcripts in 275347 lines in 49 files # 33410 groups 49 seqs 1 sources 3 feature types # 33410 gene predictions hgPepPred hg18 generic geneidPep geneid.fa # verify same names in both tables: awk '{print $1}' geneidPep.tab | sort > pep.names awk '{print $1}' genePred.tab | sort > id.names wc -l pep.names id.names # 33410 pep.names # 33410 id.names comm -12 pep.names id.names | wc -l # 33410 # QA NOTE (ASZ 5-11-2006) I dropped the geneidPep table and the reference # to it from the trackDb.ra file. This functionality is now done on the # fly and this table is no longer needed. # Added back the geneidPep table as requested by a user # (hartera, 2006-07-11) ssh hgwdev cd /cluster/data/hg18/bed/geneid hgPepPred hg18 generic geneidPep geneid.fa # The trackDb.ra file in kent/src/makeDb seems to have a reference # to the geneidPep table already. ########################################################################## # BLASTZ/CHAIN/NET XENTRO2 (DONE 4/20/06 angie) ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.xenTro2.2006-04-20 cd /cluster/data/hg18/bed/blastz.xenTro2.2006-04-20 cat << '_EOF_' > DEF # human vs. frog BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64 # Use same params as used for mammal-xenTro1 (see makeXenTro1.doc) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Frog xenTro2 - single chunk big enough to run two of the # largest scaffolds in one job SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/hg18/bed/blastz.xenTro2.2006-04-20 '_EOF_' # << emacs doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/hg18XenTro2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \ >& do.log & tail -f do.log ln -s blastz.xenTro2.2006-04-20 /cluster/data/hg18/bed/blastz.xenTro2 ########################################################################### # BLASTZ CHAIN SWAP FOR ZEBRAFISH (danRer4) (DONE, 2006-04-25, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS # See also makeDanRer4.doc # alignments are in: /cluster/data/hg18/bed/blastz.danRer4.swap # Blastz parameters used were: # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=6000 # BLASTZ_K=2200 # BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q # There are no lineage-specific repeats defined for this species pair so # all repeats were used as lineage-specific. ssh pk cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24 nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& doSwap.log & # Took about 15 minutes. # check with featureBits and compare to danRer3 chains: featureBits hg18 chainDanRer4Link # 57415379 bases of 2881515245 (1.993%) in intersection featureBits hg18 chainDanRer3Link # 64801985 bases of 2881515245 (2.249%) in intersection featureBits -chrom=chr1 hg18 refGene:cds chainDanRer4Link -enrichment # refGene:cds 1.389%, chainDanRer4Link 2.337%, both 0.937%, cover 67.47%, # enrich 28.87x featureBits -chrom=chr1 hg18 refGene:cds chainDanRer3Link -enrichment # refGene:cds 1.389%, chainDanRer3Link 2.601%, both 0.931%, cover 67.01%, # enrich 25.76x featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment # refGene:cds 1.395%, chainDanRer2Link 2.742%, both 0.911%, cover 65.31%, # enrich 23.82x # similar coverage and enrichment for danRer4 and danRer3 chains # which is good. featureBits -chrom=chr1 hg18 refGene:cds netDanRer4 -enrichment # refGene:cds 1.389%, netDanRer4 31.001%, both 1.096%, cover 78.91%, # enrich 2.55x featureBits -chrom=chr1 hg18 refGene:cds netDanRer3 -enrichment # refGene:cds 1.389%, netDanRer3 29.929%, both 1.080%, cover 77.72%, # enrich 2.60x # Similar coverage and enrichment for danRer4 net on hg18 as for danRer3. # LOAD FIRSTEF TRACK (DONE 2006-04-25 Fan) ssh hgwdev mkdir -p /cluster/data/hg18/bed/firstEF cd /cluster/data/hg18/bed/firstEF # receive the file firstEFMar05New.bed.gz from email (ramana.davuluri at osumc.edu) into this subdirectory cat << '_EOF_' > sedScript s/chr23/chrX/g s/chr24/chrY/g /^>/d /^$/d /^No/d '_EOF_' # << this line keeps emacs coloring happy bash zcat firstEFMar05New.bed.gz | sed -f sedScript | awk "{OFS=\"\t\"} {\$3 +=1; print \$0}" > firstEF.bed exit hgLoadBed hg18 firstEF firstEF.bed rm firstEF.bed bed.tab #done firstEF ########################################################################### # ALTGRAPHX TRACK (sugnet) Wed Apr 26 13:46:46 PDT 2006 cd /cluster/store1/sugnet/altSplice/ mkdir hg18-2006.04.13 cd hg18-2006.04.13 mkdir rnaCluster cd rnaCluster # Don't use RAGE libraries for clone bounds. ~/latestJk/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs # Make spec file to run. foreach c (`echo 'select chrom from chromInfo' | hgsql hg18 | grep -v chrom`) set out = chrom/$c.bed echo "clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c" >> clusterRna.spec end # Tried running it on the minicluster, but can't connect to the # cluster accounts so run it from here on hgwdev. chmod 755 clusterRna.spec mkdir chrom ./clusterRna.spec >& clusterRna.log cd .. # Make script to setup parasol job file for raw altGraphX files on human cat << '_EOF_' > makeRun.sh #!/bin/sh for chrom in `echo "select chrom from chromInfo" | hgsql hg18 | grep -v chrom`; do echo 'echo "Doing $chrom"' echo "/cluster/home/sugnet/bin/i386/altSplice -db=hg18 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg18.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg18/nib/$chrom.nib" done '_EOF_' # << this line makes emacs coloring happy mkdir agxs chmod 755 makeRun.sh chmod 755 toRun.sh ./toRun.sh >& toRun.log & cat agxs/*.agx > hg18.agx mkdir hg18 mv agxs/ makeRun.sh toRun.log toRun.sh hg18.agx hg18 cd .. mkdir mm7 cd mm7 # make the rnaClusters mkdir rnaCluster cd rnaCluster/ mkdir chrom # Don't use RAGE libraries for clone bounds. ~/latestJk/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh mm7 rage.libs foreach c (`echo 'select chrom from chromInfo' | hgsql mm7 | grep -v chrom`) set out = chrom/$c.bed echo "clusterRna -mrnaExclude=mm7.rage.libs mm7 /dev/null $out -chrom=$c" >> clusterRna.spec end # tried to run on kki, but no longer can access db from minicluster. chmod 755 clusterRna.spec ./clusterRna.spec >& clusterRna.log & cd .. cat << '_EOF_' > makeRun.sh #!/bin/sh for chrom in `echo "select chrom from chromInfo" | hgsql mm7 | grep -v chrom`; do echo 'echo "Doing $chrom"' echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm7 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm7.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm7/nib/$chrom.nib" done '_EOF_' # << this line keeps emacs coloring happy chmod 755 makeRun.sh ./makeRun.sh > toRun.sh chmod 755 toRun.sh mkdir agxs ./toRun.sh >& toRun.log & cat agxs/*.agx > mm7.agxc cd .. mkdir orthoSpliceExoniphy cd orthoSpliceExoniphy/ echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg17 | grep -v txStart > hg17.exoniphy.bed liftOver hg17.exoniphy.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.exoniphy.bed hg17.exoniphy.unmapped.bed mkdir orthoSplice cd orthoSplice ln -s ../orthoSpliceExoniphy/hg18.exoniphy.bed . echo 'select chrom, size from chromInfo' | hgsql hg18 | grep -v chrom > chromSizes.tab cp /cluster/data/hg18/bed/blastz.mm7/axtChain/hg18.mm7.all.chain.gz . chainSplit chains hg18.mm7.all.chain cp /cluster/data/hg18/bed/blastz.mm7/axtChain/hg18.mm7.net.gz . netSplit hg18.mm7.net.gz nets mkdir agx report logs cat << '_EOF_' > makeRun.sh #!/usr/bin/perl -w open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n"; while() { chomp; @w = split; print "/cluster/home/sugnet/bin/i386/orthoSplice -chromSize=$w[1] -exonFile=hg18.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../hg18/agxs/hg18.$w[0].agx -orthoAgxFile=../mm7/mm7.agx -db=hg18 -orthoDb=mm7 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg18.mm7.cons.t3.agx -reportFile=report/$w[0].hg18.report -edgeFile=report/$w[0].hg18.edge.report >& logs/$w[0].test.log\n"; } '_EOF_' # << this line keeps emacs coloring happy # clean up disk space we're not using rm hg18.mm7.all.chain hg18.mm7.net.gz nets/* chains/* chmod 755 makeRun.sh ./makeRun.sh > orthoSplice.para.spec ssh kki cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/orthoSplice para create orthoSplice.para.spec para push cat agx/*.agx > hg18.mm7.t3.exoniphy.agx cp ~/latestJk/kent/src/hg/lib/altGraphX.sql . hgLoadBed -notItemRgb -sqlTable=altGraphX.sql hg18 altGraphX hg18.mm7.t3.exoniphy.agx # end AltGraphX track. #################################################################### # EXONWALK TRACK (sugnet) Wed Apr 26 13:51:14 PDT 2006 # first make altGraphX track (see above) cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/orthoSplice mkdir exonWalk mkdir beds cd exonWalk mkdir beds foreach file (`ls ../agx/*.agx`) set base=`basename $file .agx` echo "/cluster/home/sugnet/bin/i386/exonWalk db=hg18 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec end para create exonWalk.para.spec para push cat beds/*.bed > hg18.mm7.cons.t3.exoniphy.bed mkdir orfs cd orfs mkdir bedOrf beds fa borf cp ~/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./ splitFile ../../hg18.mm7.cons.t3.exoniphy.bed 500 exonWalk. cat << '_EOF_' > makeFa.sh #!/bin/sh for file in "$@" do base=`basename $file` echo "Doing $file" echo "sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa " sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa done '_EOF_' chmod 755 makeFa.sh makeFa.sh beds/* cat << '_EOF_' > makeGenePred.sh #!/bin/sh for file in "$@" do base=`basename $file` /cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp done '_EOF_' chmod 755 makeGenePred.sh makeGenePred.sh beds/* cat beds/* > hg18.mm7.exonWalk.bed cat genePred/*.gp > hg18.mm7.exonWalk.gp ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.gp cat << '_EOF_' > makeNoNmdGenePred.sh #!/bin/sh for file in "$@" do base=`basename $file` /cluster/home/sugnet/bin/i386/borfMatcher beds/$base borf/$base.borf bedOrfNoNmd/$base.bed genePredNoNmd/$base.gp done '_EOF_' mkdir bedOrfNoNmd genePredNoNmd chmod 755 ./makeNoNmdGenePred.sh wc beds/* 275987 3311844 57319256 total wc genePredNoNmd/*.gp 169203 1692030 59907679 total wc genePred/*.gp 225252 2252520 83619240 total cat genePred/*.gp > hg18.mm7.exonWalk.nmd.gp cat genePredNoNmd/*.gp > hg18.mm7.exonWalk.noNmd.gp cat beds/* > hg18.mm7.exonWalk.all.bed # Plain "exonWalk" track is the only one used on regular genome browser. ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.noNmd.gp hgLoadBed hg18 exonWalkAll hg18.mm7.exonWalk.all.bed ldHgGene -predTab hg18 exonWalkWithNmd hg18.mm7.exonWalk.nmd.gp cat hg18.mm7.exonWalk.noNmd.gp | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt Q1 1.000000 median 3.000000 Q3 7.000000 average 10.670556 min 1.000000 max 3844.000000 count 15857 total 169203.000000 standard deviation 63.330761 cat hg18.mm7.exonWalk.nmd.gp | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt ave counts.txt Q1 1.000000 median 3.000000 Q3 8.000000 average 14.037891 min 1.000000 max 7278.000000 count 16046 total 225252.000000 standard deviation 99.406890 trackGenome hg18 all refGene:cds trackGenome.spec Track Specification track overlap track cov track new cum size size geno track cov cov cov ----------------------------------------------------------------------------- exonWalk:cds 31207765 27951670 1.00% 89.57% 90.24% 90.24% 90.24% # end ExonWalk track. ########################################################################### # ALTGRAPHX2 TRACK (kent) in progress Fri Jan 19 11:27:45 PST 2007 # The exoniphy and human/mouse blastz/chain/nets need to be done before # this. ssh hgwdev cd /cluster/store1/sugnet/altSplice/ mkdir hg18-2007.01.19 cd hg18-2007.01.19 mkdir rnaCluster cd rnaCluster # Don't use RAGE libraries for clone bounds. ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs # Make spec file to run. echo "#!/bin/tcsh -ef@ > clusterRna.spec foreach c (`echo 'select chrom from chromInfo' | hgsql hg18 | grep -v chrom`) set out = chrom/$c.bed echo "clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c" >> clusterRna.spec end # Run the file. Needs to be done on machine with database access. # Takes an hour or so. chmod 755 clusterRna.spec mkdir chrom ./clusterRna.spec >& clusterRna.log cd .. # Make script to setup job file for raw altGraphX files on human # If we had a cluster with database access this could be run there. # As it is, run it on hgwdev. This took 45 minutes. cat << '_EOF_' > makeRun.sh #!/bin/sh echo "#!/bin/tcsh -ef" for chrom in `echo "select chrom from chromInfo" | hgsql hg18 | grep -v chrom`; do echo "echo 'Doing $chrom'" echo "altSplice -db=hg18 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg18.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg18/nib/$chrom.nib" done '_EOF_' # << this line makes emacs coloring happy mkdir agxs chmod 755 makeRun.sh ./makeRun.sh > toRun.sh chmod 755 toRun.sh ./toRun.sh >& toRun.log & cat agxs/*.agx > hg18.agx mkdir hg18 mv agxs/ makeRun.sh toRun.log toRun.sh hg18.agx hg18 cd .. mkdir mm8 cd mm8 # make the rnaClusters mkdir rnaCluster cd rnaCluster/ mkdir chrom # Don't use RAGE libraries for clone bounds. ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh mm8 rage.libs echo "#!/bin/tcsh -ef" > clusterRna.spec foreach c (`echo 'select chrom from chromInfo' | hgsql mm8 | grep -v chrom`) set out = chrom/$c.bed echo "clusterRna -mrnaExclude=mm8.rage.libs mm8 /dev/null $out -chrom=$c" >> clusterRna.spec end # Could make this a cluster run if had a cluster with database access. # as is, took about 15 minutes on hgwdev. (Faster than human since less ESTs.) chmod 755 clusterRna.spec ./clusterRna.spec >& clusterRna.log & cd .. # Make batch file file to run altSplice program (by making a batch file). cat << '_EOF_' > makeRun.sh #!/bin/sh echo "#!/bin/tcsh -ef" for chrom in `echo "select chrom from chromInfo" | hgsql mm8 | grep -v chrom`; do echo "echo 'Doing $chrom'" echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm8 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm8.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm8/nib/$chrom.nib" done '_EOF_' # << this line keeps emacs coloring happy chmod 755 makeRun.sh ./makeRun.sh > toRun.sh chmod 755 toRun.sh # Run altSplice. This takes about 12 minutes. mkdir agxs ./toRun.sh >& toRun.log & cat agxs/*.agx > mm8.agx cd .. mkdir orthoSpliceExoniphy cd orthoSpliceExoniphy/ echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg18 | grep -v txStart > hg18.exoniphy.bed mkdir orthoSplice cd orthoSplice echo 'select chrom, size from chromInfo' | hgsql hg18 | grep -v chrom > chromSizes.tab zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.all.chain.gz | chainSplit chains stdin zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.net.gz | netSplit stdin nets mkdir agx report logs cat << '_EOF_' > makeRun.sh #!/usr/bin/perl -w open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n"; while() { chomp; @w = split; print "orthoSplice -chromSize=$w[1] -exonFile=../hg18.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../../hg18/agxs/hg18.$w[0].agx -orthoAgxFile=../../mm8/mm8.agx -db=hg18 -orthoDb=mm8 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg18.mm8.cons.t3.agx -reportFile=report/$w[0].hg18.report -edgeFile=report/$w[0].hg18.edge.report >& logs/$w[0].test.log\n"; } '_EOF_' # << this line keeps emacs coloring happy chmod 755 makeRun.sh ./makeRun.sh > orthoSplice.para.spec # do a little cluster run ssh kki cd /cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy/orthoSplice para create orthoSplice.para.spec para push # Do para check, etc until done. Here's the para time results. # # 49 jobs in batch # 147 jobs (including everybody's) in Parasol queue. # Checking finished jobs # Completed: 47 of 49 jobs # Crashed: 2 jobs # CPU time in finished jobs: 7002s 116.70m 1.94h 0.08d 0.000 y # IO & Wait Time: 196s 3.27m 0.05h 0.00d 0.000 y # Average job time: 153s 2.55m 0.04h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1283s 21.38m 0.36h 0.01d # Submission to last job: 1283s 21.38m 0.36h 0.01d # # The two jobs that crashed are ok, it was simply the result of no input on # some of the small random chroms. It'd be good to take the jobs out earlier # somehow. Probably Angie could figure out a way to add a file existence # test in a line of the perl script above. The altInFile is missing in this # case. # Concatenate cluster output and load it into the database. ssh hgwdev cd /cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy/orthoSplice cat agx/*.agx > hg18.mm8.t3.exoniphy.agx cp ~/kent/src/hg/lib/altGraphX.sql . hgLoadBed -notItemRgb -sqlTable=altGraphX.sql hg18 altGraphX2 hg18.mm8.t3.exoniphy.agx # clean up disk space we're not using rm hg18.mm7.all.chain hg18.mm7.net.gz nets/* chains/* # end AltGraphX2 track. #################################################################### # EXONWALK2 TRACK (kent) Tue Jan 24 2007 # first make altGraphX2 track (see above) ssh hgwdev cd /cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy mkdir exonWalk mkdir beds cd exonWalk mkdir beds foreach file (`ls ../orthoSplice/agx/*.agx`) set base=`basename $file .agx` echo "exonWalk db=hg18 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec end # Execute para spec as batch file since wants database access. # takes about 2.5 hours #para create exonWalk.para.spec #para push #cat beds/*.bed > hg18.mm7.cons.t3.exoniphy.bed time tcsh -efx exonWalk.para.spec #8256.940u 21.747s 2:18:07.32 99.8% 0+0k 0+0io 0pf+0w mkdir orfs cd orfs mkdir bedOrf beds fa borf genePred cd beds # cp /cluster/store1/sugnet/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./ cat ../../beds/*.bed | splitFile stdin 500 exonWalk. cd .. cat << '_EOF_' > makeFa.sh #!/bin/sh for file in "$@" do base=`basename $file` echo "Doing $file" echo "sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa " sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa done '_EOF_' chmod 755 makeFa.sh makeFa.sh beds/* cat << '_EOF_' > makeBorf.sh #!/bin/sh for file in "$@" do base=`basename $file` echo "Doing $file" echo "borfBig $file borf/$base.borf " borfBig $file borf/$base.borf done '_EOF_' chmod 755 makeBorf.sh makeBorf.sh fa/*.fa # Alternatively do this on the cluster. It takes a little doing to # get a version of bestorf set up to be cluster accessible. I # just copied it in from /projects/compbio/bin/borf, including # copying in some binary fiels that script referenced. # As a parasol job on kk, here's what para time said: CPU time in finished jobs: 51577s 859.61m 14.33h 0.60d 0.002 y IO & Wait Time: 25442s 424.04m 7.07h 0.29d 0.001 y Average job time: 132s 2.19m 0.04h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 179s 2.98m 0.05h 0.00d Submission to last job: 307s 5.12m 0.09h 0.00d cat << '_EOF_' > makeGenePred.sh #!/bin/sh for file in "$@" do base=`basename $file` borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp done '_EOF_' chmod 755 makeGenePred.sh makeGenePred.sh beds/* cat beds/* > hg18.mm7.exonWalk.bed cat genePred/*.gp | ldHgGene -predTab hg18 exonWalk2 stdin cat << '_EOF_' > makeNoNmdGenePred.sh #!/bin/sh for file in "$@" do base=`basename $file` /cluster/home/sugnet/bin/i386/borfMatcher beds/$base borf/$base.borf bedOrfNoNmd/$base.bed genePredNoNmd/$base.gp done '_EOF_' mkdir bedOrfNoNmd genePredNoNmd chmod 755 ./makeNoNmdGenePred.sh wc beds/* 275987 3311844 57319256 total wc genePredNoNmd/*.gp 169203 1692030 59907679 total wc genePred/*.gp 225252 2252520 83619240 total cat genePred/*.gp > hg18.mm7.exonWalk.nmd.gp cat genePredNoNmd/*.gp > hg18.mm7.exonWalk.noNmd.gp cat beds/* > hg18.mm7.exonWalk.all.bed # Plain "exonWalk" track is the only one used on regular genome browser. ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.noNmd.gp hgLoadBed hg18 exonWalkAll hg18.mm7.exonWalk.all.bed ldHgGene -predTab hg18 exonWalkWithNmd hg18.mm7.exonWalk.nmd.gp cat hg18.mm7.exonWalk.noNmd.gp | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt Q1 1.000000 median 3.000000 Q3 7.000000 average 10.670556 min 1.000000 max 3844.000000 count 15857 total 169203.000000 standard deviation 63.330761 cat hg18.mm7.exonWalk.nmd.gp | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt ave counts.txt Q1 1.000000 median 3.000000 Q3 8.000000 average 14.037891 min 1.000000 max 7278.000000 count 16046 total 225252.000000 standard deviation 99.406890 trackGenome hg18 all refGene:cds trackGenome.spec Track Specification track overlap track cov track new cum size size geno track cov cov cov ----------------------------------------------------------------------------- exonWalk:cds 31207765 27951670 1.00% 89.57% 90.24% 90.24% 90.24% # end ExonWalk track. #################################################################### # LOAD ENSEMBL GENES (DONE, 2006-05-02, Fan) # ADDED STABLE URL TO TRACKDB (DONE, 2006-05-29, hartera) # ADDED RELEASE ALPHA AND RELEASE BETA VERSIONS OF TRACK ENTRY IN # trackDb.ra SO THAT CORRECT ENSEMBL BUILD VERSION DISPLAYED AND LINKED TO # AS DIFFERENT ENSEMBL BUILDS ON RR AND HGWDEV (DONE, 2007-09-25, hartera) mkdir /cluster/data/hg18/bed/ensembl cd /cluster/data/hg18/bed/ensembl # Get the ensembl protein data from # http://www.ensembl.org/Homo_sapiens/martview # Follow this sequence through the pages: # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensemblGene.gtf.gz # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. # Finally, get rid of the ".1" or ".2" after the name gunzip -c ensemblGene.gtf.gz \ |sed -e 's/c22_H2/22_h2_hap1/'\ |sed -e 's/c5_H2/5_h2_hap1/'\ |sed -e 's/c6_COX/6_cox_hap1/'\ |sed -e 's/c6_QBL/6_qbl_hap2/'\ | perl -wpe 's/^([0-9]|X|Y|Un|MT|5_h2_hap1|22_h2_hap1|6_cox_hap1|6_qbl_hap2)/chr$1/ || die "Line $. doesnt start with human chrom:\n$_"' \ | sed -e 's/\..\"/\"/g' \ | sed -e 's/chrMT/chrM/' \ > ensGene.gtf ssh hgwdev cd /cluster/data/hg18/bed/ensembl # Remove hap chroms entries because Ensembl is using different genomic coordinates. fgrep -v hap ensGene.gtf > ensGeneNew.gtf /cluster/bin/i386/ldHgGene hg18 ensGene ensGeneNew.gtf # Read 58424 transcripts in 1014240 lines in 1 files # 58424 groups 25 seqs 1 sources 4 feature types # 58424 gene predictions # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql hg18 < ~/kent/src/hg/lib/ensGtp.sql # remove header line from ensGtp.txt echo "load data local infile 'ensGtp.txt' into table ensGtp ignore 1 lines" | hgsql -N hg18 # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 2) Choose protein_coding for gene type # Page 3) Choose the "Sequences" box. # Page 4) check Ensembl Gene ID, Transcript ID, and Peptid ID, uncheck chrom, Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz gunzip ensemblPep.fa.gz hgPepPred hg18 ensembl ensemblPep.fa # Added stable archive URL for Ensembl v38 to human/hg18/trackDb.ra # (2006-05-29, hartera) # Changed url line for ensGene entry to: # url http://apr2006.archive.ensembl.org/perl/transview?transcript=$$ # (2007-09-25, hartera) # Created a release beta version of this track in human/hg18/trackDb.ra # with the ensArchive setting set to apr2006 to create the correct URL # as above and add the correct version (version 38) in the label: track ensGene release beta shortLabel Ensembl Genes longLabel Ensembl (Build 38) Gene Predictions group genes priority 40 visibility hide color 150,0,0 type genePred ensPep ensArchive apr2006 # A separate trackDb entry (release alpha) was made for the updated # track on hgwdev which is Build 46 (aug2007). This means that the # correct version will be displayed and the correct links made on both # the RR and hgwdev. # Create knownToEnsembl column (updated 2007-11-15 - Jim Kent) hgMapToGene hg18 ensGene knownGene knownToEnsembl # QA NOTE [ASZ: 9-11-2006]: mytouch on ensGtp and ensPep. This is because # ensGene was updated later than they were. Ensembl treats hap chroms # differently than we do. So the ensGene table was reloaded. # sudo mytouch hg18 ensGtp 200605241000.00 # sudo mytouch hg18 ensPep 200605241000.00 # SGP GENES (DONE 5/3/06 Fan) # See below for: SGP GENES Update (DONE - 2007-10-02 - Hiram) ssh hgwdev mkdir /cluster/data/hg18/bed/sgp cd /cluster/data/hg18/bed/sgp foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/SGP/$chr.gtf wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/SGP/$chr.prot end ldHgGene -gtf -genePredExt hg18 sgpGene chr*.gtf # VEGA LIFT FROM HG17 (DONE 5/22/06 acs) # This can be replaced when the new version comes out (Tim Hubbard says soon) ssh hgwdev cd /cluster/store8/ensembl/vega33_35f # there's a bad record at the top of both of these files awk 'NF == 15 ' vegaGene.gp > tmp.gp awk 'NF == 15 ' vegaPseudo.gp > tmp2.gp zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | liftOver tmp.gp stdin vegaGeneHg18.gp unMapped.gp -genePred # only 6 dropped zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | liftOver tmp2.gp stdin vegaPseudoGeneHg18.gp unMappedPseudo.gp -genePred # only 11 dropped ldHgGene hg18 vegaGene -predTab vegaGeneHg18.gp -genePredExt ldHgGene hg18 vegaPseudoGene -predTab vegaPseudoGeneHg18.gp -genePredExt hgsql hg18 -N -B < /cluster/home/acs/kent/src/hg/lib/vegaInfo.sql echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg18 -N -B # SYNTENIC NETS FOR PANTRO2, RHEMAC2, MM8, RN4, AND CANFAM2 AS COMPOSITE TRACK (DONE 5/22/06 acs) # (for use in defining orthologs for macaque paper) ssh hgwdev # load syntenic nets created previously by Robert hgLoadNet hg18 netSyntenyPanTro2 /cluster/data/hg18/bed/blastz.panTro2/axtChain/hg18.panTro2.syn.net zcat /cluster/data/hg18/bed/blastz.rheMac2/axtChain/hg18.rheMac2.syn.net.gz | hgLoadNet hg18 netSyntenyRheMac2 stdin zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.syn.net.gz | hgLoadNet hg18 netSyntenyMm8 stdin zcat /cluster/data/hg18/bed/blastz.rn4/axtChain/hg18.rn4.syn.net.gz | hgLoadNet hg18 netSyntenyRn4 stdin zcat /cluster/data/hg18/bed/blastz.canFam2/axtChain/hg18.canFam2.syn.net.gz | hgLoadNet hg18 netSyntenyCanFam2 stdin # add more distant vertebrates to track so we can evaluate # syntenic netting for multiple alignment (2007-03-10 kate) cd /cluster/data/hg18/bed netFilter -syn blastz.danRer4/axtChain/hg18.danRer4.net.gz | \ hgLoadNet hg18 netSyntenyDanRer4 stdin netFilter -syn blastz.galGal3/axtChain/hg18.galGal3.net.gz | \ hgLoadNet hg18 netSyntenyGalGal3 stdin netFilter -syn blastz.monDom4/axtChain/hg18.monDom4.net.gz | \ hgLoadNet -warn hg18 netSyntenyMonDom4 stdin netFilter -syn blastz.ornAna1/axtChain/hg18.ornAna1.net.gz | \ hgLoadNet hg18 netSyntenyOrnAna1 stdin netFilter -syn blastz.anoCar1/axtChain/hg18.anoCar1.net.gz | \ hgLoadNet hg18 netSyntenyAnoCar1 stdin netFilter -syn blastz.xenTro2/axtChain/hg18.xenTro2.net.gz | \ hgLoadNet hg18 netSyntenyXenTro2 stdin netFilter -syn blastz.fr2/axtChain/hg18.fr2.net.gz | \ hgLoadNet hg18 netSyntenyFr2 stdin netFilter -syn blastz.equCab1/axtChain/hg18.equCab1.net.gz | \ hgLoadNet hg18 netSyntenyEquCab1 stdin netFilter -syn blastz.bosTau3/axtChain/hg18.bosTau3.net.gz | \ hgLoadNet -warn hg18 netSyntenyBosTau3 stdin netFilter -syn blastz.oryLat1/axtChain/hg18.oryLat1.net.gz | \ hgLoadNet hg18 netSyntenyOryLat1 stdin cat > netCov.csh << 'EOF' #!/bin/csh -ef foreach db (PanTro2 RheMac2 Mm8 Rn4 CanFam2 EquCab1 BosTau3 MonDom4 OrnAna1 GalGal3 AnoCar1 XenTro2 DanRer4 Fr2 OryLat1) echo -n " " featureBits -countGaps -chrom=chr1 hg18 refGene:cds net$db -enrichment featureBits -countGaps -chrom=chr1 hg18 refGene:cds netSynteny$db -enrichment echo "" end 'EOF' csh netCov.csh >&! netCov.log & cat netCov.log #refGene:cds 1.282%, netPanTro2 99.979%, both 1.282%, cover 100.00%, enrich 1.00x #refGene:cds 1.282%, netSyntenyPanTro2 99.978%, both 1.282%, cover 100.00%, enrich 1.00x #refGene:cds 1.282%, netRheMac2 99.970%, both 1.282%, cover 100.00%, enrich 1.00x #refGene:cds 1.282%, netSyntenyRheMac2 99.961%, both 1.282%, cover 99.97%, enrich 1.00x #refGene:cds 1.282%, netMm8 98.650%, both 1.278%, cover 99.69%, enrich 1.01x #refGene:cds 1.282%, netSyntenyMm8 98.352%, both 1.255%, cover 97.89%, enrich 1.00x #refGene:cds 1.282%, netRn4 98.404%, both 1.281%, cover 99.89%, enrich 1.02x #refGene:cds 1.282%, netSyntenyRn4 98.074%, both 1.258%, cover 98.10%, enrich 1.00x #refGene:cds 1.282%, netCanFam2 99.527%, both 1.281%, cover 99.91%, enrich 1.00x #refGene:cds 1.282%, netSyntenyCanFam2 99.274%, both 1.272%, cover 99.16%, enrich 1.00x #refGene:cds 1.282%, netEquCab1 99.457%, both 1.281%, cover 99.87%, enrich 1.00x #refGene:cds 1.282%, netSyntenyEquCab1 99.020%, both 1.270%, cover 99.06%, enrich 1.00x #refGene:cds 1.282%, netBosTau3 99.641%, both 1.282%, cover 100.00%, enrich 1.00x #refGene:cds 1.282%, netSyntenyBosTau3 99.493%, both 1.280%, cover 99.81%, enrich 1.00x #refGene:cds 1.282%, netMonDom4 98.718%, both 1.279%, cover 99.72%, enrich 1.01x #refGene:cds 1.282%, netSyntenyMonDom4 98.029%, both 1.260%, cover 98.26%, enrich 1.00x #refGene:cds 1.282%, netOrnAna1 68.119%, both 1.168%, cover 91.06%, enrich 1.34x #refGene:cds 1.282%, netSyntenyOrnAna1 56.729%, both 0.714%, cover 55.67%, enrich 0.98x #refGene:cds 1.282%, netGalGal3 82.246%, both 1.189%, cover 92.68%, enrich 1.13x #refGene:cds 1.282%, netSyntenyGalGal3 80.379%, both 1.101%, cover 85.86%, enrich 1.07x #refGene:cds 1.282%, netAnoCar1 63.263%, both 1.128%, cover 87.97%, enrich 1.39x #refGene:cds 1.282%, netSyntenyAnoCar1 54.068%, both 0.816%, cover 63.65%, enrich 1.18x #refGene:cds 1.282%, netXenTro2 45.072%, both 1.057%, cover 82.44%, enrich 1.83x #refGene:cds 1.282%, netSyntenyXenTro2 31.985%, both 0.596%, cover 46.44%, enrich 1.45x #refGene:cds 1.282%, netDanRer4 28.211%, both 1.012%, cover 78.87%, enrich 2.80x #refGene:cds 1.282%, netSyntenyDanRer4 7.631%, both 0.177%, cover 13.83%, enrich 1.81x #refGene:cds 1.282%, netFr2 26.938%, both 0.975%, cover 76.03%, enrich 2.82x #refGene:cds 1.282%, netSyntenyFr2 7.991%, both 0.200%, cover 15.62%, enrich 1.95x # Conclusion: CDS coverage loss is small in all placentals and opossum, so # use syntenic net mafs for these in multiz. # Ask about chicken -- it's marginal # Robert prepped synMafNet's for some species, but the files lack # soft-masked sequence, so redo if time. # (set up trackDb.ra entry for composite track) # SYNTENIC NET MAFS FOR MULTIZ (2007-03-09 kate) # Compare with Robert's ssh kkstore02 cd /cluster/data/hg18/bed/blastz.rheMac2 mv mafSynNet mafSynNet.robert ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -syntenicNet -continue syntenicNet >&! synnet.log & ssh kkstore02 cd /cluster/data/hg18/bed/blastz.panTro2 # need DEF file for syntenic net, but this was # a swapped run, so we will simulate cp /cluster/data/panTro2/bed/blastz.hg18/DEF . # edit to reverse target and query, and change BASE dir ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -syntenicNet -continue syntenicNet >&! synnet.log & rm DEF # edit DEF file to reference kolossus-accessible sequence and chrom.sizes cd /cluster/data/hg18/bed/blastz.monDom4 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -syntenicNet -continue syntenicNet >&! synnet.log & cd /cluster/data/hg18/bed/blastz.equCab1 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -syntenicNet -continue syntenicNet >&! synnet.log & cd /cluster/data/hg18/bed/blastz.bosTau3 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -syntenicNet -continue syntenicNet >&! synnet.log & cd /cluster/data/hg18/bed/blastz.mm8 cp /cluster/data/mm8/bed/blastz.hg18/DEF . # edit to reverse target & query, change BASE ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -syntenicNet -continue syntenicNet >&! synnet.log rm -f DEF cd /cluster/data/hg18/bed/blastz.rn4 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -syntenicNet -continue syntenicNet >&! synnet.log cd /cluster/data/hg18/bed/blastz.canFam2 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -syntenicNet -continue syntenicNet >&! synnet.log & # use syntenic net on opossum too cd /cluster/data/hg18/bed/blastz.monDom4 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \ -syntenicNet -continue syntenicNet >&! synnet.log & # NET AND RECIPROCAL BEST TABLES FOR 2X MAMMALS # load net and reciprocal best net for comparison # rabbit ssh hgwdev cd /cluster/data/hg18/bed/blastz.oryCun1/axtChain netFilter -minGap=10 hg18.oryCun1.net | hgLoadNet -warn hg18 netOryCun1 stdin netFilter -minGap=10 hg18.oryCun1.rbest.net.gz | \ hgLoadNet -warn hg18 netRBestOryCun1 stdin # tenrec ssh hgwdev cd /cluster/data/hg18/bed/blastz.echTel1/axtChain netFilter -minGap=10 hg18.echTel1.net.gz | hgLoadNet -warn hg18 netEchTel1 stdin netFilter -minGap=10 hg18.echTel1.rbest.net.gz | \ hgLoadNet -warn hg18 netRBestEchTel1 stdin # net coverage ssh hgwdev cd /cluster/data/hg18/bed cat > netRBestCov.csh << 'EOF' #!/bin/csh -ef foreach db (OtoGar1 OryCun1 CavPor2 LoxAfr1 EchTel1 DasNov1) echo -n " " featureBits -countGaps -chrom=chr1 hg18 refGene:cds net$db -enrichment featureBits -countGaps -chrom=chr1 hg18 refGene:cds netRBest$db -enrichment echo "" end 'EOF' # << emacs csh netRBestCov.csh >&! netRBestCov.log & ########################################################################## # EVOFOLD (Done, 05/12/06) Jakob Skou Pedersen # RNA secondary structure predictions lifted from hg17 and filtered ssh -C hgwdev mkdir -p /cluster/data/hg18/bed/evofold cd /cluster/data/hg18/bed/evofold echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed liftOver -minMatch=1.0 foldsHg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz tmp.bed unmapped.bed # remove elements which are wrong size after lifting awk '$3-$2 == $7' tmp.bed | sort -k4,4 > rawFoldsHg18.bed # structure filters # first, remove pairs that can't form in human cut -f 1-6 rawFoldsHg18.bed > tmp.bed # sequenceForBed can be found and compiled from here: $HOME/kent/src/hg/altSplice/altSplice/ nice /cluster/home/sugnet/bin/i386/sequenceForBed -db=hg18 -bedIn=tmp.bed -fastaOut=tmp.fa cat tmp.fa | sed -e 's/\.[+-]\.chr.*$//' \ | sed -e '/^>/s/$/\t/' | tr -d '\n' | sed -e 's/>/\n/g' | sed -e '1d' -e '$s/$/\n/' | sort -k1,1 > foldsHg18Seq.tab join -1 4 -2 1 -o "1.4 1.8 2.2" rawFoldsHg18.bed foldsHg18Seq.tab | sed -e 's/ */\t/g' | sort -k1,1 \ | /cluster/home/jsp/scripts/tabFoldFilter.py > cleanFolds.tab join -1 4 -2 1 -o "1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.2 1.9" rawFoldsHg18.bed cleanFolds.tab | sed -e 's/ */\t/g' > tmp1.bed # second, remove poor predictions # scripts can be found in cvs tree at: cvsroot/jsp/scripts/. They use a few modules which can be found at: cvsroot/jsp/py_modules cat tmp1.bed | /cluster/home/jsp/scripts/bedRnassFilter.py --dangling --minAvrStemSize=3 | /cluster/home/jsp/scripts/bedRnassFilter.sh 1 3 \ | /cluster/home/jsp/scripts/roundListFloats.py -c9 > foldsHg18.bed # clean up rm tmp.bed tmp1.bed foldsHg17.bed foldsHg18Seq.tab rawFoldsHg18.bed tmp.fa cleanFolds.tab # upload hgLoadBed -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/evofold.sql hg18 evofold foldsHg18.bed ######################################################################### # BLASTZ CHICKEN galGal3 (DONE 5/23/06 angie) ssh pk mkdir /cluster/data/hg18/bed/blastz.galGal3.2006-05-22 cd /cluster/data/hg18/bed/blastz.galGal3.2006-05-22 cat << '_EOF_' > DEF # human vs chicken BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom SEQ2_DIR=/san/sanvol1/galGal3/nib SEQ2_LEN=/cluster/data/galGal3/chrom.sizes SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.galGal3.2006-05-22 '_EOF_' # << emacs ~/kent/src/utils/doBlastzChainNet.pl DEF \ -bigClusterHub=pk -smallClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ >& do.log & tail -f do.log ln -s blastz.galGal3.2006-05-22 /cluster/data/hg18/bed/blastz.galGal3 # running syntenicNet 2008-10-30 # had to update the DEF file to correspond to new hive layout cd /cluster/data/hg18/bed/blastz.galGal3.2006-05-22 mv DEF DEF.0 cat << '_EOF_' > DEF # human vs chicken BLASTZ=blastz.v7.x86_64 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_SMSK=/scratch/data/hg18/linSpecRep/notInMouseRat SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom SEQ2_DIR=/scratch/data/galGal3/nib SEQ2_LEN=/scratch/data/galGal3/chrom.sizes SEQ2_SMSK=/scratch/data/galGal3/linSpecRep SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastz.galGal3.2006-05-22 '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=swarm -smallClusterHub=memk \ -continue=syntenicNet -syntenicNet \ -chainMinScore=5000 -chainLinearGap=loose > synNet.log 2>&1 # worked OK in about 3 minutes ######################################################################### # REGULATORY POTENTIAL (DONE - 2006-06-09 - Hiram) # download data from "James Taylor" ssh kkstore02 mkdir /cluster/data/hg18/bed/regPotential7X cd /cluster/data/hg18/bed/regPotential7X # This is a lot of data for C in 1 2 3 4 5 6 7 8 9 X Y 10 11 12 13 14 15 16 17 18 19 20 21 22 do wget --timestamping \ "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/chr${C}.scores.truncated.bz2" done wget --timestamping \ "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/trackDb.html" -O description.html time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y do bzcat chr${C}.scores.truncated.bz2 done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 23m27.454s # user 22m41.058s # sys 0m41.850s # Loading the table on hgwdev ssh hgwdev cd /cluster/data/hg18/bed/regPotential7X ln -s /cluster/data/hg18/bed/regPotential7X/regPotential7X.wib \ /gbdb/hg18/wib/regPotential7X.wib # using the tmpDir is faster since it is on local disk and it will # clean up any temporary .tab file it creates there time hgLoadWiggle -tmpDir=/scratch/tmp \ hg18 regPotential7X regPotential7X.wig # How about a histogram of the data. # find min and max for everything to verify it is 0 to 1 ssh kkstore02 cd /cluster/data/hg18/bed/regPotential7X time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y do echo " ============ ${C} =======================" bzcat chr${C}.scores.truncated.bz2 | ave -col=2 stdin done > stats.all 2>&1 grep "^min" stats.all | sort -u # min 0.000000 grep "^max" stats.all | sort -u # max 1.000000 ssh kolossus cd /cluster/data/hg18/bed/regPotential7X time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \ -hMinVal=0.0 -db=hg18 regPotential7X > histogram.data 2>&1 # real 2m42.311s # 73 % of the data values are zero # create download gzip files from the bz2 files: ssh kkstore02 cd /cluster/data/hg18/bed/regPotential7X for F in chr*.scores.truncated.bz2 do C=`echo $F | awk -F'.' '{print $1}'` echo -n "${C}.regPotential7X.hg18.gz working ... " bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz echo done ######################################################################### # create md5sum.txt under bigZips (DONE, 6/7/06, Fan) cd /cluster/store11/gs.19/build36/downloads/bigZips md5sum *.zip *.2bit README.txt > md5sum.txt ######################################################################### # UPDATE BACENDS track (DONE - 2006-06-16 - Hiram) # An attempt to recover some of the missing clones from the # bacEnds track. It turns out the perl processing script wasn't # properly catagorizing all the clone ends, thus a lot of them # were being left out of the final track ssh hgwdev mkdir /cluster/data/hg18/bed/updateCloneEnds cd /cluster/data/hg18/bed/updateCloneEnds ln -s ../cloneend/all.txt.gz . # Checked this script into the source tree and fixed it up to # recognize more of the catagories of clone ends zcat all.txt.gz | $HOME/kent/src/hg/utils/cloneEndParse.pl /dev/stdin # Reading in end info # Writing out pair info # Writing out singleton info # 301377 pairs and 204698 singles # Note that there are none marked at "unclassified" - this script # will print out that message to stderr if it doesn't recognize # any marker classifications. This produces the files: # -rw-rw-r-- 1 9645568 Jun 16 14:09 cloneEndPairs.txt # -rw-rw-r-- 1 4906468 Jun 16 14:09 cloneEndSingles.txt wc -l clone*.txt # 301377 cloneEndPairs.txt # 204698 cloneEndSingles.txt # This is a lot better than previous: wc -l ../cloneend/cloneEnd*.txt # 249619 ../cloneend/cloneEndPairs.txt # 318500 ../cloneend/cloneEndSingles.txt mkdir /san/sanvol1/scratch/hg18/updateBacEnds cd /san/sanvol1/scratch/hg18/updateBacEnds ln -s ../bacends/bacEnds.sorted.psl . ln -s ../bacends/lifted . pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \ -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose bacEnds.sorted.psl \ /cluster/data/hg18/bed/updateCloneEnds/cloneEndPairs.txt \ all_bacends bacEnds echo -e \ 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header echo -e '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long \ bacEnds.mismatch bacEnds.orphan \ | row score ge 300 | sorttbl chr start | headchg -del \ > bacEndPairsBad.bed extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \ bacEndPairsBad.bed | \ sorttbl tname tstart | headchg -del > bacEnds.load.psl # looks like we are getting a lot more now in every catagory: wc -l bacEnds.* bacEndPairs* | sort -n 49 bacEnds.long 1399 bacEnds.mismatch 4516 bacEnds.slop 7202 bacEnds.short 66861 bacEnds.orphan 78900 bacEndPairsBad.bed 205443 bacEndPairs.bed 207997 bacEnds.pairs 1727387 bacEnds.load.psl # Previously: wc -l ../bacends/bacEnds.* ../bacends/bacEndPairs* | sort -n 40 ../bacends/bacEnds.long 1061 ../bacends/bacEnds.mismatch 3954 ../bacends/bacEnds.slop 6279 ../bacends/bacEnds.short 59245 ../bacends/bacEnds.orphan 69788 ../bacends/bacEndPairsBad.bed 159268 ../bacends/bacEndPairs.bed 161251 ../bacends/bacEnds.pairs 1249956 ../bacends/bacEnds.load.psl # Move the previous build out of the way and copy these # results over to the primary hg18 bed location: mv /cluster/data/hg18/bed/bacends /cluster/data/hg18/bed/bacends.2006-02-02 mkdir /cluster/data/hg18/bed/bacends cp -p bacEnd* /cluster/data/hg18/bed/bacends cp -p lifted/bacEnds.lifted.psl /cluster/data/hg18/bed/bacends # load them into the database ssh hgwdev cd /cluster/data/hg18/bed/bacends # CHECK bacEndPairs.bed ID's to make sure they have no blanks in them awk '{print $5}' bacEndPairs.bed | sort | uniq -c # result should be the scores, no extraneous strings: # 202488 1000 # 255 300 # 416 375 # 384 500 # 1900 750 # edit the file and fix it if it has a bad name. sed -e "s/bacEndPairs /bacEndPairsUpdate /" \ $HOME/kent/src/hg/lib/bacEndPairs.sql > bacEndPairsUpdate.sql hgLoadBed -notItemRgb hg18 bacEndPairsUpdate bacEndPairs.bed \ -sqlTable=bacEndPairsUpdate.sql # Loaded 205443 elements of size 11 # Previously was: # Loaded 159268 # note - this track isn't pushed to RR, just used for assembly QA sed -e "s/bacEndPairsBad /bacEndPairsBadUpdate /" \ $HOME/kent/src/hg/lib/bacEndPairsBad.sql > bacEndPairsBadUpdate.sql hgLoadBed -notItemRgb hg18 bacEndPairsBadUpdate bacEndPairsBad.bed \ -sqlTable=bacEndPairsBadUpdate.sql # Loaded 78900 elements of size 11 # Previously was: # Loaded 69788 #hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl # NOTE: truncates file to 0 if -nobin is used # NOTE: truncates file to 0 if -nobin is used hgLoadPsl hg18 -table=all_bacendsUpdate bacEnds.load.psl # no complaints ! Usually there are, this loaded: hgsql -N -e "select count(*) from all_bacendsUpdate;" hg18 # 1727387 # Previously this was: # 1249956 nice featureBits hg18 all_bacendsUpdate # 227770876 bases of 2881515245 (7.905%) in intersection nice featureBits hg18 all_bacends # 191078854 bases of 2881515245 (6.631%) in intersection nice featureBits hg17 all_bacends # 225763317 bases of 2866216770 (7.877%) in intersection nice featureBits hg18 bacEndPairsUpdate # 162690030 bases of 2881515245 (5.646%) in intersection nice featureBits hg18 bacEndPairs # 130270940 bases of 2881515245 (4.521%) in intersection nice featureBits hg17 bacEndPairs # 162099487 bases of 2866216770 (5.656%) in intersection nice featureBits hg18 bacEndPairsBadUpdate # 37326990 bases of 2881515245 (1.295%) in intersection nice featureBits hg18 bacEndPairsBad # 33650226 bases of 2881515245 (1.168%) in intersection nice featureBits hg17 bacEndPairsBad # 37437558 bases of 2866216770 (1.306%) in intersection # Renamed the new BAC End Pairs tables (7-27-2006 Brooke) mysql> alter table all_bacends rename all_bacendsOld; Query OK, 0 rows affected (0.01 sec) mysql> alter table bacEndPairs rename bacEndPairsOld; Query OK, 0 rows affected (0.00 sec) mysql> alter table all_bacendsUpdate rename all_bacends; Query OK, 0 rows affected (0.00 sec) mysql> alter table bacEndPairsUpdate rename bacEndPairs; Query OK, 0 rows affected (0.00 sec) ######################################################################### # dbSNP BUILD 126 (Heather, June 2006) # Set up directory structure ssh kkstore02 cd /cluster/data/dbSNP mkdir 126 cd 126 mkdir human cd human mkdir data mkdir schema mkdir rs_fasta # Get data from NCBI (anonymous FTP) cd /cluster/data/dbSNP/126/human/data ftp ftp.ncbi.nih.gov cd snp/organisms/human_9606/database/organism_data # ContigLoc table has coords, orientation, loc_type, and refNCBI allele get b126_SNPContigLoc_36_1.bcp.gz # ContigLocusId has function get b126_SNPContigLocusId_36_1.bcp.gz get b126_ContigInfo_36_1.bcp.gz # MapInfo has alignment weights get b126_SNPMapInfo_36_1.bcp.gz # SNP has univar_id, validation status and heterozygosity get SNP.bcp.gz # Get schema from NCBI cd /cluster/data/dbSNP/126/human/schema ftp ftp.ncbi.nih.gov cd snp/organisms/human_9606/database/organism_schema get human_9606_table.sql.gz # Get fasta files from NCBI # using headers of fasta files for molType cd /cluster/data/dbSNP/126/human/rs_fasta ftp ftp.ncbi.nih.gov cd snp/organisms/human_9606/rs_fasta mget *.gz # Simplify names of data files cd /cluster/data/dbSNP/126/human/data mv b126_SNPContigLoc_36_1.bcp.gz ContigLoc.gz mv b126_SNPContigLocusId_36_1.bcp.gz ContigLocusId.gz mv b126_ContigInfo_36_1.bcp.gz ContigInfo.gz mv b126_SNPMapInfo_36_1.bcp.gz MapInfo.gz mv SNP.bcp.gz SNP.gz ls -1 *.gz > filelist # edit table descriptions cd /cluster/data/dbSNP/126/human/schema # get CREATE statements from human_9606_table.sql for our 5 tables # store in table.tmp # convert and rename tables sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp rm table.tmp sed -f 'tableRename.sed' table2.tmp > table.sql rm table2.tmp # Get updated UniVariation table cd /cluster/data/dbSNP/126/shared ftp ftp.ncbi.nih.gov cd snp/organisms/human_9606/database/shared_data get UniVariation.bcp.gz cd ../shared_schema get dbSNP_main_table.sql.gz # get UniVariation CREATE statement from dbSNP_main_table.sql # use mssqlToMysql.sed to convert # get header lines from rs_fasta cd /cluster/data/dbSNP/126/human/rs_fasta /bin/csh gnl.csh # add rs_fasta to seq/extFile # 2 edits first: strip header to just rsId, and remove duplicates # work on /cluster/store12 (kkstore05) which has more disk space # also for human, don't include chrUn cp rs_ch*.fas.gz /cluster/store12/snp/126/human/rs_fasta ssh kkstore05 cd /cluster/store12/snp/126/human/rs_fasta mkdir unarchive mv rs_chUn.fas.gz unarchive # concat into rsAll.fas cat << '_EOF_' > concat.csh #!/bin/csh -ef rm -f rsAll.fas foreach file (rs_ch*.fas.gz) echo $file zcat $file >> rsAll.fas end '_EOF_' # << emacs # snpCleanSeq strips the header and skips duplicates /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa rm rsAll.fas # load on hgwdev ssh hgwdev mkdir /gbdb/hg18/snp ln -s /cluster/store12/snp/126/human/rs_fasta/snp.fa /gbdb/hg18/snp/snp.fa cd /cluster/store12/snp/126/human/rs_fasta hgLoadSeq hg18 /gbdb/hg18/snp/snp.fa # look up id in extFile # move into separate table hgsql hg18 < snpSeq.sql hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 15200238' hg18 hgsql -e 'delete from seq where extFile = 15200238' hg18 hgsql -e 'alter table snpSeq add index acc (acc)' hg18 # clean up after hgLoadSeq rm seq.tab # load on kkr5u00 ssh kkr5u00 hgsql -e mysql 'create database hg18snp126' cd /cluster/data/dbSNP/126/human/schema hgsql hg18snp126 < table.sql cd ../data /bin/csh load.csh # note rowcount # ContigLoc 27007176 # SNP 11961761 # MapInfo 11712346 # ContigLocusId 11854143 cd /cluster/data/dbSNP/126/shared hgsql hg18snp126 < UniVariation.sql zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' hg18snp126 # create working /scratch dir cd /scratch/snp mkdir 126 cd 126 mkdir human cd human # get hg18 ctgPos, load into dbSnpHumanBuild126, compare contig list between ctgPos and ContigInfo # Note: missing chrY PAR regions # get gnl files cp /cluster/data/dbSNP/126/human/rs_fasta/*.gnl . # examine ContigInfo for group_term and edit pipeline.csh # use "ref_assembly" cd /scratch/snp/126/human # filter ContigLoc into ContigLocFilter # this lifts from contig coords to chrom coords # phys_pos_from is used to check coords for non-random chroms # errors reported to stdout # this gets rid of alternate assemblies (using ContigInfo) # this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo) # assumes all contigs are positively oriented; will abort if not true mysql> desc ContigLocFilter; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromName | varchar(32) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | start | int(11) | NO | | | | # | end | int(11) | YES | | NULL | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter hg18snp126 ref_assembly reference # note rowcount # ContigLocFilter 12368145 # how many are positive strand? hopefully 90% mysql> select count(*) from ContigLocFilter where orientation = 0; # 10622168 # note count by loc_type mysql> select count(*), loc_type from ContigLocFilter group by loc_type; # +----------+----------+ # | count(*) | loc_type | # +----------+----------+ # | 205359 | 1 | # | 10678378 | 2 | # | 1464642 | 3 | # | 9025 | 4 | # | 1117 | 5 | # | 9624 | 6 | # +----------+----------+ # filter ContigLocusId into ContigLocusIdFilter /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter hg18snp126 ref_assembly # note rowcount # ContigLocusIdFilter 5812538 # condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions) # assumes SNPs are in numerical order; will errAbort if not true /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense hg18snp126 # note rowcount; expect about 50% for human # ContigLocusIdCondense 3975405 (note this is smaller than hg17/snp125) # could delete ContigLocusIdFilter table here # create chrN_snpFasta tables from *.gnl files # we are just using molType, but also storing class and observed # 266,366 duplicates detected in snpMoltype.errors /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta hg18snp126 # (could start using pipeline.csh here) # (pipeline.csh takes about 35 minutes to run) # split ContigLocFilter by chrom # create the first chrN_snpTmp # we will reuse this table name, adding/changing columns as we go # at this point chrN_snpTmp will have the same description as ContigLocFilter # this opens a file handle for every chrom, so will not scale to scaffold-based assemblies /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom hg18snp126 ref_assembly # adjust coords using loc_type # possible errors logged to snpLocType.error: # Unknown locType # Between with end != start + 1 # Between with allele != '-' # Exact with end != start # Range with end < start # possible exceptions logged to snpLocType.exceptions: # RefAlleleWrongSize # This run no errors, no exceptions # I do note that out of 25K rows where loc_type == 6, 12259 have asn_from == asn_to # All of loc_type == 1, 4, 5 have zero rows where asn_from == asn_to # This was also true in build125 # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype hg18snp126 ref_assembly # expand allele as necessary # report syntax errors to snpExpandAllele.errors # possible exceptions logged to snpExpandAllele.exceptions: # RefAlleleWrongSize # This run no errors, no exceptions # 8092 alleles expanded /cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele hg18snp126 ref_assembly # the next few steps prepare for working in UCSC space # sort by position /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort hg18snp126 ref_assembly # rename MT --> M (pipeline.csh takes care of this) hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" hg18snp126 # get hg18 nib files # get hg18 chromInfo, load into hg18snp126 with editted path # lookup reference allele in nibs # keep reverse complement to use in error checking (snpCheckAlleles) # check here for SNPs larger than 1024 # errAbort if detected # check for coords that are too large, log to snpRefUCSC.error and skip # This run we got 30678 lines in snpRefUCSC.error # 12178 from chr14 (reported to dbSNP) # also 18423 from chr1_random and 77 from chr6_random /cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC hg18snp126 # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +--------------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # +--------------------+-------------+------+-----+---------+-------+ # compare allele from dbSNP to refUCSC # locType between is excluded from this check # log exceptions to snpCheckAllele.exceptions # if SNP is positive strand, expect allele == refUCSC # log RefAlleleMismatch if not # if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp # If allele == refUCSCRevComp, log RefAlleleNotRevComp # If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch # This run we got: # 0 RefAlleleMismatch # 119366 RefAlleleNotRevComp # Note this is double from build125 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles hg18snp126 # add class and observed using univar_id from SNP table # to get class (subsnp_class) and observed (var_str) from UniVariation # log errors to snpClassAndObserved.errors # errors detected: # class = 0 in UniVariation # class > 8 in UniVariation # univar_id = 0 in SNP # no row in SNP for snp_id in chrN_snpTmp # This run we got: # 3 class = 0 in UniVariation # 0 class > 8 in UniVariation # 39059 univar_id = 0 in SNP # 879 no row in SNP for snp_id in chrN_snpTmp (all chr6) # dbSNP has class = 'in-del' # we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved hg18snp126 # morph chrN_snpTmp # +--------------------+---------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+---------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | class | varchar(255) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # | observed | blob | YES | | NULL | | # +--------------------+---------------+------+-----+---------+-------+ # generate exceptions for class and observed # SingleClassBetweenLocType # SingleClassRangeLocType # NamedClassWrongLocType # ObservedWrongFormat # ObservedWrongSize (twice as many as hg17/snp125) # ObservedMismatch (nearly 3x as many as hg17/snp125) # RangeSubstitutionLocTypeExactMatch # SingleClassTriAllelic # SingleClassQuadAllelic # This will also detect IUPAC symbols in allele /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved hg18snp126 # add function /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction hg18snp126 # add validation status and heterozygosity # log error if validation status > 31 or missing # this run we got 8 missing /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP hg18snp126 # add molType # errors detected: missing or duplicate molType # no errors this run /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype hg18snp126 # generate chrN_snp126 and snp126Exceptions tables cp snpCheckAlleles.exceptions snpCheckAlleles.tab cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab cp snpExpandAllele.exceptions snpExpandAllele.tab cp snpLocType.exceptions snpLocType.tab /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable hg18snp126 126 # handle chrY PAR SNPs (still missing from dbSNP) /cluster/home/heather/kent/src/hg/snp/snpLoad/snpPAR hg18snp126 hgsql -e 'load data local infile "snpPARexceptions.tab" into table snp126Exceptions' hg18snp126 # concat into snp126.tab # cat chr*_snp126.tab >> snp126.tab # note chr18_random_snp126.tab is empty (just 2 rows in hg17/snp125) /bin/sh concat.sh # check for multiple alignments /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple hg18snp126 mysql> load data local infile 'snpMultiple.tab' into table snp126Exceptions; # run and review snpCompareLoctype # load snp125subset /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareLoctype hg18snp126 snp125subset snp126 # cat snpCompareLoctypeCounts.out # note: rangeToExact is 2x 124/125 conversion rate # exactToExact = 8747888 # exactToBetween = 1071 # exactToRange = 6673 # betweenToBetween = 321371 # betweenToExact 1323 # betweenToRange 514 # rangeToRange = 95562 # rangeToBetween = 1794 # rangeToExact = 15148 # oldToNew = 10649 # run and review snpCompareWeight # load into database snp125snp126 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareWeight snp125snp126 weight125 weight126 # cat snpCompareWeightCounts.out # oneToOne = 9161896 # oneToTwo = 0 <-- good # oneToThree = 531 <--- interesting but minor # twoToTwo = 38 <-- okay # twoToOne = 1896 <--- improvement # twoToThree = 0 <-- good # threeToThree = 494 <-- okay # threeToOne = 37571 <-- improvement # threeToTwo = 12 <-- improvement # load on hgwdev cp snp126.tab /cluster/home/heather/transfer/snp hgsql hg18snp126 -e 'select * from snp126Exceptions' > /cluster/home/heather/transfer/snp/snp126Exceptions.tab ssh hgwdev mysql> load data local infile 'snp126.tab' into table snp126; mysql> load data local infile 'snp126Exceptions.tab' into table snp126Exceptions; # create indexes mysql> alter table snp126 add index name (name); mysql> alter table snp126 add index chrom (chrom, bin); mysql> alter table snp126Exceptions add index name(name); # create snp126ExceptionDesc table cd /cluster/data/dbSNP hgsql hg18 < snp126ExceptionDesc.sql # add counts to exception.human.126, can start with exception.template hgsql -e 'select count(*), exception from snp126Exceptions group by exception' hg18 mysql> load data local infile 'exception.human.126' into table snp126ExceptionDesc; ################################################################ # SNP126 edit: condense UTR/intron func into just intron at Jim's request ssh kkr5u00 cd /scratch/snp/126/human /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense hg18snp126 /bin/csh pipeline.csh ssh hgwdev cd /cluster/home/heather/transfer/snp hgsql hg18 -e 'drop table snp126' hgsql hg18 < /cluster/home/heather/kent/src/hg/lib/snp126.sql hgsql hg18 -e 'load data local infile "snp126.tab" into table snp126' hgsql hg18 -e 'alter table snp126 add index name (name)' hgsql hg18 -e 'alter table snp126 add index chrom (chrom, bin)' ################################################################ # SNP126 edit: detect clustering errors (Heather, Sept. 2006) # for locType = 'between' (class = 'insertion') # 1,393,040 candidates # exceptions: # DuplicatedObserved (3020 of these) # MixedObserved (1312 of these) # create and populate a simple table snp126insertions mysql> insert into snp126insertions select chrom, chromStart, chromEnd, name, score, strand, observed from snp126 where locType = 'between' and class = 'insertion'; # generate and load data cd /cluster/home/heather/kent/src/hg/snp/snpLoad ./snpCheckCluster hg18 snp126insertions mysql> load data local infile 'snpCheckCluster.tab' into table snp126Exceptions; # update snp126ExceptionDesc ################################################################ # generate snpMasked sequence for snp126 (Heather, Sept. 2006) # snpMaskChrom was run too, not documented here. # OBSOLETED by snp128Mask, see below. # 3 steps: simple filtering, advanced filtering, generate sequence # simple filtering: create and populate tables # insertions: 1,393,040 # deletions: 783,454 ssh hgwdev mysql> insert into snp126insertions select * from snp126 where locType = 'between' and class = 'insertion'; mysql> insert into snp126deletions select * from snp126 where class = 'deletion'; # advanced filtering -- insertions cd /cluster/home/heather/kent/src/hg/snp/snpLoad # this removes SNPs with weight != 1 # this removes SNPs that align to more than one position # this removes SNPs that cluster together with conflicting observations # (these should be class = 'mixed') # this removes SNPs with invalid observed string # this asserts end == start # final count 1,352,380 # written to insertions.tab ./snpGetInsertions hg18 snp126insertions snp126Exceptions # advanced filtering -- deletions cd /cluster/home/heather/kent/src/hg/snp/snpLoad # this removes SNPs with weight != 1 # this removes SNPs that align to more than one position # this removes SNPs with invalid observed string # this removes SNPs with exception ObservedWrongSize # this asserts end > start # final count 621,024 # written to deletions.tab ./snpGetDeletions hg18 snp126deletions snp126Exceptions # Note: the advanced filtering pretty much removes all SNPs from chrN_random # generate sequence -- insertions # use kent/src/hg/snp/snpMask/seqWithInsertions.c # this asserts that position doesn't exceed chromSize # this will reverse complement observed if strand is negative # if no SNPs found, output sequence == input sequence # write to chrN.fat ssh kkr5u00 mysql> load data local infile "/cluster/home/heather/kent/src/hg/snp/snpLoad/insertions.tab" into table snp126insertionsClean; cd /scratch/snp126/human/fat /bin/sh fat.sh cp *.fat /cluster/data/hg18/snpMask/insertions ssh kkstore02 cd /cluster/data/hg18/snpMask/insertions nice gzip *.fat # generate sequence -- deletions # use kent/src/hg/snp/snpMask/seqWithoutDeletions.c # this asserts that position doesn't exceed chromSize # if no SNPs found, output sequence == input sequence # write to chrN.skinny ssh kkr5u00 mysql> load data local infile "/cluster/home/heather/kent/src/hg/snp/snpLoad/deletions.tab" into table snp126deletionsClean; cd /scratch/snp126/human/skinny /bin/sh skinny.sh cp *.skinny /cluster/data/hg18/snpMask/deletions ssh kkstore02 cd /cluster/data/hg18/snpMask/deletions nice gzip *.skinny # create links on hgwdev ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg18/snpMask/insertions /bin/sh link.sh cd /usr/local/apache/htdocs/goldenPath/hg18/snpMask/deletions /bin/sh link.sh ############################################################################ # Lift simple bi-allelic SNPs to rheMac2 and panTro2 (Heather, August 2006) # OBSOLETED by snp128Ortho, see below. ssh hgwdev cd /cluster/data/dbSNP/ortho/hg18/snpDump # dump raw data -- this creates snpGetSimple.chr* # exceptions table is used to skip SNPs that align in multiple places # We also skip SNPs on chrN_random # We also skip triallelic and quadallelic # We don't filter on weight # This yields 9,092,533 SNPs # This data is also stored into hg18.snp126simple for later use /cluster/home/heather/kent/src/hg/snp/snpLoad/snpGetSimple hg18 snp126 snp126Exceptions # split up into just under 200 files to make for an efficient pk run # using file size of 60K lines # this creates /cluster/data/dbSNP/ortho/hg18/split/chr1-01, chr1-02, chr1-03, etc. # 165 files created # 140 files have 60k lines /bin/csh split.csh # prepare cluster runs # I didn't use -bedPlus=6, didn't seem to need it cp /cluster/data/dbSNP/ortho/hg18/split/* /san/sanvol1/snp/liftOver/hg18/rheMac2/input cp /cluster/data/dbSNP/ortho/hg18/split/* /san/sanvol1/snp/liftOver/hg18/panTro2/input cd /san/sanvol1/snp/liftOver/hg18/rheMac2 /bin/csh makeJobList.csh rm -f jobList foreach fileName (`ls input/chr*`) set baseName = $fileName:t echo liftOver $fileName /cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz output/$baseName.out unmapped/$baseName.unmapped >> jobList end cd /san/sanvol1/snp/liftOver/hg18/panTro2 /bin/csh makeJobList.csh rm -f jobList foreach fileName (`ls input/chr*`) set baseName = $fileName:t echo liftOver $fileName /cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz output/$baseName.out unmapped/$baseName.unmapped >> jobList end # do cluster runs # this only took a few minutes # got 7321537 lifts for rheMac2 # got 8517465 lifts for panTro2 ssh pk cd /san/sanvol1/snp/liftOver/hg18/rheMac2 para create jobList para try; para check; para push; para check; etc. cd /san/sanvol1/snp/liftOver/hg18/panTro2 para create jobList para try; para check; para push; para check; etc. # concatenate output files into all.out cd /san/sanvol1/snp/liftOver/hg18/rheMac2/output /bin/csh concat.csh cd /san/sanvol1/snp/liftOver/hg18/panTro2/output /bin/csh concat.csh # load into panTro2 and rheMac2 # Doing the load and split so I can easily load sequence for a full chrom ssh hgwdev cp /san/sanvol1/snp/liftOver/hg18/rheMac2/output/all.out /cluster/data/dbSNP/ortho/hg18/rheMac2Lift cd /cluster/data/dbSNP/ortho/hg18/rheMac2Lift hgsql rheMac2 < snp126hg18ortho.sql hgsql -e 'load data local infile "all.out" into table snp126hg18ortho' rheMac2 cp /san/sanvol1/snp/liftOver/hg18/panTro2/output/all.out /cluster/data/dbSNP/ortho/hg18/panTro2Lift cd /cluster/data/dbSNP/ortho/hg18/panTro2Lift hgsql panTro2 < snp126hg18ortho.sql hgsql -e 'load data local infile "all.out" into table snp126hg18ortho' panTro2 # split by chrom # this creates tables chrN_snp126hg18ortho and can be run from anywhere # it will create chrN_snp126hg18ortho.tab files which can be deleted cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom2 rheMac2 snp126hg18ortho rm chr*.tab # rm snp126ortho.tab cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom2 panTro2 snp126hg18ortho rm chr*.tab # rm snp126ortho.tab # get sequence # this creates chrN_snp126hg18orthoPrelim.tab files # random chroms are okay here # note we are including Ns # This will log to fetchSeq.errors any examples where chromEnd != chromStart + 1 # It will also check for coordinates past the end of the chrom. # No errors for rheMac2 or panTro2. cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq /cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq rheMac2 /cluster/data/rheMac2/rheMac2.2bit # ssh kkstore02 # cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq /bin/csh concat.csh # cleanup; remove split tables from rheMac2, keep snp126hg18orthoPrelim hgsql rheMac2 < drop.sql rm chr*.tab cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq /cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq panTro2 /cluster/data/panTro2/panTro2.2bit # ssh kkstore02 # cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq /bin/sh concat.sh # cleanup; remove split tables from panTro2, keep snp126hg18orthoPrelim hgsql panTro2 < drop.sql rm chr*.tab # do a preliminary load -- combine chimp and macaque cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq hgsql hg18 < snp126orthoPrelim.sql hgsql -e 'load data local infile "snp126orthoPrelim.tab" into table snp126orthoPrelim' hg18 cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq hgsql -e 'load data local infile "snp126orthoPrelim.tab" into table snp126orthoPrelim' hg18 # add human chrom, chromStart, chromEnd, allele, variant # liftOver loses the chrom, chromStart and chromEnd # liftOver does retain the allele and variant cd /cluster/data/dbSNP/ortho/hg18/integrate /cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrthoLookup hg18 snp126simple snp126orthoPrelim # load final table with separate rows for chimp and macaque # drop snp126orthoPrelim because it has non-human coords # rm tab file because it is huge hgsql hg18 < snp126ortho.sql load data local infile "snpOrthoLookup.tab" into table snp126ortho drop table snp126orthoPrelim rm snpOrthoLookup.tab # create indices mysql> alter table snp126ortho add index name (name); mysql> alter table snp126ortho add index chrom (chrom, bin); # manually validate a few examples on various chroms, various strands # I used rheMac2: # rs533274, hg18 chr1 +, rheMac2 chr18 - # rs1690550, hg18 chr1 -, rheMac2 chr19 + # rs3121568, hg18 chr1 -, rheMac2 chr19 - # rs28709562, hg18 chr1 +, rheMac2 chr19 + # rs34675838, also hg18 chr1 +, rheMac2 chr19 + # create alternate format with both alleles in same row /cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrthoJoin hg18 snp126simple snp126ortho # 8517465 rows in hash for panTro2 # 7321537 rows in hash for rheMac2 # humanCount = 9092533 # chimpOnlyCount = 1418324 # macaqueOnlyCount = 222396 # missingCount = 352672 # bothCount = 7098141 # confirm that chimpOnly + macaqueOnly + missing + both = human hgsql hg18 < snp126orthoPanTro2RheMac2.sql hgsql -e "load data local infile 'snpOrthoJoin.tab' into table snp126orthoPanTro2RheMac2" hg18 mysql> alter table snp126orthoPanTro2RheMac2 add index name (name); mysql> alter table snp126orthoPanTro2RheMac2 add index chrom (chrom, bin); ################################################################ ### CREATE chimpHiQualDiff -- panTro2 (Daryl; May 1, 2006) # Make file/table of high quality single base pair differences # between hg18 and panTro2 set bedDir = /cluster/data/hg18/bed/chimpHiQualDiffs mkdir -p $bedDir cd $bedDir sed 's/simpleNucDiff/chimpHiQualDiffs/' ~/kent/src/hg/lib/simpleNucDiff.sql >! chimpHiQualDiffs.sql set axtDir = /cluster/data/hg18/bed/blastz.panTro2/axtRBestNet mkdir -p chroms; cd chroms ls -1 $axtDir | grep chr | grep axt | sed 's/.hg18.panTro2.net.axt.gz//' | grep -v random | grep -v "_" | xargs mkdir set workDir = /scratch/chqd mkdir -p $workDir touch $workDir/chqd.log # time nice /cluster/home/daryl/bin/i386/chimpHiQualDiffs $workDir/$f /cluster/data/panTro2/bed/quality/qac/panTro2.qac $f.chimpHiQualDiffs .bed>>& $workDir/chqd.log foreach f (chr*) echo -n $f " " mkdir -p $workDir/$f/ cp $axtDir/$f.*.axt.gz $workDir/$f/ gunzip $workDir/$f/$f.*.axt.gz time nice /cluster/home/daryl/bin/i386/chimpHiQualDiffs $workDir/$f /cluster/data/panTro2/bed/quality/qac/panTro2.qac $f.chimpHiQualDiffs .bed rm -f $workDir/$f/$f.*axt rmdir $workDir/$f/ end mv $workDir/chqd.log . cat chr*bed >! ../chimpHiQualDiffs.bed ## The load (sort) ran out of memory on hgwdev, so sort the ## file first on kolossus and then load it on hgwdev ssh kolossus time hgLoadBed -strict -sqlTable=chimpHiQualDiffs.sql -noLoad hg18 chimpHiQualDiffs chimpHiQualDiffs.bed # 110.214u 10.836s 2:24.42 83.8% 0+0k 0+0io 1pf+0w exit ## hgwdev time hgLoadBed -hasBin -noSort -sqlTable=chimpHiQualDiffs.sql hg18 chimpHiQualDiffs bed.tab # 328.890u 113.230s 42:26.00 17.3% 0+0k 0+0io 197676pf+0w ## TODO: need to filter out polymorphic sites (SNPs) ################################################################# ###### BUILD SUPERFAMILY RELATED TABLES (DONE - 2006-06-20 - Fan) # Build Superfamily track and create sf tables needed for PB ssh hgwdev hgsql hg18 < ~/src/hg/lib/sfAssign.sql cd /cluster/data/superfamily/060619 hgsql hg18 -e 'load data local infile "ass_18-Jun-2006.tab" into table hg18.sfAssign;' # If hg18.sfDes already exists, drop it. mkdir /cluster/data/hg18/bed/sf cd /cluster/data/hg18/bed/sf hgsql superfam060619 -N -e "select * from des" >sfDes.tab hgsql hg18 < ~/src/hg/lib/sfDes.sql hgsql hg18 -e 'load data local infile "sfDes.tab" into table sfDes' # Build ensemblXref3 # Get the ensembl gene/protein cross-reference data from Ensembl BioMart # http://www.ensembl.org/Multi/martview # Follow this sequence through the pages: # Page 1) Select Ensembl39 and Homo Sapien. Hit next. # Page 2) Do not select anything. Hit next. # Page 3) Choose the "Feature" box, select Ensembl gene ID, transcript ID, peptide ID, UniProt/TrEMBL ID, UniProt/SWISSPROT ID, and UniProt/SWISSPROT Accession # Page 4) Choose "Text, tab separated". choose gzip compression. hit export. # Save as ensembXref3.gz ssh hgwdev cd /cluster/data/hg18/bed/ensembl gzip -d ensembXref3.gz hgsql hg18 < ~/src/hg/lib/ensemblXref3Temp.sql hgsql hg18 -e \ 'load data local infile "ensemblXref3" into table ensemblXref3Temp ignore 1 lines' hgsql hg18 -N -e \ 'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \ > ensemblXref3.tab hgsql hg18 -e 'drop table ensemblXref3' hgsql hg18 <~/src/hg/lib/ensemblXref3.sql hgsql hg18 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3' # If hg18.superfamily already exists, drop it. cd /cluster/data/hg18/bed/sf hgSuperfam hg18 superfam060619 > sf.log # It is normal that many proteins do not have corresponding Superfamily entries. # If hg18.sfDescription exists, drop it. hgsql hg18 < ~/src/hg/lib/sfDescription.sql hgsql hg18 -e 'LOAD DATA local INFILE "sfDescription.tab" into table hg18.sfDescription;' # Finally, load the superfamily table. hgLoadBed hg18 superfamily superfamily.tab -tab # Create knownToSuperfamily table # Note hs is changed into ht for this Superfamily release. cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \ | hgKnownToSuper hg18 hs stdin # created 27,511 rows in knownToSuper ############################################################################ # SEGMENTAL DUPLICATIONS (DONE 7/14/06 angie) # File emailed from Xinwei She mkdir /cluster/data/hg18/bed/genomicSuperDups cd /cluster/data/hg18/bed/genomicSuperDups # The sed command is necessary to fix "_" used as strand. # The awk command was necessary for some recent other species # genomicSuperDups that had some too-short regions. It does not seem # to be necessary here, but doesn't hurt and may be useful in # future builds. sed -e 's/\t_\t/\t-\t/' hg18genomicSuperDup.tab \ | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \ | hgLoadBed hg18 genomicSuperDups stdin \ -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql # fix off-by one error: sed -e 's/\t_\t/\t-\t/' hg18genomicSuperDup.tab \ | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' > hg18.gsd.bed # run this perl script: #!/usr/bin/env perl use strict; use warnings; open (FH, ") { chomp $line; my ($chr, $start, $rest) = split('\s+', $line, 3); printf "%s\t%d\t%s\n", $chr, $start-1, $rest; } close (FH); # ./addOne.pl > oneLarger.bed # check first column: ave -col=2 hg18.gsd.bed ave -col=2 oneLarger.bed # reload table hgLoadBed hg18 genomicSuperDups oneLarger.bed \ -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql ############################################################################ # GENE BOUNDS (RNACLUSTER) (DONE 08-09-2006 Fan) # Create rnaCluster table (depends on {est,mrna}OrientInfo) cd /cluster/data/hg18/bed mkdir rnaCluster cd rnaCluster/ mkdir chrom # Create a list of accessions that come from RAGE libraries and need to be excluded. ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs foreach f (/cluster/data/hg18/nib/chr*.nib) set c = $f:t:r set out = chrom/$c.bed # Exclude accesions in the RAGE file echo clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c end hgLoadBed hg18 rnaCluster chrom/*.bed ############################################################################ ############################################################################ # POLYA_DB TRACK (DONE 08-28-2006 Andy) mkdir /cluster/data/hg18/bed/polyaDB cd /cluster/data/hg18/bed/polyaDB wget http://polya.umdnj.edu/download/polyAsite.gz gunzip polyAsite.gz find /cluster/data/hg16/ -name 'ordered.lft' | xargs cat > hg16.lft sed 's/\(\s\).*\//\1/; s/chr/hg16.chr/' hg16.lft > tmp mv tmp hg16.lft cut -f2 hg16.lft > hg16.lft.names grep -F -f hg16.lft.names polyAsite > hg16.polyAsite awk '{printf("%s\t%d\t%d\t%s\n", $3, ($5-1), $5, $1);}' hg16.polyAsite > hg16.polyAsite.bed liftUp lifted.bed hg16.lft warn hg16.polyAsite.bed sed 's/hg16\.//' lifted.bed > final.bed liftOver final.bed /gbdb/hg16/liftOver/hg16ToHg18.over.chain.gz hg18.bed unmapped hgLoadBed hg18 polyaDB hg18.bed # trackDb entry/html in human/hg18 # redmine issue 19: wrong bed file format used on hgwdev # reload it with table from beta (2010-10-28 - Chin) cd /cluster/data/hg18/bed/polyaDB hgLoadBed hg18 polyaDb hg18.beta.polyaDb.bed ############################################################################ # Translate SNP Array data from hg17 (Heather August 2006) # Affy500 cd /cluster/data/hg18/bed/snp/affy # get rsId/affy name pairs from hg17 where rsId != 'unknown' # 257954 candidates from Nsp (4311 with unknown rsId) # 234765 candidates from Sty (3540 with unknown rsId) hgsql hg17 < getHg17-Nsp.sql > nsp.hg17 hgsql hg17 < getHg17-Sty.sql > sty.hg17 # get name, chrom, chromStart, chromEnd, strand, observed from snp126simple # snp126simple contains only class = "simple", locType = "exact", # chromEnd = chromStart + 1, biallelic, singly-aligning hgsql hg18 < getHg18.sql > snp126simple.hg18 # sort and join # 257213in nsp.join # 233941 in sty.join # 741 in nsp.missing # 824 in sty.missing sort nsp.hg17 > nsp.hg17.sort sort sty.hg17 > sty.hg17.sort sort snp126simple.hg18 > snp126simple.hg18.sort join nsp.hg17.sort snp126simple.hg18.sort > nsp.join join sty.hg17.sort snp126simple.hg18.sort > sty.join join -v 1 nsp.hg17.sort snp126simple.hg18.sort > nsp.missing join -v 1 sty.hg17.sort snp126simple.hg18.sort > sty.missing # fix column order awk '{print $3, $4, $5, $2, 0, $6, $7, $1}' nsp.join > nsp.bed awk '{print $3, $4, $5, $2, 0, $6, $7, $1}' sty.join > sty.bed # load hgLoadBed hg18 snpArrayAffy250Nsp nsp.bed -sqlTable=snpArrayAffy250Nsp.sql hgLoadBed hg18 snpArrayAffy250Sty sty.bed -sqlTable=snpArrayAffy250Sty.sql # cleanup rm nsp.hg17 nsp.hg17.sort nsp.join rm sty.hg17 sty.hg17.sort sty.join rm snp126simple.hg18 bed.tab mv snp126simple.hg18.sort ../illumina gzip nsp.bed sty.bed # Illumina300 cd /cluster/data/hg18/bed/snp/illumina # 317,100 candidates from hg17 hgsql -e 'select name from snpArrayIllumina300' hg17 > hg17.data # sort and join # 314,093 in join.out # 3,007 in join.missing sort hg17.data > hg17.data.sort join hg17.data.sort hg18.data.sort > join.out join -v 1 hg17.data.sort hg18.data.sort > join.missing # fix column order awk '{print $2, $3, $4, $1}' join.out > illumina.bed # load hgsql hg18 < snpArrayIllumina300.sql hgLoadBed hg18 snpArrayIllumina300 illumina.bed -sqlTable=snpArrayIllumina300.sql # cleanup rm hg17.data hg17.data.sort hg18.data.sort bed.tab join.out gzip illumina.bed ########################################################################## # New SNP Array data (Heather April 2007) # Affymetrix introduced a new genotyping array in February # I got the data from Venu in April # It is based on dbSNP build 126 # Venu reviewed the load ssh hgwdev cd /cluster/data/hg18/bed/snp/affy # There were 60 lines with no chrom, chromEnd or strand grep -v NULL GenomeWideSNP_5_ucsc.tsv > genomewide.in # little Perl script to add chromEnd & score for bed format genomewide.pl < genomewide.id > genomewide.bed # preliminary load hgLoadBed hg18 snpArrayAffyGenomeWidePrelim genomewide.bed -tab -sqlTable=snpArrayAffyGenomeWidePrelim.sql # based on position, lookup rsId # 2 runs # first run: don't include dbSNP if class != single or locType != exact or # chromEnd != chromStart + 1 /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffyGenomeWidePrelim snp126 # missing count = 5279 # multiple count = 44 # second run: use all of snp126 /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffyGenomeWidePrelim snp126 # missing count = 5210 # multiple count = 724 # Use the first run (better to avoid nearly 700 multiples at the cost of # 69 more unknown) hgLoadBed hg18 snpArrayAffy5 affyLookup.out -tab -sqlTable=snpArrayAffy5.sql ########################################################################## # More new SNP Array data from Affymetrix (Heather May 2007) # Source: Venu_Valmeekam at affymetrix.com # This is the 6.0 array, announced mid-May # It contains 2 components: single-base substitutions and copy-number probes # Single-base substitutions are based on snp127 ssh hgwdev cd /cluster/data/hg18/bed/snp/affy/6.0/single unzip GenomeWideSNP_6_ucsc_1.tsv.zip unzip GenomeWideSNP_6_ucsc_2.tsv.zip format.pl < GenomeWideSNP_6_ucsc_1.tsv > 1.bed format.pl < GenomeWideSNP_6_ucsc_2.tsv > 2.bed cp 1.bed all.bed cat 2.bed >> all.bed hgLoadBed hg18 snpArrayAffy6Prelim all.bed -tab -sqltable=snpArrayAffy6Prelim.sql mysql> update snpArrayAffy6Prelim set chrom = "chrM" where chrom = "chrMT"; /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffy6Prelim snp127 # missing count = 1149 # multiple count = 2396 # used the strict version of affyLookup (class="single", locType="exact", size=1) hgLoadBed hg18 snpArrayAffy6 affyLookup.out -tab -sqlTable=snpArrayAffy6.sql mysql> alter table snpArrayAffy6 add index name(name); mysql> alter table snpArrayAffy6 add index chrom(chrom, bin); cd /cluster/data/hg18/bed/snp/affy/6.0/sv unzip GenomeWideSNP_6_CN_ucsc_1.tsv.zip unzip GenomeWideSNP_6_CN_ucsc_2.tsv.zip format.pl < GenomeWideSNP_6_CN_ucsc_1.tsv > 1.bed format.pl < GenomeWideSNP_6_CN_ucsc_2.tsv > 2.bed cp 1.bed all.bed cat 2.bed >> all.bed hgLoadBed hg18 snpArrayAffy6SV all.bed -tab mysql> delete from snpArrayAffy6SV where chrom = "chr0"; mysql> update snpArrayAffy6SV set chromStart = chromStart - 1; ########################################################################## # Venu from Affy requested to remove about 25,000 items from # snpArrayAffy6 track. # # Imported the list into the table, snpArrayAffy6Remove, in hg18. # # Issued a simple MySQL command to delete records in snpArrayAffy6 # that having ids in snpArrayAffy6Remove (sorry did not write down it). # # This was done 10/8/07. Fan. ########################################################################## # New Illumina Array data (Heather April 2007) # HumanHap300v3, HumanHap550v3, HumanHap650v3 # Data from Luana Galver (lgalver at illumina.com) # Based on dbSNP build 126 ssh hgwdev cd /cluster/data/hg18/bed/snp/illumina # split off chrM from zips bed.pl < 300.in > 300.bed bed.pl < 550.in > 550.bed bed.pl < 650.in > 650.bed chrM.pl < 550.in.M > 550.bed.M chrM.pl < 650.in.M > 650.bed.M hgLoadBed hg18 snpArrayIllumina300 300.bed -sqlTable=snpArrayIllumina300.sql -tab hgLoadBed hg18 snpArrayIllumina550 550.bed -sqlTable=snpArrayIllumina550.sql -tab hgLoadBed hg18 snpArrayIllumina650 650.bed -sqlTable=snpArrayIllumina650.sql -tab hgLoadBed hg18 snpArrayIllumina550 550.bed.M -tab -oldTable hgLoadBed hg18 snpArrayIllumina650 650.bed.M -tab -oldTable # add indices mysql> alter table snpArrayIllumina300 add index name (name); mysql> alter table snpArrayIllumina300 add index chrom (chrom, bin); mysql> alter table snpArrayIllumina550 add index name (name); mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin); mysql> alter table snpArrayIllumina650 add index name (name); mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin); # fix strand convention mysql> update snpArrayIllumina300 set strand = "+" where strand = "F"; mysql> update snpArrayIllumina300 set strand = "-" where strand = "R"; mysql> update snpArrayIllumina550 set strand = "+" where strand = "F"; mysql> update snpArrayIllumina550 set strand = "-" where strand = "R"; mysql> update snpArrayIllumina650 set strand = "+" where strand = "F"; mysql> update snpArrayIllumina650 set strand = "-" where strand = "R"; # Note no A/T or C/G!! mysql> select distinct(observed) from snpArrayIllumina300; # +----------+ # | observed | # +----------+ # | [A/G] | # | [T/C] | # | [A/C] | # | [T/G] | # +----------+ # fix observed mysql> update snpArrayIllumina300 set observed = "A/C" where observed = "[A/C]"; mysql> update snpArrayIllumina550 set observed = "A/C" where observed = "[A/C]"; mysql> update snpArrayIllumina650 set observed = "A/C" where observed = "[A/C]"; mysql> update snpArrayIllumina300 set observed = "A/G" where observed = "[A/G]"; mysql> update snpArrayIllumina550 set observed = "A/G" where observed = "[A/G]"; mysql> update snpArrayIllumina650 set observed = "A/G" where observed = "[A/G]"; mysql> update snpArrayIllumina300 set observed = "C/T" where observed = "[T/C]"; mysql> update snpArrayIllumina550 set observed = "C/T" where observed = "[T/C]"; mysql> update snpArrayIllumina650 set observed = "C/T" where observed = "[T/C]"; mysql> update snpArrayIllumina300 set observed = "G/T" where observed = "[T/G]"; mysql> update snpArrayIllumina550 set observed = "G/T" where observed = "[T/G]"; mysql> update snpArrayIllumina650 set observed = "G/T" where observed = "[T/G]"; # Note 2 rows in 300 and 15 rows in 550 and 650 where chrom = "chrXY" # validation /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina300 snp126 snp126Exceptions illuminaLookup.hg18.300 /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina550 snp126 snp126Exceptions illuminaLookup.hg18.550 /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina650 snp126 snp126Exceptions illuminaLookup.hg18.650 # Not found: 2 in 300, 15 in 550 and 650 # These are in snp127 # Mixed: 55 in 300, 74 in 550, 81 in 650 # Found 2 strange things here: # First of all, for snps that are illumina forward strand, dbSNP reverse strand: # in all cases, the observed polymorphism is identical. # Counts: # 36k on the HumanHap300v3 # 52k on the HumanHap550v3 # 59k on the HumanHap650v3 # This surprises me, because the dbSNP observation is intended to be reverse-complemented. # Examples from HumanHap300v3 include rs1000007, rs1000031, rs1000041, rs1000071, rs1000078. # Secondly, for snps that are illumina reverse strand: # in all cases is that your observed polymorphism is the reverse complement of the dbSNP polymorphism. # this could only make sense for the dbSNP forward strand OR the dbSNP reverse strand, although I don't think it matters which one. # examples: # rs3934834: illumina A/G (-), dbSNP C/T (+) # rs6687776: illumina A/G (-), dbSNP C/T (+) # rs2298217: illumina A/G (-), dbSNP C/T (+) # rs9442380: illumina A/G (-), dbSNP C/T (+) # rs3737728: illumina A/G (-), dbSNP C/T (-) # rs3813199: illumina A/G (-), dbSNP C/T (-) # rs880051: illumina A/G (-), dbSNP C/T (-) # rs12562034: illumina C/T (-), dbSNP A/G (+) # rs9442372: illumina C/T (-), dbSNP A/G (+) # rs11260588: illumina C/T (-), dbSNP A/G (+) # rs12726255: illumina C/T (-), dbSNP A/G (+) # rs2887286: illumina C/T (-), dbSNP A/G (-) # rs2649588: illumina C/T (-), dbSNP A/G (-) # rs2296716: illumina C/T (-), dbSNP A/G (-) # rs2474460: illumina C/T (-), dbSNP A/G (-) # redo this, just using name/chrom/pos from illumina bed2.pl < 300.in > 300.bed.2 hgLoadBed hg18 snpArrayIllumina300Prelim 300.bed.2 -tab /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina300Prelim snp126 snp126Exceptions mv illuminaLookup.out lookup.300 mv illuminaLookup.err lookup.300.err hgLoadBed hg18 snpArrayIllumina300 lookup.300 -tab -sqlTable=snpArrayIllumina300.sql hgsql -N -e 'drop table snpArrayIllumina300Prelim' hg18 bed2.pl < 550.in > 550.bed.2 hgLoadBed hg18 snpArrayIllumina550Prelim 550.bed.2 -tab /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina550Prelim snp126 snp126Exceptions mv illuminaLookup.err lookup.550.err mv illuminaLookup.out lookup.550 hgLoadBed hg18 snpArrayIllumina550 lookup.550 -tab -sqlTable=snpArrayIllumina550.sql hgsql -N -e 'drop table snpArrayIllumina550Prelim' hg18 bed2.pl < 650.in > 650.bed.2 hgLoadBed hg18 snpArrayIllumina650Prelim 650.bed.2 -tab /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina650Prelim snp126 snp126Exceptions mv illuminaLookup.out lookup.650 mv illuminaLookup.err lookup.650.err hgLoadBed hg18 snpArrayIllumina650 lookup.650 -tab -sqlTable=snpArrayIllumina650.sql hgsql -N -e 'drop table snpArrayIllumina650Prelim' hg18 # add indices mysql> alter table snpArrayIllumina300 add index name (name); mysql> alter table snpArrayIllumina300 add index chrom (chrom, bin); mysql> alter table snpArrayIllumina550 add index name (name); mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin); mysql> alter table snpArrayIllumina650 add index name (name); mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin); ########################################################################## # Added gvPos table for Locus Variants (Belinda Giardine Sept 2006) # This uses the gv* tables in hgFixed for the related data. The track has # been on hg17, just added to hg18. Most variants were mapped directly to # hg18 only the LSDB BGMUT was lifted using liftOver. # Update, reloaded table Dec 2006 Belinda Giardine # new entries for previous sources and more IDbases # Update, reloaded table January 2007 Belinda Giardine # new source (first set of LOVD) and some fixes to IDbases and HbVar # Update most LSDBs, add more genes for LMDp(LOVD) Jan 11, 2008 # loaded and tested first at PSU #update old dbs and add dbPEX March 22-23, 2007 #need to truncate and reload all tables (new entries in old) #prepare positions for loading cd gvNov2006 cat gvPosARdb.hg17.txt gvPosSrd5a2.hg17.txt gvPosPah.hg17.txt > ../gvMar2007/gvPosNov2006.hg17.txt cd ../gvMar2007 cat ../gvJan2007/gvPosLOVD.hg17.txt *.hg17.txt > gvPos.Hg17.txt grep "^chr" gvPos.Hg17.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg17.bed cd gvNov2006 cat gvPosARdb.hg18.txt gvPosSrd5a2.hg18.txt gvPosPah.hg18.txt > ../gvMar2007/gvPosNov2006.hg18.txt cd ../gvMar2007 cat ../gvJan2007/gvPosLOVD.hg18.txt *.hg18.txt > gvPos.Hg18.txt grep "^chr" gvPos.Hg18.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg18.bed #run checks ~giardine/gv/checkLinksRaFile.pl /cluster/store6/giardine/gvMar2007/ ~giardine/gv/checkSeq.pl hg18 < gvPos.Hg18.txt > errors.txt ~giardine/gv/checkSeq.pl hg17 < gvPos.Hg17.txt > errors17.txt #start reload hgsql hgFixed < emptyTables.sql #copy and paste from reloadHgFixed.txt #load new dbs hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvRettBASE.txt hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrRettBASE.txt hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkRettBASE.txt hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvdbPEX.txt hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrdbPEX.txt hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkdbPEX.txt #load position tables hgLoadBed hg18 gvPos gvPosSorted.Hg18.bed -noSort -oldTable -tab hgLoadBed hg17 gvPos gvPosSorted.Hg17.bed -noSort -oldTable -tab #run remaining checks select distinct attrType from gvAttr; select distinct attrType from gvLink; #and compare against gvAttrTypeKey in hg/lib/gvUi.c ~/gv/joinerChecks.pl table1 IDfield1 table2 IDfield2 #for gv, gvPos, gvSrc, gvAttr, and gvLink #script to check for non unique rows in database ~/gv/uniqueCheck.pl gvAttr > gvAttrNonunique.txt ~/gv/uniqueCheck.pl gvLink > gvLinkNonunique.txt #add IPNMDB and reload LOVD with more genes April 12, 2007 cat *.hg17.txt > gvPos.Hg17.txt grep "^chr" gvPos.Hg17.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg17.bed cat *.hg18.txt > gvPos.Hg18.txt grep "^chr" gvPos.Hg18.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg18.bed #run checks ~giardine/gv/checkLinksRaFile.pl /cluster/store6/giardine/gvMar2007/ ~giardine/gv/checkSeq.pl hg18 < gvPos.Hg18.txt > errors.txt ~giardine/gv/checkSeq.pl hg17 < gvPos.Hg17.txt > errors17.txt #remove old LOVD entries hgsql hgFixed delete from gvLink where id like 'FKRP%'; delete from gvAttr where id like 'FKRP%'; delete from gv where id like 'FKRP%'; insert into gvSrc values ('IPNMDB', 'LSDB', 'Mutation Database of Inherited Peripheral Neuropathies'); #load new dbs hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvLOVD.txt hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrLOVD.txt hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkLOVD.txt hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvIPNMDB.txt hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrIPNMDB.txt adSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkIPNMDB.txt hgsql hg18 truncate table gvPos; hgsql hg17 truncate table gvPos; #load position tables hgLoadBed hg18 gvPos gvPosSorted.Hg18.bed -noSort -oldTable -tab hgLoadBed hg17 gvPos gvPosSorted.Hg17.bed -noSort -oldTable -tab #run remaining checks select distinct attrType from gvAttr; select distinct attrType from gvLink; #and compare against gvAttrTypeKey in hg/lib/gvUi.c ~/gv/joinerChecks.pl table1 IDfield1 table2 IDfield2 #for gv, gvPos, gvSrc, gvAttr, and gvLink #script to check for non unique rows in database ~/gv/uniqueCheck.pl gvAttr ~/gv/uniqueCheck.pl gvLink #found missing common names hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrIPNMDBcommonName.txt ########################################################################## # hars 1 to 202 Sol 09/10/2006 set bedDir = /gbdb/hg18/haseq/bed mkdir -p $bedDir/hars pushd /projects/hg/wet/Sol/hars1to49 cp -p hars_1to202.hg18.bed $bedDir/hars/hars_1to202.bed hgLoadBed hg18 hars $bedDir/hars/hars_1to202.bed rm -f $bedDir/hars/hars_1to202.bed popd # BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (DONE 9/11/06) # Download HPRD_XML_060106.tar.gz from www.hprd.org gzip -d HPRD_XML_060106.tar.gz tar -xvf HPRD_XML_060106.tar.gz # This will create 18838 xxxx.xml files under HPRD_XML_060106 # Create hprdToCdna table echo 'grep -H entry_cdna HPRD_XML_060106/$1.xml' >do1Cdna ls HPRD_XML_060106 >j cat j |sed -e 's/.xml/\tdo1Cdna/g' >jj cut -f 1 jj >j.2 cut -f 2 jj >j.1 paste j.1 j.2 >doAllCdna chmod +x do* ./doAllCdna >j.cdna cat j.cdna| sed -e 's/\//\t/' | sed -e 's/.xml/\t/' |\ sed -e 's//\t/' | sed -e 's/<\//\t/'| sed -e 's/\./\t/'| cut -f 2,4|\ grep -v None >hprdToCdna.tab hgsql hg18 -e 'drop table hprdToCdna' hgsql hg18 <~/src/hg/lib/hprdToCdna.sql hgsql hg18 -e 'load data local infile "hprdToCdna.tab" into table hprdToCdna' # Create hprdToUniProt table echo 'fgrep -H Swiss HPRD_XML_060106/$1.xml' >do1 ls HPRD_XML_060106 >j cat j |sed -e 's/.xml/\tdo1/g' >jj cut -f 1 jj >j.2 cut -f 2 jj >j.1 paste j.1 j.2 >doall chmod +x do* ./doall >j.out cat j.out|grep SwissProt | sed -e 's/\//\t/' | sed -e 's/.xml/\t/' | \ sed -e 's/Prot>/\t/' | sed -e 's/<\//\t/'| cut -f 2,4|grep -v None >hgrdToUniProt.tab hgsql hg18 -e 'drop table hprdToUniProt' hgsql hg18 <~/src/hg/lib/hprdToUniProt.sql hgsql hg18 -e 'load data local infile "hprdToUniProt.tab" into table hprdToUniProt' # build knownToHprd table hgsql hg18 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=kgId' >j.kg1 hgsql hg18 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2 cat j.kg1 j.kg2 |sort -u >knownToHprd.tab wc knownToHprd.tab hgsql hg18 -e 'drop table knownToHprd' hgsql hg18 <~/src/hg/lib/knownToHprd.sql hgsql hg18 -e 'load data local infile "knownToHprd.tab" into table knownToHprd' hgsql hg18 -e 'select count(*) from knownToHprd' # 19,646 records created. # remove temporary files. rm j* # Do the same for hg17. See hg17.txt for details. ########################################################################## # ORegAnno: oreganno, oregannoAttr, oregannoLink # Belinda Giardine August 3, 2007 # updated Oct 26, 2007 # updated July 7, 2008 # This has regulatory annotations from ORegAnno. # Get updated file from ORegAnno wiki page # http://www.bcgsc.ca/wiki/display/oreganno/DataFiles # Parse flat file into 3 tables, truncate tables, load. # Has other species but only Human, Fly, sacSer1 has enough entries for now. cd /cluster/store6/giardine/oreganno/20071026/ ~giardine/oreganno/parseOra hg18 < oreganno_UCSC_25Oct07.txt hgsql hg18 truncate table oreganno; truncate table oregannoAttr; truncate table oregannoLink; quit; grep "^chr" oreganno.hg18.txt | sort -k1,1 -k2,2n > oreganno.bed hgLoadBed hg18 oreganno oreganno.bed -noSort -oldTable -tab hgLoadSqlTab -oldTable hg18 oregannoAttr ~/humPhen/kent/src/hg/lib/oreganno.sql oregannoAttr.hg18.txt hgLoadSqlTab -oldTable hg18 oregannoLink ~/humPhen/kent/src/hg/lib/oreganno.sql oregannoLink.hg18.txt ########################################################################## # LIFT ACEMBLY FROM HG17 TO HG18 (DONE, Fan, 9/28/06) # OBSOLETED BY LOAD OF NEW DATA, SEE BELOW 8/28/07 angie # get acembly data from hg17 hgsql hg17 -N -e 'select * from acembly' >hg17Acembly.gp # lift to hg18 zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | \ liftOver hg17Acembly.gp stdin acembly.gp unMapped.gp -genePred # load the genePred table ldHgGene hg18 acembly -predTab acembl.gp # get acemblyPep and acemblyClass table from hg17 and load them into hg18. hgsql hg17 -N -e 'select * from acemblyPep' >acemblyPep.tab hgsql hg18 -e 'drop table acemblyPep' hgsql hg18 < ~/src/hg/lib/acemblyPep.sql hgsql hg18 -e 'load data local infile "acemblyPep.tab" into table acemblyPep' hgsql hg17 -N -e 'select * from acemblyClass' >acemblyClass.tab hgsql hg18 -e 'drop table acemblyClass' hgsql hg18 < ~/src/hg/lib/acemblyClass.sql hgsql hg18 -e 'load data local infile "acemblyClass.tab" into table acemblyClass' ########################################################################## # LIFT RNAGENE FROM HG17 TO HG18 (DONE, Robert, 10/3/06) mkdir /cluster/data/hg18/bed/rnaGene cd /cluster/data/hg18/bed/rnaGene hgsql hg18 < rnaGene.sql liftOver ~/hg17/rnaGene/rnaGenes.tab /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz rnaGenes.bed unmapped -bedPlus=10 -tab hgLoadBed hg18 rnaGene rnaGenes.bed -oldTable -tab -noBin ########################################################################## # SWAP/CHAIN/NET GASACU1 (DONE 10/17/06 angie) ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.gasAcu1.swap cd /cluster/data/hg18/bed/blastz.gasAcu1.swap doBlastzChainNet.pl -swap /cluster/data/gasAcu1/bed/blastz.hg18/DEF \ -chainMinScore=2000 -chainLinearGap=loose >& do.log & tail -f do.log ln -s blastz.gasAcu1.swap /cluster/data/hg18/bed/blastz.gasAcu1 nice featureBits hg18 chainGasAcu1Link #55424609 bases of 2881515245 (1.923%) in intersection ########################################################################## # YALE TRANSCRIPTIONALLY ACTIVE REGIONS (TARs/TransFrags) TRACK IDENTIFIED # # USING A WHOLE GENOME TILING ARRAY (DONE, 2006-10-12 - 2006-10-13, hartera) # Data is from the paper: Bertone et al. Science 24 December 2004: # Vol. 306. no. 5705, pp. 2242 - 2246. From Mark Gerstein's lab at Yale. # Contact at Yale: Joel S. Rozowsky, joel.rozowsky at yale.edu # The data consist of Transcriptionally Active Regions (TARs or TransFrags) # found using Affymetrix genome tiling arrays. The data is from the lab # of Mark Gerstein at Yale. ssh kkstore02 mkdir /cluster/data/hg18/bed/yaleBertoneTars/ cd /cluster/data/hg18/bed/yaleBertoneTars/ # download Bertone et al. data from this URL: #http://dart.gersteinlab.org/cgi-bin/ar/download.cgi?ID=TAR_data_NCBI31.txt # and put it in this directory. # The sequences used to design the microarrays were from # UCSC hg13/NCBI Build 31 so the sequences # should be aligned again using Blat since this is probably better # than using liftOver across so many assemblies. # Get sequences from TARs file and put in FASTA format: # Remove characters from Windows: dos2unix TAR_data_NCBI31.txt # The TARs are in order of IDs in the file so the first TAR has ID 1, the # second is 2 up to the last which is 17517. These IDs are used to link # to the DART database of TARs at Yale so use these IDs in the FASTA # header lines. Need to add "TAR" as prefix to ID so that it is unique # in the seq table. awk 'BEGIN {FS="\t";n=0;}{if ($1 ~ /^chr/) print ">TAR"n"\n"$14"\n";n++;}' \ TAR_data_NCBI31.txt > yaleBertoneTARSeqs.fa ssh pk mkdir -p /san/sanvol1/scratch/hg18/TARs/ cp /cluster/data/hg18/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \ /san/sanvol1/scratch/hg18/TARs/ # Set up to Blat the TAR sequences against hg18 cd /cluster/data/hg18/bed/yaleBertoneTars ls -1 /san/sanvol1/scratch/hg18/TARs/yaleBertoneTARSeqs.fa > tars.lst ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst # output dir mkdir psl cat << '_EOF_' > template.sub #LOOP /cluster/bin/x86_64/blat -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << for emacs gensub2 genome.lst tars.lst template.sub para.spec para create para.spec para try, para check, para push ... para time # Completed: 49 of 49 jobs #CPU time in finished jobs: 396s 6.61m 0.11h 0.00d 0.000y #IO & Wait Time: 198s 3.29m 0.05h 0.00d 0.000 y #Average job time: 12s 0.20m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 39s 0.65m 0.01h 0.00d #Submission to last job: 253s 4.22m 0.07h 0.00d # sort and then filter pslSort dirs raw.psl tmp psl # use these parameters as for Genbank alignments of native mRNAs # for finished assemblies. pslCDnaFilter -minId=0.96 -minCover=0.25 -localNearBest=0.001 \ -minQSize=20 -minNonRepSize=16 -ignoreNs -bestOverlap \ raw.psl yaleBertoneTars.psl # seqs aligns # total: 17512 38243 # drop minNonRepSize: 159 403 # drop minIdent: 3822 14798 # drop minCover: 563 895 # weird over: 242 832 # kept weird: 204 210 # drop localBest: 2410 4018 # kept: 17469 18129 # 99.75% were kept. # check how many aligned grep '>' yaleBertoneTARSeqs.fa | wc -l # 17517 # 99.7% of the original set of sequences are in this filtered PSL file. pslCheck yaleBertoneTars.psl # psl is ok # load into database ssh hgwdev cd /cluster/data/hg18/bed/yaleBertoneTars hgLoadPsl hg18 yaleBertoneTars.psl # Add sequences to /gbdb/hg18 and to seq and extFile tables. mkdir -p /gbdb/hg18/yaleTARs/ ln -s /cluster/data/hg18/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \ /gbdb/hg18/yaleTARs/ hgLoadSeq hg18 /gbdb/hg18/yaleTARs/yaleBertoneTARSeqs.fa # Add trackDb.ra entry to trackDb/human/trackDb.ra and create # a description page. ############################################################################## # Update upstream maf files, fixing a problem of RefSeq ID being trucated. (2006-10-20 Fan) ssh hgwdev cd /cluster/data/hg18/bed/multiz17way cd mafDownloads # upstream mafs (mafFrags takes a while) cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad cat up.bad|sed -e "s/_up_${i}_/\t/" >up.bad2 awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, $4, 0, $6)}' up.bad2 > up.bed rm up.bad up.bad2 nice mafFrags hg18 multiz17way up.bed upstream$i.maf \ -orgs=/cluster/store11/gs.19/build36/bed/multiz17way.2006-02-18/species.lst rm up.bed end date 'EOF' # << happy emacs time csh mafFrags.csh > mafFrags.log nice gzip up*.maf md5sum up*.gz >> md5sum.txt ######################################################################### # BLASTZ/CHAIN/NET FELCAT3 (Done Nov 09 2006 heather) # working in /cluster/data/felCat3 because /cluster/data/hg18 is 96% full # make this a link in /cluster/data/hg18 mkdir /cluster/data/felCat3/bed/blastz.hg18.2006-11-09 ln -s /cluster/data/felCat3/bed/blastz.hg18.2006-11-09 /cluster/data/hg18/bed/blastz.felCat3 cd /cluster/data/felCat3/bed/blastz.hg18.2006-11-09 cat << '_EOF_' > DEF BLASTZ_M=50 # TARGET: Human Hg18 # Can we use 2bit here? SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cat felCat3 SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=500 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/felCat3/bed/blastz.hg18.2006-11-09 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/felCat3/blastz.hg18 >& do.log & tail -f do.log nice featureBits -chrom=chr1 hg18 chainFelCat3Link # 86932463 bases of 224999719 (38.637%) in intersection # reciprocal best net mafs for multiz ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 felCat3 >&! rbest.log & ######################################################################### # BLASTZ/CHAIN/NET BOSTAU3 (Done Feb 2007 heather) mkdir /cluster/data/hg18/bed/blastz.bosTau3.2007-02-23 ln -s /cluster/data/hg18/bed/blastz.bosTau3.2007-02-23 /cluster/data/hg18/bed/blastz.bosTau3 cd /cluster/data/hg18/bed/blastz.bosTau3 cat << '_EOF_' > DEF BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/san/sanvol1/scratch/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau3 SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=500 SEQ2_CHUNK=50000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.bosTau3.2007-02-23 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/bosTau3/blastz.hg18 >& do.log & tail -f do.log nice featureBits -chrom=chr1 hg18 chainBosTau3Link # 114562908 bases of 224999719 (50.917%) in intersection ############################################################################## # MAKE tfbsConsSites and tfbsConsFactors for TFBS conserved track (DONE weirauch 11/19/06) # Questions? weirauch at soe.ucsc.edu or braney at soe.ucsc.edu ssh hgwdev mkdir /cluster/data/hg18/bed/tfbsCons cd /cluster/data/hg18/bed/tfbsCons # Define all parameters in 'PARAMS.txt' # Define all chromosomes in 'CHROMS.txt' # Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts weirauch at soe.ucsc.edu set tarfile=/cluster/data/hg18/bed/tfbsCons/tfbsConsUtils.tar.gz tar zxf $tarfile nice ./getRefseqStats.pl & nice ./getBatchQueries.pl & ssh kk mkdir /cluster/bluearc/braney/tfloc # Copy ./tmp/ctfbs_batch_list.txt to this dir # Copy ./scripts/doit to this dir para create ctfbs_batch_list.txt para try para push # When the run is done (within a day or so), the results will be in individual dirs, one for each chromosome. ssh kksilo (or hgwdev, or whatever) nice ./getBedFile.pl & hgLoadBed -noSort hg18 tfbsConsSites -sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed -tab hgLoadBed -noSort hg18 tfbsConsFactors -sqlTable=$HOME/kent/src/hg/lib/tfbsConsFactors.sql tfbsConsFactors.bed -tab # Feel free to delete or gzip anything in ./tmp (particularly the huge .maf and .bed files) after the final two bed files are sucessfully loaded # fixed up the tfbsConsSites.bed file to remove extra indexes, then: hgsql -e "drop index chrom_2 on tfbsConsSites;" hg18 hgsql -e "drop index chrom_3 on tfbsConsSites;" hg18 # the tfbsConsFactors table had extra names, they were removed: for N in `cat extra.tfbsConsFactors.name` do echo "delete from tfbsConsFactors where name=\"${N}\";" hg18 hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg18 done # the extra names were: # B$CRP_C F$DDE1_B F$STRE_01 P$GBP_Q6 V$ACAAT_B V$APOLYA_B V$ATATA_B # V$BARBIE_01 V$BEL1_B V$CAAT_01 V$CAAT_C V$CAP_01 V$DTYPEPA_B V$E2F_Q2 # V$ETF_Q6 V$ETS_Q6 V$GC_01 V$GEN_INI2_B V$GEN_INI3_B V$GEN_INI_B V$HFH8_01 # V$HOGNESS_B V$LBP1_Q6 V$LDSPOLYA_B V$LEF1_Q2 V$LPOLYA_B V$MEF3_B V$MINI19_B # V$MINI20_B V$MTATA_B V$MUSCLE_INI_B V$PADS_C V$PEA3_Q6 V$POLY_C V$SRY_01 # V$STAT4_01 V$STAT5A_03 V$STAT5A_04 V$STAT6_02 V$TAACC_B V$TANTIGEN_B # V$TEF1_Q6 V$USF2_Q6 # And re-load once again since the above data was based on transfac data that # is too new (2006-11-03 - Hiram) cd /cluster/data/hg18/bed/tfbsCons hgLoadBed -tab -strict hg18 tfbsConsSites \ -sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed # And this leads once again to a bunch of extra names in Factors hgsql -N -e "select name from tfbsConsSites;" hg18 | sort -u > names.new hgsql -N -e "select name from tfbsConsFactors;" hg18 \ | sort -u > names.factors comm -13 names.new names.factors > names.extra.factors for N in `cat names.extra.factors` do echo "delete from tfbsConsFactors where name=\"${N}\";" hg18 hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg18 done # Reload tfbsCons to correct errors (2007-07-17 - Hiram) cd /cluster/data/hg18/bed/tfbsCons hgLoadBed -tab hg18 tfbsConsSites \ -sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed hgsql -N -e "select name from tfbsConsSites;" hg18 | sort -u \ > names.new.2007-07-17 # showing zero difference still, nothing more to be done comm -13 names.new.2007-07-17 names.factors ############################################################################## # REWORK PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE # (WORKING - 2006-10-23 - Hiram) # five different cluster runs are described here for different classes # of clones # runPlacedNotSplit - all placed clones split or not split with blat # runFish - 392 fish clones against all 378 contigs, with blat # runUnPlaced - 14,569 clones on known contigs - with psLayout # runUnPlacedChr - 297 clones on known chroms - with psLayout # runLastOnes - 1,877 clones against 378 contigs - with blat # The original run of this forgot to split of the BAC clones that were just # a fasta file full of unordered pieces. They need to be split up # to work properly. ssh pk mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23 cd /san/sanvol1/scratch/hg18/coverage.2006-10-23 # Going to copy over the BAC clones from the previous runs and split # them up if they have too many N's (>100) (indicating pieces) # This may actually split up a couple of BACs that are not actually # pieces, but in the cases I could find, and they were rare, the big # BACs appear to break into only two pieces. # The first set to do are the clones that were used in the assembly # Since they were placed, we know where they all belong. Only 50 of # them end up being split, and then usually only in 2 pieces. # We could tediously go through each of these 50 and determine if they # are actually unordered pieces. Although this raises the question, # how could unordered pieces be used in the assembly ? Doesn't make any # sense. cat << '_EOF_' > placedClones.sh #!/bin/sh D0=placedNotSplit D1=placedSplit export D0 D1 find ../coverage/placedClones -type f | grep -v faCount.all.txt | while read F do BN=`basename "${F}"` DN=`dirname "${F}"` CHROM=`basename "${DN}"` Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"` if [ "${Ncount}" -gt 99 ]; then out="${D1}/${CHROM}/${BN}" mkdir -p ${D1}/${CHROM} echo "gapSplit -minGap=100 ${F} ${out}" gapSplit -minGap=100 ${F} stdout | gzip > ${out} faSize "${F}" faSize "${out}" else out="${D0}/${CHROM}/${BN}" mkdir -p ${D0}/${CHROM} echo "cp -p ${F} ${out}" cp -p ${F} ${out} fi done '_EOF_' # << happy emacs # Going to use blat this time instead of psLayout # It is faster and appears to do just about the same exact job mkdir runPlacedNotSplit cd runPlacedNotSplit # Re-use the previous jobList sed -e "s/runPsLayout.sh/runBlat.csh/" \ ../../coverage/runPlaced/masterJobList > jobList cat << '_EOF_' > runBlat.csh #!/bin/csh -fe set chrom = $1 set clone = $2 set contig = $3 set result = $4 set target = /san/sanvol1/scratch/hg18/coverage/maskedContigs/$chrom/$contig.fa.gz set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/placedNotSplit/$chrom/$clone.fa.gz if ( ! -f $query ) then set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/placedSplit/$chrom/$clone.fa.gz endif set scrTmp = "/scratch/tmp/$contig/$clone" set ooc = /san/sanvol1/scratch/hg18/coverage/contigOoc/$contig.10.ooc mkdir -p "$scrTmp" zcat $target > $scrTmp/$contig.fa zcat $query > $scrTmp/$clone.fa cp -p $ooc $scrTmp/10.ooc pushd $scrTmp pwd ls -l blat -minIdentity=98 -fastMap -tileSize=10 -t=dna -q=dna -ooc=10.ooc $contig.fa $clone.fa $clone.psl popd mkdir -p psl/$chrom/$contig cp -p $scrTmp/$clone.psl $result rm $scrTmp/* rmdir $scrTmp rmdir --ignore-fail-on-non-empty /scratch/tmp/$contig '_EOF_' # << happy emacs para create jobList para try; para check; etc ... para time # Completed: 27093 of 27093 jobs # CPU time in finished jobs: 435042s 7250.69m 120.84h 5.04d 0.014 y # IO & Wait Time: 74031s 1233.86m 20.56h 0.86d 0.002 y # Average job time: 19s 0.31m 0.01h 0.00d # Longest finished job: 463s 7.72m 0.13h 0.01d # Submission to last job: 3079s 51.32m 0.86h 0.04d # combine the results into one large raw.psl file time pslSort dirs raw.psl tmp psl/*/* ls -og raw.psl # -rw-rw-r-- 1 52067774 Oct 31 12:06 raw.psl # This raw.psl file will be included in the overall results, but as a # check, it is possible to turn just these results into a .bed file for # uploading as a custom track to take a look at them. time pslReps -nohead -nearTop=0.001 -singleHit \ raw.psl repsSingle.psl /dev/null clusterClone -allowDuplicates -agp -minCover=80 \ -maxGap=60000 repsSingle.psl > single.agp 2> single.out sort -k1,1 -k2,2n single.agp | ../../coverage/fixPhase.pl \ /cluster/data/hg18/bed/coverage/phase.txt > contig_overlaps.agp \ 2> singleToOverlaps.out awk -F'\t' '{printf "%s\t%s\t%s\t%s\t0\t%s\n", $1,$2,$3,$6,$9}' \ contig_overlaps.agp > cOverlaps.bed liftUp chrOverlaps.bed /san/sanvol1/scratch/hg18/bacends/liftContigs.lft \ warn cOverlaps.bed # Load up that chrOverlaps.bed as a custom track to see these results ################################################################## # The next big group are the FISH clones cd /san/sanvol1/scratch/hg18/coverage.2006-10-23 # Split or not split depending on gap count >= 100 cat << '_EOF_' > splitFishClones.sh #!/bin/sh D0=fishSplit export D0 find ../coverage/fishClones/sequence -type f | while read F do BN=`basename "${F}"` Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"` if [ "${Ncount}" -gt 99 ]; then out="${D0}/fishPieces/${BN}" echo "gapSplit -minGap=100 ${F} ${out}" gapSplit -minGap=100 ${F} stdout | gzip > ${out} faSize "${F}" faSize "${out}" else out="${D0}/noPieces/${BN}" echo "cp -p ${F} ${out}" cp -p "${F}" "${out}" fi done '_EOF_' # << happy emacs mkdir fishSplit chmod +x splitFishClones.sh time ./splitFishClones.sh # combine them all into large fasta files to lower the file count cd fishSplit for F in fishPieces/* noPieces/* do zcat "${F}" done | gzip > all.fa.gz faSplit about all.fa.gz 500000 split/f_ mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runFish cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runFish ls ../fishSplit/split | sed -e "s/.fa.gz//" > fish.list ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/?/* | \ sed -e \ "s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \ > contig.list ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/??/* | \ sed -e \ "s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \ >> contig.list ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/?_*/* | \ sed -e \ "s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \ >> contig.list ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/??_*/* | \ sed -e \ "s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \ >> contig.list cat << '_EOF_' > template #LOOP ./runBlat.csh $(path1) $(path2) {check out line+ psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > runBlat.csh #!/bin/csh -fe set target = /san/sanvol1/scratch/hg18/coverage/maskedContigs/$1.fa.gz set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/fishSplit/split/$2.fa.gz set contig = $target:t:r:r set fishPiece = $query:t:r:r set result = psl/$contig/$fishPiece.psl set scrTmp = "/scratch/tmp/$contig/$fishPiece" set ooc = /san/sanvol1/scratch/hg18/coverage/contigOoc/$contig.10.ooc mkdir -p "$scrTmp" zcat $target > $scrTmp/$contig.fa zcat $query > $scrTmp/$fishPiece.fa cp -p $ooc $scrTmp/10.ooc pushd $scrTmp pwd ls -l blat -fastMap -tileSize=10 -t=dna -q=dna -ooc=10.ooc $contig.fa $fishPiece.fa $fishPiece.psl popd mkdir -p psl/$contig cp -p $scrTmp/$fishPiece.psl $result rm $scrTmp/* rmdir $scrTmp rmdir --ignore-fail-on-non-empty /scratch/tmp/$contig '_EOF_' # << happy emacs chmod +x runBlat.csh para create contig.list fish.list template jobList para try; para create; etc ... para time # Completed: 148176 of 148176 jobs # CPU time in finished jobs: 2884533s 48075.56m 801.26h 33.39d 0.091 y # IO & Wait Time: 385142s 6419.03m 106.98h 4.46d 0.012 y # Average job time: 22s 0.37m 0.01h 0.00d # Longest finished job: 270s 4.50m 0.07h 0.00d # Submission to last job: 9510s 158.50m 2.64h 0.11d # put all the results together into a single file pslSort dirs raw.psl tmp psl/* # this is a big result ls -og raw.psl # -rw-rw-r-- 1 6972351482 Oct 25 16:25 raw.psl # can do the same thing as above to look at these results individually # not listed here ################################################################## # The next big group are the unplaced clones. In the original run, the # contig location of these items were inferred from Hg17 results, and # thus many of them can be aligned against their respective contig. For # some cases, the contig isn't known, but the chrom is, thus they can be # aligned to all the contigs for a chrom. And finally, those completely # unknown have to be aligned to all contigs. # There are two sections here, those for which contig details are # unknown, and those for which contigs are known. First, those for # which details are unknown: cd /san/sanvol1/scratch/hg18/coverage.2006-10-23 cat << '_EOF_' > splitUnplacedClones.sh #!/bin/sh find ../coverage/unPlacedClones -type f | while read F do BN=`basename "${F}"` DN=`dirname "${F}"` CONTIG=`basename "${DN}"` DN=`dirname "${DN}"` CHROM=`basename "${DN}"` out="unPlacedSplit/${CHROM}/${CONTIG}/${BN}" # echo "${CHROM}/${CONTIG}/${BN}" mkdir -p unPlacedSplit/${CHROM}/${CONTIG} Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"` if [ "${Ncount}" -gt 99 ]; then echo "gapSplit -minGap=100 ${F} ${out}" gapSplit -minGap=100 ${F} stdout | gzip > ${out} faSize "${F}" faSize "${out}" fi done '_EOF_' # << happy emacs chmod +x splitUnplacedClones.sh mkdir unPlacedSplit time ./splitUnplacedClones.sh > unPlaced.out 2>&1 mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlacedChr cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlacedChr ls ../unPlacedSplit/*/XX*/*.fa.gz > bac.list cat << '_EOF_' > mkJobList.sh #!/bin/sh cat bac.list | while read F do CHR=`echo "${F}" | sed -e "s#.*unPlacedSplit/##; s#/.*##"` CLONE=`basename ${F} | sed -e "s/.fa.gz//"` case $CHR in U|Un) for C in /san/sanvol1/scratch/hg18/coverage/maskedContigs/? \ /san/sanvol1/scratch/hg18/coverage/maskedContigs/?? \ /san/sanvol1/scratch/hg18/coverage/maskedContigs/?_* \ /san/sanvol1/scratch/hg18/coverage/maskedContigs/??_* do CH=`basename ${C}` for CT in /san/sanvol1/scratch/hg18/coverage/maskedContigs/${CH}/* do CONTIG=`basename ${CT} | sed -e "s/.fa.gz//"` echo "./runPsLayout.sh $CH $CLONE $CONTIG {check out line+ psl/$ CH/$CONTIG/$CLONE.psl}" done done ;; *) for CT in /san/sanvol1/scratch/hg18/coverage/maskedContigs/${CHR}/* do CONTIG=`basename ${CT} | sed -e "s/.fa.gz//"` echo "./runPsLayout.sh $CHR $CLONE $CONTIG {check out line+ psl/$CHR /$CONTIG/$CLONE.psl}" done ;; esac '_EOF_' # << happy emacs chmod +x mkJobList.sh ./mkJobList.sh > jobList cat << '_EOF_' > runPsLayout.sh #!/bin/sh # runPsLayout.sh # where is the chrom this contig is on # is one of the .fa.gz files in # /san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit//.fa.gz # is one of the contigs found in: # /san/sanvol1/scratch/hg18/coverage/maskedContigs//.fa.gz # HERE=`pwd` CHROM=$1 CLONE=$2 CONTIG=$3 TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/$CHROM/XX_000000/$CLONE.fa.gz OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl" export CHROM CLONE CONTIG TARGET CLONESRC RESULT mkdir -p psl/${CHROM}/${CONTIG} if [ ! -s ${CLONESRC} ]; then CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/U/XX_000000/$CLONE.fa.gz if [ ! -s ${CLONESRC} ]; then CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/Un/XX_000000/$CLONE.fa.gz if [ ! -s ${CLONESRC} ]; then echo "Can not find: ${CLONESRC}" 1>/dev/stderr exit 255 fi fi fi if [ ! -s ${TARGET} ]; then echo "Can not find: ${TARGET}" 1>/dev/stderr exit 255 fi if [ ! -s ${OOC} ]; then echo "Can not find: ${OOC}" 1>/dev/stderr exit 255 fi WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}" mkdir -p "${WRKDIR}" cd ${WRKDIR} zcat ${CLONESRC} > ${CLONE}.fa zcat ${TARGET} > ${CONTIG}.fa cp -p ${OOC} ./10.ooc /cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT} RET=$? cd ${HERE} rm -fr ${WRKDIR} rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}" rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}" exit ${RET} '_EOF_' # << happy emacs chmod +x ./runPsLayout.sh mkdir psl para create jobList para try; para check; ... etc ... para time # Completed: 40509 of 40509 jobs # CPU time in finished jobs: 5354801s 89246.69m 1487.44h 61.98d 0.170 y # IO & Wait Time: 115279s 1921.31m 32.02h 1.33d 0.004 y # Average job time: 135s 2.25m 0.04h 0.00d # Longest finished job: 164276s 2737.93m 45.63h 1.90d # Submission to last job: 187712s 3128.53m 52.14h 2.17d # combine into one result file pslSort dirs raw.psl tmp psl/*/* ################################################################## # Now, for those unplaced clones for which contig details are known ssh pk mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlaced cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlaced cat << '_EOF_' > mkJobList.sh #!/bin/sh find ../unPlacedSplit -type f | grep -v XX_ | while read F do BN=`basename ${F} | sed -e "s/.fa.gz//"` DN=`dirname ${F}` CONTIG=`basename ${DN}` DN=`dirname ${DN}` CHROM=`basename ${DN}` echo "./runPsLayout.sh ${CHROM} ${BN} ${CONTIG} {check out line+ psl/${CHROM }/${CONTIG}/${BN}.psl}" done '_EOF_' # << happy emacs chmod +x mkJobList.sh ./mkJobList.sh > jobList cat << '_EOF_' > runPsLayout.sh #!/bin/sh # runPsLayout.sh # where is the chrom this contig is on # is one of the .fa.gz files in # /san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit//.fa.gz # is one of the contigs found in: # /san/sanvol1/scratch/hg18/coverage/maskedContigs//.fa.gz # HERE=`pwd` CHROM=$1 CLONE=$2 CONTIG=$3 TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/$CHROM/$CONTIG/$CLONE.fa.gz OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl" mkdir -p psl/${CHROM}/${CONTIG} if [ ! -s ${CLONESRC} ]; then echo "Can not find: ${CLONESRC}" 1>/dev/stderr exit 255 fi if [ ! -s ${TARGET} ]; then echo "Can not find: ${TARGET}" 1>/dev/stderr exit 255 fi if [ ! -s ${OOC} ]; then echo "Can not find: ${OOC}" 1>/dev/stderr exit 255 fi WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}" mkdir -p "${WRKDIR}" cd ${WRKDIR} zcat ${CLONESRC} > ${CLONE}.fa zcat ${TARGET} > ${CONTIG}.fa cp -p ${OOC} ./10.ooc /cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT} RET=$? cd ${HERE} rm -fr ${WRKDIR} rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}" rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}" exit ${RET} '_EOF_' # << happy emacs chmod +x runPsLayout.sh para create jobList para try; para check; ... etc ... para time # Completed: 14569 of 14569 jobs # CPU time in finished jobs: 4863551s 81059.19m 1350.99h 56.29d 0.154 y # IO & Wait Time: 64196s 1069.93m 17.83h 0.74d 0.002 y # Average job time: 338s 5.64m 0.09h 0.00d # Longest finished job: 36681s 611.35m 10.19h 0.42d # Submission to last job: 68213s 1136.88m 18.95h 0.79d # combine into a single result pslSort dirs raw.psl tmp psl/*/* # combine into a single result time pslSort dirs raw.psl tmp psl/* # real 550m57.744s # user 324m56.251s # sys 10m15.358s ls -og raw.psl # -rw-rw-r-- 1 39273644954 Nov 2 20:23 raw.psl # Wow ... time pslReps -nohead -nearTop=0.001 -singleHit \ raw.psl repsSingle.psl /dev/null # real 15m14.462s # user 13m6.580s # sys 1m50.304s ls -og repsSingle.psl # -rw-rw-r-- 1 73403317 Nov 3 09:44 repsSingle.psl ########################################################### # And now, combining all results together mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/finalPsl cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/finalPsl ln -s ../runLastOnes/repsSingle.psl lastOnes.psl ln -s ../runFish/raw.psl fish.psl ln -s ../runUnPlaced/raw.psl unPlaced.psl ln -s ../runUnPlacedChr/raw.psl unPlacedChr.psl ln -s ../runPlacedNotSplit/raw.psl placed.psl cd /san/sanvol1/scratch/hg18/coverage.2006-10-23 time pslSort dirs raw.psl tmp finalPsl # real 18m53.770s # user 12m19.002s # sys 1m17.504s ls -og raw.psl # -rw-rw-r-- 1 7742802124 Nov 3 10:10 raw.psl time pslReps -nohead -nearTop=0.001 -singleHit \ raw.psl repsSingle.psl /dev/null clusterClone -allowDuplicates -agp -minCover=80 \ -maxGap=60000 repsSingle.psl > single.agp 2> single.out sort -k1,1 -k2,2n single.agp | ../coverage/fixPhase.pl \ /cluster/data/hg18/bed/coverage/phase.txt > contig_overlaps.agp \ 2> singleToOverlaps.out awk -F'\t' '{printf "%s\t%s\t%s\t%s\t0\t%s\n", $1,$2,$3,$6,$9}' \ contig_overlaps.agp > cOverlaps.bed liftUp chrOverlaps.bed /san/sanvol1/scratch/hg18/bacends/liftContigs.lft \ warn cOverlaps.bed # Load up that chrOverlaps.bed as a custom track to see these results # And back to the original business of eliminating obsolete clones awk '{print $6}' contig_overlaps.agp | sort -u > clone.coverage.list time $HOME/kent/src/hg/makeDb/hgClonePos/ckMultipleVersions.pl \ clone.coverage.list > /dev/null 2> obsolete.clones time $HOME/kent/src/hg/makeDb/hgClonePos/removeObsoleteClones.sh \ contig_overlaps.agp obsolete.clones > clean_overlaps.agp # looks like it removes 295 lines wc -l contig_overlaps.agp clean_overlaps.agp # 613577 contig_overlaps.agp # 613507 clean_overlaps.agp mv contig_overlaps.agp contig_overlapsWithObsoletes.agp mv clean_overlaps.agp contig_overlaps.agp cd /cluster/data/hg18 # save all existing .gl files before we overwrite them all tar cvzf ./save.glFiles.tgz ./?/*.gl ./??/*.gl ./?_*/*.gl \ ./??_*/*.gl ./?/*/*.gl ./??/*/*.gl ./?_*/*/*.gl ./??_*/*/*.gl time agpToGl contig_overlaps.agp . -md=seq_contig.md # real 1m4.253s time ./jkStuff/liftGl.csh contig.gl # saw some errors such as: NT_113974/contig.gl doesn't exist, skipping # I'm guessing they were contigs with no alignment results # capture these new .gl files for future reference tar cvzf ./new.glFiles.tgz ./?/*.gl ./??/*.gl ./?_*/*.gl \ ./??_*/*.gl ./?/*/*.gl ./??/*/*.gl ./?_*/*/*.gl ./??_*/*/*.gl # now reload all the _gold, _gap and _gl tables # Tested this load on a dummy database and found that the contents of # the gold and gap tables do not change hgGoldGapGl -chromLst=chrom.lst hg18 /cluster/store11/gs.19 build36 # Then hgClonePos uses those tables to create the Coverage track # table: clonePos hgClonePos -maxErr=600 -maxWarn=50000 -chromLst=chrom.lst \ hg18 /cluster/data/hg18 ./cleanedSequence.inf /cluster/store11/gs.19 \ > updated.clone.pos.errors 2>&1 # Now let's check for clones that are excessively wrong cd /tmp hgsql -N -e \ "select chrom,chromStart,chromEnd,name,chromEnd-chromStart,seqSize from clonePos;" \ hg18 > clonePos.hg18.lengths awk '{if ($6 > 0) { printf "%.2f\t%s\n", 100.0*$5/$6,$0}}' \ clonePos.hg18.lengths | sort -n > clonePos.hg18.deviations # Looking at that list of deviations, there are still a number of them # that are extreme deviants, but there are a lot less than there were # before. Previously: ave clonePos.hg18.deviations # Q1 100.000000 # median 100.000000 # Q3 109.172500 # average 350.043843 # min 80.000000 # max 23574.310000 # count 44978 # total 15744271.980000 # standard deviation 851.762186 # Over 3,500 of them larger than 10 times too large: awk '{if ($1 > 1000) {print}}' clonePos.hg18.deviations | wc # 3881 27167 223039 # This new lot: ave clonePos.hg18.deviations # Q1 100.000000 # median 100.000000 # Q3 100.360000 # average 140.353820 # min 0.250000 # max 40838.840000 # count 43734 # total 6138233.960000 # standard deviation 381.871589 # Only 277 are larger than 10 times too big: awk '{if ($1 > 1000) {print}}' clonePos.hg18.deviations | wc # 277 1939 15747 # QA NOTE: ran mytouch on the *gold and *gap tables because the values were # unachaged, but they got a new date/time in the above process (ASZ # 11-14-2006): # sudo mytouch hg18 'chr*_gold' 200604060800.00 # sudo mytouch hg18 'chr*_gap' 200604060800.00 ############################################################################## # LongSAGE (2006-10-20 markd) # Load LongSAGE composite tag with genomo mappings of tag clusters # obtained from "Martin Hirst" ftp ftp2.bcgsc.ca user: ucsc download SHE*_u.map chmod a-w *.map ~/compbio/kent/src/hg/makeDb/outside/bcgscSage/bcgscSageLoad hg18 *_u.map #################################################################### # MAKE UNIGENE/SAGE TRACK (DONE - 2006-11-20 Fan) # Create the uniGene alignments # /cluster/data/hg18/uniGene/hg18.uniGene.lifted.pslReps.psl # Download of the latest UniGene version is now automated by a # cron job -- see /cluster/home/angie/crontab , # /cluster/home/angie/unigeneVers/unigene.csh . # If hgwdev gets rebooted, that needs to be restarted... maybe there's # a more stable place to set up that cron job. ssh hgwdev cd /cluster/store11/gs.19/build36/bed mkdir uniGene cd uniGene set Version = 196 zcat /cluster/store7/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\ sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa ssh pk set Version = 196 mkdir -p /san/sanvol1/scratch/hg18/uniGene/ cd /san/sanvol1/scratch/hg18/uniGene/ cp -p /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa . ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst ls -1S \ /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa \ > uniGene.lst cat << '_EOF_' > template.sub #LOOP /cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst uniGene.lst template.sub para.spec para create para.spec mkdir psl para try para check para push # Completed: 49 of 49 jobs # CPU time in finished jobs: 46855s 780.92m 13.02h 0.54d 0.001 y # IO & Wait Time: 240s 3.99m 0.07h 0.00d 0.000 y # Average job time: 961s 16.02m 0.27h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3629s 60.48m 1.01h 0.04d # Submission to last job: 4337s 72.28m 1.20h 0.05d pslSort dirs raw.psl tmp psl >& pslSort.log cat raw.psl|\ pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \ stdin hg18.uniGene.pslReps.psl /dev/null # Processed 553470 alignments gzip raw.psl gzip Hs.seq.uniq.simpleHeader.fa ssh hgwdev cd /cluster/store11/gs.19/build36/bed/uniGene cp -p /san/sanvol1/scratch/hg18/uniGene/hg18.uniGene.pslReps.psl . hgLoadPsl -table=uniGene_3 hg18 hg18.uniGene.pslReps.psl #################################################################### # EXONIPHY (2006-12-05 acs) # predictions provided by Brona Brejova in Siepel Lab (bb248 at cornell.edu). # stored in /cluster/data/hg18/bed/exoniphy/exoniphy.gff ldHgGene -genePredExt -gtf hg18 exoniphy exoniphy.gff #################################################################### # HapMap CNVRs (copy number variable regions) from Matt Hurles (Heather Dec. 2006) # Change bed3 to bed6 to match hg17 cd /cluster/data/hg18/bed/sv redon.pl < cnpRedon.hg18 > redon.bed hgLoadBed hg18 cnpRedon cnpRedon.bed ######################################################### # Structural Variation from Lars Feuk (Heather Jan - April 2007) # These tables are all tiny so I'm not using indices # I kept the bin column in all but Sebat but I could have done without that, # too ssh hgwdev cd /cluster/data/hg18/bed/sv # 8 *txt files from Lars # Sharp (format different from hg17) cp Sharp*txt sharp.in # use editor to remove header from sharp.in # grab the data we need sharp.pl < sharp.in > sharp.prelim # adjust sharp2.pl < sharp.prelim > sharp.bed hgLoadBed hg18 cnpSharp2 sharp.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpSharp2.sql # Iafrate (format different from hg17) cp Iafrate*txt iafrate.in # use editor to change TABTAB to TAB0TAB and get rid of header iafrate.pl < iafrate.in > iafrate.bed hgLoadBed hg18 cnpIafrate2 iafrate.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpIafrate2.sql # Sebat (format different from hg17) cp Sebat*txt sebat.in # use editor to get rid of header sebat.pl < sebat.in > sebat.bed hgLoadBed hg18 cnpSebat2 sebat.bed -noBin -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpSebat2.sql # Tuzun (I called this cnpFosmid in hg17) # simple bed 4 . cp Tuzun*txt tuzun.in # use editor to get rid of header tuzun.pl < tuzun.in > tuzun.bed hgLoadBed hg18 cnpTuzun tuzun.bed -tab # McCarroll (same format as hg17, simple bed 4 .) # need to sort and assign ids cp McCarroll*txt mccarroll.in # use editor to get rid of header mccarroll.pl < mccarroll.in > mccarroll.prelim sort -g mccarroll.prelim > mccarroll.sort # sort isn't perfect, use editor to finish mccarroll2.pl < mccarroll.sort > mccarroll.bed hgLoadBed hg18 delMccarroll mccarroll.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delMccarroll.sql # Conrad (different format from hg17) cp Conrad*txt conrad.prelim # use editor to shorten "Study" column conrad.pl < conrad.prelim > conrad.prelim2 cp conrad.prelim2 conrad.prelim3 # use editor to sort conrad.prelim3 (lame) # assign Ids conradId.pl < conrad.prelim3 > conrad.bed hgLoadBed hg18 delConrad2 conrad.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delConrad2.sql # Hinds (different format from hg17) cp Hinds*txt hinds.in # use editor to remove header hinds.pl < hinds.in > hinds.prelim sort -g hinds.prelim > hinds.sort # sort isn't perfect, use editor to finish hinds2.pl < hinds.sort > hinds.bed hgLoadBed hg18 delHinds2 hinds.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delHinds2.sql # Locke (new data) cp Locke*txt locke.in locke.pl < locke.in > locke.prelim sort -g locke.prelim > locke.?? locke2.pl ######################################################### # BUILD GAD TRACK (Done, 12/12/06, Fan) mkdir /cluster/store12/gad061211 rm /cluster/data/gad ln -s /cluster/store12/gad061211 /cluster/data/gad # Receive "GAD-Hg18DATA.txt" from GAD/NIA # contact person: Shenoy, Narmada, shenoyn at grc.nia.nih.gov hgsql hg18 -e 'drop table gadAll' hgsql hg18 <~/src/hg/lib/gadAll.sql hgsql hg18 -e 'load data local infile "GAD-Hg18DATA.txt" into table gadAll ignore 1 lines' hgsql hg18 -e 'create index geneSymbol on gadAll(geneSymbol(10))' # create gad table hgsql hg18 -N -e \ 'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll where chromStart <>0 and chromEnd <>0 and chromosome<>""'|\ sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gadHg18.bed hgLoadBed hg18 gad gadHg18.bed ######################################################################### # BLASTZ/CHAIN/NET oryLat1 (DONE - 2006-12-14 - Hiram) # third time with randoms and chrUn in scaffolds on both sequences ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.oryLat1.2006-12-14 cd /cluster/data/hg18/bed/blastz.oryLat1.2006-12-14 cat << '_EOF_' > DEF # Human vs. Medaka # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human hg18, randoms in contigs, lifted to their chr*_random SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/hg18.randomContigs.sdTrf.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/hg18.randomContigs.sdTrf.sizes SEQ1_LIFT=/san/sanvol1/scratch/hg18/hg18.randomContigs.lift SEQ1_CHUNK=10000000 SEQ1_LIMIT=1 SEQ1_LAP=10000 # QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ2_CHUNK=40000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.oryLat1.2006-12-14 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -verbose=2 \ -blastzOutRoot /cluster/bluearc/hg18OryLat1 > do.log 2>&1 & ### this did not work, abandoned ######################################################################### # BLASTZ/CHAIN/NET oryLat1 (DONE - 2006-12-14 - Hiram) # fourth time with randoms and chrUn in scaffolds for only Medaka # All chroms and randoms as they are complete on Human ssh kkstore04 mkdir /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24 cd /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24 cat << '_EOF_' > DEF # Human vs. Medaka # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human hg18, randoms complete, as they are, no contig confusion SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ2_CHUNK=40000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.oryLat1.2007-02-24 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -verbose=2 \ -blastzOutRoot /cluster/bluearc/hg18OryLat1 > do.log 2>&1 & # real 318m45.339s # typical failure: # HgStepManager: executing step 'net'. # netChains: looks like previous stage was not successful # (can't find [hg18.oryLat1.]all.chain[.gz]). # continuing net: time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=net -bigClusterHub=pk -verbose=2 \ -blastzOutRoot /cluster/bluearc/hg18OryLat1 > net.log 2>&1 & # real 39m25.853s ssh hgwdev cd /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24 nice -n +19 featureBits hg18 chainOryLat1Link \ > fb.hg18.chainOryLat1Link.txt 2>&1 & # 57393910 bases of 2881515245 (1.992%) in intersection ssh kkstore04 mkdir /cluster/data/oryLat1/bed/blastz.hg18.swap cd /cluster/data/oryLat1/bed/blastz.hg18.swap time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \ /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -verbose=2 -swap > swap.log 2>&1 & ssh hgwdev cd /cluster/data/oryLat1/bed/blastz.hg18.swap nice -n +19 featureBits oryLat1 chainHg18Link \ > fb.oryLat1.chainHg18Link.txt 2>&1 & # 48002423 bases of 700386597 (6.854%) in intersection ########################################################################## # AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14) ssh hgwdev cd /cluster/data/hg18/bed/affyHumanExon liftOver /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.bed \ /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz affyHuEx1.bed affyHuEx1.unmapped awk 'BEGIN{OFS="\t"}{print $4,$3-$2}' affyHuEx1.bed | sort -k2,2nr | head #2440970 81664 #3016074 9552 #3641787 8061 #2321649 8054 # It seems the liftOver problem still happens for that segmental dupe. # So the start is correct and the end is correct. Just make two entries, both # with size == 305. grep -v "\b2440970\b" affyHuEx1.bed > tmp.bed grep "\b2440970\b" affyHuEx1.bed > bad.bed awk 'BEGIN{OFS="\t"}{print $1,$2,$2+305,$4,$5,$6}' bad.bed > good.bed awk 'BEGIN{OFS="\t"}{print $1,$3-305,$3,$4,$5,$6}' bad.bed >> good.bed cat tmp.bed good.bed > affyHuEx1.bed bedSort affyHuEx1.bed tmp.bed mv tmp.bed affyHuEx1.bed rm good.bed bad.bed hgLoadBed hg18 affyHuEx1 affyHuEx1.bed ########################################################################## # CGAP SAGE (In progress Andy 2007-01-09) # This is the BED part. ssh hgwdev cd /cluster/data/hg18/bed mkdir /san/sanVol1/scratch/andy/cgapSage ln -s /san/sanVol1/scratch/andy/cgapSage cgapSage wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_hs_long_forward_v36.1.tar.gz wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_hs_long_reverse_v36.1.tar.gz tar xfz SAGE_hs_long_forward_v36.1.tar.gz tar xfz SAGE_hs_long_reverse_v36.1.tar.gz cd hs_forward/ cat * | awk 'BEGIN{OFS="\t"}{print $1, $3, $4, $2, 1000, "+"}' > ../unlifted.bed cd ../hs_reverse/ cat * | awk 'BEGIN{OFS="\t"}{print $1, $4, $3, $2, 1000, "-"}' >> ../unlifted.bed cd ../ rm -rf hs* liftUp lifted.bed /cluster/data/hg18/jkStuff/liftAll.lft warn unlifted.bed #Got 378 lifts in /cluster/data/hg18/jkStuff/liftAll.lft #Lifting unlifted.bed #Expecting number field 3 line 13868252 of unlifted.bed, got CCATCGGATGCCCACCT # Looks like there was a funny line in unlifted.bed: grep CCATCGGATGCCCACCT unlifted.bed #NT_011362 24364534NT_004321 CCATCGGATGCCCACCT AATAAGCCAGAGTCTAT 1000 - #NT_004321 7900 7884 CCATCGGATGCCCACCT 1000 - # Ok so there's one record for CCATCGGATGCCCACCT in addition... and for # AATAAGCCAGAGTCTAT? grep AATAAGCCAGAGTCTAT unlifted.bed #NT_011362 24364534NT_004321 CCATCGGATGCCCACCT AATAAGCCAGAGTCTAT 1000 - #NT_011362 24364534 24364518 AATAAGCCAGAGTCTAT 1000 - # Looks like that one's got a record too. So just get rid of the stupid # line: grep -v 24364534NT_004321 unlifted.bed > tmp mv tmp unlifted.bed liftUp lifted.bed /cluster/data/hg18/jkStuff/liftAll.lft warn unlifted.bed rm unlifted.bed head lifted.bed #chr1 649 665 TGTCTGCGCCTGCGCCG 1000 - #chr1 670 686 CTAGCGCGTCGGGGTGG 1000 + nibFrag /cluster/data/hg18/nib/chr1.nib 669 686 "+" /dev/stdout #>/cluster/data/hg18/nib/chr1.nib:669-686 #ctagcgcgtcggggtgg nibFrag /cluster/data/hg18/nib/chr1.nib 649 665 m /dev/stdout #>/cluster/data/hg18/nib/chr1.nib:649-665 #tgtctgcgcctgcgcc # It looks like there's off-by-one errors, so fix em: awk 'BEGIN{OFS="\t"}{start=$2; end=$3;if ($6 == "-") { end = end+1; } else { start = start-1 } print $1, start, end, $4, $5, $6}' \ < lifted.bed > mapping.bed6 rm lifted.bed # Add thickStart/thickEnd fields awk 'BEGIN{OFS="\t"}{thickStart=$2; thickEnd=$3; if ($6=="-") {thickStart = thickStart+13; } else { thickEnd = thickEnd-13; } print $0, thickStart, thickEnd}' \ < mapping.bed6 > mapping.bed ########################################################################## # xxBlastTab - Help filter out unwanted paralogs (Galt 2007-01-10) # # Background: The xxBlastTab tables are made with a simple blastall # (blastp with -b 1) which chooses the best match. Unfortunately this # means that if there is no proper match it will still pick something # even though it's probably not orthologous. This is especially a problem # in organisms like rat knownGene which has only 30% gene coverage. # The strategy here is to filter our xxBlastTab using synteny mappings from # the chains. This is done by simply taking hg18.kg and using /gbdb/$db chains # and pslMap to lift the genes to the target xx assembly. Then hgMapToGene # will find which of those mapped ids have good overlap with xx.knownGene. # The final mapping is then created by doing an inner join between # the traditional xxBlastTab and the mapping table produced above. # Then simply drop the old table and rename the new table. # # # We are starting with xxBlastTab tables already built in the usual way with # blastall/blastp, probably with doHgNearBlastp.pl script. # # I created a new utility script called synBlastp.csh since I have to do this # several times. # # we want to update hg18 for rat and mouse, # so check ./hgGeneData/Human/hg18/otherOrgs.ra for current settings ssh hgwdev synBlastp.csh hg18 rn4 #hg18.rnBlastTab results: #new number of unique query values: # 13120 #new number of unique target values # 6431 #old number of unique query values: # 26982 #old number of unique target values # 6732 synBlastp.csh hg18 mm8 #hg18.mmBlastTab results: #new number of unique query values: # 28733 #new number of unique target values # 15366 #old number of unique query values: # 33016 #old number of unique target values # 15918 ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg18 ################################################# # BUILD ncRna TRACK (DONE, 1/12/07, Fan) # Download the terms and make the database. ssh hgwdev cd /cluster/store11/gs.19/build36 cd bed mkdir ncRna # copy Perl file at: # http://cvs.sanger.ac.uk/cgi-bin/viewcvs.cgi/biomart-perl/scripts/webExample.pl?view=markup # into getBiomart.pl # create the following query xml file, ncRna.xml: cat << '_EOF_' >ncRna.xml '_EOF_' # get Ensembl gene data from BioMart and filter out protein-conding genes perl getBiomart.pl ncRna.xml | grep -v protein_coding >ncRna0.tab # cut and paste different cols to form ncRna.tab cat ncRna0.tab | sed -e 's/ENSG/chr\tENSG/'>j1 cut -f 2 j1 >j.chr0 cut -f 1 j1 >j.chr cat j.chr0|sed -e 's/chr/0/' >j.0 cut -f 6 j1 >j.strand cut -f 4,5 j1 >j.startEnd cut -f 3 j1 >j.name cut -f 7 j1 >j.type cut -f 8 j1 >j.extGeneId paste j.chr0 j.chr j.startEnd j.name j.0 j.strand j.0 j.0 j.type j.extGeneId >j.all cat j.all|grep -v c6_COX|grep -v c6_QBL|grep -v c5_H2\ |sed -e 's/chr\t/chr/'\ |grep -v NT_\ |sed -e 's/\t-1\t/\t-\t/' |sed -e 's/\t1\t/\t+\t/' \ |sed -e 's/chrMT/chrM/'\ |sort -k1,1 -k2,2n -k3,3n >ncRna.tab hgLoadBed -strict -tab -sqlTable=/cluster/home/fanhsu/src/hg/lib/ncRna.sql hg18 ncRna ncRna.tab rm j.* rm j1 ########################################################### # MAKE Drosophila Proteins track (DONE 2007-02-06 braney) ssh kkstore02 sandir=/san/sanvol1/scratch/hg18 mkdir $sandir cd /cluster/data/hg18 cat noUn/chr*fa > temp.fa faSplit gap temp.fa 1000000 $sandir/blastDb/x -lift=$sandir/blastDb.lft cat randomContigs/*.fa > temp.fa faSplit sequence temp.fa 150 $sandir/blastDb/y rm temp.fa cd $sandir/blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /cluster/data/hg18/bed/tblastn.dm2FB cd /cluster/data/hg18/bed/tblastn.dm2FB echo /san/sanvol1/scratch/hg18/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 3066 query.lst # we want around 150000 jobs calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\) # 18929/(150000/3066) = 386.908760 mkdir -p /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa split -l 387 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa/kg ln -s /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa cd fbfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir -p /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut ln -s /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/hg18/bed/tblastn.dm2FB cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /san/sanvol1/scratch/hg18/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs exit chmod +x blastSome gensub2 query.lst fb.lst blastGsub blastSpec ssh pk cd /cluster/data/hg18/bed/tblastn.dm2FB para create blastSpec # para try, check, push, check etc. para time # Completed: 150234 of 150234 jobs # CPU time in finished jobs: 8313632s 138560.53m 2309.34h 96.22d 0.264 y # IO & Wait Time: 882301s 14705.02m 245.08h 10.21d 0.028 y # Average job time: 61s 1.02m 0.02h 0.00d # Longest finished job: 545s 9.08m 0.15h 0.01d # Submission to last job: 40693s 678.22m 11.30h 0.47d ssh kkstore02 cd /cluster/data/hg18/bed/tblastn.dm2FB mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut/c.`basename $1`.psl) '_EOF_' exit chmod +x chainOne ls -1dS /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/hg18/bed/tblastn.dm2FB/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 48 of 49 jobs # Crashed: 1 jobs # CPU time in finished jobs: 209872s 3497.86m 58.30h 2.43d 0.007 y # IO & Wait Time: 48501s 808.35m 13.47h 0.56d 0.002 y # Average job time: 5383s 89.71m 1.50h 0.06d # Longest finished job: 19336s 322.27m 5.37h 0.22d # Submission to last job: 19336s 322.27m 5.37h 0.22d ssh kkstore02 cd /cluster/data/hg18/bed/tblastn.dm2FB/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/hg18/bed/tblastn.dm2FB/unliftBlastDm2FB.psl cd .. pslCheck unliftBlastDm2FB.psl sed "s/[0-9XY]*\///" unliftBlastDm2FB.psl | liftUp -type=.psl -nohead stdout ../../randomContigs/hg18.randomContigs.lift carry stdin | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > blastDm2FB.psl # load table ssh hgwdev cd /cluster/data/hg18/bed/tblastn.dm2FB hgLoadPsl hg18 blastDm2FB.psl # check coverage featureBits hg18 blastDm2FB # 5976178 bases of 2881515245 (0.207%) in intersection featureBits hg18 knownGene:cds blastDm2FB -enrichment # knownGene:cds 1.111%, blastDm2FB 0.207%, both 0.130%, cover 11.71%, enrich 56.45x ssh kkstore04 rm -rf /cluster/data/hg18/bed/tblastn.dm2FB/blastOut rm -rf /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut #end tblastn ########################################################################## ######################################################################### # BLASTZ/CHAIN/NET FR2 (DONE - 2007-01-26 - Hiram) ## Align to fr2 scaffolds, ## results lifted to fr2 chrUn coordinates ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.fr2.2007-01-24 cd /cluster/data/hg18/bed/blastz.fr2.2007-01-24 cat << '_EOF_' > DEF # Human vs. Fugu # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human hg18 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LIMIT=1 SEQ1_LAP=10000 # QUERY: Fugu fr2 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.fr2.2007-01-24 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/hg18Fr2 > do.log 2>&1 & # real 414m47.505s ## Swap back to fr2 (duplicated in fr2.txt also) mkdir /cluster/data/fr2/bed/blastz.hg18.swap cd /cluster/data/fr2/bed/blastz.hg18.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/hg18/bed/blastz.fr2.2007-01-24/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -swap > swap.log 2>&1 & # real 47m14.554s ssh hgwdev cd /cluster/data/fr2/bed/blastz.hg18.swap time nice -n +19 featureBits fr2 chainHg18Link \ > fb.fr2.chainHg18Link.txt 2>&1 & # 42875664 bases of 393312790 (10.901%) in intersection ############################################################################ ## BLASTZ mm8 test with WindowMasker sequence (DONE - 2007-01-30 - Hiram) ssh kkstore04 mkdir /cluster/data/hg18/bed/blastz.mm8.2007-01-30 cd /cluster/data/hg18/bed/blastz.mm8.2007-01-30 cat << '_EOF_' > DEF # human vs mouse BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18.noUn.sdTrf.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Mouse Mm8 - single chunk big enough to run each chrom by itself SEQ2_DIR=/san/sanvol1/scratch/mm8/sdTrf/mm8.noUn.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/mm8/sdTrf/noUn.sdTrf.sizes SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.mm8.2007-01-30 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/hg18Mm8 \ -chainMinScore=3000 -chainLinearGap=medium > do.out 2>&1 & time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/hg18Mm8 \ -continue=cat -stop=net \ -chainMinScore=3000 -chainLinearGap=medium > cat.out 2>&1 & # real 635m55.126s nice -n +19 featureBits -noRandom hg18 chainMm8Link \ > fb.noRandom.hg18.chainMm8Link.txt 2>&1 # 991429484 bases of 2868834265 (34.559%) in intersection nice -n +19 featureBits -noRandom hg18 chainMm8WMLink \ > fb.noRandom.hg18.chainMm8WMLink.txt 2>&1 # 1071083201 bases of 2868834265 (37.335%) in intersection ## swap to mm8 mkdir /cluster/data/mm8/bed/blastz.hg18.swap.2007-02-01 cd /cluster/data/mm8/bed/blastz.hg18.swap.2007-02-01 time doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \ /cluster/data/hg18/bed/blastz.mm8.2007-01-30/DEF \ -swap -stop=net -chainMinScore=3000 \ -chainLinearGap=medium > swap.out 2>&1 & # this created the directory /cluster/data/mm8/bed/blastz.hg18.swap # after it was done, move to here blastz.hg18.swap.2007-02-01 since # it is on a filesystem with some free space nice -n +19 featureBits -noRandom mm8 chainHg18Link \ > fb.noRandom.mm8.chainHg18Link.txt 2>&1 # 983004750 bases of 2550172871 (38.547%) in intersection nice -n +19 featureBits -noRandom mm8 chainHg18WMLink \ > fb.noRandom.mm8.chainHg18WMLink.txt 2>&1 # 976774811 bases of 2550172871 (38.302%) in intersection ########################################################### # MAKE C. elegans proteins track ssh kkstore02 sandir=/san/sanvol1/scratch/hg18 mkdir -p /cluster/data/hg18/bed/tblastn.ce3WB cd /cluster/data/hg18/bed/tblastn.ce3WB echo /san/sanvol1/scratch/hg18/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 3066 query.lst # we want around 200000 jobs calc `wc /cluster/data/ce3/bed/blat.ce3WB/ce3WB.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\) # 22395/(200000/3066) = 343.315350 mkdir -p /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa split -l 343 /cluster/data/ce3/bed/blat.ce3WB/ce3WB.psl /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa/wb ln -s /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa cd wbfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S wbfa/*.fa > wb.lst mkdir -p /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut ln -s /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut for i in `cat wb.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/hg18/bed/tblastn.ce3WB cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /san/sanvol1/scratch/hg18/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/ce3/bed/blat.ce3WB/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs exit chmod +x blastSome gensub2 query.lst wb.lst blastGsub blastSpec ssh pk cd /cluster/data/hg18/bed/tblastn.ce3WB para create blastSpec # para try, check, push, check etc. para time # Completed: 195603 of 195603 jobs # CPU time in finished jobs: 12047221s 200787.01m 3346.45h 139.44d 0.382 y # IO & Wait Time: 9089287s 151488.12m 2524.80h 105.20d 0.288 y # Average job time: 108s 1.80m 0.03h 0.00d # Longest finished job: 1002s 16.70m 0.28h 0.01d # Submission to last job: 192221s 3203.68m 53.39h 2.22d ssh kkstore02 cd /cluster/data/hg18/bed/tblastn.ce3WB mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut/c.`basename $1`.psl) '_EOF_' exit chmod +x chainOne ls -1dS /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut/wb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/hg18/bed/tblastn.ce3WB/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 66 of 66 jobs # CPU time in finished jobs: 161714s 2695.23m 44.92h 1.87d 0.005 y # IO & Wait Time: 40315s 671.92m 11.20h 0.47d 0.001 y # Average job time: 3061s 51.02m 0.85h 0.04d # Longest finished job: 9372s 156.20m 2.60h 0.11d # Submission to last job: 11599s 193.32m 3.22h 0.13d ssh kkstore02 cd /cluster/data/hg18/bed/tblastn.ce3WB/blastOut for i in wb?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/hg18/bed/tblastn.ce3WB/unliftBlastCe3WB.psl cd .. pslCheck unliftBlastCe3WB.psl sed "s/[0-9XY]*\///" unliftBlastCe3WB.psl | liftUp -type=.psl -nohead stdout ../../randomContigs/hg18.randomContigs.lift carry stdin | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > blastCe3WB.psl # load table ssh hgwdev cd /cluster/data/hg18/bed/tblastn.ce3WB hgLoadPsl hg18 blastCe3WB.psl # check coverage featureBits hg18 blastCe3WB # 4326489 bases of 2881515245 (0.150%) in intersection featureBits hg18 knownGene:cds blastCe3WB -enrichment # knownGene:cds 1.111%, blastCe3WB 0.150%, both 0.086%, cover 7.76%, enrich 51.67x ssh kkstore04 rm -rf /cluster/data/hg18/bed/tblastn.ce3WB/blastOut rm -rf /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut #end tblastn ########################################################################## ############################################################################# # RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan) # rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan) ssh hgwdev cd /cluster/data/hg18/bed mkdir wgRna-2007-02-07 cd wgRna-2007-02-07 # Received the data file, wg_feb2007.txt (saved from wg_feb2007.doc) # from Michel Weber's email # (Michel.Weber at ibcg.biotoul.fr) # and place it under cd /cluster/data/hg18/bed/wgRna-2007-02-07. # The record of hsa-mir-770 was found missing the strand info. # manually add "+" to wg_feb2007.txt for the record of hsa-mir-770. cat wg_feb2007.txt|sed -e 's/ /\t/g' > wgRna.tab hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab ############################################################################# # RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan) ssh hgwdev cd /cluster/data/hg18/bed mkdir wgRna-2007-02-12 cd wgRna-2007-02-12 # Received the data file, wg_feb2007_corrected.txt (saved from wg_feb2007_corrected.doc) # from Michel Weber's email # (Michel.Weber at ibcg.biotoul.fr) # and place it under cd /cluster/data/hg18/bed/wgRna-2007-02-12. # The record of hsa-mir-770 was found missing the strand info. # manually add "+" to wg_feb2007_corrected.txt for the record of hsa-mir-770. cat wg_feb2007_corrected.txt|sed -e 's/ /\t/g' > wgRna.tab hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab ######################################################################### ## BLASTZ ANOCAR1 - Lizard - (DONE - 2007-02-17 - 2007-02-18 - Hiram) ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17 cd /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17 cat << '_EOF_' > DEF # human vs lizard BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human Hg18 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.anoCar1.2007-02-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/hg18AnoCar1 > do.log 2>&1 & # real 684m40.568s # there was a pause in there as the pk kluster was corrected during the # first kluster run to get it to finish. # appears to have successfully finished ssh hgwdev cd /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17 time nice -n +19 featureBits hg18 chainAnoCar1Link \ > fb.hg18.chainAnoCar1Link.txt 2>&1 # real 2m28.318s # 137554843 bases of 2881515245 (4.774%) in intersection # running the swap to anoCar1 - instructions in anoCar1.txt cd /cluster/data/anoCar1/bed/blastz.hg18.swap time nice -n +19 featureBits anoCar1 chainHg18Link \ > fb.anoCar1.chainHg18Link.txt 2>&1 # real 3m16.810s # 112434396 bases of 1741478929 (6.456%) in intersection # reciprocal best net mafs for multiz 2008-10-30 - Hiram time nice -n +19 doRecipBest.pl hg18 anoCar1 > rbest.log 2>&1 & # this failed immediately: # cd /cluster/data/hg18/bed/blastz.anoCar1/axtChain # chainStitchId hg18.anoCar1.over.chain.gz stdout # chainSwap stdin stdout # chainSort stdin anoCar1.hg18.tBest.chain # t end mismatch -526389042 vs 10481870 line 1920305 of stdin # Command failed: # ssh -x kkr14u04 nice /cluster/data/hg18/bed/blastz.anoCar1/axtChain/doRecipBest.csh # but, then, when run locally on hgwdev, it proceeded just fine: time nice -n +19 ./doRecipBest.csh > doRecipBest.log 2>&1 & # real 175m54.202s doRecipBest.pl -continue=download hg18 anoCar1 ########################################################################## # UPDATED hg18.knownToVisiGene (DONE galt 2007-02-15) # after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc) ssh hgwdev knownToVisiGene hg18 -fromProbePsl=vgAllProbes ######################################################################### ## BLASTZ OTOGAR1 - Bushbaby - (2007-02-26 kate) # # NOTE: using masked sequence (unlike Brian Raney's alignments) cd /cluster/data/otoGar1 ln -s otoGar1.rmsk.2bit otoGar1.2bit mkdir -p /san/sanvol1/scratch/otoGar1 cp -p otoGar1.2bit chrom.sizes /san/sanvol1/scratch/otoGar1 ssh pk mkdir /cluster/data/hg18/bed/blastz.otoGar1.2007-02-26 cd /cluster/data/hg18/bed/blastz.otoGar1.2007-02-26 cat << '_EOF_' > DEF # human vs bushbaby # params from Hiram & Brian BLASTZ=blastz.v7.x86_64 # TARGET: Human Hg18 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Bushbaby otoGar1 - single chunk big enough to run largest scaffold SEQ2_DIR=/san/sanvol1/scratch/otoGar1/otoGar1.2bit SEQ2_LEN=/cluster/data/otoGar1/chrom.sizes SEQ2_LIMIT=400 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.otoGar1.2007-02-26 TMPDIR=/scratch/tmp '_EOF_' # << emacs /cluster/bin/scripts/doBlastzChainNet.pl DEF \ -bigClusterHub=pk -smallClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ >& do.log & tail -f do.log # problems on cluster -- stale NFS mounts and a routing problem # so batch failed with 4 retries. I restarted cluster run # with retries=8, and all finished except 38. These failed due # to output files existing; as the results look OK, I'm proceeding. para time > run.time /cluster/bin/scripts/doBlastzChainNet.pl DEF \ -continue=cat -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ >&! do2.log & tail -f do2.log # failed due to pre-existing liftOver chain from Brian's run /cluster/bin/scripts/doBlastzChainNet.pl DEF \ -continue=net -bigClusterHub=pk \ >&! do3.log & tail -f do3.log # reciprocal best net mafs for multiz ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 otoGar1 >&! rbest.log & # Load net (2007-03-12 kate) ssh hgwdev cd /cluster/data/hg18/bed/blastz.otoGar1/axtChain netFilter -minGap=10 hg18.otoGar1.rbest.net.gz | \ hgLoadNet -warn hg18 netRBestOtoGar1 stdin ######################################################################### # BLASTZ/CHAIN/NET CAVPOR2 (IN PROGRESS 2007-03-06 kate) ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06 cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06 cat << '_EOF_' > DEF # human vs. guinea pig # dynamic masking param BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Guinea pig cavPor2 # using cat-like params, as this has similar #scaffolds SEQ2_DIR=/san/sanvol1/scratch/cavPor2/cavPor2.2bit SEQ2_LEN=/san/sanvol1/scratch/cavPor2/chrom.sizes # Maximum number of scaffolds that can be lumped together # this makes ~200K jobs SEQ2_LIMIT=500 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.cavPor2.2007-03-06 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium >& do.log & tail -f do.log # cluster brought down by site work # restart on 3/7 ssh pk cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06/run.blastz para recover jobList jobList2 para make jobList2 >&! do2.log & para time > run.time # entire run took probably 36 hours cluster time ssh kkstore02 cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06 /cluster/bin/scripts/doBlastzChainNet.pl DEF \ -bigClusterHub pk -continue=cat -stop cleanup \ -chainMinScore=3000 -chainLinearGap=medium >& do3.log & # reciprocal best net mafs for multiz ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 cavPor2 >&! rbest.log & # load nets manually -- automated loading fails as classification info # not available (no database) ssh hgwdev cd /cluster/data/hg18/bed/blastz.cavPor2/axtChain netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netCavPor2 stdin netFilter -minGap=10 hg18.cavPor2.rbest.net.gz | \ hgLoadNet -warn hg18 netRBestCavPor2 stdin ######################################################################### # BLASTZ/CHAIN/NET ERIEUR1 (IN PROGRESS 2007-03-08 kate) ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.eriEur1.2007-03-08 cd /cluster/data/hg18/bed/blastz.eriEur1.2007-03-08 cat << '_EOF_' > DEF # human vs. hedgehog # dynamic masking param BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: hedgehog eriEur1 # using cat-like params, as this has similar #scaffolds SEQ2_DIR=/san/sanvol1/scratch/eriEur1/eriEur1.2bit SEQ2_LEN=/san/sanvol1/scratch/eriEur1/chrom.sizes # Maximum number of scaffolds that can be lumped together # this makes ~200K jobs SEQ2_LIMIT=500 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.eriEur1.2007-03-08 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium >& do.log & tail -f do.log # Reciprocal best net mafs for multiz (kate) ssh kkstore02 cd /cluster/data/hg18/bed/blastz.eriEur1 ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 eriEur1 >&! rbest.log & #GOT HERE # Load nets (2007-03-12 kate) ssh hgwdev cd /cluster/data/hg18/bed/blastz.dasNov1/axtChain netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netDasNov1 stdin netFilter -minGap=10 hg18.dasNov1.rbest.net.gz | \ hgLoadNet -warn hg18 netRBestDasNov1 stdin ######################################################################### # BLASTZ/CHAIN/NET SORARA1 (IN PROGRESS 2007-03-08 kate) ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.sorAra1.2007-03-08 cd /cluster/data/hg18/bed/blastz.sorAra1.2007-03-08 cat << '_EOF_' > DEF # human vs. hedgehog # dynamic masking param BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: shrew sorAra1 # using cat-like params, as this has similar #scaffolds SEQ2_DIR=/san/sanvol1/scratch/sorAra1/sorAra1.2bit SEQ2_LEN=/san/sanvol1/scratch/sorAra1/chrom.sizes # Maximum number of scaffolds that can be lumped together # this makes ~200K jobs SEQ2_LIMIT=500 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.sorAra1.2007-03-08 '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium >& do.log & tail -f do.log # stopped during load step due to missing database for classifying net # Reciprocal best net mafs for multiz (2007-03-12 kate) ssh kkstore02 cd /cluster/data/hg18/bed/blastz.sorAra1 ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 sorAra1 >&! rbest.log & # GOT HERE # Load nets (2007-03-12 kate) ssh hgwdev cd /cluster/data/hg18/bed/blastz.sorAra1/axtChain netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netSorAra1 stdin netFilter -minGap=10 hg18.sorAra1.rbest.net.gz | \ hgLoadNet -warn hg18 netRBestSorAra1 stdin ######################################################################### # BLASTZ ORNANA1 (PLATYPUS) - (DONE 2007-02-23, REDONE 2007-04-04 angie) # The first time around, the copy of ornAna1.2bit still had the pre-release -- # doh! Differences are miniscule (a couple contigs' orientation changed), # but redo just to get it 100% right. # In the re-run, I changed SEQ2_LIMIT which made the cluster run more # efficient but had side-effects on the results because blastz's dynamic # masking was applied differently (different groupings of sequences) -- # in retrospect, would have been better to use the suboptimal SEQ2_LIMIT # and have fewer differences to slog through. ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02 cd /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02 cat << '_EOF_' > DEF # human vs. platypus # Use same params as used for hg18-danRer4 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: hg18 SEQ1_DIR=/scratch/hg/hg18/hg18.2bit SEQ1_LEN=/scratch/hg/hg18/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: ornAna1 SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.ornAna1.2007-04-02 TMPDIR=/scratch/tmp '_EOF_' # << emacs doBlastzChainNet.pl DEF \ -blastzOutRoot /cluster/bluearc/hg18.ornAna1 \ >& do.log & tail -f do.log cd /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02 time nice -n +19 doRecipBest.pl hg18 ornAna1 > rbest.log 2>&1 & # real 238m22.247s # worked OK ######################################################################## # 28-WAY VERTEBRATE MULTIZ (2007-03-20 kate) ssh kkstore02 cd /cluster/data/hg18/bed mkdir multiz28way.2007-03-20 ln -s multiz28way.2007-03-20 multiz28way cd multiz28way # start with 17way tree; update assemblies and add new species mkdir tree cd tree cp /cluster/data/hg18/bed/multiz17way/tree.nh tree.asm.nh # edit and create tree.28.nh, with Webb's assistance echo `sed 's/[a-zA-Z0-9]*_//g' tree.asm.nh` > tree.28.nh # # create version for download that includes common names and assemblies cp tree.asm.nh ../28way.nh # edit # create version for phyloGif program (replace spaces with commas) cp 28way.gif /usr/local/apache/htdocs/images/phylo/hg18_28way.gif # create species list cd .. sed -e 's/[()]//g' -e 's/ /\n/g' tree/tree.28.nh | \ sed -e '/^$/d'| sort > species.28.lst wc -l species.28.lst ln -s species.28.lst species.lst # Organisms: (N)ew, (U)pdated, (S)ame species since 17way: U chimp (panTro2) S rhesus (rheMac2) -N bushbaby (otoGar1) "Otolemur garnetti" (galago) 2X N tree_shrew (tupBel1) "Tupaia belangeri" S rat (rn4) S mouse (mm8) -N guinea_pig (cavPor2) "Cavia porcellus" 2X S rabbit (oryCun1) 2X -N shrew (sorAra1) "Sorex araneus" 2X -N hedgehog (eriEur1) "Erinaceus europaeus" 2X S dog (canFam2) N cat (felCat3) "Felis catus" 2X -N horse (equCab1) "Equus caballus" U cow (bosTau3) S armadillo (dasNov1) "Dasypus novemcinctus" 2X S elephant (loxAfr1) 2X S tenrec (echTel1) 2X S opossum (monDom4) N platypus (ornAna1) "Ornithorhychus anatinus" U chicken (galGal3) N lizard (anoCar1) "Anolis carolinensis" (Green Anole), Iguana family U frog (xenTro2) U fugu (fr2) S tetraodon (tetNig1) N stickleback (gasAcu1) "Gasterosteus aculeatus" N medaka (oryLat1) "Oryzias latipes" U zebrafish (danRer4) ssh kkstore02 cd /cluster/data/hg18/bed/multiz28way # verify all blastz's exists cat > listMafs.csh << 'EOF' foreach db (`cat species.lst`) set bdir = /cluster/data/hg18/bed/blastz.$db if (-e $bdir/mafRBestNet/chr1.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/chr1.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/chr1.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end 'EOF' # gather chain stats ssh hgwdev cd /cluster/data/hg18/bed/multiz28way cat > getChainStats.csh << 'EOF' set species = $1 foreach db (`cat $species`) echo -n "${db} " set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'` set fb = /cluster/data/hg18/bed/blastz.$db/fb.hg18.chain${Db}Link.txt if (! -e $fb || -z $fb ) then nice featureBits hg18 chain${Db}Link >& $fb endif sed 's/.*(\(.*\)).*/\1/' $fb end 'EOF' # << happy emacs csh getChainStats.csh species.lst >&! species.chainStats # Maf types: # 2X mammals -> reciprocal best net # high cov placental mammals and opossum -> syntenic net # other -> standard net csh listMafs.csh > listMafs.log & cat listMafs.log # add links of the formt blastz. to blastz.. dirs: cd /cluster/data/hg18/bed ln -s blastz.fr2.2007-01-24 blastz.fr2 ln -s blastz.ornAna1.2007-02-21 blastz.ornAna1 ln -s blastz.oryLat1.swap blastz.oryLat1 # copy net mafs to cluster-friendly storage, splitting chroms # into 50MB chunks to improve run-time # NOTE: splitting will be different for scaffold-based reference asemblies ssh hgwdev cd /cluster/data/hg18/bed/multiz28way mkdir run.split cd run.split mafSplitPos hg18 50 mafSplit.bed ssh kki cd /cluster/data/hg18/bed/multiz28way cd run.split cat << 'EOF' > doSplit.csh #!/bin/csh -ef set db = $1 set sdir = /san/sanvol1/scratch/hg18/splitStrictMafNet mkdir -p $sdir if (-e $sdir/$db) then echo "directory $sdir/$db already exists -- remove and retry" exit 1 endif set bdir = /cluster/data/hg18/bed/blastz.$db if (! -e $bdir) then echo "directory $bdir not found" exit 1 endif mkdir -p $sdir/$db if (-e $bdir/mafRBestNet) then set mdir = $bdir/mafRBestNet else if (-e $bdir/mafSynNet) then set mdir = $bdir/mafSynNet else if (-e $bdir/mafNet) then set mdir = $bdir/mafNet else echo "$bdir maf dir not found" exit 1 endif echo $mdir foreach f ($mdir/*) set c = $f:t:r:r echo " $c" nice mafSplit mafSplit.bed $sdir/$db/ $f end echo "gzipping $sdir/$db mafs" nice gzip $sdir/$db/* endif echo $mdir > $db.done 'EOF' # << happy emacs chmod +x doSplit.csh grep -v hg18 ../species.28.lst > split.lst cat > spec << 'EOF' #LOOP doSplit.csh $(path1) {check out line+ $(path1).done} #ENDLOOP 'EOF' gensub2 split.lst single spec jobList para create jobList # 24 jobs para try para check para push # till complete para time >&! run.time # 30 minutes # run multiz ssh pk cd /cluster/data/hg18/bed/multiz28way mkdir -p maf run cd run mkdir penn # use latest penn utilities set PENN_BIN = /cluster/bin/penn/multiz.v11.2007-03-19 cp -p $PENN_BIN/{autoMZ,multiz,maf_project} penn # list chrom chunks, any db dir will do; better would be for the # splitter to generate this file # We temporarily use __ instead of . to delimit chunk in filename # so we can use $(root) to get basename set mdir = /san/sanvol1/scratch/hg18/splitStrictMafNet ls $mdir/fr2 | sed -e 's/.maf.gz//' -e 's/\./__/' > chromChunks.lst wc -l chromChunks.lst # 93 cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = hg18 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet rm -fr $tmp mkdir -p $tmp cp ../tree/tree.28.nh ../species.28.lst $tmp pushd $tmp foreach s (`cat species.28.lst`) set c2 = `echo $c | sed 's/__/./'` set in = $pairs/$s/$c2.maf set out = $db.$s.sing.maf if ($s == hg18) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.28.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp 'EOF' # << happy emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz28way/maf/$(root1).maf} #ENDLOOP 'EOF' # << emacs gensub2 chromChunks.lst single spec jobList para create jobList # 93 jobs para try para check para push para time > run.time # 4 hours! (~9 min/species) # load tables for a look ssh hgwdev mkdir -p /gbdb/hg18/multiz28way/maf ln -s /cluster/data/hg18/bed/multiz28way/maf/*.maf \ /gbdb/hg18/multiz28way/maf cd /cluster/data/hg18/bed/multiz28way cat > loadMaf.csh << 'EOF' date hgLoadMaf -pathPrefix=/gbdb/hg18/multiz28way/maf hg18 multiz28way # load summary table cat maf/*.maf | nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz28waySummary stdin 'EOF' csh loadMaf.csh >&! loadMaf.log & # look at coverage ssh kkstore02 cd /cluster/data/hg18/bed/multiz25wayStrict mkdir mafCov cd mafCov cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 \ -otherDb=canFam2 chr7.canFam2.bed | bedSort > chr7.canFam2.bed echo canFam2 > species.lst cat ../maf/chr7__*.maf | mafSpeciesSubset stdin species.lst stdout | \ mafToAxt stdin hg18 canFam2 stdout | \ axtToPsl stdin /cluster/data/hg18/chrom.sizes \ /cluster/data/canFam2/chrom.sizes chr7.canFam2.psl cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 -otherDb=oryCun1 chr7.oryCun1.bed cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 -otherDb=tetNig1 chr7.tetNig1.bed ssh hgwdev cd /cluster/data/hg18/bed/multiz25wayStrict/mafCov # canFam2 syntenic net vs standard net nice featureBits hg18 -chrom=chr7 chr7.canFam2.bed # 82967535 bases of 154952424 (53.544%) in intersection nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.canFam2.bed # 86391682 bases of 154952424 (55.754%) in intersection nice featureBits hg18 -chrom=chr7 ../../multiz17way/mafCov/chr7.canFam2.bed # 86248995 bases of 154952424 (55.662%) in intersection # compare using another method cat ../maf/chr7__*.maf | mafSpeciesSubset stdin species.lst chr7.canFam2.maf mafToAxt chr7.canFam2.maf hg18 canFam2 chr7.canFam2.axt axtToPsl chr7.canFam2.axt /cluster/data/hg18/chrom.sizes \ /cluster/data/canFam2/chrom.sizes chr7.canFam2.psl nice featureBits hg18 -chrom=chr7 chr7.canFam2.psl # 75497734 bases of 154952424 (48.723%) in intersection # oryCun1 reciprocal best net vs standard net nice featureBits hg18 -chrom=chr7 chr7.oryCun1.bed # 53157578 bases of 154952424 (34.306%) in intersection nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.oryCun1.bed # 56858022 bases of 154952424 (36.694%) in intersection # tetNig1 both used standard net nice featureBits hg18 -chrom=chr7 chr7.tetNig1.bed # 2905058 bases of 154952424 (1.875%) in intersection nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.tetNig1.bed # 2901708 bases of 154952424 (1.873%) in intersection # NOTE: Next time concatenate split mafs before proceeding further # Gap Annotation # prepare bed files with gap info ssh kkstore02 cd /cluster/data/hg18/bed/multiz28way mkdir anno cd anno mkdir maf run cd run cat > doNBed.csh << 'EOF' foreach db (`cat species.lst`) echo -n "$db " set cdir = /cluster/data/$db if (! -e $cdir/$db.N.bed) then echo "creating N.bed" twoBitInfo -nBed $cdir/$db.2bit $cdir/$db.N.bed else echo "" endif end 'EOF' csh doNBed.csh >&! doNBed.log & rm -f nBeds sizes foreach db (`grep -v hg18 ../../species.lst`) echo "$db " ln -s /cluster/data/$db/$db.N.bed $db.bed echo $db.bed >> nBeds ln -s /cluster/data/$db/chrom.sizes $db.len echo $db.len >> sizes end ssh kki cd /cluster/data/hg18/bed/multiz28way/anno/run cat > doAnno.csh << 'EOF' #!/bin/csh -ef set dir = /cluster/data/hg18/bed/multiz28way set c = $1 cat $dir/maf/${c}__*.maf | \ nice mafAddIRows -nBeds=nBeds -sizes=sizes stdin \ /cluster/data/hg18/hg18.2bit $2 'EOF' #<< happy emacs chmod +x doAnno.csh cat > spec << 'EOF' #LOOP ./doAnno.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz28way/anno/maf/$(root1).maf} #ENDLOOP 'EOF' #<< happy emacs awk '{print $1}' /cluster/data/hg18/chrom.sizes > chroms.lst gensub2 chroms.lst single spec jobList para create jobList para try ssh hgwdev cd /cluster/data/hg18/bed/multiz28way/anno mkdir -p /gbdb/hg18/multiz28way/anno/maf ln -s /cluster/data/hg18/bed/multiz28way/anno/maf/*.maf \ /gbdb/hg18/multiz28way/anno/maf cat > loadMaf.csh << 'EOF' date nice hgLoadMaf -pathPrefix=/gbdb/hg18/multiz28way/anno/maf \ hg18 multiz28wayAnno date cat maf/*.maf | \ nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz28wayAnnoSummary stdin date 'EOF' csh loadMaf.csh >& loadMaf.log & # NOTE: rebuilt hgLoadMafSummary to exclude chroms<1MB (2007-06-21 kate) ######################################################################## # ANNOTATE 28-WAY ALIGNMENT WITH QUALITY DATA (2007-06-11 rico at bx.psu.edu) # # The basic idea here is to create a qac file which has quality data for each # (chromosome/scaffold/etc) and then index the qac file. Once this is done, # mafAddQRows can be used to add the quality data to a given maf. The agp # files are used so that gaps can be represented in the qac files as a special # value. ## create .qac and .qdx files for each species in the 28-way alignment o human (hg18) Unable to find quality data. o chimp (panTro2) /cluster/data/panTro2/bed/quality/qac/*.qac /cluster/data/panTro2/wustl/*.agp qacAddGapIdx in.agp in.qac panTro2.qac panTro2.qdx o rhesus (rheMac2) /cluster/data/rheMac2/qual/foo.qv /cluster/data/rheMac2/downloads/foo.agp qacAddGapIdx in.agp in.qac rheMac2.qac rheMac2.qdx o bushbaby (otoGar1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/bushbaby/otoGar1 assembly.agp Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz otoGar1.qac otoGar1.qdx o treeshrew (tupBel1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/treeShrew/tupBel1 assembly.agp Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz tupBel1.qac tupBel1.qdx o rat (rn4) /cluster/data/rn4/downloads/foo.qual /cluster/data/rn4/CHROM/foo.agp qacAddGapIdx in.agp in.qac rn4.qac rn4.qdx o mouse (mm8) Unable to find quality data. o guinea pig (cavPor2) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/guineaPig/cavPor2 assembly.agp Draft_v2.agp.chromosome.qual.gz qaAgpToQacIdx assembly.agp Draft_v2.agp.chromosome.qual.gz cavPor2.qac cavPor2.qdx o rabbit (oryCun1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/rabbit/oryCun1 assembly.agp Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz oryCun1.qac oryCun1.qdx o shrew (sorAra1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/commonShrew/sorAra1 assembly.agp Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz sorAra1.qac sorAra1.qdx o hedgehog (eriEur1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/hedgehog/eriEur1 assembly.agp Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz eriEur1.qac eriEur1.qdx o dog (canFam2) /cluster/data/canFam2/bed/quality/chrom.qac /cluster/data/canFam2/broad/foo.agp qacAddGapIdx in.agp in.qac canFam2.qac canFam2.qdx o cat (felCat3) /cluster/data/felCat3/downloads/assembly.agp /cluster/data/felCat3/downloads/Draft_v3.agp.chromosome.qual.gz qaAgpToQacIdx assembly.agp Draft_v3.agp.chromosome.qual.gz felCat3.qac felCat3.qdx o horse (equCab1) /cluster/data/equCab1/downloads/assembly.agp /cluster/data/equCab1/downloads/Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz equCab1.qac equCab1.qdx o cow (bosTau3) /cluster/data/bosTau3/baylor/chroms/foo.qual /cluster/data/bosTau3/baylor/foo.agp qacAddGapIdx in.agp in.qac bosTau3.qac bosTau3.qdx o armadillo (dasNov1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/armadillo/dasNov1 assembly.agp assembly.quals.gz combineQuals assembly.agp assembly.quals.gz combined.quals qaAgpToQacIdx assembly.agp combined.quals.gz dasNov1.qac dasNov1.qdx o elephant (loxAfr1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/elephant/loxAfr1 assembly.agp assembly.quals.gz combineQuals assembly.agp assembly.quals.gz combined.quals qaAgpToQacIdx assembly.agp combined.quals.gz loxAfr1.qac loxAfr1.qdx o tenrec (echTel1) http://www.broad.mit.edu/ftp/pub/assemblies/mammals/tenrec/echTel1 assembly.agp Draft_v1.agp.chromosome.qual.gz qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz echTel1.qac echTel1.qdx o opossum (monDom4) /cluster/data/monDom4/broad.mit.edu/foo.qac /cluster/data/monDom4/broad.mit.edu/foo.agp qacAddGapIdx in.agp in.qac monDom4.qac monDom4.qdx o platypus (ornAna1) /cluster/data/ornAna1 agp files are present, but there are no quality files o chicken (galGal3) Unable to find quality data. o lizard (anoCar1) /cluster/data/anoCar1/downloads/assembly.agp /cluster/data/anoCar1/downloads/scaffold.lifted.qac qacAddGapIdx in.agp in.qac anoCar1.qac anoCar1.qdx o frog (xenTro2) Unable to find quality data. o tetraodon (tetNig1) Unable to find quality data. o fugu (fr2) Unable to find quality data. o stickleback (gasAcu1) /cluster/data/gasAcu1/downloads/foo.agp /cluster/data/gasAcu1/downloads/foo.qual qacAddGapIdx in.agp in.qac gasAcu1.qac gasAcu1.qdx o medaka (oryLat1) /cluster/data/oryLat1/bed/qual/foo.qual /cluster/data/oryLat1/downloads/foo.agp qacAddGapIdx in.agp in.qac oryLat1.qac oryLat1.qdx o zebrafish (danRer4) Unable to find quality data. ## NOTE quality data for chrM needed: dog, guineapig, horse, hedgehog, stickleback, medaka, rat quality data for chrUn needed: medaka ## copy all .qac and .qdx files to the san cp *.qac *.qdx /san/sanvol1/rico/quality ## create species list (species.lst) containing the following anoCar1 /san/sanvol1/rico/quality bosTau3 /san/sanvol1/rico/quality canFam2 /san/sanvol1/rico/quality cavPor2 /san/sanvol1/rico/quality dasNov1 /san/sanvol1/rico/quality echTel1 /san/sanvol1/rico/quality equCab1 /san/sanvol1/rico/quality eriEur1 /san/sanvol1/rico/quality felCat3 /san/sanvol1/rico/quality gasAcu1 /san/sanvol1/rico/quality loxAfr1 /san/sanvol1/rico/quality monDom4 /san/sanvol1/rico/quality oryCun1 /san/sanvol1/rico/quality oryLat1 /san/sanvol1/rico/quality otoGar1 /san/sanvol1/rico/quality panTro2 /san/sanvol1/rico/quality rheMac2 /san/sanvol1/rico/quality rn4 /san/sanvol1/rico/quality sorAra1 /san/sanvol1/rico/quality tupBel1 /san/sanvol1/rico/quality ## the following script will add quality data to each of the mafs cat > addQData << 'EOF' #!/bin/sh INPUT_DIR=/cluster/data/hg18/bed/multiz28way/anno/maf OUTPUT_DIR=/cluster/store12/rico/hg18/bed/multiz28way/qual/maf for maf in `ls -1Sr ${INPUT_DIR}/*.maf` do file=`basename $maf` mafAddQRows species.lst $maf ${OUTPUT_DIR}/$file done 'EOF' # Gene frames ssh hgwdev cd /cluster/data/hg18/bed/multiz28way mkdir frames cd frames cat > showGenes.csh << 'EOF' foreach db (`grep -v hg18 ../species.lst`) echo " $db" echo -n "Tables: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \ $table == "knownGene") then echo -n "${table}: " hgsql $db -N -e "select count(*) from $table" endif end echo -n "Mrnas: " set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql hg18 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "0" else hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId" endif end 'EOF' # based on output, pick gene tables, according to the following criteria: # KG if present, else refGene if >10000 entries, else ensGene (unless dog), # else mgcGenes, else mrnas if > 10000 else none. In all cases # except none, add in refGene. hg18: knownGene bosTau3: mrna canFam2: mrna cavPor2: mrna danRer4: refGene (13K) or ensGene (36K ?) equCab1: mrna fr2: ensGene galGal3: mrna gasAcu1: ensGene mm8: knownGene monDom4: ensGene oryCun1: mrna panTro2: refGene rheMac2: ensGene rn4: knownGene ? (8K) or refGene (10K) or ensGene(34K) ? tetNig1: mrna xenTro2: mrna # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using mrna table as genes: bostau3, canFam2, cavPor2, equCab1, galGal3, oryCun1, tetNig1, xenTro2 cat > getGenes.csh << 'EOF' rm -fr genes mkdir -p genes #set mrnaDbs = "bosTau3 canFam2 cavPor2 equCab1 galGal3 oryCun1 tetNig1 xenTro2" # use only those with databases for now set mrnaDbs = "bosTau3 canFam2 equCab1 galGal3 oryCun1 tetNig1 xenTro2" foreach queryDb ($mrnaDbs) set tmpExt = `mktemp temp.XXXXXX` set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt} set tmpMrna = ${queryDb}.mrna.${tmpExt} set tmpCds = ${queryDb}.cds.${tmpExt} echo $queryDb hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ $queryDb > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \ genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$queryDb.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end # using knownGene for rn4 mm8 hg18 # using refGene for panTro2 # using ensGene for danRer4, fr2, gasSAcu1, monDom4, rheMac2 # genePreds; (must keep only the first 10 columns for knownGene) set geneDbs = "hg18 mm8 rn4 danRer4 panTro2 fr2 gasAcu1 monDom4 rheMac2" foreach queryDb ($geneDbs) if ($queryDb == "danRer4" || $queryDb == "fr2" || $queryDb == "gasAcu1" || \ $queryDb == "monDom4" || $queryDb == "rheMac2") then set geneTbl = ensGene else if ($queryDb == "panTro2") then set geneTbl = refGene else if ($queryDb == "hg18" || $queryDb == "mm8" || $queryDb == "rn4") then set geneTbl = knownGene endif hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from $geneTbl" ${queryDb} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz end 'EOF' csh getGenes.csh >&! getGenes.log & ssh kkstore02 cd /cluster/data/hg18/bed/multiz28way/frames # leaving out cavPor2 (no db) and tetNig1 (too few gene preds) (cat ../maf/*.maf | nice genePredToMafFrames hg18 stdin stdout bosTau3 genes/bosTau3.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz fr2 genes/fr2.gp.gz galGal3 genes/galGal3.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz xenTro2 genes/xenTro2.gp.gz gasAcu1 genes/gasAcu1.gp.gz monDom4 genes/monDom4.gp.gz equCab1 genes/equCab1.gp.gz | gzip > multiz28way.mafFrames.gz) >& frames.log & ssh hgwdev cd /cluster/data/hg18/bed/multiz28way/frames nice hgLoadMafFrames hg18 multiz28wayFrames multiz28way.mafFrames.gz >& loadFrames. log & # from 17way: hg18 = knownGene rn4 = knownGene mm8 = knownGene panTro1 = ensGene rheMac2 = mrna oryCun1 = mrna #dasNov1 = canFam2 = mrna #loxAfr1 = bosTau2 = mrna #echTel1 = #monDom4 = galGal2 = refGene xenTro1 = mgcGenes #tetNig1 = fr1 = ensGene danRer3 = mrna ############################################################################ # PHASTCONS FOR 28WAY (2007-04-04 kate) # generate tree model with branch lengths using phyloFit from Adam # Siepel's # phastCons package. Input is 28way alignments of # 4-fold degenerate sites (4d sites) determined from a # nonredundant (non-overlapping) gene set. Elliott Margulies # has a perl script (extract_coding_alignments.pl) that he used # with the ENCODE alignments. # Adam uses his msa_view tool with the --4d option. # For first try, use Gencode Oct '05 reference set filtered # to longest transcript, then lifted to hg18 # Compare results with hgClusterGenes and /cluster/bin/phast/refeature, # and genePredSingleCover hgsql hg17 -N -e 'select * from encodeGencodeGeneKnownOct05' > gencodeKnown.gp wc -l gencodeKnown.gp # 2608 gencodeKnown.gp hgsql hg17 -N -e "select count(*) from encodeGencodeGeneKnownOct05 where cdsStart <> 0 and cdsEnd <> 0" # 1097 hgsql hg17 -N -e "select count(*) from encodeGencodeGeneKnownOct05 where cdsStartStat='cmpl' and cdsEndStat='cmpl'" # 752 # Jim's gene uniquifier hgClusterGenes -noProt hg17 encodeGencodeGeneKnownOct05 \ encodeGencodeGeneKnownOct05Clusters encodeGencodeGeneKnownOct05Canonical # Got 457 clusters, from 2608 genes in 46 chromosomes hgsql hg17 -N -e "select transcript from encodeGencodeGeneKnownOct05Canonical order by transcript" > genes.jim # Adam's feature uniquifier # requires cdsStart and cdsEnd in gene pred hgsql hg17 -N -e 'select * from encodeGencodeGeneKnownOct05 where cdsStart<>0 and cdsEnd <> 0' > gencodeKnownCds.gp wc -l gencodeKnownCds.gp # 1097 gencodeKnownCds.gp /cluster/bin/phast/refeature --unique gencodeKnownCds.gp > \ gencodeKnownCdsNR.gff awk '{print $10}' gencodeKnownCdsNR.gff | sort | uniq | wc -l # 333 /cluster/bin/phast/refeature -o genepred --unique \ gencodeKnownCds.gp | sort > gencodeKnownCdsNR.gp wc -l gencodeKnownCdsNR.gp # 333 awk '{print $1}' gencodeKnownCdsNR.gp | sort > genes.adam # get intersection comm -1 -2 genes.jim genes.adam > genes.both wc -l genes.both # 235 # genePredSingleCover filters but leaves extended gene pred genePredSingleCover gencodeKnownCds.gp stdout | sort > gencodeKnownCdsNR2.gp wc -l gencodeKnownCdsNR2.gp # 423 awk '{print $1}' gencodeKnownCdsNR2.gp | sort > genes.scov comm -1 -2 genes.scov genes.both > genes.all wc -l genes.all # 224 -- all 3 methods picked these liftOver -genePred gencodeKnownCdsNR2.gp \ /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz \ gencodeKnown.hg18.gp unmapped.gp genePredCheck gencodeKnown.hg18.gp # checked: 423 failed: 0 # all genes mapped # consider using only intersection of above 3 methods grep chr22 gencodeKnown.hg18.gp > gencodeKnown.hg18.chr22.gp /cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.chr22.gp \ -i MAF ../maf/chr22__0.maf > chr22.mfa # extract ENCODE regions from MAF's ssh hgwdev cd /cluster/data/hg18/bed/multiz28way/4d hgsql hg18 -N -e \ "select chrom, chromStart, chromEnd, name from encodeRegions" \ > encodeRegions.bed ssh kkstore02 cd /cluster/data/hg18/bed/multiz28way/4d cat > encodeMafs.csh << 'EOF' mkdir -p encodeMafs set chroms = `awk '{print $1}' encodeRegions.bed | sort | uniq` foreach c ($chroms) echo $c # needed till mafsInRegion is fixed to handle split maf files cat ../maf/${c}__?.maf > $c.maf awk -v CHR=$c '$1 == CHR {print}' encodeRegions.bed > regions.bed mafsInRegion regions.bed -outDir encodeMafs/ $c.maf end 'EOF' csh encodeMafs.csh >&! encodeMafs.log & # try it out on a few regions set r = "ENm001" set r = "ENr231" perl -wpe 's/^s ([^.]+)\.\S+/s $1/' encodeMafs/$r.maf > $r.clean.maf # generate ss file /cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.gp \ -i MAF $r.clean.maf -o SS > $r.4d.3.ss /cluster/bin/phast/msa_view -i SS -o FASTA $r.4d.3.ss > $r.4d.3.mfa /cluster/bin/phast/msa_view -i SS --tuple-size 1 $r.4d.3.ss -o SS > $r.4d.1.ss /cluster/bin/phast/msa_view -i SS -o FASTA $r.4d.1.ss > $r.4d.1.mfa # now on all regions cat > encode4d.csh << 'EOF' mkdir mfa4d foreach f (encodeMafs/*.maf) set r = $f:t:r echo $r perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $f > clean.maf /cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.gp \ -i MAF clean.maf -o SS | \ /cluster/bin/phast/msa_view -i SS --tuple-size 1 - > mfa4d/$r.4d.mfa # remove empties to satisfy msa_view --aggregate if (-z mfa4d/$r.4d.mfa) then rm mfa4d/$r.4d.mfa endif end 'EOF' csh encode4d.csh >&! encode4d.log & set species1 = `sed 's/$/,/g' ../species.lst` set species = `echo $species1 | sed -e 's/ //g' -e 's/,$//'` # From Elliott's script: #/cluster/bin/phast/msa_view --aggregate $species EN*.mfa | \ #sed s/"> "/">"/ > some-4d_align.mfa /cluster/bin/phast/msa_view --aggregate $species mfa4d/EN*.4d.mfa | \ sed s/"> "/">"/ > all-4d_align.mfa # tweak input tree -- remove common names, include commas sed 's/[a-z][a-z]*_//g' ../tree/tree.web.commas.nh > tree.commas.nh # From Elliott's script with Adam's mods (use --EM, MED) /cluster/bin/phast/phyloFit --EM --precision MED \ --msa-format FASTA --subst-mod REV \ --tree tree.commas.nh all-4d_align.mfa grep TREE phyloFit.mod | sed 's/TREE\:\ //' > tree_4d.28way.nh /cluster/bin/phast/tree_doctor --dissect tree_4d.28way.nh | \ awk '$1 == "dparent" {x += $3} END {print x}' # 9.0516 # extract species distances /cluster/bin/phast/all_dists tree_4d.28way.nh > 28way.distances.txt grep hg18 28way.distances.txt | sort -k3,3n | \ awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt # get chain stats ordered by distance awk '{print $2}' distances.txt > species.byDistance csh ../getChainStats.csh species.byDistance >&! species.chainStats # spruce up names for tree drawing /cluster/bin/phast/tree_doctor \ --rename="hg18 -> human ; panTro2 -> chimp ; rheMac2 -> macaque ; otoGar1 -> bushbaby ; tupBel1 -> tree_shrew ; rn4 -> rat ; mm8 -> mouse ; cavPor2 -> guinea_pig ; oryCun1 -> rabbit ; sorAra1 -> shrew ; eriEur1 -> hedgehog ; canFam2 -> dog ; felCat3 -> cat ; equCab1 -> horse ; bosTau3 -> cow ; dasNov1 -> armadillo ; loxAfr1 -> elephant ; echTel1 -> tenrec ; monDom4 -> opossum ; ornAna1 -> platypus ; galGal3 -> chicken ; anoCar1 -> lizard ; xenTro2 -> frog ; tetNig1 -> tetraodon ; fr2 -> fugu ; gasAcu1 -> stickleback ; oryLat1 -> medaka ; danRer4 -> zebrafish" \ tree_4d.28way.nh > tree_4d.28way.common.nh # compare to Elliott's latest ENCODE tree, pruned to match /cluster/bin/phast/tree_doctor \ --prune-all-but=human,chimp,macaque,galago,rat,mouse,guinea_pig,rabbit,cow,cat,dog,hedgehog,shrew,armadillo,elephant,tenrec,monodelphis,platypus,chicken,xenopus \ --rename="xenopus -> frog ; galago -> bushbaby; monodelphis -> opossum"\ encode2007.nh > encode2007.pruned.nh # my 4d tree with only species in the pruned ENCODE tree /cluster/bin/phast/tree_doctor \ --prune-all-but=human,chimp,macaque,bushbaby,rat,mouse,guinea_pig,rabbit,cow,cat,dog,hedgehog,shrew,armadillo,elephant,tenrec,opossum,platypus,chicken,frog \ tree_4d.28way.common.nh > tree_4d.20way.common.nh # Create chrom mafs from split mafs (do this earlier next time) ssh kki cd /cluster/data/hg18/bed/multiz28way mkdir chromMaf mkdir run.merge cd run.merge cat > doMerge.csh << 'EOF' #!/bin/csh -ef set c = $1 set cmaf = ../chromMaf/${c}.maf # NOTE: need to change mafFilter to retain (and uniquify) comments # begin with ##maf header head -1 ../maf/${c}__0.maf > $cmaf grep -h '# ' ../maf/${c}__?.maf | sed 's/\/scratch\/tmp.* //' | sort | uniq \ >> $cmaf # don't filter out blocks with alignment this time -- might be needed # for symmetry with irows version, or for analysis. Check on this. mafFilter -minRow=1 ../maf/${c}__?.maf >> $cmaf 'EOF' # << happy emacs chmod a+x doMerge.csh cat > spec << 'EOF' #LOOP ./doMerge.csh $(root1) {check out line+ ../chromMaf/$(root1).maf} #ENDLOOP 'EOF' # << happy emacs awk '{print $1}' /cluster/data/hg18/chrom.sizes > chrom.lst gensub2 chrom.lst single spec jobList para create jobList # 49 jobs para try para check para push # Split chromosome MAF's into windows and use to generate # "sufficient statistics" (ss) files for phastCons input # large mem jobs so use mini-cluster ssh kki cd /cluster/data/hg18/bed/multiz28way mkdir cons cd cons # Create tree model for phastCons # Adjust model file base composition background and rate matrix to be # representative of whole-genome (.41 -- as was done for ENCODE) # using utility, 'modFreqs' from Adam (5/07) # NOTE: updated all phast source and rebuilt to phast.2007-05-04 set gc = `grep BACKGROUND /cluster/data/hg18/bed/multiz17way/cons/elliotsEncode.mod | \ awk '{printf "%0.3f\n", $3 + $4;}'` echo $gc # .41 /cluster/bin/phast.2007-05-04/modFreqs ../4d/phyloFit.mod $gc > 28way.mod # split 28way mafs into 10M chunks and generate sufficient statistics # files for # phastCons mkdir run.split cd run.split set WINDOWS = /san/sanvol1/scratch/hg18/multiz28way/cons/ss rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/hg18/bed/multiz28way/chromMaf set WINDOWS = /san/sanvol1/scratch/hg18/multiz28way/cons/ss cd $WINDOWS set c = $1 echo $c rm -fr $c mkdir $c # need to truncate odd-ball scaffold/chrom names that include dots # as phastCons utils can't handle them set TMP = /scratch/tmp/$c.clean.maf.$$ #perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $MAFS/$c.maf > $TMP perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $TMP /cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \ -M /cluster/bluearc/hg18/chrom/$c.fa \ -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 rm -f $TMP echo "Done" >> $c.done 'EOF' # << happy emacs chmod +x doSplit.csh rm -f jobList foreach f (../../chromMaf/*.maf) set c = $f:t:r echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList end para create jobList # 49 jobs para try para check para push # completed shorter jobs in a few hours, but others failed on memory. # redo on kolossus -- 14 hours! # NOTE: next time try harder working with split mafs! # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh pk cd /cluster/data/hg18/bed/multiz28way/cons mkdir run.cons cd run.cons cat > doPhast.csh << 'EOF' #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.2007-05-04 set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set tmp = /scratch/tmp/$f mkdir -p $tmp set san = /san/sanvol1/scratch/hg18/multiz28way/cons cp -p $grp/$grp.mod $grp/$grp.non-inf . cp -p $san/ss/$c/$f.ss ../../$grp/$grp.mod ../../$grp/$grp.non-inf $tmp pushd $tmp > /dev/null $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c sleep 1 mv $tmp/$f.pp $san/$grp/pp/$c mv $tmp/$f.bed $san/$grp/bed/$c rm -fr $tmp 'EOF' # << happy emacs chmod a+x doPhast.csh # Create parasol batch and run it pushd /san/sanvol1/scratch/hg18/multiz28way/cons ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \ /cluster/data/hg18/bed/multiz28way/cons/run.cons/in.list popd # run for all species cd .. mkdir -p all run.cons/all cd all cp ../28way.mod all.mod # non-informative option for closest relatives (exclude regions with only these aligning), # and till Adam fixes the problem, also exclude all species removed from tree (below) echo "panTro2,rheMac2" > all.non-inf cd ../run.cons # Create template file # root1 == chrom name, file1 == ss file name without .ss suffix cat > template << 'EOF' #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 #ENDLOOP 'EOF' # << happy emacs cd all gensub2 ../in.list single ../template jobList para create jobList # 337 jobs para try para check para push # NOTE: These jobs regularly crash (too quick ?), and have to be repushed. # Also, a few hang, and need to be stopped and restarted. # The whole batch runs so fast, this isn't a problem # CPU time in finished jobs: 34253s 570.89m 9.51h 0.40d 0.001 y IO & Wait Time: 61148s 1019.13m 16.99h 0.71d 0.002 y Average job time: 283s 4.72m 0.08h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 496s 8.27m 0.14h 0.01d Submission to last job: 995s 16.58m 0.28h 0.01d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/hg18/multiz28way/cons/all cat bed/*/chr*.bed | ~/bin/${MACHTYPE}/bedSort stdin stdout | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/hg18/bed/multiz28way/cons/all # load into database ssh hgwdev cd /cluster/data/hg18/bed/multiz28way/cons/all hgLoadBed hg18 phastConsElements28way mostConserved.bed # Loaded 2183600 elements # compare with previous tracks hgsql hg18 -s -N -e "select count(*) from phastConsElements17way" # 2229902 hgsql hg18 -s -N -e "select count(*) from phastConsElements17way where chrom='chr7'" # 114703 # Try for 5% overall cov, and 70% CDS cov featureBits hg18 -enrichment refGene:cds phastConsElements28way >& fb.out & # Compare to chr7 for 17way -- chr7 is .7% lower than whole genome, # so aim for 4.3% on chr7 featureBits hg18 -chrom=chr7 -enrichment refGene:cds phastConsElements28way # USED FOR 17WAY # too little coverage # 14 .008 .28 # refGene:cds 0.911%, phastConsElements28way 3.551%, both 0.653%, cover 71.74%, enrich 20.20x # 14 .1 .28 # refGene:cds 0.911%, phastConsElements28way 3.954%, both 0.648%, cover 71.12%, enrich 17.98x # 12 .1 .28 # refGene:cds 0.911%, phastConsElements28way 3.914%, both 0.644%, cover 70.74%, enrich 18.08x # 14 .2 .3 # 234653 elements # refGene:cds 0.911%, phastConsElements28way 4.423%, both 0.659%, cover 72.34%, enrich 16.36x # 13 .2 .28 # refGene:cds 0.911%, phastConsElements28way 4.266%, both 0.644%, cover 70.73%, enrich 16.58x # USE THIS ONE # minimum change to params to achieve coverage # 14 .2 .28 # 249585 elements # refGene:cds 0.911%, phastConsElements28way 4.269%, both 0.646%, cover 70.92%, enrich 16.61x # 15 .2 .28 # refGene:cds 0.911%, phastConsElements28way 4.271%, both 0.647%, cover 71.08%, enrich 16.64x # 14 .3 .28 # refGene:cds 0.911%, phastConsElements28way 4.644%, both 0.645%, cover 70.89%, enrich 15.27x # 14 .35 .28 # refGene:cds 0.911%, phastConsElements28way 4.879%, both 0.646%, cover 70.90%, enrich 14.53x # 14 .15 .3 # 207188 elements # refGene:cds 0.912%, phastConsElements28way 4.260%, both 0.660%, cover 72.34%, enrich 16.98x # 16 .15 .3 # 193531 elements # refGene:cds 0.912%, phastConsElements28way 4.289%, both 0.663%, cover 72.66%, enrich 16.94x # 20 .15 .3 # 173668 elements # refGene:cds 0.912%, phastConsElements28way 4.321%, both 0.667%, cover 73.11%, enrich 16.92x # 24 .15 .3 # 159646 elements # refGene:cds 0.912%, phastConsElements28way 4.338%, both 0.670%, cover 73.40%, enrich 16.92x # 30 .15 .3 # 144399 elements # refGene:cds 0.912%, phastConsElements28way 4.349%, both 0.673%, cover 73.72%, enrich 16.95x # 40 .15 .3 # 128087 elements # refGene:cds 0.912%, phastConsElements28way 4.353%, both 0.676%, cover 74.09%, enrich 17.02x # 50 .15 .3 # 117338 elements # refGene:cds 0.912%, phastConsElements28way 4.352%, both 0.678%, cover 74.32%, enrich 17.08x # 50 .1 .3 # 116930 elements # refGene:cds 0.912%, phastConsElements28way 4.347%, both 0.678%, cover 74.32%, enrich 17.10x # 50 .05 .3 # 93391 elements # refGene:cds 0.912%, phastConsElements28way 4.193%, both 0.680%, cover 74.57%, enrich 17.78x # 50 .07 .3 # 99358 # refGene:cds 0.912%, phastConsElements28way 4.231%, both 0.680%, cover 74.51%, enrich 17.61x # 45 .07 .3 # 102864 elements # refGene:cds 0.912%, phastConsElements28way 4.227%, both 0.679%, cover 74.41%, enrich 17.60x # USE THIS ONE # matches element count for 17way # 45 .1 .3 # 110836 elements # refGene:cds 0.912%, phastConsElements28way 4.277%, both 0.678%, cover 74.33%, enrich 17.38x # 75 .1 .3 # Try for really long elements # 93524 elements # refGene:cds 0.912%, phastConsElements28way 4.279%, both 0.682%, cover 74.73%, enrich 17.47x # 100 .1 .3 # 85757 elements # refGene:cds 0.912%, phastConsElements28way 4.270%, both 0.683%, cover 74.90%, enrich 17.54 # 71218 elements # 200 .1 .3 # refGene:cds 0.912%, phastConsElements28way 4.225%, both 0.686%, cover 75.16%, enrich 17.79x # 200 .12 .3 # refGene:cds 0.912%, phastConsElements28way 4.241%, both 0.686%, cover 75.13%, enrich 17.72x # USE THIS ONE # for really long elements # 200 .15 .3 # 75659 # refGene:cds 0.912%, phastConsElements28way 4.261%, both 0.685%, cover 75.11%, enrich 17.63x featureBits hg18 -chrom=chr7 -enrichment refGene:cds phastConsElements17way # refGene:cds 0.911%, phastConsElements17way 4.838%, both 0.639%, cover 70.22%, enrich 14.51x featureBits hg18 -enrichment refGene:cds phastConsElements17way # refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x # compare element sizes to other runs: # e.g. select min(chromEnd-chromStart) from encodeTbaPhastConsEl # hg17 ENCODE TBA phastCons: min=1, max=1961 # hg17 ENCODE TBA gerp: min=3, max=1426 # hg18 17way: min=1, max=12590 #el on chr7: 114703 # 45 .3 .31 # featureBits hg18 -enrichment refGene:cds phastConsElements28way refGene:cds 1.095%, phastConsElements28way 4.920%, both 0.827%, cover 75.48%, enrich 15.34x # 2906254 elements # Create merged posterier probability file and wiggle track data files # pk is currently closer to the san than any other machine ssh pk cd /san/sanvol1/scratch/hg18/multiz28way/cons/all # sort by chromName, chromStart so that items are in numerical order # for wigEncode cat > listPp.csh << 'EOF' foreach d (pp/chr*/) ls $d/*.pp | sort -n -t\. -k2 end 'EOF' csh listPp.csh | xargs cat | \ nice wigEncode stdin phastCons28way.wig phastCons28way.wib # about 23 minutes for above cp -p phastCons28way.wi? /cluster/data/hg18/bed/multiz28way/cons/all # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/hg18/bed/multiz28way/cons/all ln -s /cluster/data/hg18/bed/multiz28way/cons/all/phastCons28way.wib \ /gbdb/hg18/multiz28way hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \ phastCons28way phastCons28way.wig # ~ 3 minute load ## Run phastCons on subgroup (placentals) ssh pk cd /cluster/data/hg18/bed/multiz28way/cons # create pruned tree set species = `cat ../species.lst` echo $species | sed 's/ /,/g' #anoCar1,bosTau3,canFam2,cavPor2,danRer4,dasNov1,echTel1,equCab1,eriEur1,felCat3,fr2,galGal3,gasAcu1,hg18,loxAfr1,mm8,monDom4,ornAna1,oryCun1,oryLat1,otoGar1,panTro2,rheMac2,rn4,sorAra1,tetNig1,tupBel1,xenTro2 # setup placental-only run mkdir placental run.cons/placental cd placental # placental-only: exclude from phastCons: 10 non-placentals # (platypus, opossum, 5 fish, chicken, lizard, frog) /cluster/bin/phast.new/tree_doctor ../28way.mod \ --prune-all-but=bosTau3,canFam2,cavPor2,dasNov1,echTel1,equCab1,eriEur1,felCat3,hg18,loxAfr1,mm8,oryCun1,otoGar1,panTro2,rheMac2,rn4,sorAra1,tupBel1 \ > placental.mod echo "panTro2,rheMac2,anoCar1,danRer4,fr2,galGal3,gasAcu1,monDom4,ornAna1,oryLat1,tetNig1,xenTro2" \ > placental.non-inf cd ../run.cons/placental gensub2 ../in.list single ../template jobList para create jobList para try para check para push # ~30 minutes on pk # NOTE: sometimes jobs crash or hang due to access problems on SAN # para stop then push to recover cd ../../ mkdir hqAll run.cons/hqAll cd hqAll # high-qual only: exclude 10 low-qual mammals /cluster/bin/phast.new/tree_doctor 28way.mod \ --prune-all-but=anoCar1,bosTau3,canFam2,danRer4,equCab1,fr2,galGal3,gasAcu1,hg18,mm8,monDom4,ornAna1,oryLat1,panTro2,rheMac2,rn4,tetNig1,xenTro2 \ > hqAll.mod echo "panTro2,rheMac2,cavPor2,dasNov1,echTel1,loxAfr1,eriEur1,felCat3,oryCun1,otoGar1,sorAra1,tupBel1" \ > hqAll.non-inf cd ../run.cons/hqAll gensub2 ../in.list single ../template jobList para create jobList para try para check para push cd ../../ mkdir hqPlacental run.cons/hqPlacental cd hqPlacental # high-qual placental only: exclude 10 non-placentals and 10 low-qual mammals, /cluster/bin/phast.new/tree_doctor ../28way.mod \ --prune-all-but=bosTau3,canFam2,equCab1,hg18,mm8,panTro2,rheMac2,rn4 \ > hqPlacental.mod echo "panTro2,rheMac2,cavPor2,dasNov1,echTel1,loxAfr1,eriEur1,felCat3,oryCun1,otoGar1,sorAra1,tupBel1,anoCar1,danRer4,fr2,galGal3,gasAcu1,monDom4,ornAna1,oryLat1,tetNig1,xenTro2" \ > hqPlacental.non-inf cd ../run.cons/hqPlacental gensub2 ../in.list single ../template jobList para create jobList para try para check para push # add placental elements to Most Conserved track ssh kolossus cd /san/sanvol1/scratch/hg18/multiz28way/cons/placental cat bed/*/chr*.bed | ~/bin/${MACHTYPE}/bedSort stdin stdout | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/hg18/bed/multiz28way/cons/placental # load into database ssh hgwdev cd /cluster/data/hg18/bed/multiz28way/cons/placental hgLoadBed hg18 phastConsElements28wayPlacMammal mostConserved.bed featureBits hg18 -enrichment refGene:cds phastConsElements28wayPlacMammal >&! ../run.cons/placental/fb.out # experiments # USING THIS ONE: min change from 17way to achieve coverage # 14.2.28 # 169516 elements # 169518 # refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Short 3.437%, both 0.615%, cover 67.40%, enrich 19.61x # refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Short 3.437%, both 0.615%, cover 67.40%, enrich 19.61x # USING THIS ONE: vertebrate elements have similar count to 17way ("medium") # 45.1.3 # 76715 elements # 76718 elements # refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Med 3.312%, both 0.642%, cover 70.33%, enrich 21.24x #refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Med 3.312%, both 0.642%, cover 70.33%, enrich 21.24x # Create merged posterier probability file and wiggle track data files # pk is currently closer to the san than any other machine ssh pk cd /san/sanvol1/scratch/hg18/multiz28way/cons/placental # sort by chromName, chromStart so that items are in numerical order # for wigEncode cat > listPp.csh << 'EOF' foreach d (pp/chr*/) ls $d/*.pp | sort -n -t\. -k2 end 'EOF' csh ../listPp.csh | xargs cat | \ nice wigEncode stdin \ phastCons28wayPlacMammal.wig phastCons28wayPlacMammal.wib # about 23 minutes for above cp -p phastCons28wayPlacMammal.wi? /cluster/data/hg18/bed/multiz28way/cons/placental # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/hg18/bed/multiz28way/cons/placental ln -s \ /cluster/data/hg18/bed/multiz28way/cons/placental/phastCons28wayPlacMammal.wib \ /gbdb/hg18/multiz28way hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \ phastCons28wayPlacMammal phastCons28wayPlacMammal.wig # WARNING: Exceeded chr4_random size 842649 > 842648. dropping 2 data point(s) # NOTE: weird msa_split on this chrom -- sent inquiry to Adam about this # ~ 3 minute load ######################################################################## # phyloP conservation # split SS files into 100K chunks (5 min./job) ssh kki cd /cluster/data/hg18/bed/multiz28way/cons/ mkdir run.phyloP.split cd run.phyloP.split cat << 'EOF' > doSplit.csh #!/bin/csh -ef set c = $1 set san = /san/sanvol1/scratch/hg18/multiz28way set in = $san/cons/ss set out = $san/phyloP/ss set PHASTBIN = /cluster/bin/phast.2007-05-04 @ i=0 foreach f (`ls $in/$c/*.ss | sort -n -t\. -k2`) @ i++ mkdir -p $out/$c/$i $PHASTBIN/msa_split $f -i SS -o SS \ -r $out/$c/$i/$c.$i -w 100000,0 -I 1000 -B 5000 end echo "Done" >> $out/$c.done 'EOF' # << happy emacs chmod +x doSplit.csh set san = /san/sanvol1/scratch/hg18/multiz28way set JOBS = /cluster/data/hg18/bed/multiz28way/cons/run.phyloP.split/jobList rm -f $JOBS foreach c (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`) echo "doSplit.csh $c {check out line+ $san/phyloP/ss/$c.done}" >> $JOBS end para create jobList # 49 jobs para try para check para push para time # Completed: 49 of 49 jobs # CPU time in finished jobs: 8827s 147.12m 2.45h 0.10d 0.000 y # IO & Wait Time: 6837s 113.95m 1.90h 0.08d 0.000 y # Average job time: 320s 5.33m 0.09h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1343s 22.38m 0.37h 0.02d # Submission to last job: 1528s 25.47m 0.42h 0.02d ######################################################################## # phyloP scoring method experiments on chr7 (2008-11-11 kate) ssh pk cd /cluster/data/hg18/bed/multiz28way/cons mkdir -p run.phyloPMethod cd run.phyloPMethod cat > doPhyloP.csh << 'EOF' set method = $1 set f = $2 set out = $3 set c = $f:r:r set n = $f:r:e set tmp = /scratch/tmp/$f mkdir -p $tmp cp -p /san/sanvol1/scratch/hg18/multiz28way/phyloP/ss/$c/$n/$f.ss ../tree.mod $tmp pushd $tmp > /dev/null # Built phast from CornellCVS on 11/11/08 in /cluster/bin/phast.build. # Symlinked the bin to /cluster/bin/phast.2008 set PHASTBIN = /cluster/bin/phast.2008-11-13 # PHAST version is 0.9.9.8b $PHASTBIN/phyloP --method $method --mode CONACC --wig-scores --chrom $c \ -i SS tree.mod $f.ss > $f.wig popd > /dev/null mkdir -p $out:h mv $tmp/$f.wig $out rm -fr $tmp 'EOF' # Create list of chunks (just chr7 for now) pushd /san/sanvol1/scratch/hg18/multiz28way/phyloP/ss ls chr7/*/chr7.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \ /cluster/data/hg18/bed/multiz28way/cons/run.phyloPMethod/in.list # setup run mkdir -p all cd all cp ../../28way.mod tree.mod mkdir -p SCORE cd SCORE # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat > template << 'EOF' #LOOP csh ../../doPhyloP.csh SCORE $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloPMethod/all/SCORE/$(path1).wig} #ENDLOOP 'EOF' # << happy emacs gensub2 ../../in.list single template jobList para create jobList para try para check para push # Completed: 1552 of 1552 jobs # CPU time in finished jobs: 15411s 256.84m 4.28h 0.18d 0.000 y # IO & Wait Time: 7678s 127.97m 2.13h 0.09d 0.000 y # Average job time: 15s 0.25m 0.00h 0.00d # Longest finished job: 29s 0.48m 0.01h 0.00d # Submission to last job: 236s 3.93m 0.07h 0.00d # Estimated complete: 0s 0.00m 0.00h 0.00d cd .. mkdir -p LRT cd LRT # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat > template << 'EOF' #LOOP csh ../../doPhyloP.csh LRT $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloPMethod/all/LRT/$(path1).wig} #ENDLOOP 'EOF' # << happy emacs gensub2 ../../in.list single template jobList para create jobList para try para check para push # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/hg18/multiz28way/phyloP/all cat > listPp.csh << 'EOF' foreach c (`ls -d chr*`) foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`) ls -1 $d/*.wig | sort -n -t\. -k3 end end 'EOF' csh listPp.csh | xargs cat | \ nice wigEncode stdin phyloP28way.wig phyloP28way.wib mkdir /cluster/data/hg18/bed/multiz28way/cons/phyloP/all cp -p phyloP28way.wi? /cluster/data/hg18/bed/multiz28way/cons/phyloP/all # setup placental run mkdir -p placental cd all cp ../../placental.mod tree.mod # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat > template << 'EOF' #LOOP csh ../doPhyloP.csh $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloP/placental/$(path1).wig} #ENDLOOP 'EOF' # << happy emacs gensub2 ../in.list single template jobList para create jobList para try para check para push #CPU time in finished jobs: 1934553s 32242.55m 537.38h 22.39d 0.061 y #IO & Wait Time: 82007s 1366.78m 22.78h 0.95d 0.003 y #Average job time: 70s 1.16m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 147s 2.45m 0.04h 0.00d #Submission to last job: 37642s 627.37m 10.46h 0.44d # sort by chromName, chromStart so that items are in numerical order # for wigEncode ssh pk cd /san/sanvol1/scratch/hg18/multiz28way/phyloP/placental # check for clean dir here -- chr* will match garbage if it's there cat > listPp.csh << 'EOF' foreach c (`ls -d chr*`) foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`) ls -1 $d/*.wig | sort -n -t\. -k3 end end 'EOF' csh listPp.csh | xargs cat | \ nice wigEncode stdin phyloP28wayPlacMammal.wig phyloP28wayPlacMammal.wib mkdir /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental cp -p phyloP28wayPlacMammal.wi? /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/hg18/bed/multiz28way/cons/phyloP/all ln -s \ /cluster/data/hg18/bed/multiz28way/cons/phyloP/all/phyloP28way.wib \ /gbdb/hg18/multiz28way/phyloP28way.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \ phyloP28way phyloP28way.wig # WARNING: Exceeded chr4_random size 842649 > 842648. dropping 2 data point(s) cd ../placental ln -s \ /cluster/data/hg18/bed/multiz28way/cons/phyloP/all/phyloP28wayPlacMammal.wib \ /gbdb/hg18/multiz28way/phyloP28wayPlacMammal.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \ phyloP28wayPlacMammal phyloP28wayPlacMammal.wig hgWiggle phyloP28wayChr7 | textHistogram -col=2 -real -skip=7 -binSize=.2 0.000000 **************** 26649187 0.200000 ************************************************************ 101774235 0.400000 ********** 16325655 0.600000 *** 4331032 0.800000 * 1029490 1.000000 0 1.200000 456666 1.400000 0 1.600000 240876 1.800000 0 2.000000 246969 2.200000 0 2.400000 0 2.600000 0 2.800000 134764 cd ../placental hgWiggle phyloP28wayPlacMammalChr7 | textHistogram -col=2 -real -skip=7 -binSize=.2 stdin cd ../../all hgWiggle phastCons28wayChr7Short | textHistogram -col=2 -real -skip=7 -binSize=.1 stdin 0.000000 ************************************************************ 128445730 0.100000 **** 7648620 0.200000 ** 3473415 0.300000 * 1986801 0.400000 * 1399849 0.500000 * 1096292 0.600000 912539 0.700000 893991 0.800000 1008630 0.900000 * 2940535 1.000000 * 1383115 ############################################################################ # PhyloP experiments with new scoring methods: LRT and SCORE, implemented in 2008 # Using new PHAST package (rebuilt from cornellCVS) # chr7-only # 2008-11-11 kate ############################################################################ # DOWNLOADS FOR 28WAY (2007-05-30 kate) ssh kkstore02 cd /cluster/data/hg18/bed/multiz28way cat > downloads.csh << 'EOF' date set dir = /cluster/data/hg18/bed/multiz28way mkdir -p mafDownloads cd $dir/mafDownloads foreach f (../maf/chr*.maf) set c = $f:t:r echo $c nice gzip -c $f > $c.maf.gz end md5sum *.gz > md5sum.txt cd $dir mkdir -p phastConsDownloads/vertebrate phastConsDownloads/placental cd /san/sanvol1/scratch/hg18/multiz28way/cons foreach chr (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`) echo $chr cat `ls -1 all/pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \ | nice gzip -c \ > $dir/phastConsDownloads/vertebrate/$chr.pp.gz cat `ls -1 placental/pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \ | nice gzip -c \ > $dir/phastConsDownloads/placental/$chr.pp.gz end cd /cluster/data/hg18/bed/multiz28way/phastConsDownloads/vertebrate md5sum *.gz > md5sum.txt cd ../placental md5sum *.gz > md5sum.txt date 'EOF' csh downloads.csh >&! downloads.log & # << happy emacs ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/hg18/phastCons28way mkdir -p $dir/vertebrate $dir/placental ln -s /cluster/data/hg18/bed/multiz28way/phastConsDownloads/vertebrate/{*.gz,md5sum.txt} $dir/vertebrate ln -s /cluster/data/hg18/bed/multiz28way/phastConsDownloads/placental/{*.gz,md5sum.txt} $dir/placental cp /usr/local/apache/htdocs/goldenPath/hg18/phastCons17way/README.txt $dir # edit this file to reflect the latest releases used. vi $dir/README.txt set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz28way/maf mkdir $dir ln -s /cluster/data/hg18/bed/multiz28way/mafDownloads/{*.gz,md5sum.txt} $dir # upstream mafs (mafFrags takes a while) ssh hgwdev cd /cluster/data/hg18/bed/multiz28way/mafDownloads cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad cat up.bad|sed -e "s/_up_${i}_/\t/" >up.bad2 awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, $4, 0, $6)}' up.bad2 > up.bed rm up.bad up.bad2 nice mafFrags hg18 multiz28way up.bed upstream$i.maf \ -orgs=/cluster/data/hg18/bed/multiz28way/species.lst rm up.bed end date 'EOF' # << happy emacs ssh kkstore02 cd /cluster/data/hg18/bed/multiz28way/mafDownloads csh mafFrags.csh > mafFrags.log & nice gzip up*.maf md5sum up*.gz >> md5sum.txt ssh hgwdev cd /cluster/data/hg18/bed/multiz28way # link filtered nets and chains to downloads area (doRecipBest.pl could # be changed for this) # Species where syntenic net was used foreach db (panTro2 rheMac2 equCab1 canFam2 bosTau3 mm8 rn4,monDom4) echo $db set cd = /cluster/data/hg18/bed/blastz.$db/axtChain cd $cd set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'` set f = hg18.$db.syn.net.gz if (! -e $f) then netFilter -syn hg18.$db.net.gz > $f endif set d = /usr/local/apache/htdocs/goldenPath/hg18/vs$Db ln -s $cd/$f $d nice md5sum $f >> $d/md5sum.txt end # Create downloads dir for new species without genome databases #foreach db (tupBel1 cavPor2 eriEur1 sorAra1) # NOTE: Keeping these only on genome-test for now. foreach db (tupBel1 cavPor2 eriEur1 sorAra1) echo $db set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'` set d = /usr/local/apache/htdocs/goldenPath/hg18 mkdir -p $d/vs$Db cp $d/vsOryCun1/README.txt $d/vs$Db set bd = /cluster/data/hg18/bed/blastz.$db cd $bd/axtChain set f = hg18.$db.net.gz if (! -e $f) then cat net/*.net | gzip -c > $f endif nice md5sum hg18.$db.{all.chain,net}.gz > md5sum.txt cd .. nice md5sum axtNet/*.gz >> axtChain/md5sum.txt ln -s $bd/axtChain/hg18.$db.{all.chain,net}.gz $d/vs$Db ln -s $bd/axtChain/md5sum.txt $d/vs$Db ln -s $bd/axtNet $d/vs$Db end # EDIT README's for the species # Post reciprocal best nets # NOTE: Keeping these only on genome-test for now. cat > downloads4.csh << 'EOF' foreach db (felCat3 otoGar1 loxAfr1 oryCun1 echTel1 dasNov1 \ tupBel1 cavPor2 eriEur1 sorAra1) echo $db set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'` set d = /usr/local/apache/htdocs/goldenPath/hg18/vs$Db set cd = /cluster/data/hg18/bed/blastz.$db/axtChain ln -s $cd/hg18.$db.rbest.{chain,net}.gz $d cd $d md5sum hg18.$db.rbest.{chain,net}.gz >> md5sum.txt end 'EOF' # EDIT README's to include reciprocal best chains & nets ############################################################################ # 28-way PhyloP downloads # 2008-10-21 kate ssh kolossus cd /san/sanvol1/scratch/hg18/multiz28way/phyloP cat > merge.csh << 'EOF' set out = $1 rm -f *.lst foreach c (`ls -d chr*`) echo $c touch $c.lst foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`) ls -1 $d/*.wig | sort -n -t\. -k3 >> $c.lst xargs < $c.lst cat > $out/$c.wigFix end end 'EOF' # all species cd all csh ../merge.csh /cluster/data/hg18/bed/multiz28way/cons/phyloP/all > merge.log cd ../placental csh ../merge.csh /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental > merge.log cd /cluster/data/hg18/bed/multiz28way/cons/phyloP # post to downloads cd /usr/local/apache/htdocs/goldenPath/hg18 mkdir phyloP28way cd phyloP28way ln -s /cluster/data/hg18/bed/multiz28way/cons/phyloP/{all,placental} . cd all nice gzip $out/$c.wigFix cd ../placental nice gzip $out/$c.wigFix ############################################################################ # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd) # see hg17.txt for build temporary ccds database for CCDS.20070228 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg18 ccdsInfo ccdsGene /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=hg18 -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords hg18 -verbose=2 ccdsGene # update all.jointer to include hg18 in ccdsDb joinerCheck -database=hg18 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # build initial version of ccdsMgcMap table, updated by nightly genbank update /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg18 ccdsGene mgcGenes ccdsMgcMap # load trackDb cd kent/src/hg/makeDb/trackDb make alpha # check in browser # request push of ccdsGene ccdsInfo ccdsKgMap ccdsMgcMap # << emacs ######################################################################### # RECIPROCAL BEST CHIMP PANTRO2 (2007-03-02 kate) # Requested by Daryl cd /cluster/data/hg18/bed/blastz.panTro2 doRecipBest.pl hg18 panTro2 >&! rbest.log & ######################################################################### # EPONINE-TSS (TRANSCRIPTON START SITE) PREDICTION # (DONE, 2007-03-08, hartera) # The Eponine software is version 2 and has not changed in several years # (contact: Thomas Down at Sanger, td2 at sanger.ac.uk). The version downloaded # for hg16 should be the same as the current version but download again just # to check. The application includes the TSS model file: eponine-tss2.xml ssh kkstore02 # Eponine runs fine on 2.5Mb contig, but barfs on much larger contig; # chop up sequence at gaps into ~2.5Mb chunks for cluster run. mkdir /san/sanvol1/scratch/hg18/chunks cd /cluster/data/hg18 foreach f (?{,?}/NT_*/NT_??????.fa) set ctg = $f:t:r /cluster/bin/x86_64/faSplit -minGapSize=10 \ -lift=/san/sanvol1/scratch/hg18/chunks/${ctg}.lft \ gap $f 2500000 /san/sanvol1/scratch/hg18/chunks/${ctg}.chunk end # seems to ignore the chunk part of the file name mkdir /cluster/data/hg18/bed/eponine cd /cluster/data/hg18/bed/eponine wget --timestamping \ http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar # file has the same date and same size as the one downloaded for hg16 # the script requires all of the path setting found in my .tcshrc file. # Using only set path = (/usr/java/jre1.5.0_06/bin $path) # as in the doEpo file for hg16 does not work. cat << '_EOF_' > doEpo #!/bin/csh -ef set path = (/usr/java/jre1.5.0_06/bin /bin /usr/bin /usr/X11R6/bin \ /usr/local/bin . /cluster/home/hartera/bin/x86_64 \ /cluster/bin/x86_64 /projects/compbio/bin/x86_64 \ /projects/compbio/bin /projects/compbio/bin/x86_64-linux \ /cluster/bin/scripts) java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2 '_EOF_' chmod a+x doEpo cp /dev/null jobList foreach f (/san/sanvol1/scratch/hg18/chunks/NT*.fa) echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \ >> jobList end mkdir out ssh pk cd /cluster/data/hg18/bed/eponine /parasol/bin/para create jobList /parasol/bin/para try, check, push, check etc..... /parasol/bin/para time # Completed: 1408 of 1408 jobs # CPU time in finished jobs: 105248s 1754.13m 29.24h 1.22d 0.003 y # IO & Wait Time: 4369s 72.82m 1.21h 0.05d 0.000 y # Average job time: 78s 1.30m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 104s 1.73m 0.03h 0.00d # Submission to last job: 1295s 21.58m 0.36h 0.01d # lift chunks -> contigs mkdir contigs/ foreach l (/san/sanvol1/scratch/hg18/chunks/*.lft) set ctg = $l:t:r liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff end # lift contigs -> chrom liftUp eponine.gff /cluster/data/hg18/jkStuff/liftAll.lft \ warn contigs/NT_*.gff # Translate to bed 4 + float-score -- it would be a shame to lose # those scores in genePred or bed 5 (int score) awk 'BEGIN {i=0;} \ {printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \ i = i + 1;}' \ eponine.gff > eponine.bed # load up ssh hgwdev cd /cluster/data/hg18/bed/eponine sed -e 's/bed6FloatScore/eponine/g' \ $HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql hgLoadBed hg18 eponine eponine.bed -tab -sqlTable=eponine.sql # Loaded 61359 elements of size 6 # trackDb.ra entry and eponine.html already exist in trackDb directory. ########################################################################### # ACEScan Track (DONE 2007-03-15 Andy ssh hgwdev cd /cluster/data/hg18/bed mkdir acescan cd acescan/ cp /cluster/data/hg17/bed/acescan/acescan.hg17.gp . liftOver -genePred acescan.hg17.gp /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \ acescan.hg18.gp unmapped ldHgGene -predTab hg18 acescan acescan.hg18.gp ############################################################################## # Update central DB gdbPdb table in preparation for KG III (DONE 3/22/07, Fan) mysql -u hgcat -p$HGPSWD -h genome-testdb -A hgcentraltest update gdbPdb set proteomeDb = "proteins070202" where genomeDb = "hg18"; quit ############################################################################## # UPDATE CGAP TABLES (DONE, 3/26/07, Fan) cd /cluster/data/hg18/bed/ucsc.10 mkdir cgap cd cgap wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat" hgCGAP Hs_GeneData.dat cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab hgLoadSqlTab hg18 cgapAlias ~/kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab hgLoadSqlTab hg18 cgapBiocPathway ~/kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab hgLoadSqlTab hg18 cgapBiocDesc ~/kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab ############################################################################## # UPDATE CGAP TABLES (DONE, 8/05/08, JK) cd /cluster/data/hg18/bed/ucsc.11 mkdir cgap cd cgap wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat" hgCGAP Hs_GeneData.dat cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab hgLoadSqlTab hg18 cgapAlias ~/kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab hgLoadSqlTab hg18 cgapBiocPathway ~/kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab hgLoadSqlTab hg18 cgapBiocDesc ~/kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab ############################################################################## ## BLASTZ HUMAN HG18 (DONE - 2007-03-26 - Hiram) ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26 cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26 cat << '_EOF_' > DEF # human vs lancelet BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human Hg18 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold # Largest scaffold 7,200,735 - 3032 scaffolds + chrM SEQ2_DIR=/san/sanvol1/scratch/braFlo1/braFlo1.2bit SEQ2_LEN=/san/sanvol1/scratch/braFlo1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/braFlo1/braFlo1.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.braFlo1.2007-03-26 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -verbose=2 \ -blastzOutRoot /cluster/bluearc/hg18BraFlo1 > do.log 2>&1 & # real 458m43.961s cat fb.hg18.chainBraFlo1Link.txt # 26455595 bases of 2881515245 (0.918%) in intersection # test reciprocal best chains/nets for 5-way maf alignments # on braFlo1, this did not work right there ssh hgwdev cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 braFlo1 \ > rbest.log 2>&1 & # real 105m14.176s # and now the swap, also documented in braFlo1.txt mkdir /cluster/data/braFlo1/bed/blastz.hg18.swap cd /cluster/data/braFlo1/bed/blastz.hg18.swap time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \ /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26/DEF \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -verbose=2 \ -swap > swap.log 2>&1 & # real 83m46.258s cat fb.braFlo1.chainHg18Link.txt # 30912893 bases of 923355587 (3.348%) in intersection ############################################################################## # RE-BUILD knownGeneList, (DONE, 3/29/07, Fan) cd /cluster/data/hg18/bed rm -rf knownGeneList/hg18 # Run hgKnownGeneList to generate the tree of HTML pages # under ./knownGeneList/hg18 hgKnownGeneList hg18 # copy over to /usr/local/apache/htdocs rm -rf /usr/local/apache/htdocs/knownGeneList/hg18 mkdir -p /usr/local/apache/htdocs/knownGeneList/hg18 cp -Rfp knownGeneList/hg18/* /usr/local/apache/htdocs/knownGeneList/hg18 ############################################################################## # Update entrez DB tables. cd /cluster/store10/entrez mkdir 070329 ln -s /cluster/store10/entrez/070329 /cluster/data/entrez/070329 cd /cluster/data/entrez/070329 wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz gzip -d *.gz cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g'|cut -f 1-2 > entrezMrna.tab cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g'|cut -f 1-2 > entrezRefseq.tab cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g'|cut -f 1,2,4 > entrezRefProt.tab hgLoadSqlTab entrez entrezRefseq ~/src/hg/lib/entrezRefseq.sql ./entrezRefseq.tab hgLoadSqlTab entrez entrezMrna ~/src/hg/lib/entrezMrna.sql ./entrezMrna.tab hgLoadSqlTab entrez entrezRefProt ~/src/hg/lib/entrezRefProt.sql ./entrezRefProt.tab cd /cluster/data/hg18/bed/ucsc.10 hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna, hg18.all_mrna where qName=mrna and entrezRefseq.geneID=entrezMrna.geneID' \ >mrnaRefseq1.tab # Include RefSeq as valid mRNA too. hgsql hg18 -N -e 'select name, name from refGene' >mrnaRefseq2.tab cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab hgLoadSqlTab hg18 mrnaRefseq ~/src/hg/lib/mrnaRefseq.sql ./mrnaRefseq.tab ############################################################################## # RE-BUILD KEGG RELATED TABLES FOR KG III. (DONE, 3/29/07, Fan) wget --timestamping -O hsa.html \ "http://www.genome.ad.jp/dbget-bin/www_bfind_sub?dbkey=pathway&keywords=hsa&mode=bfind&max_hit=1000&.cgifields=max_hit" grep href hsa.html | perl -wpe "s/<[^>]+>//g" > hsa.lis # edit hsa.lis to get rid of the first blank line and last line which is an unrelated line. ~/kent/src/hg/protein/getKeggList2.pl hsa > keggList.tab hgLoadSqlTab hg18 keggList ~/src/hg/lib/keggList.sql ./keggList.tab # Before running hgKegg3, make sure entrez DB is updated. hgKegg3 hg18 hg18 # Load resulting data hgLoadSqlTab hg18 keggPathway ~/src/hg/lib/keggPathway.sql ./keggPathway.tab hgLoadSqlTab hg18 keggMapDesc ~/src/hg/lib/keggMapDesc.sql ./keggMapDesc.tab ############################################################################## # REATTACH KEGG TO KNOWN GENES. (DONE, 8/12/08, JK) mkdir -p /cluster/data/hg18/bed/ucsc.11/kegg cd /cluster/data/hg18/bed/ucsc.11/kegg kgAttachKegg hg18 ../../ucsc.10/kegg/keggList.tab keggPathway.tab hgLoadSqlTab hg18 keggPathway ~/src/hg/lib/keggPathway.sql ./keggPathway.tab ############################################################################## # REATTACH SPMRNA TABLE TO KNOWN GENES. (DONE, 8/12/08, JK) hgsql hg18 -N -e "select spDisplayID,kgID from kgXref where spDisplayID != ''" > spMrna.tab; hgLoadSqlTab hg18 spMrna ~/kent/src/hg/lib/spMrna.sql spMrna.tab ############################################################################## # UPDATE BIOCYCTABLES NEEDED BY hgGene (DONE 3/27/07 Fan) # First register with BioCyc to download their HumanCyc database # The site will email you the URL for download wget --timesatmping \ http://bioinformatics.ai.sri.com/ecocyc/dist/flatfiles-52983746/humancyc-flatfiles.zip unzip humancyc-flatfiles.zip cp genes.col genes.tab cp pathways.col pathways.tab # delete the first 20 or so header lines from these two files. vi genes.tab vi pathways.tab hgsql hg18 -e 'create database bioCyc070327' hgLoadSqlTab bioCyc070327 genes ~/src/hg/lib/bioCycGenes.sql ./genes.tab hgLoadSqlTab bioCyc070327 pathways ~/src/hg/lib/bioCycPathways.sql ./pathways.tab # Create bioCycMapDesc.tab hgsql bioCyc070327 -N -e 'select UNIQUE_ID, NAME from pathways' |sort -u > bioCycMapDesc.tab # Create bioCycPathway.tab kgBioCyc0 bioCyc070327 hg18 hg18 hgLoadSqlTab hg18 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab hgLoadSqlTab hg18 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab ########################################################################## # PARTIAL UPDATE OF BIOCYCTABLES NEEDED BY hgGene (DONE 8/05/08 JK) # Note, ideally would get new data from bioCyc, but they never sent me the # URL for the files though I filled out their web form a week ago. So reusing the # 3/27/07 pathways.col and genes.col files. I did write a new kbBioCyc1 to do # the actual load, and it is on the new UCSC genes. It looks to be a slight # improvement. About 10% more genes in pathways. mkdir /cluster/data/hg18/bed/ucsc.11/bioCyc cd /cluster/data/hg18/bed/ucsc.11/bioCyc grep -v '^#' /cluster/data/hg18/bed/ucsc.10/bioCyc/pathways.col > pathways.tab grep -v '^#' /cluster/data/hg18/bed/cusc.10/bioCyc/genes.col > genes.tab kgBioCyc1 genes.tab pathways.tab $db bioCycPathway.tab bioCycMapDesc.tab hgLoadSqlTab hg18 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab hgLoadSqlTab hg18 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab ########################################################################### # SwitchDB TSS Track (DONE 2007-04-12 Andy) ssh hgwdev mkdir /cluster/data/hg18/bed/switchDbTss cd /cluster/data/hg18/bed/switchDbTss ln -s /cluster/data/hg17/bed/switchDbTss/switchDbTss.bed hg17.bed liftOver -bedPlus=5 hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.bed unMapped wc -l unMapped #12 unMapped (12 are "deleted in new") ln -s ~/kent/src/hg/lib/switchDbTss.sql hgLoadBed -sqlTable=switchDbTss.sql hg18 switchDbTss hg18.bed ########################################################################### # ADD KG TO TREEFAM LINKS (DONE, 2007-04-13 Fan) # Generate ucscToEnsembl.txt and send it to TreeFam # zhongzhongchen [chenzhzh at genomics.org.cn] hgsql hg18 -N -e 'select * from knownToEnsembl' >ucscToEnsembl.txt ssh hgwdev cd /cluster/store12 mkdir treeFam070413 ln -s /cluster/store12/treeFam070413 /cluster/data/treeFam cd /cluster/data/treeFam # Receive the following files from TreeFam ucscToEnsemblToTreefamToRefToUniprot.txt ucscToEnsemblToTreefamToRef.txt ucscToEnsemblTotreefam.txt # Use ucscToEnsemblTotreefam.txt to construct knownToTreefam table. cut -f 1,3 ucscToEnsemblTotreefam.txt >knownToTreefam.tab hgLoadSqlTab hg18 knownToTreefam \ ~/src/hg/lib/knownToTreefam.sql ./knownToTreefam.tab # Add the following section into kent/src/hg/hgGene/hgGeneData/links.ra name treeFam shortLabel Treefam tables knownToTreefam idSql select value from knownToTreefam where name = '%s'; url http://www.treefam.org/cgi-bin/TFinfo.pl?ac=%s priority 10 ########################################################################### # BLASTZ/CHAIN/NET HORSE (equCab1) (STARTED 2/16/07, DONE 2/21/07, Fan) ssh kkstore05 mkdir /cluster/data/equCab1/bed/blastz.hg18.2007-02-15 cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15 # NOTE: THE TARGET WAS ORIGINALLY INTENDED TO BE HORSE, BUT I DID NOT # DISCOVER THIS UNTIL THE TASK IS DONE. cat << '_EOF_' > DEF # Horse vs. Human BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse equCab1 SEQ2_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit SEQ2_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes # Maximum number of scaffolds that can be lumped together SEQ2_LIMIT=500 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/equCab1/bed/blastz.hg18.2007-02-15 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab1/blastz.hg18 >& do.log & tail -f do.log ln -s blastz.hg18.2007-02-15 /cluster/data/hg18/bed/blastz.equCab1 nice featureBits hg18 -chrom=chr1 chainEquCab1Link # 132947074 bases of 224999719 (59.088%) in intersection ssh hgwdev cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15 bash time nice -n 19 featureBits hg18 chainEquCab1Link \ > fb.hg18.chainEquCab1Link.txt 2>&1 & # 1643928877 bases of 2881515245 (57.051%) in intersection ######################################################################### # enable ORFeome track build. (markd 2007-05-02) cd ~/kent/src/hg/makeDb/genbank cvs update -d etc # edit etc/genbank.conf to add hg18.orfeomeTables.hgwdev = yes hg18.orfeomeTables.hgwbeta = yes # will need to enable for rr later. In the future, this can just be enabled # as part the normal genbank build. Change above to: hg18.orfeomeTables.default = yes ######################################################################### # exaptedRepeats track (4/30/07, Craig) # for full methods an analysis see: Lowe, Bejerano, Haussler. # Thousands of human mobile element fragments undergo # strong purifying selection near developmental genes. # PNAS. (in press). Epub 2007 Apr 26. # # Code to re-make this track is in: # build36/bed/exapted/create.csh # # To re-make the track all you have to do is run that c-shell # while you are in its directory. # It is easiest if you are on hgwdev since it uses featureBits a few times # and gets some info from the sql database. I would say it takes # about two hours to run. # ####################################################################### # UCSC GENES (DONE 2007-03-xx kent) see file: ucscGenes10.txt ####################################################################### # ENCODE Regulation track (DONE June 2010. DNAse and TFBS redone April 30 # 2011 kent) #make root dir mkdir -p /cluster/data/hg18/bed/wgEncodeReg cd /cluster/data/hg18/bed/wgEncodeReg # Create the DNAse peak clusters subtrack. # Get all of the narrowPeak format files for the wgEncodeUwDnaseSeq # linked into directory /hive/users/kent/regulate/dnase/peaks mkdir dnase cd dnase mkdir peaks ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg18/wgEncodeUwDnase/*.narrowPeak.gz peaks # Process these into clusters in a bed file and load clusters into # table. /bin/ls -1 peaks/*.narrowPeak.gz > peak.lst regClusterMakeTableOfTables uw01 peak.lst peak.table regCluster peak.table /dev/null peak.bed awk '$4 > 1 || $5 >= 100' peak.bed > wgEncodeRegDnaseClustered.bed hgLoadBed hg18 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.bed # Make wgEncodeRegDnaseClusteredInput table. Start with mdbQuery, and # then do some massaging since not completely in sync with file list. mdbQuery out=tab "select obj,cell,treatment,replicate,lab,dateUnrestricted from hg18 where obj like 'wgEncodeUwDnase%' and view='Peaks'" | sed 's/n\/a/None/' > inputMdb.tab cut -f 1 peak.table | sed 's/\.narrowPeak\.gz//' | sed 's/peaks\///' > inputs.lst weedLines inputs.lst inputMdb.tab wgEncodeRegDnaseClusteredInputs.tab -invert hgLoadSqlTab hg18 wgEncodeRegDnaseClusteredInputs ~/kent/src/hg/lib/clusterInputDnase.sql \ wgEncodeRegDnaseClusteredInputs.tab # Create the Transcription Factor Binding Site subtrack. This is a bit # complex because it is merging data from the Snyder lab (yale) and from # HudsonAlpha (hud), and the hud data has replicates while the yale # data does not. # Create hud/replicates directory full of gzipped narrow peak files, # converting broad peak files as needed. mkdir -p /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud/replicates cd /hive/groups/encode/dcc/analysis/ftp/pipeline/hg18/wgEncodeHudsonalphaChipSeq foreach i (*.narrowPeak*.gz) cp $i /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud/replicates end foreach i (*.broadPeak*.gz) zcat $i | awk '{printf("%s\t%d\n", $0, ($3-$2)/2);}' > \ /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud/replicates/$i:r:r.narrowPeak end cd /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud/replicates gzip *.narrowPeak # Get ra file that includes the file name and other info we need for hud cd /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud mkdir hud cd hud mdbQuery "select tableName,fileName,antibody,cell,replicate,treatment,lab from hg18 \ where view='Peaks' and lab='HudsonAlpha' and dataType='ChipSeq'" \ -out=ra | sed s/broadPeak/narrowPeak/ > hud.ra # Generate merged dir full of merged replicates. mkdir merged encodeMergeReplicatesBatch hud.ra replicates merge.sh merged.ra merged chmod a+x merge.sh merge.sh # Generate symbolic links to directories containing pooled peak files # for yale and for hud cd /cluster/data/hg18/bed/wgEncodeReg/tfbs ln -s /cluster/data/hg18/bed/wgEncodeReg/tfbs/hud/merged hudPeaks ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg18/wgEncodeYaleChIPseq yalePeaks # Get ra file that includes the file name and other info we need for yale mdbQuery "select tableName,fileName,antibody,cell,treatment,lab from hg18 where lab like '%Snyder%' and dataType like 'ChipSeq' and view='Peaks'" out=ra | sed 's/wgEncode/yalePeaks\/wgEncode/' > yale.ra # Get ra file for hud cat hud/merged.ra | sed 's/wgEncode/hudPeaks\/wgEncode/' > hud.ra # Combine both ra files and convert to three column tab-separated cat yale.ra hud.ra | raToTab stdin bothLabs.tab -cols=fileName,cell,antibody # Set up config file for clustering job. This includes calculating some # normalization factors for the score. The cellLetter.tab assigns # letters to cell lines, and is created by hand in the source tree. cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter.tab . regClusterBedExpCfg -tabList bothLabs.tab bothLabs.cfg -cellLetter=cellLetter.tab # Do the actual clustering and load results into database hgBedsToBedExps -dupeLetterOk bothLabs.cfg peak.bed peak.exps awk '$2 != $3' peak.bed > filtered.bed hgLoadBed hg18 wgEncodeRegTfbsClustered filtered.bed hgLoadSqlTab hg18 wgEncodeRegTfbsCells ~/kent/src/hg/lib/expRecord.sql peak.exps # Create inputTrackTable - three columns: #mdbQuery "select tableName,cell,antibody from hg18 where (lab like '%Snyder%' or lab='HudsonAlpha') and dataType like 'ChipSeq' and view='Peaks' and antibody not like 'Pol2%'" -out=tab > wgEncodeRegTfbsClusteredInputs.tab cat yale.ra hud.ra | raToTab stdin stdout -cols=tableName,cell,antibody,cell,treatment,lab \ | awk 'BEGIN {OFS="\t";} {if ($5 != "None") $2=$2 "+" $5;print}' \ > wgEncodeRegTfbsClusteredInputs.tab hgLoadSqlTab hg18 wgEncodeRegTfbsClusteredInputs ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputs.tab ####################################################################### # CGAP SAGE (DONE 2007-04-17 Andy) ssh hgwdev bash mkdir /san/sanVol1/scratch/andy/cgapSage cd /san/sanVol1/scratch/andy/cgapSage echo "select * from cgapSageLib" | hgsql hg18 | tail +2 > libs.txt echo "select * from snp127 where class='single' and locType='exact'" | hgsql hg18 | tail +2 | cut -f2- > allSnpss.txt echo "select name from snp127Exceptions where exception='ObservedWrongSize' or exception='SingleClassBetweenLocType' or exception='SingleClassRangeLocType' or exception='MultipleAlignment'" | hgsql hg18 | tail +2 > exceptions tabGrep -v exceptions 4 allSnps.txt > snps.txt rm allSnps.txt exceptions echo select chrom,chromStart,chromEnd,name from simpleRepeat | hgsql hg18 | tail +2 > trf.bed cut -f1-4 snps.txt > snps.bed overlapSelect -selectFmt=bed -inFmt=bed -nonOverlapping trf.bed snps.bed snps.noTrf.bed cut -f4 snps.noTrf.bed > snps.noTrf tabGrep snps.noTrf 4 snps.txt > snps.noTrf.txt mv snps.noTrf.txt snps.txt grep -v random /cluster/data/hg18/chrom.sizes | grep -v hap > chrom.sizes mkdir chromSnps for c in `cat chrom.sizes | cut -f1`; do awk "{if (\$1==\"$c\") print;}" snps.txt > chromSnps/$c.snps.txt; echo $c; done rm snps.txt wget ftp://ftp1.nci.nih.gov/pub/SAGE/HUMAN/Hs.libraries.gz gunzip Hs.libraries.gz cat << "EOF" > cleanLibs.awk BEGIN{FS="\t"} { for (i = 1; i <= 12; i++) { printf("%s\t", $i); } sex = ""; if ($13=="male") { sex = "male,"; } else if ($13=="female") { sex = "female,"; } else if ($13=="male and female") { sex = "male,female,"; } else if ($13=="unknown") { sex = ""; } printf("%s\t", sex); for (i = 14; i <= 20; i++) { printf("%s\t", $i); } printf("%s\n", $21); } EOF tail +2 Hs.libraries | awk -f cleanLibs.awk > libs.txt ln -s ~/kent/src/hg/lib/cgapSage/cgapSageLib.sql hgLoadSqlTab hg18 cgapSageLib cgapSageLib.sql libs.txt partitionSequence.pl -lstDir small 5000000 30 hg18.2bit chrom.sizes 0 > sequence.lst grep -v small sequence.lst > seq.lst cat small/* >> seq.lst mv seq.lst sequence.lst rm -rf small/ wget ftp://ftp1.nci.nih.gov/pub/SAGE/HUMAN/Hs_long.frequencies.gz gunzip Hs_long.frequencies.gz cat << "EOF" > doJobList.sh #!/bin/bash # basic vars part=$1; range=${part#*2bit:}; chrom=${range%:*}; nums=${range#*:} firstnum=${nums:0:1} outDir=output/${chrom}/${firstnum} mkdir -p $outDir echo ./doFind.sh $1 {check out exists `pwd`/${outDir}/${range}.bed} EOF chmod +x doJobList.sh for part in `cat sequence.lst`; do ./doJobList.sh $part >> jobList; done cat << "EOF" > doFind.sh #!/bin/bash # basic vars part=$1; range=${part#*2bit:}; chrom=${range%:*}; nums=${range#*:} firstnum=${nums:0:1} # dirs/files startDir=`pwd` scratch=/scratch/tmp/$part output=$2 # begin mkdir -p $scratch pushd $scratch twoBitToFa -noMask $startDir/"$part" part.fa cgapSageFind part.fa $startDir/Hs_long.frequencies $startDir/libs.txt \ $startDir/chromSnps/${chrom}.snps.txt output.bed cp output.bed $output popd rm -rf $scratch EOF chmod +x doFind.sh ssh pk cd /san/sanVol1/scratch/andy/cgapSage para create jobList para try para push # takes like 5-10 min exit # back to hgwdev find output/ -name '*.bed' -exec cat '{}' >> output.bed \; cgapSageDupeRemove output.bed tmp.bed cgapSageDupeRemove -unique tmp.bed final.bed ln -s ~/kent/src/hg/lib/cgapSage/cgapSage.sql hgLoadBed -sqlTable=cgapSage.sql -tab hg18 cgapSage final.bed ######################################################################### # HapMap SNPs (DONE 2007-05-23 Andy) # rel22 # OBSOLETED by Phase II+III SNPs 3/09 angie (see HAPMAP REL27 GENOTYPES) # Tables renamed to [originalName]PhaseII 3/9/09 ssh hgwdev bash cd /cluster/data/hg18/bed mkdir -p hapmap/zips cd hapmap/zips # archived to http://www.hapmap.org/genotypes/2007-03 wget -nd -r -N -A html http://www.hapmap.org/genotypes/latest_ncbi_build36/rs_strand/non-redundant/ grep gz index.html | sed 's/^.*href=\"\(geno.*\.txt\.gz\)\".*$/\1/' > files.txt wget -N -i files.txt --base=http://www.hapmap.org/genotypes/latest_ncbi_build36/rs_strand/non-redundant/ rm index.html robots.txt files.txt cd ../ mkdir samples cd samples/ wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_CEU.txt.gz wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_CHB.txt.gz wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_JPT.txt.gz wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_YRI.txt.gz cp /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant/*.pl . ln -s ../zips ./filterPedigree.pl < pedinfo2sample_CEU.txt > filtered.CEU ./filterPedigree.pl < pedinfo2sample_YRI.txt > filtered.YRI zcat zips/*chr22_CEU* | head -1 | tr ' ' '\n' > header.CEU zcat zips/*chr22_YRI* | head -1 | tr ' ' '\n' > header.YRI grep -n -f filtered.CEU header.CEU | cut -f1 -d':' > offsets.CEU grep -n -f filtered.YRI header.YRI | cut -f1 -d':' > offsets.YRI for pop in CEU YRI CHB JPT; do for f in zips/genotypes_chr*_${pop}_r22_nr.b36.txt.gz; do zcat $f | ./filter${pop}.pl >> ../${pop}.merge echo Done with $f done done cd ../ for pop in CEU YRI CHB JPT; do ~/kent/src/hg/snp/snpLoad/hapmap1 ${pop}.merge ${pop}.condense mv hapmap1.log ${pop}.hapmap1.log done wc -l *.log #0 CEU.hapmap1.log #0 CHB.hapmap1.log #0 JPT.hapmap1.log #0 YRI.hapmap1.log #0 total rm *.log cp ~/kent/src/hg/lib/hapmapSnps.sql . for pop in CEU CHB JPT YRI; do sed "s/hapmapSnps/hapmapSnps$pop/" hapmapSnps.sql > hapmapSnps${pop}.sql hgLoadBed -sqlTable=hapmapSnps${pop}.sql hg18 hapmapSnps$pop ${pop}.condense done # Don't worry if you see: #load of hapmapSnpsCEU did not go as planned... etc. # unless it says rows skipped. ~/kent/src/hg/snp/snpLoad/hapmap2 hg18 #building CEU hash... #Can't start query: #select * from hapmapAllelesCEU # #mySQL error 1146: Table 'hg18.hapmapAllelesCEU' doesn't exist # But this works: ~heather/kent/src/hg/snp/snpLoad/hapmap2 hg18 # (gotta bug Heather about that one) ln -s ~/kent/src/hg/lib/hapmapSnpsCombined.sql hgLoadBed -sqlTable=hapmapSnpsCombined.sql hg18 hapmapSnpsCombined hapmapSnpsCombined.tab # Checks: ~heather/kent/src/hg/snp/snpLoad/snpCheckCluster2 hg18 hapmapSnpsCombined #match count = 0 ### clean up rm *.sql hapmapSnpsCombined.tab bed.tab tar cfvz merge.tar.gz *.merge tar cfvz condense.tar.gz *.condense rm *.condense *.merge mkdir logs mv *.errors *.log *.out logs mkdir orthos cd orthos/ # hgWiggle output has the chromosome in a comment, followed by the values # This script prints that chromosome on every line cat << "EOF" > joinify.awk { if ($1 == "variableStep") { sub("chrom=", "", $2); chrom = $2; } else if ($1 != "#") { printf("%s,%s\t%s\n", chrom, $1, $2); } } EOF cat << "EOF" > join.sh #!/bin/bash sed 's/\(^chr\w\+\)\t/\1,/' $1 > bed sort -k1,1 bed > tmp; mv tmp bed awk -f joinify.awk $2 > scores sort -k1,1 scores > tmp; mv tmp scores join -1 1 -2 1 bed scores | tr ',' ' ' | awk '{printf("%s\t%s\t%s\t%s\t%d\t%s\t%s\n", $1, $2, $3, $4, $8, $6, $7);}' > qual.tab rm scores bed EOF chmod +x join.sh # chimp alleles cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq awk '{printf("%s\t%s\t%s\t%s\t0\t%s\t%s\n", $2, $3, $4, $5, $7, $8);}' snp126orthoPrelim.tab > snp126orthoPrelim.bed cp snp126orthoPrelim.bed /cluster/data/hg18/bed/hapmap/orthos/panTro2.bed.new cd /cluster/data/hg18/bed/hapmap/orthos hgWiggle -db=panTro2 -bedFile=panTro2.bed quality > panTro2.scores # create qual.tab; combine panTro2 sequence with panTro2 quality score ./join.sh panTro2.bed.new panTro2.scores grep chr21 panTro2.bed.new >> qual.tab grep chrY panTro2.bed.new >> qual.tab # create snpOrtho.tab; a table in human coords that has associated ortho alleles ~heather/kent/src/hg/snp/snpLoad/snpOrtho hg18 snp126 qual.tab sed 's/snpOrtho/snp126OrthoPanTro2/' ~/kent/src/hg/lib/snpOrtho.sql > snpOrthoPanTro2.sql hgLoadBed -tab -sqlTable=snpOrthoPanTro2.sql hg18 snp126OrthoPanTro2 snpOrtho.tab mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chr21"; mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY"; mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY_random"; # get the HapMap subset sed 's/hapmapAllelesOrtho/hapmapAllelesChimp/' ~/kent/src/hg/lib/hapmapAllelesOrtho.sql > hapmapAllelesChimp.sql ~heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg18 hapmapSnpsCombined snp126OrthoPanTro2 hgLoadBed -tab -sqlTable=hapmapAllelesChimp.sql hg18 hapmapAllelesChimp hapmapOrtho.tab # sanity check mysql> select count(*) from hapmapAllelesChimp where chrom = orthoChrom; # 3,492,708 mysql> select count(*) from hapmapAllelesChimp where chrom != orthoChrom; # 374,010 # macaque alleles cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq awk '{printf("%s\t%s\t%s\t%s\t0\t%s\t%s\n", $2, $3, $4, $5, $7, $8);}' snp126orthoPrelim.tab > snp126orthoPrelim.bed cp snp126orthoPrelim.bed /cluster/data/hg18/bed/hapmap/orthos/rheMac2.bed.new cd /cluster/data/hg18/bed/hapmap/orthos hgWiggle -db=rheMac2 -bedFile=rheMac2.bed quality > rheMac2.scores # create qual.tab: combine rheMac2 sequence with rheMac2 quality score ./join.sh rheMac2.bed.new rheMac2.scores # create snpOrtho.tab; a table in human coords that has associated ortho alleles ~heather/kent/src/hg/snp/snpLoad/snpOrtho hg18 snp126 qual.tab sed 's/snpOrtho/snp126OrthoRheMac2/' ~/kent/src/hg/lib/snpOrtho.sql > snpOrthoRheMac2.sql hgLoadBed -tab -sqlTable=snpOrthoRheMac2.sql hg18 snp126OrthoRheMac2 snpOrtho.tab # get the HapMap subset sed 's/hapmapAllelesOrtho/hapmapAllelesMacaque/' ~/kent/src/hg/lib/hapmapAllelesOrtho.sql > hapmapAllelesMacaque.sql ~heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg18 hapmapSnpsCombined snp126OrthoRheMac2 hgLoadBed -tab -sqlTable=hapmapAllelesMacaque.sql hg18 hapmapAllelesMacaque hapmapOrtho.tab # create summary table ~heather/kent/src/hg/snp/snpLoad/hapmapSummary hg18 hapmapSnpsCombined hapmapAllelesChimp hapmapAllelesMacaque ln -s ~/kent/src/hg/lib/hapmapAllelesSummary.sql hgLoadBed -tab -sqlTable=hapmapAllelesSummary.sql hg18 hapmapAllelesSummary hapmapSummary.tab ############################################################################# # RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan) # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan) ssh hgwdev cd /cluster/data/hg18/bed mkdir wgRna-2007-05-31 cd wgRna-2007-05-31 # Received the data file, wg_may2007.txt (saved from wg_may2007.doc) # from Michel Weber's email # (Michel.Weber at ibcg.biotoul.fr) # and place it under cd /cluster/data/hg18/bed/wgRna-2007-05-31. cat wg_may2007.txt|sed -e 's/ /\t/g' > wgRna.tab hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab ############################################################################# # N-SCAN GENES track (2007-06-21 markd) # create a composite track with exists ab-inito and new PASA N-SCAN predictions # download pasa predictions cd /cluster/data/hg18/bed/nscan/pasa wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.pasa.gtf wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.prot.fa bzip2 hg18.* chmod a-w hg18.* ldHgGene -gtf -genePredExt hg18 nscanPasaGene hg18.pasa.gtf.bz2 hgPepPred hg18 generic nscanPasaPep hg18.prot.fa.bz2 rm *.tab # update trackDb; need a hg18-specific page to describe informants and PASA human/hg18/nscan.html human/hg18/trackDb.ra # remove old human/hg18/nscanGene.html ########################################################################### # AUGUSTUS track (DONE 2007-7-3 Mario) # # augustusHints subtrack mkdir -p /cluster/data/hg18/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.X.final cd /cluster/data/hg18/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.X.final wget http://augustus.gobics.de/predictions/hg18/usingEvidence/augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.gff wget http://augustus.gobics.de/predictions/hg18/usingEvidence/augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.aa ldHgGene -bin hg18 augustusHints augustus.hg18.Trefseq.hmRNA.hsEST.R.X.gff hgPepPred hg18 generic augustusHintsPep augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.aa # augustus de novo subtrack mkdir -p /cluster/data/hg18/bed/augustus/usingHints/predictions/Xp.RA.it cd /cluster/data/hg18/bed/augustus/usingHints/predictions/Xp.RA.it wget http://augustus.gobics.de/predictions/hg18/deNovo/augustus.hg18.Xp.RA.it.pep.gff wget http://augustus.gobics.de/predictions/hg18/deNovo/augustus.hg18.Xp.RA.it.pep.aa ldHgGene -bin hg18 augustusXRA augustus.hg18.Xp.RA.it.gff hgPepPred hg18 generic augustusXRAPep augustus.hg18.Xp.RA.it.pep.aa # augustus ab initio subtrack mkdir -p /cluster/data/hg18/bed/augustus/abinitio cd /cluster/data/hg18/bed/augustus/abinitio wget http://augustus.gobics.de/predictions/hg18/abinitio/augustus.pep.gff wget http://augustus.gobics.de/predictions/hg18/abinitio/augustus.pep.aa ldHgGene -bin hg18 augustusAbinitio augustus.gff hgPepPred hg18 generic augustusAbinitioPep augustus.pep.aa ############################################################################# # Stanford NRSF ChIP-seq (DONE, Heather, July 2007) # Add color-by-strand and overlap table (2008-05-27 kate) # BED file of sites provided May 2008 by Tim Reddy (treddy@gmail.com) ssh kkstore03 cd /cluster/data/encode/stanford/2007-03-14 # lift to hg18 liftOver fix.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.bed core.unmapped liftOver control_fix.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.control.bed control.unmapped # add color by strand (red for +, blue for minus) awk 'OFS="\t" {$6=="+" ? c="255,0,0" : c="0,0,255"; print $1, $2, $3, "", $5, $6, $2, $3, c}' hg18.bed > hg18.fixc.bed awk 'OFS="\t" {$6=="+" ? c="255,0,0" : c="0,0,255"; print $1, $2, $3, "", $5, $6, $2, $3, c}' hg18.control.bed > hg18.control_fixc.bed # load into database hgwdev cd /cluster/data/encode/stanford/2007-03-14 hgLoadBed hg18 stanfordNRSFEnriched hg18.fixc.bed -tab hgLoadBed hg18 stanfordNRSFControl hg18.control_fixc.bed -tab # overlap tables set prefix = /gbdb/hg18/wib set table = stanfordNRSFEnrichedOverlaps sort -k1,1 -k2,2n hg18.bed | bedItemOverlapCount hg18 stdin | \ wigEncode stdin ${table}.wig ${table}.wib ln -s /cluster/data/encode/stanford/2007-03-14/${table}.wib $prefix hgLoadWiggle -pathPrefix=$prefix hg18 $table ${table}.wig set table = stanfordNRSFControlOverlaps sort -k1,1 -k2,2n hg18.control.bed | bedItemOverlapCount hg18 stdin | \ wigEncode stdin ${table}.wig ${table}.wib ln -s /cluster/data/encode/stanford/2007-03-14/${table}.wib $prefix hgLoadWiggle -pathPrefix=$prefix hg18 $table ${table}.wig # peaks (provided May 2008) sort -k1,1 -k2,2n lab/NRSF_Peak_Calls.bed | \ awk '{print $1, $2, $3}' > peaks.bed wc -l peaks.bed # 2116 hgLoadBed -noBin hg18 stanfordNRSFSites peaks.bed ######################################################################### # REGULATORY POTENTIAL UPDATE (DONE - 2007-08-01 - Hiram) # download data from "James Taylor" ssh kkstore02 mkdir /cluster/data/hg18/bed/regPotential7X.update cd /cluster/data/hg18/bed/regPotential7X.update # This is a lot of data for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y do wget --timestamping \ "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/chr${C}.scores.truncated.bz2" echo "DONE - chr${C}.scores.truncated.bz2" done # create download gzip files from the bz2 files: time for F in chr*.scores.truncated.bz2 do C=`echo $F | awk -F'.' '{print $1}'` echo -n "${C}.regPotential7X.hg18.gz working ... " bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz touch -r "${F}" "${C}.regPotential7X.hg18.gz" echo "done" done time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y do zcat chr${C}.regPotential7X.hg18.gz done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 16m40.347s # Loading the table on hgwdev ssh hgwdev cd /cluster/data/hg18/bed/regPotential7X.update mkdir /gbdb/hg18/wib/070118 ln -s /cluster/data/hg18/bed/regPotential7X.update/regPotential7X.wib \ /gbdb/hg18/wib/070118/regPotential7X.wib # using the tmpDir is faster since it is on local disk and it will # clean up any temporary .tab file it creates there time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \ -pathPrefix=/gbdb/hg18/wib/070118 hg18 regPotential7X regPotential7X.wig # real 0m38.247s # How about a histogram of the data. ssh kolossus cd /cluster/data/hg18/bed/regPotential7X.update time nice -n +19 hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 \ -hBinCount=100 -hMinVal=0.0 -db=hg18 regPotential7X > histogram.data 2>&1 # real 3m15.934s # 73 % of the data values are zero # create download gzip files from the bz2 files: ssh kkstore02 cd /cluster/data/hg18/bed/regPotential7X for F in chr*.scores.truncated.bz2 do C=`echo $F | awk -F'.' '{print $1}'` echo -n "${C}.regPotential7X.hg18.gz working ... " bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz echo done # renaming file directory -- kuhn 08-17-2007 cd /gbdb/hg18/wib mv 070118 regPot070118 hgsql -e " update regPotential7X SET file = " \ "/gbdb/hg18/wib/regPot070118/regPotential7X.wib" hg18 Query OK, 2341572 rows affected (31.59 sec) Rows matched: 2341572 Changed: 2341572 Warnings: 0 ############################################################################# # SIB Transcriptome (DONE Aug 29, 2007 - JK) # Create working directory and download data from where Christian Iseli # (Christian.Iseli at licr.org) put it, and unpack. The download takes about # ten minutes (161M file). cd /cluster/data/hg18/bed mkdir sibTranscriptome cd sibTranscriptome wget ftp://ftp.licr.org/pub/databases/trome/human/txg.tar.gz wget ftp://ftp.licr.org/pub/databases/trome/human/HTR.gtf.gz tar -zxvf txg.tar.gz # Load up sibGene table zcat HTR.gtf.gz | ldHgGene hg18 sibGene stdin # Do a little data cleanup and transformation and load splice graphs into database. sed 's/altGraphX/sibTxGraph/' ~/src/hg/lib/altGraphX.sql > sibTxGraph.sql sed 's/chrMt/chrM/' txg/chromMt.txg > txg/chromM.txg rm txg/chromMt.txt cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql hg18 sibTxGraph stdin # Create sibAltEvents track for analysed alt-splices. cat txg/*.txg | txgAnalyze stdin /cluster/data/hg18/hg18.2bit sibAltEvents.bed awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed hgLoadBed hg18 sibAltEvents foo.bed ######################################################################### # BLASTZ MOUSE Mm9 (DONE - 2007-08-20 - Hiram) ssh kkstore02 mkdir /cluster/data/hg18/bed/blastzMm9.2007-08-09 cd /cluster/data/hg18/bed/blastzMm9.2007-08-09 # Started this before the rsync to /scratch/data/mm9/ had completed, # hence the /cluster/bluearc/scratch/data/mm9/ location is used # here. (hg18 was also in transition to a new location) cat << '_EOF_' > DEF # human vs mouse BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human Hg18 SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib SEQ1_SMSK=/cluster/bluearc/scratch/data/hg18/linSpecRep/notInMouseRat SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=0 # QUERY: Mouse Mm9 SEQ2_DIR=/cluster/bluearc/scratch/data/mm9/nib SEQ2_SMSK=/cluster/bluearc/scratch/data/mm9/notInOthers SEQ2_LEN=/cluster/data/mm9/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/data/hg18/bed/blastzMm9.2007-08-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > blastz.out 2>&1 & # real 1480m54.483s # failed due to pk node difficulties, finish the run.blastz # manually # Completed: 102120 of 102120 jobs # CPU time in finished jobs: 6908585s 115143.08m 1919.05h 79.96d 0.219 y # IO & Wait Time: 50958894s 849314.90m 14155.25h 589.80d 1.616 y # Average job time: 567s 9.44m 0.16h 0.01d # Longest finished job: 3000s 50.00m 0.83h 0.03d # Submission to last job: 446177s 7436.28m 123.94h 5.16d # continuing time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 -bigClusterHub=pk -chainMinScore=3000 \ -chainLinearGap=medium -continue=cat `pwd`/DEF > cat.out 2>&1 & # real 111m59.041s cat /cluster/data/hg18/bed/blastzMm9.2007-08-09/fb.hg18.chainMm9Link.txt # 1014323175 bases of 2881515245 (35.201%) in intersection cat /cluster/data/hg18/bed/blastz.mm8/fb.hg18.chainMm8Link.txt # 994530182 bases of 2881515245 (34.514%) in intersection cd /cluster/data/hg18/bed ln -s blastzMm9.2007-08-09 blastz.mm9 # Then to swap over to Mm9 (also in mm9.txt) mkdir /cluster/data/mm9/bed/blastz.hg18.swap cd /cluster/data/mm9/bed/blastz.hg18.swap time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 -swap -bigClusterHub=pk -chainMinScore=3000 \ -chainLinearGap=medium \ /cluster/data/hg18/bed/blastz.mm9/DEF > swap.out 2>&1 & # real 67m21.146s cat /cluster/data/mm9/bed/blastz.hg18.swap/fb.mm9.chainHg18Link.txt # 1008812599 bases of 2620346127 (38.499%) in intersection cat /cluster/data/mm8/bed/blastz.hg18/fb.mm8.chainHg18Link # 984380268 bases of 2567283971 (38.343%) in intersection cd /cluster/data/mm9/bed ln -s blastz.hg18.swap blastz.hg18 ## make syntenic net (DONE - 2007-08-20 - Hiram) cd /cluster/data/hg18/bed/blastzMm9.2007-08-09 time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 -bigClusterHub=pk -chainMinScore=3000 \ -syntenicNet -chainLinearGap=medium -continue=syntenicNet \ `pwd`/DEF > syntenic.out 2>&1 & ## real 25m47.767s ######################################################################### # LOAD ACEMBLY (DONE 8/28/07 angie) ssh kkstore02 cd /cluster/data/hg18/bed/acembly # Move aside liftOver run results mkdir liftOver mv a* g* h* j* u* liftOver wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_36.human.genes/AceView.ncbi_36.genes_gff.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_36.human.genes/AceView.ncbi_36.good_proteins_fasta.tar.gz tar xvzf AceView.ncbi_36.genes_gff.tar.gz tar xvzf AceView.ncbi_36.good_proteins_fasta.tar.gz cd AceView.ncbi_36.genes_gff # If the result of this command is > 0, then some lines have end < start # and need to be fixed: awk '$5 < $4 {print;}' *.gff | wc -l #0 # Filter out empty lines, lines where the product_id has a stray # newline before it, and $chr|Hs# IDs that don't appear liftable. egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \ | sed -e 's/^/chr/;' \ > acembly.gff # Extract annotation classes from original gff: egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \ | perl -wpe 's/^.*Gene_type (\w+); transcript_id (\S+);.*/$2\t$1/; \ s/Main$/main/ || s/Putative$/putative/ || \ die "Unrecognized class/Gene_type:\n$_\n";' \ | sort -u \ > acemblyClass.tab # Some gff transcript_id's end in -unspliced (no intron), but the # corresponding protein fasta IDs to not have that suffix. We need # them to match, so add where necessary. # Use perl to make a perl script to add -unspliced to protein IDs # where necessary: grep unspliced acemblyClass.tab | wc -l #70156 egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \ | perl -wpe 's@^.*transcript_id (\S+)-unspliced;.*$@\$unsp{"$1"} = 1;@ \ || s/^.*\n$//;' \ | sort -u \ > ../addUnspliced.pl wc -l ../addUnspliced.pl #70156 ../addUnspliced.pl cat >> ../addUnspliced.pl <<'_EOF_' while (<>) { if (/^>(\S+)$/) { if ($unsp{$1}) { s/^>(\S+)/>$1-unspliced/; } } print; } '_EOF_' # << emacs # Add -unspliced suffix to protein IDs where necessary, and pare down # proteins to just the ones that we have transcripts for: cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.good_proteins_fasta awk '{print $1;}' ../AceView.ncbi_36.genes_gff/acemblyClass.tab \ > transcriptNames.txt perl ../addUnspliced.pl *.fasta \ | faSomeRecords stdin transcriptNames.txt acemblyPep.fa grep unspliced acemblyPep.fa | wc -l #55931 # Danielle Thierry-Mieg explained that noncoding genes are included so # the number of proteins can be smaller than the number of transcripts. # Load tables ssh hgwdev cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.genes_gff ldHgGene -gtf hg18 acembly acembly.gff #Read 258618 transcripts in 3451107 lines in 1 files # 258618 groups 24 seqs 1 sources 5 feature types #258618 gene predictions hgLoadSqlTab hg18 acemblyClass ~/kent/src/hg/lib/acemblyClass.sql \ acemblyClass.tab cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.good_proteins_fasta hgPepPred hg18 generic acemblyPep acemblyPep.fa rm acemblyPep.tab runJoiner.csh hg18 acembly # hg18.acemblyPep.name - hits 210003 of 210003 ok # hg18.acemblyClass.name - hits 258618 of 258618 ok ########################################################################### ## Create gc5Base download raw data file (DONE - 2007-08-29 - Hiram) ssh kkstore02 cd /cluster/data/hg18/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 \ hg18 /cluster/data/hg18/hg18.2bit 2> /dev/null \ | gzip > hg18.gc5Base.txt.gz ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg18/gc5Base cd /usr/local/apache/htdocs/goldenPath/hg18/gc5Base ln -s /cluster/data/hg18/bed/gc5Base/hg18.gc5Base.txt.gz . ########################################################################### # GENE BOUNDS (RNACLUSTER) (REBUILT 08-30-2007 Fan) # Create rnaCluster table (depends on {est,mrna}OrientInfo) cd /cluster/data/hg18/bed mv rnaCluster rnaCluster.old mkdir rnaCluster cd rnaCluster/ mkdir chrom # Create a list of accessions that come from RAGE libraries and need to be excluded. ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs foreach f (/cluster/data/hg18/nib/chr*.nib) set c = $f:t:r set out = chrom/$c.bed # Exclude accesions in the RAGE file echo clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c end hgLoadBed hg18 rnaCluster chrom/*.bed ########################################################################### # RE-LOAD FISH CLONES after bacEnds update (DONE - 2007-09-04 - Hiram) # The bacEnds processing results are used here ssh hgwdev mkdir /cluster/data/hg18/bed/fishClones.2007-08-29 cd /cluster/data/hg18/bed/fishClones.2007-08-29 ln -s ../fishClones/cl_acc_gi_len . ln -s ../fishClones/fhcrc.sts . # have to be on hgwdev for this since it is going to read from the db time nice -n +19 fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg18 \ /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt \ /cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out \ ./cl_acc_gi_len \ /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl \ fishClones > fishClones.out 2>&1 # real 0m53.783s # Reading Fish Clones file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt # reading fishInfo file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt # Reading Clone/Acc (clac.out) file /cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out # Reading BAC Ends file ./cl_acc_gi_len # Reading BAC Ends psl file /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl # Reading additional STS Marker links fhcrc.sts # Determining good positions # findClonePos: determining positions of fish clones # Writing output file # ERROR: at line # 177, no cytoband info for chrX:104048913-104206974 # RP11-79L11 # ERROR: at line # 178, no cytoband info for chrX:104048913-104206974 # RP11-79L11 # Load the track hgLoadBed -notItemRgb -noBin -tab \ -sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \ hg18 fishClones fishClones.bed # Loaded 9788 elements of size 16 ############################################################################ # INDEL-BASED CONSERVATION TRACK (DONE, 2007-09-03 - 2007-09-17, hartera) # Data from the Gerton Lunter (gerton.lunter at anat.ox.ac.uk), MRC # Functional Genetics Unit, University of Oxford, United Kingdom. # Data is from the paper: # Lunter G, Ponting CP and Hein J Genome-wide identification of human # functional DNA using a neutral indel model. PLoS Comput Biol. 2006 # Jan;2(1):e5. ssh kkstore02 mkdir -p /cluster/data/hg18/bed/consIndels/data cd /cluster/data/hg18/bed/consIndels # Add a README.indels with the e-mail from Gerton Lunter # get the data wget --timestamping \ http://wwwfgu.anat.ox.ac.uk/~gerton/igs-hg18mm8cf2.zip # 38 Mb zip file in GFF format. This contains data for hg18 # comparing it to mm8 and cf2 (canFam2). unzip igs-hg18mm8cf2.zip mv *.gff ./data/ foreach f (./data/*.gff) set r = $f:r echo $r grep -v "track" $f > ${r}NoHeader.gff end # strip off the end of the name e.g. IGS0001.1:p=.74; FDR 0.27 # so that the name displayed is short - IGS0001.1. The score field # is used to determine colouring and this is calculated from FDR ssh kkstore02 cd /cluster/data/hg18/bed/consIndels perl -pi.bak -e \ 's/(IGS[0-9a-z]+\.[0-9XY]+):p=?> consIndelsHg18Mm8CanFam2.bed end # load data ssh hgwdev cd /cluster/data/hg18/bed/consIndels hgsql -e 'drop table consIndelsHg18Mm8CanFam2;' hg18 hgLoadBed hg18 consIndelsHg18Mm8CanFam2 consIndelsHg18Mm8CanFam2.bed # Loaded 2603017 elements of size 5 # Get the IDs, posterior probabilities (p) for the segment being neutral, # and the FDR from the original GFFs for a separate table. Some items # have p<.001. Can not do Table Browser queries restricting # p to <, =, or > a specified value unless all values are floats. # Contacted the data contributor, Gerton Lunter, and he said it would be # ok to change all p<.001 to p=0.0005 ssh kkstore02 cd /cluster/data/hg18/bed/consIndels/ foreach c (`cat /cluster/data/hg18/chrom.lst`) echo $c foreach f (./data/igs.chr${c}.gff) echo $f awk 'BEGIN {FS="\t"} {if ($9 ~ /IGS/) print $9;}' $f \ | sed -e 's/:/\t/' \ | sed -e 's/p=\./0\./' | sed -e 's/p<\.001/0\.0005/' \ | sed -e 's/;\sFDR/\t/' >> consIndelsHg18Mm8CanFam2Conf.txt end end # there are no GFF files for the haplotype chroms # Create a table definition for the table of identifier, posterior # probability and false discovery rate (FDR). cat << 'EOF' > $HOME/kent/src/hg/lib/itemConf.as table itemConf "Probability and false discovery rate (FDR) for an element in a track." ( string id; "Identifier of element" float probability; "Probability associated with element" float fdr; "False Discovery Rate (FDR) associated with element" ) 'EOF' # << emacs cd $HOME/kent/src/hg/lib autoSql itemConf.as itemConf mv itemConf.h ../inc/ # commit ../inc/itemConf.h, itemConf.c, itemConf.as and # itemConf.sql to CVS. Add itemConf.o to src/hg/lib/makefile ssh hgwdev cd /cluster/data/hg18/bed/consIndels hgLoadSqlTab hg18 consIndelsHg18Mm8CanFam2Conf \ $HOME/kent/src/hg/lib/itemConf.sql \ consIndelsHg18Mm8CanFam2Conf.txt # check that all itesm are in this table. hgsql -N -e 'select distinct(name) from consIndelsHg18Mm8CanFam2;' hg18 \ | sort > consIndels.names.sort hgsql -N -e 'select distinct(id) from consIndelsHg18Mm8CanFam2Conf;' hg18 \ | sort > consIndels.idsfromConf.sort wc -l *.sort # 2603017 consIndels.idsfromConf.sort # 2603017 consIndels.names.sort comm -12 consIndels.names.sort consIndels.idsfromConf.sort | wc -l # 2603017 # so all element IDs are in both tables. # cleanup rm ./data/*.bak *.sort # add trackDb/human/hg18/trackDb.ra entry and add description that # was written by the data contributor. Add code to hgc.c to display # the posterior probability and the FDR on the details page for # track elements. Gerton Lunter provided a description for the data # on 2007-09-12. ############################################################################ # Promote UCSD genome-wide ENCODE Chip tracks: # UCSD TAF1 IMR90 Chip/chip to Regulation group # (2007-09-14 kate) hgsql hg18 -e "alter table encodeUcsdNgChipSignal rename to wgEncodeUcsdNgTaf1Signal" hgsql hg18 -e "update wgEncodeUcsdNgTaf1Signal set file='/gbdb/hg18/encode/wib/wgEncodeUcsdNgTaf1Signal.wib'" hgsql hg18 -e "alter table encodeUcsdNgChipKnownSites rename to wgEncodeUcsdNgTaf1KnownSites" hgsql hg18 -e "alter table encodeUcsdNgChipNovelSites rename to wgEncodeUcsdNgTaf1NovelSites" hgsql hg18 -e "alter table encodeUcsdNgValChipH3K4me rename to wgEncodeUcsdNgTaf1ValidH3K4me" hgsql hg18 -e "alter table encodeUcsdNgValChipH3ac rename to wgEncodeUcsdNgTaf1ValidH3ac" hgsql hg18 -e "alter table encodeUcsdNgValChipRnap rename to wgEncodeUcsdNgTaf1ValidRnap" hgsql hg18 -e "alter table encodeUcsdNgValChipTaf rename to wgEncodeUcsdNgTaf1ValidTaf" ############################################################################ # NESTED REPEATS (DONE 9/20/07 angie) # This track is now generated by doRepeatMasker.pl; added to this older # assembly for interest. ssh kkstore02 # First, re-liftUp the .out -- liftUp has been enhanced to uniquify the # RepeatMasker IDs. cd /cluster/data/hg18 foreach c ( `cat chrom.lst` ) echo lifting chr$c chunks to contigs foreach d ( ${c}/N{C,G,T}_* ) cd $d set contig = $d:t liftUp $contig.IDs.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out \ > /dev/null cd ../.. end echo lifting contigs to chr$c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.IDs.fa.out lift/ordered.lft warn \ `sed -e 's/.fa.out$/.IDs.fa.out/' lift/oOut.lst` \ > /dev/null endif if (-e lift/random.lft && ! -z lift/random.lft) then liftUp chr${c}_random.IDs.fa.out lift/random.lft warn \ `sed -e 's/.fa.out$/.IDs.fa.out/' lift/rOut.lst` \ > /dev/null endif cd .. end # Now join fragments using shared IDs: ssh kolossus mkdir /cluster/data/hg18/bed/nestedRepeats cd /cluster/data/hg18/bed/nestedRepeats extractNestedRepeats.pl ../../?{,?}/chr*.IDs.fa.out \ > hg18.nestedRepeats.bed # Load table: ssh hgwdev cd /cluster/data/hg18/bed/nestedRepeats hgLoadBed hg18 nestedRepeats hg18.nestedRepeats.bed \ -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql ############################################################################ # Promote GIS genome-wide ENCODE tracks: # GIS PET RNA and GIS ChIP-PET to Regulation group # (2007-09-20 kate) hgsql hg18 -e "alter table encodeGisChipPet rename to wgEncodeGisChipPet" hgsql hg18 -e "alter table encodeGisChipPetHes3H3K27me3 rename to wgEncodeGisChipPetHes3H3K27me3" hgsql hg18 -e "alter table encodeGisChipPetHes3H3K4me3 rename to wgEncodeGisChipPetHes3H3K4me3" hgsql hg18 -e "alter table encodeGisChipPetMycP493 rename to wgEncodeGisChipPetMycP493" hgsql hg18 -e "alter table encodeGisChipPetStat1Gif rename to wgEncodeGisChipPetStat1Gif" hgsql hg18 -e "alter table encodeGisChipPetStat1NoGif rename to wgEncodeGisChipPetStat1NoGif" hgsql hg18 -e "alter table encodeGisRnaPetHCT116 rename to wgEncodeGisRnaPetHCT116" hgsql hg18 -e "alter table encodeGisRnaPetHes3 rename to wgEncodeGisRnaPetHes3" hgsql hg18 -e "alter table encodeGisRnaPetMCF7 rename to wgEncodeGisRnaPetMCF7" hgsql hg18 -e "alter table encodeGisRnaPetMCF7Estr rename to wgEncodeGisRnaPetMCF7Estr" ########################################################## # Case Control Consortium (DONE 2007-09-20 (Andy) ssh hgwdev bash mkdir /cluster/data/hg17/bed/caseControl cd /cluster/data/hg17/bed/caseControl wget ftp://ftp.sanger.ac.uk/pub/WTCCC/summary_stats/summary_stats_auto_all.zip unzip summary_stats_auto_all.zip cd basic/ for disease in BD CAD CD HT RA T1D T2D; do echo $disease jkDisease=${disease:0:1}`echo ${disease:1} | tr [[:upper:]] [[:lower:]]` for f in *${disease}*.txt; do tail +2 $f | awk '{if ($21 == "1") print;}' | \ cut -f1,15 >> ../chromGraphs/cccTrendPval${jkDisease}.cg done done cd ../chromGraphs/ mkdir hg17 hg18 for f in *.cg; do table=${f%.cg}; echo $table hgLoadChromGraph -idTable=affy500k -minusLog10 -pathPrefix=/gbdb/hg17/chromGraph hg17 $table $f 2> ${table}.hg17.errors mv ${table}.cgb hg17/ hgLoadChromGraph -idTable=affy500k -minusLog10 -pathPrefix=/gbdb/hg18/chromGraph hg18 $table $f 2> ${table}.hg18.errors mv ${table}.cgb hg18/ done pushd /gbdb/hg18/chromGraph ln -s /cluster/data/hg17/bed/caseControl/chromGraphs/hg18/*.cgb . popd pushd /gbdb/hg17/chromGraph ln -s /cluster/data/hg17/bed/caseControl/chromGraphs/hg17/*.cgb . popd # Add the hack row into metaChromGraph for the composite tracks. hgsql hg17 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile) values ("caseControl", 0, 0, "composite")' hgsql hg18 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile) values ("caseControl", 0, 0, "composite")' ############################################################################# # RGD HUMAN QTL (DONE 9/24/07 angie) ssh hgwdev mkdir /cluster/data/hg18/bed/rgdQtl cd /cluster/data/hg18/bed/rgdQtl wget ftp://rgd.mcw.edu/pub/data_release/QTLS # Pick out the human QTLs and liftOver hg17 --> hg18. # Make bed4 and rgdQtlLink: perl -we 'open(BED, ">rgdQtl.bed") || die; \ open(LINK, ">rgdQtlLink.txt") || die; \ while (<>) { \ chomp; my @w = split("\t"); \ next unless ($w[1] eq "human" && $w[15]); \ $w[5] =~ s/^/chr/; \ $w[15] =~ s/^([-\d]+).*$/$1/ || die "parse start pos"; \ $w[16] =~ s/^(\d+).*$/$1/ || die "parse end pos"; \ if ($w[15] > $w[16]) { \ $tmp = $w[15]; $w[15] = $w[16]; $w[16] = $tmp; \ } \ $w[15]--; \ $w[15] = 0 if ($w[15] < 0); \ print BED "$w[5]\t$w[15]\t$w[16]\t$w[2]\n"; \ print LINK "$w[0]\t$w[2]\t$w[3]\n"; \ } \ close(BED); close(LINK);' \ QTLS mv rgdQtl.bed hg17.rgdQtl.bed # Using a fairly loose minMatch -- the regions covered are huge. liftOver -minMatch=0.5 hg17.rgdQtl.bed \ /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz \ hg18.rgdQtl.{bed,unmapped} wc -l hg18* # 254 hg18.rgdQtl.bed # 2 hg18.rgdQtl.unmapped ssh hgwdev cd /cluster/data/hg18/bed/rgdQtl hgLoadBed hg18 rgdQtl hg18.rgdQtl.bed hgLoadSqlTab hg18 rgdQtlLink ~/kent/src/hg/lib/rgdQtlLink.sql rgdQtlLink.txt # Make sure there aren't any illegal coords: checkTableCoords -verbose=2 hg18 rgdQtl ############################################################################# # RGD RAT QTL MAPPED TO HUMAN (DONE 9/26/07 angie) #====== Begin work that was discarded because its output was too voluminous # to be very useful IMHO. Keeping it in the doc as a lesson learned. # See below for what I ended up loading. ssh hgwdev cd /cluster/data/hg18/bed/rgdQtl genePredToPsl -bedFormat rn4 /cluster/data/rn4/bed/rgdQtl/rgdQtl.bed \ rn4.rgdQtl.psl time ssh -x kolossus pslMap `pwd`/rn4.rgdQtl.psl \ -chainMapFile /cluster/data/hg18/bed/liftOver/hg18ToRn4.over.chain.gz \ `pwd`/hg18.rgdRatQtl.psl #0.011u 0.006s 10:58.56 0.0% 0+0k 0+0io 0pf+0w # That created an 11G monstrosity of a file that dwarfs the original # input. Linecount increased 3 orders of magnitude, filesize increased # 5 orders of magnitude. wc -l rn4.rgdQtl.psl #1067 rn4.rgdQtl.psl ssh -x kkstore02 wc -l `pwd`/hg18.rgdRatQtl.psl #1228306 /cluster/store11/gs.19/build36/bed/rgdQtl/hg18.rgdRatQtl.psl # Let's see what liftOver does... time ssh -x kolossus \ liftOver -minMatch=0.5 -multiple \ /cluster/data/rn4/bed/rgdQtl/rgdQtl.bed \ /cluster/data/rn4/bed/liftOver/rn4ToHg18.over.chain.gz \ `pwd`/hg18.rgdRatQtl.lo.{bed,unmapped} #0.014u 0.004s 0:59.27 0.0% 0+0k 0+0io 0pf+0w wc -l hg18.rgdRatQtl.lo.{bed,unmapped} # 1214366 hg18.rgdRatQtl.lo.bed # 14 hg18.rgdRatQtl.lo.unmapped # Still got 1M lines... ugh. Mapped all over the place, of course. #====== end discarded work. # Use a stringently filtered version of over.chain to do the mapping, # so we only pick up large chunks (targeting >10,000bases) of these # enormous regions (up to 235M in rn4). ssh kolossus cd /cluster/data/hg18/bed/rgdQtl # rn4ToHg18 was built before doBlastz included chainStitchId in the # pipe to create over.chain. Run it here, to repair any chain breaks: chainStitchId /cluster/data/rn4/bed/liftOver/rn4ToHg18.over.chain.gz \ rn4ToHg18Stitch.over.chain # I looked at the summed scores from chainStitchId vs. the length # spanned by the stitched chains, and arbitrarily picked what I # think is a sweet spot for mapping very large ranges: at scores # near 500000, chains seem to span 40-60k bases. Pretty much all # of the rat and human chromosomes (except human randoms) have at # least some chains with scores >= 500000. So I'll filter the # stitched chains to keep those with score >= 500000. # NOTE FOR NEXT TIME: consider filtering by length (see jaxQtl below). chainFilter rn4ToHg18Stitch.over.chain -minScore=500000 \ > rn4ToHg18Coarse.over.chain # I tried liftOver with -minMatch=0.5, 0.33, 0.25 and 0.2. These are the # wc -l stats for each run -- not surprisingly, many more matches with # lower minMatch: #0.5: # 1256 hg18.rgdRatQtl.coarse.lo.bed # 998 hg18.rgdRatQtl.coarse.lo.unmapped #0.33: # 6748 hg18.rgdRatQtl.coarse.lo.bed # 92 hg18.rgdRatQtl.coarse.lo.unmapped #0.25: # 9609 hg18.rgdRatQtl.coarse.lo.bed # 36 hg18.rgdRatQtl.coarse.lo.unmapped #0.2: # 10529 hg18.rgdRatQtl.coarse.lo.bed # 30 hg18.rgdRatQtl.coarse.lo.unmapped # I spot-checked by viewing a rat QTL and hg18 chains in rn4, and # eyeballing whether the net track looked like there were solid # matches for large regions. With minMatch=0.25, most mappings # and unmapped looked pretty reasonable, but I still saw a few # (like Alc4) where a nice long chain was not being used, so I # kicked it down to 0.2 and checked again -- looks good. time liftOver -minMatch=0.2 -multiple \ /cluster/data/rn4/bed/rgdQtl/rgdQtl.bed rn4ToHg18Coarse.over.chain \ hg18.rgdRatQtl.coarse.lo.{bed,unmapped} #100.476u 10.925s 1:52.31 99.1% 0+0k 0+0io 0pf+0w wc -l hg18.rgdRatQtl.coarse.lo.{bed,unmapped} # see above. # Many of the records are completely contained within other records # for the same QTL (inversions I suppose) -- they don't really tell # us anything new about the murky QTL region, so merge them in. # NOTE FOR NEXT TIME: instead of the perl+sort, use something like this: # liftOverMerge -mergeGap=10000 hg18.rgdRatQtl.coarse.lo.bed stdout \ # | mergeOverlapBed4.pl - > hg18.rgdRatQtl.coarse.lo.pruned.bed # liftOverMerge joins items separated by small (a relative term) gaps. perl -we \ 'while (<>) { \ chomp; ($chrom, $start, $end, $name) = split; \ push @{$item2coords{"$chrom.$name"}}, [$start, $end]; \ } \ foreach $item (keys %item2coords) { \ @sortedCoords = sort { $a->[0] <=> $b->[0] } @{$item2coords{$item}}; \ ($chrom, $name) = split(/\./, $item); \ ($mergeStart, $mergeEnd) = @{shift @sortedCoords}; \ foreach $rangeRef (@sortedCoords) { \ ($rangeStart, $rangeEnd) = @{$rangeRef}; \ next if ($rangeEnd <= $mergeEnd); \ if ($rangeStart > $mergeEnd) { \ print "$chrom\t$mergeStart\t$mergeEnd\t$name\n"; \ ($mergeStart, $mergeEnd) = ($rangeStart, $rangeEnd); \ } else { \ $mergeEnd = $rangeEnd; \ } \ } \ print "$chrom\t$mergeStart\t$mergeEnd\t$name\n" if ($mergeEnd); \ } \ ' hg18.rgdRatQtl.coarse.lo.bed \ | sort -k1,1 -k2n,2n -k4,4r \ > hg18.rgdRatQtl.coarse.lo.pruned.bed ssh hgwdev cd /cluster/data/hg18/bed/rgdQtl hgLoadBed hg18 rgdRatQtl hg18.rgdRatQtl.coarse.lo.pruned.bed # Just use rn4's non-positional associated info: sed -e 's/rgdQtlLink/rgdRatQtlLink/' ~/kent/src/hg/lib/rgdQtlLink.sql \ > rgdRatQtlLink.sql hgLoadSqlTab hg18 rgdRatQtlLink rgdRatQtlLink.sql \ /cluster/data/rn4/bed/rgdQtl/rgdQtlLink.txt # Make sure there aren't any illegal coords: checkTableCoords -verbose=2 hg18 rgdRatQtl runJoiner.csh hg18 rgdRatQtl #====== more discarded work 10/2/07: ssh kolossus cd /cluster/data/hg18/bed/rgdQtl # Try pslMap with the same filtered chains: time pslMap -swapMap rn4.rgdQtl.psl \ -chainMapFile rn4ToHg18Coarse.over.chain \ hg18.rgdRatQtl.coarse.pm.psl #444.915u 29.914s 11:20.08 69.8% 0+0k 0+0io 0pf+0w wc -l hg18.rgdRatQtl.coarse.pm.psl #10755 hg18.rgdRatQtl.coarse.pm.psl # Again, linecount is comparable to liftOver, but the block-by-block # detail from pslMap creates an enormous file (10GB) even with the # filtered chains. # Recover 21G of disk space: rm hg18.rgdRatQtl.psl hg18.rgdRatQtl.coarse.pm.psl #====== end discarded work. ############################################################################# # N-SCAN GENES partial reload (2007-09-26 markd) # reload nscanPasaGene to get fixed names and to fix search criteria # download pasa predictions cd /cluster/data/hg18/bed/nscan/pasa2 wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.pasa.gtf wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.prot.fa bzip2 hg18.* chmod a-w hg18.* ldHgGene -gtf -genePredExt hg18 nscanPasaGene hg18.pasa.gtf.bz2 hgPepPred hg18 generic nscanPasaPep hg18.prot.fa.bz2 rm *.tab # update trackDb to add correct termRegex entries human/hg18/trackDb.ra # push nscanPasaGene nscanPasaPep and trackDb ############################################################################# # Blastz hg18 to J. Craig Venter chrom attempt (DONE - 2007-09-27 - Hiram) ssh kkstore06 screen # use a screen to control this job mkdir /cluster/data/hg18/bed/blastzVenter1.2007-09-27 cd /cluster/data/hg18/bed/blastzVenter1.2007-09-27 cat << '_EOF_' > DEF # human reference vs J. Craig Venter # using -chainMinScore=10000 and -chainLinearGap=medium # during doBlastzChainNet.pl run # parameters on advice from Webb for K and Q # M as in hg18 self, O and E from Q # Y and T as in hg18-panTro2 and mm9-rn4 BLASTZ_K=10000 BLASTZ_M=400 BLASTZ_O=600 BLASTZ_E=150 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q # TARGET: Human Hg18 SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Venter1 SEQ2_DIR=/iscratch/i/venter1/venter1.unmasked.2bit SEQ2_LEN=/cluster/data/venter1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzVenter1.2007-09-27 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 -chainMinScore=10000 -chainLinearGap=medium \ -bigClusterHub=kk -noDbNameCheck DEF > do.log 2>&1 & # real 163m10.634s # this doesn't work, it failed due to mistakenly thinking it was a self # alignment. Plus, we need to do the raw scaffolds, not these fake # chroms. ############################################################################# # CONTRAST GENES (2007-10-02 markd) # recieved predictions from Sam Gross cd /cluster/data/hg18/bed/contrastGene/ wget http://www.stanford.edu/~ssgross/contrast.hg18.bed # this is a custom track, not a pure BED tail +2 contrast.hg18.bed | hgLoadBed -tab hg18 contrastGene stdin # verify # load track db (ra and contrastGene.html are global # request push of contrastGene ########################################################################### # SGP GENES Update (DONE - 2007-10-02 - Hiram) ssh kkstore02 mkdir /cluster/data/hg18/bed/sgp.2007-10-02 cd /cluster/data/hg18/bed/sgp.2007-10-02 SITE="genome.imim.es/genepredictions/H.sapiens/golden_path_200603_x_mm9" for C in `cut -f1 ../../chrom.sizes` do wget --timestamping "http://${SITE}/SGP/${C}.gtf" -O ${C}.gtf wget --timestamping "http://${SITE}/SGP/${C}.prot" -O ${C}.prot done # before reloading the table, measure the previous set: nice -n +19 featureBits -enrichment hg18 refGene:CDS sgpGene # refGene:CDS 1.123%, sgpGene 1.272%, both 0.964%, cover 85.83%, enrich 67.47x nice -n +19 featureBits -enrichment hg18 knownGene:CDS sgpGene # knownGene:CDS 1.185%, sgpGene 1.272%, both 0.989%, cover 83.43%, enrich 65.58x # now reload the table ldHgGene -gtf -genePredExt hg18 sgpGene chr*.gtf # Read 34023 transcripts in 288520 lines in 49 files # 34023 groups 46 seqs 1 sources 3 feature types # 34023 gene predictions # and now measure this new set nice -n +19 featureBits -enrichment hg18 refGene:CDS sgpGene # refGene:CDS 1.123%, sgpGene 1.270%, both 0.964%, cover 85.84%, enrich 67.59x nice -n +19 featureBits -enrichment hg18 knownGene:CDS sgpGene # knownGene:CDS 1.185%, sgpGene 1.270%, both 0.988%, cover 83.41%, enrich 65.68x ########################################################################### # Blastz Orangutan ponAbe2 (DONE - 2007-10-02 - 2007-10-05 - Hiram) ssh kkstore02 screen # use screen to control this job mkdir /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02 cd /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02 cat << '_EOF_' > DEF # Human vs orangutan BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Orangutan ponAbe2 SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=pk > blastz.log 2>&1 & # real 388m20.443s # Completed: 126960 of 126960 jobs # CPU time in finished jobs: 7068824s 117813.73m 1963.56h 81.82d 0.224 y # IO & Wait Time: 517624s 8627.07m 143.78h 5.99d 0.016 y # Average job time: 60s 1.00m 0.02h 0.00d # Longest finished job: 4940s 82.33m 1.37h 0.06d # Submission to last job: 62056s 1034.27m 17.24h 0.72d # some jobs failed (because they were done but parasol didn't realize that) # after recovery, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat -bigClusterHub=pk > cat.log 2>&1 & # real 390m56.934s cat fb.hg18.chainPonAbe2Link.txt # 2676696124 bases of 2881515245 (92.892%) in intersection # And the swap mkdir /cluster/data/ponAbe2/bed/blastz.hg18.swap cd /cluster/data/ponAbe2/bed/blastz.hg18.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -swap -bigClusterHub=pk > swap.log 2>&1 & # real 123m9.197s cat fb.ponAbe2.chainHg18Link.txt # 2824501297 bases of 3093572278 (91.302%) in intersection ############################################################## # NIMH Bipolar Genome Graphs built-in (DONE 2007-10-04 Galt) ssh hgwdev mkdir /cluster/data/hg17/bed/nimhBipolar # I registered and downloaded : wget http://mapgenetics.nimh.nih.gov/BP_POOLING/german_data_share.csv.zip \ --user=galt --password=mypassword wget http://mapgenetics.nimh.nih.gov/BP_POOLING/nimh_data_share.csv.zip \ --user=galt --password=mypassword unzip german_data_share.csv.zip unzip nimh_data_share.csv.zip mkdir chromGraphs tail +2 nimh_data_share.csv | tr -d '"' | gawk -F ',' '{print $1 "\t" $9}' \ > chromGraphs/nimhBipolarUs.cgt tail +2 german_data_share.csv | tr -d '"' | gawk -F ',' '{print $1 "\t" $9}' \ > chromGraphs/nimhBipolarDe.cgt cd chromGraphs/ mkdir hg17 hg18 hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10 -pathPrefix=/gbdb/hg17/chromGraph hg17 nimhBipolarUs nimhBipolarUs.cgt \ >& nimhBipolarUs.hg17.errors mv nimhBipolarUs.cgb hg17/ hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10 -pathPrefix=/gbdb/hg17/chromGraph hg17 nimhBipolarDe nimhBipolarDe.cgt \ >& nimhBipolarDe.hg17.errors mv nimhBipolarDe.cgb hg17/ hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10 -pathPrefix=/gbdb/hg18/chromGraph hg18 nimhBipolarUs nimhBipolarUs.cgt \ >& nimhBipolarUs.hg18.errors mv nimhBipolarUs.cgb hg18/ hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10 -pathPrefix=/gbdb/hg18/chromGraph hg18 nimhBipolarDe nimhBipolarDe.cgt \ >& nimhBipolarDe.hg18.errors mv nimhBipolarDe.cgb hg18/ pushd /gbdb/hg17/chromGraph ln -s /cluster/data/hg17/bed/nimhBipolar/chromGraphs/hg17/*.cgb . popd pushd /gbdb/hg18/chromGraph ln -s /cluster/data/hg17/bed/nimhBipolar/chromGraphs/hg18/*.cgb . popd # Add the hack row into metaChromGraph for the composite tracks. hgsql hg17 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile) values ("bipolar", 0, 0, "composite")' hgsql hg18 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile) values ("bipolar", 0, 0, "composite")' #Add composite track info to src/hg/makeDb/trackDb/human/trackDb.ra: ############################################################################ # MGI MOUSE QTL MAPPED TO HUMAN (DONE 10/10/07 angie) # Use a stringently filtered version of over.chain to do the mapping, # so we only pick up large chunks (targeting >10,000bases) of the # large fuzzy QTL regions. # Of the MGI QTLs, some are large as expected, but most are tiny -- # they have only the peak STS marker coords, no indication of the # range. Jim suggested padding those out to 100k. So I will process # these in two batches, and make subtracks -- one for original, one # for our modified set. ### NOTE FOR NEXT TIME ### ### Use jaxQtl instead of jaxQTL throughout. ssh kolossus mkdir /cluster/data/hg18/bed/jaxQTL cd /cluster/data/hg18/bed/jaxQTL # mm8ToHg18 was built before doBlastz included chainStitchId in the # pipe to create over.chain. Run it here, to repair any chain breaks: chainStitchId /cluster/data/mm8/bed/liftOver/mm8ToHg18.over.chain.gz \ /scratch/tmp/mm8ToHg18Stitch.over.chain # For rn4->hg18 (rgdRatQtl above), I eyeballed scores vs. spans of # stitched chains, to try to find a score threshold over which almost # all spans were at least 10 or 20k, most >50k. For mm8->hg18, the # correspondence is not quite so smooth, and in order to keep all spans # >= 100k, the score threshold would have to be 170k (compared to # 500k for rn4-hg18) and would pick up a lot of short chains. # So this time I'll try filtering directly by span instead of score # (but add a reasonable minScore to kick out some outliers). chainFilter /scratch/tmp/mm8ToHg18Stitch.over.chain \ -tMinSize=20000 -qMinSize=20000 -minScore=10000\ > mm8ToHg18Coarse.over.chain # Separate the mm8 jaxQtl's by size and reduce to bed4: awk 'BEGIN{OFS="\t";} \ ($3-$2) < 1000 {s = $2 > 50000 ? $2-50000 : 0; \ print $1, s, $3+50000, $4;}' \ /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed \ > mm8.jaxQtl.padded.bed cp /dev/null tmp.bed foreach chr (`awk '{print $1;}' /cluster/data/mm8/chrom.sizes`) set size = `awk '$1 == "'$chr'" {print $2;}' /cluster/data/mm8/chrom.sizes` awk 'BEGIN{OFS="\t";} \ $1 == "'$chr'" && $3 > '$size' {$3 = '$size';} \ $1 == "'$chr'" && $3 > $2 {print;}' \ mm8.jaxQtl.padded.bed >> tmp.bed end mv tmp.bed mm8.jaxQtl.padded.bed awk 'BEGIN{OFS="\t";} ($3-$2) > 100000 {print $1, $2, $3, $4;}' \ /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed \ > mm8.jaxQtl.asIs.bed # Make sure we didn't miss any between those two size ranges (except for # the 4 markers whose coords are completely off the end of mm8 chroms): wc -l mm8.*.bed # 73 mm8.jaxQtl.asIs.bed # 1468 mm8.jaxQtl.padded.bed # 1541 total wc -l /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed #1545 /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed # Try liftOver with various -minMatch settings. Compare the number # mapped and unmapped; eyeball some of the unmapped in mm8, see if # the hg18 Nets are truly weak there. foreach minMatch (0.1 0.2 0.25 0.33) time liftOver -minMatch=$minMatch -multiple \ mm8.jaxQtl.asIs.bed mm8ToHg18Coarse.over.chain \ hg18.jaxQTL.asIs.$minMatch.{bed,unmapped} time liftOver -minMatch=$minMatch -multiple \ mm8.jaxQtl.padded.bed mm8ToHg18Coarse.over.chain \ hg18.jaxQTL.padded.$minMatch.{bed,unmapped} wc -l hg18.jaxQTL.*.$minMatch.{bed,unmapped} echo "" end #typical time: 23s for asIs, 45s for padded # 757 hg18.jaxQTL.asIs.0.1.bed # 1471 hg18.jaxQTL.padded.0.1.bed # 0 hg18.jaxQTL.asIs.0.1.unmapped # 54 hg18.jaxQTL.padded.0.1.unmapped # 634 hg18.jaxQTL.asIs.0.2.bed # 1429 hg18.jaxQTL.padded.0.2.bed # 0 hg18.jaxQTL.asIs.0.2.unmapped # 128 hg18.jaxQTL.padded.0.2.unmapped # 532 hg18.jaxQTL.asIs.0.25.bed # 1345 hg18.jaxQTL.padded.0.25.bed # 2 hg18.jaxQTL.asIs.0.25.unmapped # 282 hg18.jaxQTL.padded.0.25.unmapped # 362 hg18.jaxQTL.asIs.0.33.bed # 1146 hg18.jaxQTL.padded.0.33.bed # 8 hg18.jaxQTL.asIs.0.33.unmapped # 670 hg18.jaxQTL.padded.0.33.unmapped # I eyeballed the 0.1 .bed and .unmapped files, and they look # pretty good, esp. for mapped... we could probably get away with # 0.2 for the asIs but 0.1 looks OK. # Many of the records are completely contained within other records # for the same QTL (inversions I suppose) -- they don't really tell # us anything new about the murky QTL region, so merge them in. # NOTE FOR NEXT TIME: try this: # liftOverMerge -mergeGap=10000 hg18.jaxQTL.asIs.0.1.bed stdout \ # | mergeOverlapBed4.pl - > hg18.jaxQTL.asIs.0.1.pruned.bed # liftOverMerge joins items separated by small (a relative term) gaps. mergeOverlapBed4.pl hg18.jaxQTL.asIs.0.1.bed \ > hg18.jaxQTL.asIs.0.1.pruned.bed mergeOverlapBed4.pl hg18.jaxQTL.padded.0.1.bed \ > hg18.jaxQTL.padded.0.1.pruned.bed wc -l hg18.jaxQTL.*.pruned.bed # 398 hg18.jaxQTL.asIs.0.1.pruned.bed # 1463 hg18.jaxQTL.padded.0.1.pruned.bed ssh hgwdev cd /cluster/data/hg18/bed/jaxQTL ### NOTE FOR NEXT TIME ### ### Call the tables jaxQtl* instead of jaxQTL* -- QA doesn't like jaxQTL. hgLoadBed hg18 jaxQTLAsIs hg18.jaxQTL.asIs.0.1.pruned.bed hgLoadBed hg18 jaxQTLPadded hg18.jaxQTL.padded.0.1.pruned.bed # Make sure there aren't any illegal coords: checkTableCoords -verbose=2 hg18 jaxQTLAsIs checkTableCoords -verbose=2 hg18 jaxQTLPadded runJoiner.csh hg18 jaxQTLAsIs runJoiner.csh hg18 jaxQTLPadded # Tables renamed kuhn 10-12-2007 # jaxQTLAsIs to jaxQtlAsIs # jaxQTLPadded to jaxQtlPadded ########################################################################### # Build targetScanS track - (DONE - 2007-10-05 - 2007-10-31 - Hiram) # requested by: George Bell gbell at wi.mit.edu ssh hgwdev mkdir -p /cluster/data/hg18/bed/targetScanS cd /cluster/data/hg18/bed/targetScanS wget --timestamping \ http://jura.wi.mit.edu/targetscan/vert_40/ucsc/hg18/hg18ConsChrALL.bed hgLoadBed hg18 targetScanS -tmpDir=/scratch/tmp hg18ConsChrALL.bed # Loaded 50764 elements of size 6 featureBits hg18 targetScanS # 313293 bases of 2881515245 (0.011%) in intersection ################################ # previous attempts listed below # the don't supply them all, but we don't know which ones they # don't. So, ask for them all, and remove the files that are empty. for C in `cut -f1 ../../chrom.sizes | sed -e "s/chr//"` do wget --timestamping \ "http://jura.wi.mit.edu/targetscan/vert_40/ucsc/NR/hg18ConsChr${C}.bed" \ -O hg18ConsChr${C}.bed if [ ! -s "hg18ConsChr${C}.bed" ]; then rm -f "hg18ConsChr${C}.bed" fi done # Remove the browser/track lines from these custom track files # and load into the hg18.targetScanS table egrep -h -v "^browser|^track" hg*.bed | \ hgLoadBed hg18 targetScanS -tmpDir=/scratch/tmp stdin # Loaded 50802 elements of size 6 featureBits hg18 targetScanS # 312951 bases of 2881515245 (0.011%) in intersection # Create/edit/check in targetScans.html and trackDb.ra under # kent/src/hg/makeDb/trackDb/human/hg18 ########################################################################### # RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan) ssh hgwdev cd /cluster/data/hg18/bed mkdir wgRna-2007-10-05 cd wgRna-2007-10-05 # Received the data file, wgtrack_oct2007.txt (saved from wgtrack_oct2007.doc) # from Michel Weber's email # (Michel.Weber at ibcg.biotoul.fr) # and place it under cd /cluster/data/hg18/bed/wgRna-2007-10-05. cat wg_track_oct2007.txt|sed -e 's/ /\t/g' > wgRna.tab hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab ############################################################################# # BLASTZ calJac1 - Marmoset (2007-10-09 kate) ssh kkstore02 mkdir /cluster/data/hg18/bed/blastz.calJac1.2007-10-07 cd /cluster/data/hg18/bed/blastz.calJac1.2007-10-07 cat << '_EOF_' > DEF # human vs. marmoset # dynamic masking param BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Marmoset (calJac1) SEQ2_DIR=/san/sanvol1/scratch/calJac1/calJac1.2bit SEQ2_LEN=/san/sanvol1/scratch/calJac1/chrom.sizes SEQ2_LIMIT=500 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.calJac1.2007-10-07 '_EOF_' # << happy emacs doBlastzChainNet.pl DEF \ -bigClusterHub pk \ -chainMinScore=3000 -chainLinearGap=medium >& do.log & tail -f do.log # failed at download step do to pre-existing file of Brian's doBlastzChainNet.pl DEF \ -bigClusterHub pk -continue=download \ -chainMinScore=3000 -chainLinearGap=medium >& do2.log & tail -f do2.log ######################################################### # RE-BUILD GAD TRACK (Done, 10/17/06, Fan) mkdir /cluster/store12/gad071011 rm /cluster/data/gad ln -s /cluster/store12/gad071011 /cluster/data/gad cd /cluster/data/gad # Receive "all.txt" from GAD # contact person: Garner, John (NIH/NIA/IRP) [F] [garnerjr@mail.nih.gov] hgsql hg18 -e 'drop table gadAll' hgsql hg18 <~/src/hg/lib/gadAll.sql hgsql hg18 -e 'load data local infile "all.txt" into table gadAll ignore 3 lines' # create gad table gadPos hg18 j18.tmp cat j18.tmp |sort -u >hg18.gad.tab hgLoadBed hg18 gad hg18.gad.tab rm j18.tmp ######################################################################### # HAPMAP LD (DONE 10/26/07 angie -- phased REDONE 1/30/08) # Based on Daryl's hg17 work. Data version here is release #22, # March 2007 (2007-03). # 1/30/08: HapMap re-released the phased genotypes 1/22/08 -- re-run, # but without the removal of question marks that we had to do the # first time around. # hapmap.org offers ld_data downloads that look like the output of # makeDcc -- but only for older versions. To get LD for the latest # release (and for hg18 coords), compute LD from genotype as Daryl did. ############################# unphased ############################## #*** NOTE FOR NEXT TIME: don't bother with individual CHB and JPT subsets, #*** {CEU, CHB+JPT, YRI} is what we display. #*** Actually, if there is a next time, we'll probably just start with #*** phased and ignore unphased. ssh kolossus mkdir -p /san/sanvol1/scratch/hg18/bed/hapmapLd/genotypes_2007-03/run.Haploview cd /san/sanvol1/scratch/hg18/bed/hapmapLd/genotypes_2007-03 # wget all genotype data: wget ftp://ftp.hapmap.org/pub/hapmap/public/00README.releasenotes_rel22 wget ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2007-03/fwd_strand/non-redundant/genotypes_chr\*.txt.gz # Use latest Haploview to compute LD scores: wget http://www.broad.mit.edu/mpg/haploview/downloads/Haploview.jar # Haploview cluster run on whole-chrom genotype files was a bust. # Even on kki nodes, with java memory maxed out, 47 of 120 jobs crashed # and one was still running after 5.5 days so I killed it. # Meanwhile, Daryl suggested using the phased data instead. It is # not yet available for all chrom/pops, but start with what's there # to iron out the flow. # New approach to unphased -> LD -- split, run Haploview, merge. ssh pk # Note: although the genotypes_ files are *mostly* sorted by position, # they're not completely sorted! That can cause splitGenotype.pl to # screw up (as well as other downstream stuff), so sort them on the way # into splitGenotype. mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.splitUnphased cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.splitUnphased cat > runSplit.csh <<'_EOF_' #!/bin/csh -ef set f = $1 set base = $f:t:r:r set scriptBin = ~/kent/src/hg/snp/hapmapLd set tmpDir = `mktemp -d -p /scratch/tmp runSplit.XXXXXX` zcat $f \ | sort -k4n,4n \ | $scriptBin/splitGenotype.pl -suffix .txt.gz \ 10000000 250000 $tmpDir/$base mv $tmpDir/$base.* ../splitUnphased/$base/ rmdir $tmpDir '_EOF_' # << emacs chmod a+x runSplit.csh cp /dev/null jobList foreach f (../genotypes_2007-03/genotypes_chr*.txt.gz) mkdir -p ../splitUnphased/$f:t:r:r echo ./runSplit.csh $f >> jobList end para make jobList para time #Completed: 120 of 120 jobs #CPU time in finished jobs: 826s 13.77m 0.23h 0.01d 0.000 y #IO & Wait Time: 457s 7.61m 0.13h 0.01d 0.000 y #Average job time: 11s 0.18m 0.00h 0.00d #Longest finished job: 22s 0.37m 0.01h 0.00d #Submission to last job: 29s 0.48m 0.01h 0.00d # Run Haploview on split files. ssh pk mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewSplitUnphased cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewSplitUnphased set scriptBin = ~/kent/src/hg/snp/hapmapLd set hvPath = /san/sanvol1/scratch/hg18/bed/hapmapLd/Haploview.jar # Latest installed java on the cluster nodes (not on the para hub machine): set javaPath = /usr/java/jre1.5.0_12/bin/java set javaMemSize = 1500M find /san/sanvol1/scratch/hg18/bed/hapmapLd/splitUnphased \ -name \*.txt.gz -ls \ | awk '{print $7, $11;}' | sort -nr > filesBySize cp /dev/null jobList foreach f (`awk '{print $2;}' filesBySize`) echo $scriptBin/runHaploview.csh $f $javaPath $hvPath $javaMemSize \ >> jobList end para make jobList para time #Completed: 1493 of 1493 jobs #CPU time in finished jobs: 582015s 9700.25m 161.67h 6.74d 0.018 y #IO & Wait Time: 6558s 109.30m 1.82h 0.08d 0.000 y #Average job time: 394s 6.57m 0.11h 0.00d #Longest finished job: 1711s 28.52m 0.48h 0.02d #Submission to last job: 1740s 29.00m 0.48h 0.02d # Merge Haploview results. ssh pk mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.mergeSplitHapLD cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.mergeSplitHapLD cat > runMerge.csh <<'_EOF_' #!/bin/csh -ef set mapFile = $1 set outFile = $2 set scriptBin = ~/kent/src/hg/snp/hapmapLd set tmpOut = `mktemp -p /scratch/tmp runMerge.XXXXXX` $scriptBin/mergeHaploviewLD.pl $mapFile $tmpOut mv $tmpOut $outFile '_EOF_' # << emacs chmod a+x runMerge.csh mkdir ../mergedUnphasedLD cp /dev/null jobList foreach f (`ls -1S ../splitUnphased/genotypes_chr*/genotypes_chr*.map`) set base = $f:t:r echo ./runMerge.csh $f ../mergedUnphasedLD/$base.txt.LD.gz >> jobList end para make jobList para time #Completed: 120 of 120 jobs #CPU time in finished jobs: 16035s 267.25m 4.45h 0.19d 0.001 y #IO & Wait Time: 17282s 288.03m 4.80h 0.20d 0.001 y #Average job time: 278s 4.63m 0.08h 0.00d #Longest finished job: 737s 12.28m 0.20h 0.01d #Submission to last job: 738s 12.30m 0.20h 0.01d # Compare results of unsplit run with split/merge: ssh kolossus cd /san/sanvol1/scratch/hg18/bed/hapmapLd # Compare SNP pairs: zcat genotypes_2007-03/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz \ | awk '{print $1, $2;}' > /tmp/1 zcat mergedUnphasedLD/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz \ | awk '{print $1, $2;}' > /tmp/2 wc -l /tmp/1 /tmp/2 # 32514982 /tmp/1 # 32514982 /tmp/2 cmp /tmp/1 /tmp/2 # Compare entire files: zcat genotypes_2007-03/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz > /tmp/1 zcat mergedUnphasedLD/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz > /tmp/2 head /tmp/1 /tmp/2 cmp /tmp/1 /tmp/2 # Woohoo! ############################# phased ############################## # For this build, Daryl suggested using the phased data (output of # Jonathan Marchini's PHASE program) instead of raw genotype data ssh kolossus mkdir -p /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22 cd /san/sanvol1/scratch/hg18/bed/hapmapLd cd /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22 # 1/30/08: re-run from this point on, to pick up re-release (same URL) wget --timestamping \ ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2007-08_rel22/phased/\*.gz # Downstream stuff depends on the inputs being sorted by position -- check: cd /san/sanvol1/scratch/hg18/bed/hapmapLd foreach f (phased_2007-08_rel22/*_legend.txt.gz) echo $f zcat $f | tail +2 | awk '{print $2;}' > /tmp/1 sort -n /tmp/1 > /tmp/2 cmp /tmp/1 /tmp/2 end rm -f /tmp/1 /tmp/2 # kki cluster run -- need lots of memory! more than pk's 2G hard limit. # (would use memk but it doesn't have java and kki is sufficient) ssh kki mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewPhased cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewPhased set scriptBin = $HOME/kent/src/hg/snp/hapmapLd set hv = $scriptBin/runHaploviewPhased.csh set phaseDir = /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22 set hvPath = /san/sanvol1/scratch/hg18/bed/hapmapLd/Haploview.jar # Latest installed java on the cluster nodes (not on the para hub machine): set javaPath = /usr/java/jre1.5.0_12/bin/java set javaMemSize = 4G # Sort by size (descending) to kick off the biggest jobs first: cp /dev/null jobList foreach f (`ls -1S $phaseDir/genotypes_chr*.phase.gz`) echo $hv $f:r:r $javaPath $hvPath $javaMemSize >> jobList end para make jobList para time #Completed: 66 of 66 jobs #CPU time in finished jobs: 406845s 6780.76m 113.01h 4.71d 0.013 y #IO & Wait Time: 1517s 25.28m 0.42h 0.02d 0.000 y #Average job time: 6187s 103.12m 1.72h 0.07d #Longest finished job: 15667s 261.12m 4.35h 0.18d #Submission to last job: 29868s 497.80m 8.30h 0.35d # Our software assumes that LD scores are given for consecutive SNPs # without gaps in between, so scores in the encoded lists can be # associated with other SNPs just by their position in the list. # Make sure that's the case! I suspect this also depends on the # inputs to Haploview being sorted by position -- checked those above. ssh kolossus cd /san/sanvol1/scratch/hg18/bed/hapmapLd cp /dev/null checkLD.log foreach f ( mergedUnphasedLD/*.LD.gz phased_2007-08_rel22/*.LD.gz ) echo $f >> checkLD.log $scriptBin/checkLDSnpOrder.pl $f >>& checkLD.log echo "" >> checkLD.log date end # Takes a long time (~4 minutes for 184 files -> 11-12 hours) -- # left to run overnight. # Cluster run to translate Haploview .LD output into the DCC's # ld_data downloads format, and in turn into our bed4+ format. ssh pk mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/{dcc,bed}{Phased,Unphased} mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatUnphased cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatUnphased cat > runFormatsUnphased.csh <<'_EOF_' #!/bin/csh -ef set base = $1 set db = hg18 set scriptBin = ~/kent/src/hg/snp/hapmapLd set hapDir = /san/sanvol1/scratch/$db/bed/hapmapLd set unphDir = $hapDir/genotypes_2007-03 set unphLDDir = $hapDir/mergedUnphasedLD set dccOut = `echo $base | sed -e 's/^genotypes_/ld_/; s/$/.txt.gz/;'` set chr = `echo $base | perl -wpe 's/^.*_(chr[0-9MXY]+)_.*/$1/'` set pop = `echo $base | perl -wpe 's/^.*_chr[0-9MXY]+_([A-Z+]+)_.*/$1/'` set bedOut = $db.${pop}_$chr.bed.gz $scriptBin/makeDccAndLdBed.pl \ $unphDir/$base.txt.gz $unphLDDir/$base.txt.LD.gz \ $hapDir/dccUnphased/$dccOut $hapDir/bedUnphased/$bedOut '_EOF_' # << emacs chmod a+x runFormatsUnphased.csh cp /dev/null jobList foreach f (`ls -1S ../mergedUnphasedLD/genotypes_chr*.txt.LD.gz`) echo ./runFormatsUnphased.csh $f:t:r:r:r >> jobList end para make jobList para time #Completed: 120 of 120 jobs #CPU time in finished jobs: 101968s 1699.46m 28.32h 1.18d 0.003 y #IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y #Average job time: 847s 14.11m 0.24h 0.01d #Longest finished job: 2276s 37.93m 0.63h 0.03d #Submission to last job: 2276s 37.93m 0.63h 0.03d mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatPhased cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatPhased cat > runFormatsPhased.csh <<'_EOF_' #!/bin/csh -ef set basePath = $1 set base = $basePath:t set db = hg18 set scriptBin = ~/kent/src/hg/snp/hapmapLd set hapDir = /san/sanvol1/scratch/$db/bed/hapmapLd set dccOut = `echo $base | sed -e 's/^genotypes_/ld_/; s/$/.txt.gz/;'` set chr = `echo $base | perl -wpe 's/^.*_(chr[0-9MXY]+)_.*/$1/'` set pop = `echo $base | perl -wpe 's/^.*_chr[0-9MXY]+_([A-Z+]+)_.*/$1/'` set bedOut = $db.${pop}_$chr.bed.gz $scriptBin/makeDccAndLdBed.pl ${basePath}_legend.txt.gz $basePath.LD.gz \ $hapDir/dccPhased/$dccOut $hapDir/bedPhased/$bedOut '_EOF_' # << emacs chmod a+x runFormatsPhased.csh cp /dev/null jobList foreach f (`ls -1S ../phased_2007-08_rel22/genotypes_chr*.LD.gz`) echo ./runFormatsPhased.csh $f:r:r >> jobList end para make jobList para time #Completed: 66 of 66 jobs #CPU time in finished jobs: 66155s 1102.58m 18.38h 0.77d 0.002 y #IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y #Average job time: 972s 16.20m 0.27h 0.01d #Longest finished job: 2292s 38.20m 0.64h 0.03d #Submission to last job: 2292s 38.20m 0.64h 0.03d # Create empty tables, then load one pop_chr at a time in order # to avoid thrashing. # hg17 took about half an hour to an hour per population on hgwdev. # Load on kolossus, then ask cluster-admin to rsync to hgwdev. ssh kolossus cd /san/sanvol1/scratch/hg18/bed/hapmapLd cat > loadOne.csh <<'_EOF_' #!/bin/csh -ef set tableBase = $1 set Pop = $2 set bedDir = $3 set table = $tableBase$Pop hgsql hg18 -e "drop table if exists $table;" sed "s/ld2/$table/" $HOME/kent/src/hg/lib/ld2.sql \ | hgsql hg18 set pop = `echo $Pop | perl -wpe 's/ChbJpt/JPT+CHB/; tr/a-z/A-Z/;'` foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) set bed = $bedDir/hg18.${pop}_chr$c.bed.gz if (-e $bed) then echo $bed hgLoadBed -noSort -oldTable hg18 $table $bed else echo "\n$bed does not exist\n" endif echo "" end echo -n "\nDone with $table. "; date '_EOF_' # << emacs chmod a+x loadOne.csh # phased: cp /dev/null loadPhased.log foreach Pop (Ceu ChbJpt Yri) ./loadOne.csh hapmapLdPh $Pop bedPhased >>& loadPhased.log end # ~16 minutes for all phased on kolossus # 1/30/08: ~11 minutes for all phased on hgwdev! bg load ~1.25 # unphased: cp /dev/null loadUnphased.log foreach Pop (Ceu Chb ChbJpt Jpt Yri) ./loadOne.csh hapmapLd $Pop bedUnphased >>& loadUnphased.log end # ~21 minutes -- got segfaults for empty gzipped chrY files, debug later. rm -f bed.tab # Repeat hg17 sanity checks on the unphased results. ssh pk mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.maxDist cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.maxDist # Find the largest distance between any paired SNPs in DCC ld_* files. # Should be 249999 or less. Also count the number of unique starting # coords. We can compare those to the SNP counts in checkLD.log. cat > runMaxDist.csh <<'_EOF_' #!/bin/csh -ef set dccIn = $1 set out = $dccIn:r:r.check echo -n "$dccIn:t " > $out zcat $dccIn \ | awk '{if ($2-$1>max) max=$2-$1} \ {if (prevStart && $1 != prevStart) count++; prevStart = $1;} \ END {print max "\t" count; \ if (max > 249999) print "ERROR: maxDistance too large!";}' \ >> $out '_EOF_' # << emacs chmod a+x runMaxDist.csh cp /dev/null jobList foreach f (../dccUnphased/ld_*.txt.gz) echo ./runMaxDist.csh $f >> jobList end para make jobList para time #Completed: 120 of 120 jobs #CPU time in finished jobs: 12274s 204.56m 3.41h 0.14d 0.000 y #IO & Wait Time: 4137s 68.96m 1.15h 0.05d 0.000 y #Average job time: 137s 2.28m 0.04h 0.00d #Longest finished job: 365s 6.08m 0.10h 0.00d #Submission to last job: 365s 6.08m 0.10h 0.00d cd .. cat dccUnphased/*.check > maxDist.txt grep -B1 ERROR maxDist.txt # Other cleanup: rm -r splitUnphased ######################################################################### # University of Uppsala, Sweden Chip-chip (2007-10-18 kate) # 3 datasets (Usf1, Usf2, H3ac) -- wiggle and bed for each, in hg16 coords # Submitted by Adam Ameur ssh kkstore02 cd /cluster/data/hg18/bed mkdir uppsalaChip cd uppsalaChip foreach f (H3ac Usf1 Usf2) #wget -nd http://www.lcb.uu.se/~mada/UUtracks_hg16/${f}_hg16.wig.gz wget -nd http://www.lcb.uu.se/~mada/UUtracks_hg16/${f}_hg16.bed end wget -nd http://www.lcb.uu.se/~mada/UUtracks_hg16/UCSCdescription.html # lift to hg18 foreach f (lab/*hg16.bed) set b = `echo $f:t | sed 's/_.*//'` echo $b tail +2 $f | \ liftOver stdin \ /cluster/data/hg16/bed/liftOver/hg16ToHg18.over.chain.gz \ $b.bed $b.bed.unmapped end ssh kolossus cd /cluster/data/hg18/bed cd uppsalaChip # remove duplicate regions resulting from liftOver cat > trimDups.awk << 'EOF' BEGIN {chr=""; start="";} { if (!(($1 == chr) && ($2 == start))) print; chr = $1; start = $2; } 'EOF' # process in 2 unix pipelines, so as not to overload machine cat > load.csh << 'EOF' foreach f (lab/*hg16.wig.gz) set b = `echo $f:t | sed 's/_.*//'` echo $b date nice zcat $f | tail +2 | \ nice varStepToBedGraph.pl stdin | \ nice liftOver stdin \ /cluster/data/hg16/bed/liftOver/hg16ToHg18.over.chain.gz \ $b.wigBed $b.wigBed.unmapped nice bedSort $b.wigBed stdout | \ nice awk -f trimDups.awk | \ nice wigEncode stdin $b.wig $b.wib date end 'EOF' csh load.csh >&! load.log & # approx. 50 minutes to process the 3 datasets # load bed and wiggles into database ssh hgwdev cd /cluster/data/hg18/bed/uppsalaChip cat > load2.csh << 'EOF' foreach f (*.wig) set b = $f:r echo $b date set table = uppsalaChip${b}Sites hgLoadBed hg18 $table $b.bed set table = uppsalaChip${b}Signal ln -s /cluster/data/hg18/bed/uppsalaChip/$b.wib /gbdb/hg18/wib/uppsalaChip${b}Signal.wib hgLoadWiggle hg18 $table $f date end 'EOF' csh load2.csh >&! load2.log & # just a few minutes runtime # somehow 2 beds were left out above (lifted files were missing) cat > loadBed.csh << 'EOF' foreach f (*.bed) set b = $f:r echo $b hgLoadBed hg18 uppsalaChip${b}Sites $f end 'EOF' # << emacs csh loadBed.csh >& loadBed.log & # data distribution textHistogram H3ac.wigBed -minVal=-2 -real -col=4 -binSize=.5 -2.000000 611 -1.500000 5711 -1.000000 * 391229 -0.500000 ************************************************************ 21240336 0.000000 ******************************************************* 19325712 0.500000 ** 689267 1.000000 99083 1.500000 24453 2.000000 4635 2.500000 635 3.000000 49 3.500000 3 =4.000000 562 ######################################################################### # BLASTZ Zebrafish danRer5 (DONE - 2007-10-18 - Hiram) ssh kkstore02 mkdir /cluster/data/hg18/bed/blastzDanRer5.2007-10-17 cd /cluster/data/hg18/bed/blastzDanRer5.2007-10-17 cat << '_EOF_' > DEF # Human (hg18) vs zebrafish (danRer5) BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=0 # QUERY - zebrafish (danRer5) SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit SEQ2_LEN=/cluster/data/danRer5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzDanRer5.2007-10-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \ -chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 & # real 369m20.490s cat fb.hg18.chainDanRer5Link.txt # 73923439 bases of 2881515245 (2.565%) in intersection mkdir /cluster/data/danRer5/bed/blastz.hg18.swap cd /cluster/data/danRer5/bed/blastz.hg18.swap time nice -n +19 doBlastzChainNet.pl \ -chainMinScore=5000 \ /cluster/data/hg18/bed/blastzDanRer5.2007-10-17/DEF \ -swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > swap.log 2>&1 & # real 11m35.536s cat fb.danRer5.chainHg18Link.txt # 74166352 bases of 1435609608 (5.166%) in intersection ######################################################################### # Vista Enhancers (2007-10-18, conodera) # see also /projects/compbiousr/wet/browser/vista_enhancer/17Oct2007/Makefile # # download data file from the vista browser (coordinates are for hg17) # http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1 # save as enhancerbrowser.datadownload.txt cd /projects/compbiousr/wet/browser/vista_enhancer/ # liftOver hg17 file liftOver vista_enhancer.hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz vista_enhancer.hg18.bed vista_enhancer.hg17ToHg18.unMapped hgLoadBed hg18 vistaEnhancers vista_enhancer.hg18.bed ############################################################################ # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-10-30 markd) cd /cluster/data/genbank/data/ccds/ ftp ftp-private.ncbi.nih.gov (user ccds, needs password) get CCDS.20071030.tar.gz mkdir /scratch/tmp/ccds cd /scratch/tmp/ccds tar -zxf /cluster/data/genbank/data/ccds/CCDS.20071030.tar.gz # import ccds database tables /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg18 ccdsInfo ccdsGene /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=hg18 -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords hg18 -verbose=2 ccdsGene # update all.jointer to include hg18 in ccdsDb joinerCheck -database=hg18 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # build initial version of ccdsMgcMap table, updated by nightly genbank update /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg18 ccdsGene mgcGenes ccdsMgcMap # request push of ccdsGene ccdsInfo ccdsKgMap ccdsMgcMap # << emacs ######################################################################### # Load ENSEMBL ver 45 (2007-09-5 markd) mkdir /cluster/data/hg18/bed/ensembl45 cd /cluster/data/hg18/bed/ensembl45 ## # need to find bounds of haplotype chromosomes ## # get unmasked haplotype pseudochroms from ensemble (dna, NOT dna_rm) wget ftp://ftp.ensembl.org/pub/current_homo_sapiens/data/fasta/dna/ Homo_sapiens.NCBI36.46.dna.chromosome.c22_H2.fa.gz Homo_sapiens.NCBI36.46.dna.chromosome.c5_H2.fa.gz Homo_sapiens.NCBI36.46.dna.chromosome.c6_COX.fa.gz Homo_sapiens.NCBI36.46.dna.chromosome.c6_QBL.fa.gz # get gap locations and create hap.lift foreach f ( *.fa.gz ) faGapLocs $f $f:r:r.lift end # build lift file for randons and haps (mkRandomNTLift hg18 && cat hap.lift) > randHap.lift # load ensembl genes hgLoadEnsembl -l randHap.lift -p homo_sapiens core_45_36g hg18>&log # got 1 genes with CDS exons with no frame: ENST00000374459 # add this to problem ids and rerun hgLoadEnsembl -l randHap.lift -f problem.ids homo_sapiens core_45_36g hg18>&log # load pseudogenes hgLoadEnsembl -l randHap.lift -p homo_sapiens core_45_36g hg18>&log # got 3 pseudogenes with CDS bounds outside of exons ENST00000342841 ENST00000361218 ENST00000388856 # add this to problem ids and rerun hgLoadEnsembl -l randHap.lift -f problem.ids -p homo_sapiens core_45_36g hg18>&log # vega code is not working in robert's scripts. # done to support CCDS; push not requested awaiting resolution of vega # stuff ######################################################################### # AFFY TRANSCRIPTOME PHASE 3 (2007-11-06, Andy) ssh hgwdev bash cd /san/sanVol1/scratch/andy/transcriptome mkdir splits cd originalWigs/ for f in *.wigVar; do table=${f%.wigVar}; mkdir ../splits/$table grep -v "^track" $f | splitWig stdin 1000000 ../splits/${table}/split echo Done with $table done # Done with cluster run ssh kolossus cd /san/sanVol1/scratch/andy/transcriptome/lift/bed for tab in *; do for split in ${tab}/*; do cat $split >> ${tab}.bed done echo done catting $tab done # Split into chrom beds (with a cluster run) for f in `ls -1 hg18.bed`; do tab=${f%.bed}; for c in `cut -f1 chrom.sizes`; do cfile=hg18.bed.chromSplit/${tab}.${c}.bed; outFile=hg18.wigVar/${tab}.wigVar; if [ -e $cfile ]; then echo variableStep chrom=${c} span=1 >> $outFile; bedSort $cfile stdout | awk 'BEGIN{FS="\t"}{print $2+1, $4;}' | awk -f noDupe.awk >> $outFile; echo Added $cfile to $outFile >> the.log; fi; done; echo DONE with $tab >> the.log; wigEncode $outFile hg18.wigVar/${tab}.wig hg18.wigVar/${tab}.wib >> the.log; gzip $outFile done cd hg18.wigVar/ mkdir -p /cluster/data/hg18/bed/affyTxnPhase3/wib for f in *.wib; do echo copying $f...; cp $f /cluster/data/hg18/bed/affyTxnPhase3/wib/; done pushd /gbdb/hg18/wib ln -s /cluster/data/hg18/bed/affyTxnPhase3/wib/* . popd mkdir -p /cluster/data/hg18/bed/affyTxnPhase3/downloads cp *.wigVar.gz /cluster/data/hg18/bed/affyTxnPhase3/downloads mkdir -p /usr/local/apache/htdocs/goldenPath/hg18/affyTxnPhase3 pushd /usr/local/apache/htdocs/goldenPath/hg18/affyTxnPhase3 ln -s /cluster/data/hg18/bed/affyTxnPhase3/downloads/* . for f in *Strand*; do mv $f sRNA.$f; done for f in affyTxnPhase3*; do mv $f lRNA.$f; done ######################################################################### # Blastz Marmoset calJac1 (DONE - 2007-11-09 - Hiram) ## this is not necessary - already done by Kate in October ssh kkstore06 screen # use screen to control this job mkdir /cluster/data/hg18/bed/blastzCalJac1.2007-11-09 cd /cluster/data/hg18/bed/blastzCalJac1.2007-11-09 cat << '_EOF_' > DEF # Human vs marmoset BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Marmoset calJac1 SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit SEQ2_LEN=/cluster/data/calJac1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzCalJac1.2007-11-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=pk > blastz.log 2>&1 & # real 542m2.359s # Completed: 230805 of 230805 jobs # CPU time in finished jobs: 7279638s 121327.30m 2022.12h 84.26d 0.231 y # IO & Wait Time: 831303s 13855.05m 230.92h 9.62d 0.026 y # Average job time: 35s 0.59m 0.01h 0.00d # Longest finished job: 972s 16.20m 0.27h 0.01d # Submission to last job: 20572s 342.87m 5.71h 0.24d cat fb.hg18.chainCalJac1Link.txt # 2236493373 bases of 2881515245 (77.615%) in intersection ########################################################################### # LIFT RM ALIGN FILES, MAKE PER-CHROM DOWNLOADS (DONE 12/7/07 angie) # Lifting of .align files is now automated by doRepeatMasker.pl, but we # got a user request for .align files from this pre-automation db. ssh kkstore02 cd /cluster/data/hg18 mkdir downloads/RMalign foreach c (?{,?} ?{,?}_*hap?) echo linking/lifting to contigs of $c:t foreach ctgdir ($c/N[TC]_??????) set nt = $ctgdir:t if (! -f $ctgdir/$nt.fa.align) then pushd $ctgdir liftRMAlign.pl $nt.lft > $nt.fa.align popd endif ln -s $nt/$nt.fa.align $c/ end set chr = chr$c:t if (-e $c/lift/ordered.lft && ! -z $c/lift/ordered.lft) then echo lifting contigs to chr$c liftRMAlign.pl $c/lift/ordered.lft \ | gzip -c > downloads/RMalign/$chr.fa.align.gz endif if (-e $c/lift/random.lft && ! -z $c/lift/random.lft) then echo lifting contigs to chr${c}_random liftRMAlign.pl $c/lift/random.lft \ | gzip -c > downloads/RMalign/${chr}_random.fa.align.gz endif end md5sum downloads/RMalign/*.gz > downloads/RMalign/md5sum.txt ssh hgwdev ln -s /cluster/data/hg18/downloads/RMalign \ /usr/local/apache/htdocs/goldenPath/hg18/ ######################################################################### # ADD LINKS TO GENETESTS ON hgGene DETAILS PAGE (DONE 12/12/07 Fan) ssh hgwdev cd /cluster/store11/gs.19/build36/bed mkdir geneTests cd geneTests # paste the 3 cols gene list from GeneTest web site into file geneTests.lis cut -f 1 geneTests.lis >j1 cut -f 2 geneTests.lis >j2 cut -f 3 geneTests.lis >j3 cat j1 j2 j3 |sort -u >geneTests.tab rm j1 j2 j3 hgsql hg18 -e 'drop table geneTests' hgsql hg18 < ~/src/hg/lib/geneTests.sql hgsql hg18 -e 'load data local infile "geneTests.tab" into table geneTests ignore 1 lines' # the list is independent of hg18, so load it into hg17 too. hgsql hg17 -e 'drop table geneTests' hgsql hg17 < ~/src/hg/lib/geneTests.sql hgsql hg17 -e 'load data local infile "geneTests.tab" into table geneTests ignore 1 lines' ########################################################################### # ADD SeattleSNPs PGA GENES ON hgGene DETAILS PAGE. (DONE, Fan, 12/13/07). cd /cluster/store12/snp mkdir pga cd pga # download data from SeattleSNPs wget --timestamping http://pga.gs.washington.edu/data.tar.gz gzip -d *.gz tar -xvf *.tar # create SeattleSNPs PGA gene list cut -f 1 FinishedGenes.txt >j1 cut -f 2 FinishedGenes.txt >j2 cat j1 j2 |sort -u >pga.tab rm j1 j2 # load the data into the pga table. hgsql hg18 -e 'drop table pga' hgsql hg18 < ~/src/hg/lib/pga.sql hgsql hg18 -e 'load data local infile "pga.tab" into table pga' ########################################################################### # Reload CCDS (2007-12-12 markd) # import ccds database as described in ccds.txt set db=hg18 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap # build initial version of ccdsMgcMap table, updated by nightly genbank update /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap checkTableCoords ${db} -verbose=2 ccdsGene # update all.jointer to include ${db} in ccdsDb joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # request push of ccdsGene ccdsInfo ccdsKgMap ccdsMgcMap ############################################################################ # dbSNP BUILD 128 (DONE 1/22/08 angie) # updated snp128ExceptionDesc (tweaked wording) 3/7/08 # 8/7/08: Regenerated snp128.sql with only those enum/set values that are # actually used (except always keep unknown, the default) and reloaded snp128. # No data change -- just the sql field definitions for enums and sets. # QA NOTE: used sudo mytouch on the snp128 table to reset the timestamp to # .2008-01-22 00:00:00 (was .2008-08-07 16:08:27 after Angie's re-load) in # order to keep joinerCheck happy and avoid confusion. (8/8/08 brooke) # Set up build directory ssh kkstore06 mkdir -p /cluster/store3/dbSNP128/{human,shared} ln -s /cluster/store3/dbSNP128 /cluster/data/dbSNP/128 # Get field encodings -- if there are changes or additions to the # encoding of the corresponding fields, you might need to update # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also # hg/lib/snp125Ui.c). cd /cluster/data/dbSNP/128/shared set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz # Here is another source -- it is not as up-to-date as the above, but # our encodings (enums and sets in snp128.sql) are named more similar # to those in the 2005 ASN: # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn ########################## DOWNLOAD ############################# cd /cluster/data/dbSNP/128/human mkdir data schema rs_fasta # Get data from NCBI (anonymous FTP) wget ftp://ftp.ncbi.nih.gov/snp/00readme.txt cd /cluster/data/dbSNP/128/human/data alias wg wget --timestamping set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database # ContigLoc table has coords, orientation, loc_type, and refNCBI allele wg $ftpSnpDb/organism_data/b128_SNPContigLoc_36_2.bcp.gz wg $ftpSnpDb/organism_data/b128_SNPContigLocusId_36_2.bcp.gz wg $ftpSnpDb/organism_data/b128_ContigInfo_36_2.bcp.gz # MapInfo has alignment weights wg $ftpSnpDb/organism_data/b128_SNPMapInfo_36_2.bcp.gz # SNP has univar_id, validation status and heterozygosity wg $ftpSnpDb/organism_data/SNP.bcp.gz # Get schema cd /cluster/data/dbSNP/128/human/schema wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz # Get fasta files # using headers of fasta files for molType, class, observed cd /cluster/data/dbSNP/128/human/rs_fasta wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz ########################## LOAD NCBI TABLES ############################# # Simplify names of data files -- strip version & extras to get # local canonical table names. cd /cluster/data/dbSNP/128/human/data foreach f (*.bcp.gz) set new = `echo $f \ | sed -e 's/^b128_SNP//; s/^b128_//; s/_36_2//; s/.bcp//;'` mv $f $new echo $new end # Extract just the tables that we need from the NCBI msSQL table # creation file, and get CREATE statements from # human_9606_table.sql for our 5 tables cd /cluster/data/dbSNP/128/human/schema zcat human_9606_table.sql.gz \ | perl -we '$/ = "\nGO\n\n\n\n"; \ while (<>) { \ next unless /^CREATE TABLE \[(b128_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_2)?\]/; \ s/b128_(SNP)?//; s/_36_2//; \ s/[\[\]]//g; s/GO\n\n\n/;/; s/smalldatetime/datetime/g; \ s/ON PRIMARY//g; s/COLLATE//g; s/Latin1_General_BIN//g; \ s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \ s/nvarchar/varchar/g; s/set quoted/--set quoted/g; \ s/(image|varchar\s+\(\d+\))/BLOB/g; \ print; \ }' \ > table.sql # load on kolossus or a small cluster machine (mysql5 is OK for this). ssh kolossus hgsql '' -e 'create database hg18snp128' cd /cluster/data/dbSNP/128/human/schema hgsql hg18snp128 < table.sql cd ../data foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP) zcat $t.gz \ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \ | hgLoadSqlTab -oldTable hg18snp128 $t placeholder stdin end # There were some warnings (many cleared up by the perl substitution) # but no rows were dropped. I eyeballed a few examples, seemed OK. foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP) echo -n "${t}:\t" hgsql -N -B hg18snp128 -e 'select count(*) from '$t end #ContigInfo: 7067 #ContigLoc: 24685256 #ContigLocusId: 13129868 #MapInfo: 24132236 #SNP: 11833664 # these counts (except for MapInfo which has ~doubled) are # slightly down from 126. MapInfo has a lot of alternate assembly # mappings, esp. the celera assembly; maybe that's new? # load hg18.ctgPos into dbSnpHumanBuild128, compare contig list between # ctgPos and ContigInfo # NOTE FOR NEXT TIME: instead of going through mysql, just make a # tab-sep dump file of ctgPos. ssh hgwdev hgsql hg18 -N -B -e '"select * from ctgPos;"' \ | hgLoadSqlTab hg18snp128 ctgPos ~/kent/src/hg/lib/ctgPos.sql stdin hgsql hg18snp128 -N -B -e 'select contig from ctgPos;' | sort > /tmp/1 # Note: we used to look for group_term = "ref_assembly", but that leaves # behind some contigs that we include. So use a list of group_label: hgsql hg18snp128 -NBe 'select distinct(group_label) from ContigInfo' # --> ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2") hgsql hg18snp128 -N -B -e 'select contig_acc from ContigInfo \ where group_label in \ ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2 diff /tmp/1 /tmp/2 # No diff. #################### EXTRACT INFO FROM NCBI TABLES #################### mkdir -p /scratch/snp/128/human cd /scratch/snp/128/human # Fields of the SNP table and their NCBI source table/file: # chrom ContigLoc / contigInfo / liftUp # chromStart ContigLoc / liftUp; check vs phys_pos_from # chromEnd ContigLoc / liftUp # name rs + numeric ID that joins all the other sources # score 0 # strand ContigLoc.orientation # refNCBI ContigLoc.allele # refUCSC ContigLoc.allele if insertion, othw. from genomic # observed fasta headers # molType fasta headers # class fasta headers # valid SNP # avHet SNP # avHetSE SNP # func ContigLocusId # locType ContigLoc # weight MapInfo time hgsql hg18snp128 -e \ 'alter table ContigLoc add index (ctg_id); \ alter table ContigInfo add index (ctg_id);' #kolossus load was already 1.0. #0.001u 0.002s 4:04.73 0.0% 0+0k 0+0io 0pf+0w time hgsql hg18snp128 -e \ 'alter table ContigInfo add index (group_label(9));' #0.001u 0.001s 0:00.07 0.0% 0+0k 0+0io 0pf+0w # Make sure there are no orient != 0 contigs among those selected. hgsql hg18snp128 -NBe \ 'select count(*) from ContigInfo where orient != 0 and \ group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' #0 # For joining files by shared column, we need a unique identifier in # that shared column. snp_id is not unique -- the same rsID can appear # in both the reference assembly and on one of the others e.g. c6_COX. # So concatenate the assembly identifier and snp_id to get hopefully # unique label. time hgsql hg18snp128 -NBe \ 'select concat(ContigInfo.group_label, ".", snp_id), \ ContigInfo.contig_acc, asn_from, asn_to, \ loc_type, orientation, allele, phys_pos_from \ from ContigLoc, ContigInfo \ where ContigLoc.ctg_id = ContigInfo.ctg_id and ContigInfo.group_label \ in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \ | sort \ > ucscContigLoc.txt # no time output because of the pipe... took 4 minutes (load was 3 or 4). # Make sure these IDs are unique. wc -l ucscContigLoc.txt #12275300 ucscContigLoc.txt awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l #11863799 # Doh! Find non-unique IDs: awk 'prev == $1 {print;} {prev = $1;}' ucscContigLoc.txt | head grep ^c5_H2.10035195 ucscContigLoc.txt #c5_H2.10035195 NT_113801 639954 639954 2 0 G 69605321 #c5_H2.10035195 NT_113801 660407 660407 2 0 G 69625774 #c5_H2.10035195 NT_113801 911780 911780 2 1 C 69877147 # OK, they can be duplicated within the same contig. See if we can # get by with anchoring everything to ucscContigLoc.txt. But everybody # else better have unique IDs! # SNP -> valid, avHet, avHetSE # SNP has only snp_id as identifier, nothing relating to assembly. hgsql hg18snp128 -NBe \ 'select snp_id, validation_status, avg_heterozygosity, het_se \ from SNP;' \ | sort \ > ucscSNP.txt # Check ID uniqueness: wc -l ucscSNP.txt #11833664 ucscSNP.txt awk '{print $1;}' ucscSNP.txt | uniq | wc -l #11833664 # ContigLocusId -> func # ContigLocusId has only snp_id as an identifier (it gives one # example contig if the SNP is on multiple contigs). # The sort options and awk are to convert multiple entries with different # function classes for the same SNP into one entry per SNP with a list # of function classes. hgsql hg18snp128 -NBe \ 'select snp_id, fxn_class from ContigLocusId;' \ | sort -u -k1,1 -k2,2n \ | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \ else { if (prevId) {print prevId "\t" prevFunc;} \ prevFunc = $2 ","; }} \ {prevId = $1;} \ END {print prevId "\t" prevFunc;}' \ > ucscFunc.txt # Check ID uniqueness: wc -l ucscFunc.txt #4676589 ucscFunc.txt awk '{print $1;}' ucscFunc.txt | sort -u | wc -l #4676589 # MapInfo -> weight # MapInfo needs assembly+snp_ids in order to have unique IDs. time hgsql hg18snp128 -e \ 'alter table MapInfo add index (assembly(9));' #0.000u 0.004s 2:22.64 0.0% 0+0k 0+0io 0pf+0w hgsql hg18snp128 -NBe \ 'select concat(assembly, ".", snp_id), weight \ from MapInfo where assembly \ in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \ | sort \ > weight.txt # ~1 minute # Check ID uniqueness: wc -l weight.txt #11863799 weight.txt awk '{print $1;}' weight.txt | uniq | wc -l #11863799 awk '{print $2;}' weight.txt | sort -n | uniq -c # 47454 0 #11621954 1 # 91766 2 # 100142 3 # 2483 10 # SNPs w/weight 0 and 10 will be discarded later. # fasta headers -> observed, molType, class zcat /cluster/data/dbSNP/128/human/rs_fasta/rs_ch*.fas.gz \ | grep '^>gnl' \ | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \ | sort \ > ucscGnl.txt # ~4 minutes wc -l ucscGnl.txt #11833664 ucscGnl.txt awk '{print $1;}' ucscGnl.txt | uniq | wc -l #11833664 ############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################ # Join files by ID. Start with ContigLoc and MapInfo because they # share the concatenated assembly+snp_id IDs. time join -a 1 -e MISSING -t ' ' ucscContigLoc.txt weight.txt \ > ucscCL+w.txt #25.408u 3.551s 0:29.26 98.9% 0+0k 0+0io 0pf+0w wc -l ucscCL+w.txt #12275300 ucscCL+w.txt # Same as ucscContigLoc.txt above, good. # Any missing weights? grep MISSING ucscCL+w.txt | head # No output, good. # Join the files with SNP-only IDs. time join -e MISSING -t ' ' ucscGnl.txt ucscSNP.txt \ > ucscG+S.txt #16.805u 1.996s 0:19.04 98.6% 0+0k 0+0io 0pf+0w wc -l ucscG+S.txt #11833664 ucscG+S.txt # Same as ucscSNP.txt and ucscGnl.txt above. grep MISSING ucscG+S.txt | wc -l #0 time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \ -t ' ' ucscG+S.txt ucscFunc.txt \ > ucscG+S+F.txt #17.656u 2.318s 0:20.10 99.3% 0+0k 0+0io 0pf+0w wc -l ucscG+S+F.txt #11833664 ucscG+S+F.txt grep MISSING ucscG+S+F.txt | wc -l #7157075 # Not surprising -- ucscFunc.txt has only 4676589 lines. expr 11833664 - 4676589 #7157075 # Convert assembly+snp_id's to just snp_id (sorted) for final join. perl -wpe 's/^\S+\.(\d+)/$1/;' ucscCL+w.txt \ | sort > ucscCL+w.snp_id.txt awk '{print $1;}' ucscCL+w.snp_id.txt | uniq | wc -l #11727742 # Interesting... which snp_ids are missing from ContigLoc? awk '{print $1;}' ucscCL+w.snp_id.txt | uniq > /tmp/1 awk '{print $1;}' ucscGnl.txt | uniq > /tmp/2 comm -13 /tmp/1 /tmp/2 > notInContigLoc.txt comm -23 /tmp/1 /tmp/2 > notInSNP.txt wc -l notIn*.txt #105994 notInContigLoc.txt # 72 notInSNP.txt expr 11833664 + 72 - 105994 #11727742 # Final join -- treat ContigLoc as authoritative (since it has coords). # Arrange columns in same order as in the SNP table, with extras for # checking at the end (phys_pos_from). # chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ... time join -a 1 -e MISSING -t ' ' \ -o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \ ucscCL+w.snp_id.txt ucscG+S+F.txt \ > ucscNcbiSnp.ctg.txt #38.497u 5.536s 2:08.18 34.3% 0+0k 0+0io 0pf+0w wc -l ucscNcbiSnp.ctg.txt #12275300 ucscNcbiSnp.ctg.txt grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l #7058898 # a bit less than the 7157075 missing FUNC's above -- some overlap with # notInContigLoc would explain. # Lift the map contig coordinates to chrom coordinates (~2m); time liftUp ucscNcbiSnp.bed \ /cluster/data/hg18/jkStuff/liftContigs.lft warn \ ucscNcbiSnp.ctg.txt #98.038u 5.974s 1:45.65 98.4% 0+0k 0+0io 5pf+0w wc -l ucscNcbiSnp.bed #12275300 ucscNcbiSnp.bed # At this point, move back from /scratch to /cluster/data. nice gzip ucscNcbiSnp.bed cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/128/human/ # Drum roll please... translate NCBI's encoding into UCSC's, and # perform a bunch of checks. This is where developer involvement # is most likely as NCBI extends the encodings used in dbSNP. cd /cluster/data/dbSNP/128/human/ gunzip ucscNcbiSnp.bed.gz # Re-ran this command 8/7/08 to get new snp128.sql that includes # only those enum/set values that are actually used. No other output # files changed. time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \ snp128 #spaces stripped from observed: #chr12 5963395 5963395 rs41402545 #count of snps with weight 0 = 59123 #count of snps with weight 1 = 11654498 #count of snps with weight 2 = 191647 #count of snps with weight 3 = 335214 #count of snps with weight 10 = 34818 #Skipped 167 snp mappings due to errors -- see snp128Errors.bed #176.712u 17.466s 3:34.82 90.3% 0+0k 0+0io 0pf+0w # The 167 errors are all for SNPs for which we don't have fasta, # so we also don't have observed, class, or molType. I spot-checked # a few, and they have been deleted from dbSNP. Nothing to show, # so we skip those 167 -- nothing catastrophic. Watch out for new # types of errors reported, though: awk -F"\t" '{print $5;}' snp128Errors.bed | sort -u | wc -l #1 wc -l snp* # 12181192 snp128.bed # 22 snp128.sql # 167 snp128Errors.bed # 18 snp128ExceptionDesc.tab # 1013020 snp128Exceptions.bed # Make one big fasta file. (note: snp126 skipped chrUn... but it's small # compared to chr1, chr2 etc.) # It's a monster: 14G! Can we split by hashing rsId? zcat rs_fasta/rs_ch*.fas.gz \ | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \ > snp128.fa # Check for duplicates. grep ^\>rs snp128.fa | sort > /scratch/tmp/seqHeaders wc -l /scratch/tmp/seqHeaders #11833664 /scratch/tmp/seqHeaders uniq /scratch/tmp/seqHeaders | wc -l #11833664 # Use hgLoadSeq to generate .tab output for sequence file offsets, # and keep only the columns that we need: acc and file_offset. # Index it and translate to snpSeq table format. time hgLoadSeq -test placeholder snp128.fa #107.137u 37.140s 2:39.16 90.6% 0+0k 0+0io 0pf+0w cut -f 2,6 seq.tab > snp128Seq.tab rm seq.tab ssh hgwdev # Load up main track tables. cd /cluster/data/dbSNP/128/human # Re-ran this command 8/7/08 to get new snp128.sql that includes # only those enum/set values that are actually used. No data values # changed. Removed -noSort because Brooke had spotted some entries # sorted by chromEnd instead of chromStart. time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \ hg18 snp128 -sqlTable=snp128.sql snp128.bed #78.060u 13.298s 7:32.71 20.1% 0+0k 0+0io 0pf+0w sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125Exceptions.sql \ > snp128Exceptions.sql time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \ hg18 snp128Exceptions -sqlTable=snp128Exceptions.sql \ snp128Exceptions.bed #5.915u 0.492s 0:28.69 22.3% 0+0k 0+0io 0pf+0w sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \ > snp128ExceptionDesc.sql # 3/7/08: reloaded snp128ExceptionDesc (tweaked wording) hgLoadSqlTab hg18 snp128ExceptionDesc snp128ExceptionDesc.sql \ snp128ExceptionDesc.tab # Load up sequences. sed -e 's/snpSeq/snp128Seq/' ~/kent/src/hg/lib/snpSeq.sql \ > snp128Seq.sql mkdir -p /gbdb/hg18/snp ln -s /cluster/data/dbSNP/128/human/snp128.fa /gbdb/hg18/snp/snp128.fa time nice hgLoadSqlTab hg18 snp128Seq snp128Seq.sql snp128Seq.tab #0.001u 0.000s 2:31.19 0.0% 0+0k 0+0io 0pf+0w # Put in a link where one would expect to find the track build dir... ln -s /cluster/data/dbSNP/128/human /cluster/data/hg18/bed/snp128 ####################################################################### # SNPMASKED SEQUENCE FOR SNP128 (DONE 2/1/08 angie) ssh kolossus mkdir /cluster/data/hg18/snp128Mask cd /cluster/data/hg18/snp128Mask # Identify rsIds with various problems -- we will exclude those. # MultipleAlignments is kinda broad because anything that maps on # both chrN and chrN_foo_hap1 will be excluded... similarly, extra # matches on chrN_random might disqualify good matches on chrN. # Well, erring on the side of caution is good. awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \ /cluster/data/dbSNP/128/human/snp128Exceptions.bed \ | sort -u \ > snp128ExcludeRsIds.txt time grep -vFwf snp128ExcludeRsIds.txt \ /cluster/data/dbSNP/128/human/snp128.bed \ > snp128Cleaned.bed #100.027u 11.779s 2:09.61 86.2% 0+0k 0+0io 0pf+0w # Substitutions: mkdir substitutions snpMaskSingle snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \ | faSplit byname stdin substitutions/ #-- 79 warnings about differing observed at same base positions #-- (66 distinct positions) -- send to NCBI. snp-admin@ncbi.nlm.nih.gov # Also this warning about total size -- just means that some chroms # didn't have any SNPS that survived the stringent filtering. #Masked 9146694 snps in 9146642 out of 3091528550 genomic bases #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091528550 (difference is 16148723) # Make sure that sizes are identical, first diffs are normal -> IUPAC, # and first diffs' case is preserved: foreach f (substitutions/chr*.fa) faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" end #(output OK) foreach f (substitutions/chr*.fa) echo $f:t:r mv $f $f:r.subst.fa gzip $f:r.subst.fa end # Insertions: mkdir insertions snpMaskAddInsertions snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \ | faSplit byname stdin insertions/ #Added 1332737 snps totaling 2372942 bases to 3085151178 genomic bases #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085151178 (difference is 22526095) # Again, that just means that some chroms didn't have filtered SNPs. # Make sure that all sizes have increased relative to original: foreach f (insertions/chr*.fa) faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \ |& perl -we '$_=<>; \ if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \ if ($1 > $2) {print "OK: ins size $1 > $2\n";} \ else {die "ERROR: ins size $1 <= $2\n";} \ } else {die $_;}' end #(output OK) foreach f (insertions/chr*.fa) mv $f $f:r.ins.fa gzip $f:r.ins.fa end # Deletions: mkdir deletions snpMaskCutDeletions snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \ | faSplit byname stdin deletions/ #Cut 661637 snps totaling 1248873 bases from 3085167749 genomic bases #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524) # Again, that just means that some chroms didn't have filtered SNPs. # Make sure that all sizes have decreased relative to original: foreach f (deletions/chr*.fa) faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \ |& perl -we '$_=<>; \ if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \ if ($1 < $2) {print "OK: del size $1 < $2\n";} \ else {die "ERROR: del size $1 >= $2\n";} \ } else {die $_;}' end #(output OK) foreach f (deletions/chr*.fa) mv $f $f:r.del.fa gzip $f:r.del.fa end # Clean up and prepare for download: gzip snp128Cleaned.bed foreach d (substitutions insertions deletions) pushd $d md5sum *.gz > md5sum.txt popd end # Make a README.txt in each subdir. # Create download links on hgwdev. # NOTE: I am going to start by offering only the substitutions. # If we get any user requests, then maybe we can put the insertions # and deletions out there. ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask ln -s /cluster/data/hg18/snp128Mask/substitutions/* \ /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/ ## If there is user demand for ins & del, then start over with an empty ## goldenPath/snp128Mask and do this: ## foreach type (substitutions insertions deletions) ## mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/$type ## ln -s /cluster/data/hg18/snp128Mask/$type/* \ ## /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/$type/ ## end ####################################################################### # ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP128 (DONE 2/8/08 angie) # REDONE 2/29/08 (upcase ortho alleles) ssh kolossus mkdir /cluster/data/hg18/bed/snp128Ortho cd /cluster/data/hg18/bed/snp128Ortho # Following Heather's lead in snp126orthos, filter SNPs to to keep # only those with class=single, length=1, chrom!~random; # Exclude those with exceptions MultipleAlignments, # SingleClassTriAllelic or SingleClassQuadAllelic. # Unlike snp masking, we do not filter for weight -- don't know why. awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \ /cluster/data/dbSNP/128/human/snp128Exceptions.bed \ | sort -u \ > snp128ExcludeIds.txt awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \ /cluster/data/dbSNP/128/human/snp128.bed \ | grep -vFwf snp128ExcludeIds.txt \ > snp128Simple.bed # took ~3 minutes wc -l snp128Simple.bed #9133704 snp128Simple.bed # This is the analog of db table snp126simple. # Glom all human info that we need for the final table onto the # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand awk 'BEGIN{OFS="\t";} \ {print $1, $2, $3, \ $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \ 0, $6;}' \ snp128Simple.bed > snp128ForLiftOver.bed # 2/29/08 -- re-ran from this point on to regenerate cleaned up # cluster run results (oops) and then force ortho alleles to upper # case, for consistency with dbSNP formatting. # Map coords to chimp using liftOver. # I don't know why chimp took so much longer than macaque... the # chimp .over has fewer chains and fewer bytes than the macaque .over. mkdir run.liftOChimp cd run.liftOChimp mkdir split out splitFile ../snp128ForLiftOver.bed 25000 split/chunk cp /dev/null jobList foreach f (split/chunk*) echo liftOver $f \ /cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \ \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \ >> jobList end ssh pk cd /cluster/data/hg18/bed/snp128Ortho/run.liftOChimp para make jobList #Completed: 366 of 366 jobs #CPU time in finished jobs: 71660s 1194.33m 19.91h 0.83d 0.002 y #IO & Wait Time: 5377s 89.62m 1.49h 0.06d 0.000 y #Average job time: 210s 3.51m 0.06h 0.00d #Longest finished job: 518s 8.63m 0.14h 0.01d #Submission to last job: 518s 8.63m 0.14h 0.01d # Map coords to macaque using liftOver. mkdir ../run.liftOMac cd ../run.liftOMac mkdir out ln -s ../run.liftOChimp/split . cp /dev/null jobList foreach f (split/chunk*) echo liftOver $f \ /cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \ \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \ >> jobList end para make jobList #Completed: 366 of 366 jobs #CPU time in finished jobs: 5663s 94.38m 1.57h 0.07d 0.000 y #IO & Wait Time: 12066s 201.10m 3.35h 0.14d 0.000 y #Average job time: 48s 0.81m 0.01h 0.00d #Longest finished job: 102s 1.70m 0.03h 0.00d #Submission to last job: 102s 1.70m 0.03h 0.00d # Average job time was 54s with 50000 chunks, but those made chimp # jobs run too long. ssh kolossus cd /cluster/data/hg18/bed/snp128Ortho # Here is a script that looks up the base value in the ortho species # and swizzles columns to prepare for the joining and re-swizzling # of both ortho species' columns into the final product. If it is # used more than once, should be checked in, perhaps in hg/snp/snpLoad. cat > getOrthoSeq.pl <<'_EOF_' #!/usr/bin/env perl # Dig up orthologous alleles and swizzle columns so the glommed name that # includes human position info etc. is first. It will be used as a key for # joining up multiple other-species' ortho data. Also swizzle columns so # that the remaining columns are in order of appearance in the final result, # snp128OrthoPanTro2RheMac2. Upcase ortho alleles for consistency w/dbSNP. use warnings; use strict; my $twoBitFName = shift @ARGV || die "usage: getOrthoSeq.pl orthoDb.2bit [file(s)]\n"; sub getOChrSeq($$) { # Slurp in fasta sequence using twoBitToFa. my ($twoBitFName, $oChr) = @_; open(P, "twoBitToFa -noMask $twoBitFName -seq=$oChr stdout |") || die "Can't open pipe from twoBitToFa $twoBitFName -seq=$oChr: $!\n";

=~ /^>\w+/ || die "Doesn't look like we got fasta -- first line is this:\n$_"; # From man perlfaq5: trick to slurp entire contents: my $c = 0; my $seq = do { local $/; my $data =

; $c = ($data =~ s/\n//g); $data; }; close(P); return $seq; } my %rc = ( "a" => "t", "c" => "g", "g" => "c", "t" => "a", "A" => "T", "C" => "G", "G" => "C", "T" => "A", ); sub revComp($) { # Reverse-complement fasta input. (Pass through non-agtc chars.) my ($seq) = @_; my $rcSeq = reverse $seq; for (my $i = 0; $i < length($rcSeq); $i++) { my $base = substr($rcSeq, $i, 1); my $cBase = $rc{$base} || $base; substr($rcSeq, $i, 1, $cBase); } return $rcSeq; } my $prevOChr; my ($oChrSeq, $oChrSize); while (<>) { chomp; my ($oChr, $oStart, $oEnd, $nameGlom, undef, $oStrand) = split; if (! defined $prevOChr || $oChr ne $prevOChr) { $oChrSeq = &getOChrSeq($twoBitFName, $oChr); $oChrSize = length($oChrSeq); } die "Coords out of range, input line $.: $oEnd > $oChr size $oChrSize\n\t" if ($oEnd > $oChrSize); my $oAllele = substr($oChrSeq, $oStart, $oEnd - $oStart); $oAllele = &revComp($oAllele) if ($oStrand eq "-"); print join("\t", $nameGlom, $oChr, $oStart, $oEnd, $oAllele, $oStrand) . "\n"; $prevOChr = $oChr; } '_EOF_' # << emacs chmod a+x getOrthoSeq.pl # Concatenate the chimp results, sorting by chimp pos in order to # efficiently access 2bit sequence in ./getOrthoSeq. The output of # that is then sorted by the glommed human info field, so that we # can use join to combine chimp and macaque results in the next step. sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \ | ./getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \ | sort > panTro2.orthoGlom.txt # ditto for macaque: sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \ | ./getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \ | sort > rheMac2.orthoGlom.txt # The whole pipeline takes ~4-6 minutes each. wc -l panTro2.orthoGlom.txt rheMac2.orthoGlom.txt # 8549323 panTro2.orthoGlom.txt # 7324851 rheMac2.orthoGlom.txt # Use the glommed name field as a key to join up chimp and macaque # allele data. Include glommed name from both files because if only # file 2 has a line for the key in 2.1, then 1.1 is empty. Then plop # in the orthoGlom files from each file, which are in the same order # as the chimp and macaque columns of snp128OrthoPanTro2RheMac2. join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \ -a 1 -a 2 -e 0 \ panTro2.orthoGlom.txt rheMac2.orthoGlom.txt \ | perl -wpe 'chomp; \ ($glom1, $glom2, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \ $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand) = split; \ $glomKey = ($glom1 ne "0") ? $glom1 : $glom2; \ ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \ split(/\|/, $glomKey); \ $o1Chr =~ s/^0$/?/; $o2Chr =~ s/^0$/?/; \ $o1Al =~ s/^0$/?/; $o2Al =~ s/^0$/?/; \ $o1Strand =~ s/^0$/?/; $o2Strand =~ s/^0$/?/; \ print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \ $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \ $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand) . "\n"; \ s/^.*$//;' \ | sort -k1,1 -k2n,2n > snp128OrthoPanTro2RheMac2.bed # took ~5 minutes. wc -l snp128OrthoPanTro2RheMac2.bed #8770301 snp128OrthoPanTro2RheMac2.bed ssh hgwdev cd /cluster/data/hg18/bed/snp128Ortho sed -e 's/snpOrthoPanTroRheMac/snp128OrthoPanTro2RheMac2/' \ ~/kent/src/hg/lib/snpOrthoPanTroRheMac.sql \ > snp128OrthoPanTro2RheMac2.sql time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \ hg18 snp128OrthoPanTro2RheMac2 -sqlTable=snp128OrthoPanTro2RheMac2.sql \ snp128OrthoPanTro2RheMac2.bed #Loaded 8770301 elements of size 17 #52.659u 8.528s 5:18.68 19.1% 0+0k 0+0io 0pf+0w # Cleanup on fileserver: cd /cluster/data/hg18/bed/snp128Ortho nice gzip snp128Simple.bed snp128ExcludeIds.txt snp128ForLiftOver.bed rm -r run*/split *.orthoGlom.txt ####################################################################### # COMPARE SNP128 TO SNP126 (DONE 2/7/08 angie) # First, do a featureBits venn, on some machine other than hgwdev. # I can't find the file from which snp126 was loaded... but kkr5u00 # has an hg18snp126 database with a snp126 that is a few hours newer, # but apparently the same as, hgwdev's hg18.snp126... so use that # (had to add gap tables too): ssh kkr5u00 time featureBits hg18snp126 snp126 #12451939 bases of 2881515245 (0.432%) in intersection #57.274u 15.283s 1:20.56 90.0% 0+0k 0+0io 0pf+0w # Now make sure we have a file copy of snp126 in case we need it in # the future: hgsql hg18snp126 -NBe 'select * from snp126' \ | cut -f 2-18 \ > /cluster/data/dbSNP/126/human/snp126.bed rsync /cluster/data/dbSNP/128/human/snp128.bed /scratch/tmp/ time featureBits hg18 /scratch/tmp/snp128.bed #12387071 bases of 2881515245 (0.430%) in intersection #636.834u 47.039s 11:24.02 99.9% 0+0k 0+0io 0pf+0w # OK, db is a lot faster! # I am not worried about the drop -- spot-checking, I have seen some # dropped rsIds and some that used to have multiple mappings but now # have only one mapping -- an improvement. pushd /cluster/data/dbSNP/128/human hgLoadBed -tab -noSort -onServer -tmpDir=/scratch/tmp \ hg18snp126 snp128 -sqlTable=snp128.sql snp128.bed popd # How many covered bases in common? time featureBits hg18snp126 snp126 snp128 #11576806 bases of 2881515245 (0.402%) in intersection #114.365u 26.671s 3:15.55 72.1% 0+0k 0+0io 0pf+0w # Base coverage Venn counts: # snp126 snp128 !snp126 !snp128 # snp126 12451939 11576806 0 875133 # snp128 11576806 12387071 810265 0 # Do the same for SNPs (rs* records as opposed to bases): hgsql hg18snp126 -NBe 'select name from snp126' \ | sort -u > /scratch/tmp/1 hgsql hg18snp126 -NBe 'select name from snp128' \ | sort -u > /scratch/tmp/2 wc -l /scratch/tmp/[12] # 11647909 /scratch/tmp/1 # 11677826 /scratch/tmp/2 comm -12 /scratch/tmp/[12] | wc -l #11531282 cd /cluster/data/dbSNP/128/human comm -23 /scratch/tmp/[12] \ > /cluster/data/dbSNP/128/human/ids.inSnp126Not128.txt comm -13 /scratch/tmp/[12] \ > /cluster/data/dbSNP/128/human/ids.inSnp128Not126.txt # rsId Venn counts: # snp126 snp128 !snp126 !snp128 # snp126 11647909 11531282 0 116627 # snp128 11531282 11677826 146544 0 # Interesting that snp128 has more new rsIds but fewer new bases. # It has been 2 versions since 126... also, when spot-checking # exceptions I noticed that a lot of deletion SNPs used to be # mapped to the appropriate span in 126, but in 128 were mapped to # a single base and had some kind of range*tion locType... not an # improvement. But that kind of observation best falls out of an # examination of exception cases... and that is what will be # useful for us to report to NCBI. ############################################################################ # BLASTZ SELF chain minScore=2000 (DONE - 2007-12-19 - Hiram) ssh kkstore02 screen # use screen to manage this job mkdir /cluster/data/hg18/bed/blastzSelf.2007-12-17 cd /cluster/data/hg18/bed/blastzSelf.2007-12-17 cat << '_EOF_' > DEF # human vs human BLASTZ_M=400 # TARGET: Human Hg18 SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Human Hg18 SEQ2_DIR=/san/sanvol1/scratch/hg18/selfNib SEQ2_LEN=/san/sanvol1/scratch/hg18/self.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 BASE=/cluster/data/hg18/bed/blastzSelf.2006-01-17 TMPDIR=/scratch/tmp '_EOF_' # happy emacs cd /cluster/data/hg18/bed/blastzSelf.2007-12-17 time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ `pwd`/DEF -verbose=2 -chainMinScore=2000 -chainLinearGap=medium \ -stop=net -smallClusterHub=memk -bigClusterHub=pk > do.log 2>&1 & # real 640m37.637s ## crafted a special loadUp.csh to avoid haplotypes and randoms, # and load with normScore ssh hgwdev cd /cluster/data/hg18/bed/blastzSelf.2007-12-17/axtChain time nice -n +19 ./loadUp.csh >loadUp.out 2>&1 # real 24m51.669s cd /cluster/data/hg18/bed/blastzSelf.2007-12-17 time nice -n +19 featureBits hg18 chainSelf2KLink \ -noRandom -noHap > fb.hg18.chainSelf2KLink.txt 2>&1 & # real 11m30.010s cat fb.hg18.chainSelf2KLink.txt # 346885376 bases of 2858034764 (12.137%) in intersection time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ `pwd`/DEF -verbose=2 -chainMinScore=2000 -chainLinearGap=medium \ -continue=download \ -stop=download -smallClusterHub=memk -bigClusterHub=pk \ > download.log 2>&1 & ############################################################################ # RE-BUILD GAD TRACK (Done, 1/16/08, Fan) # During previous build, all.txt was corrupted during receiving file from # email. mkdir /cluster/store12/gad080116 rm /cluster/data/gad ln -s /cluster/store12/gad080116 /cluster/data/gad cd /cluster/data/gad # Receive "all.txt" from GAD # contact person: Garner, John (NIH/NIA/IRP) [F] [garnerjr@mail.nih.gov] hgsql hg18 -e 'drop table gadAll' hgsql hg18 <~/src/hg/lib/gadAll.sql hgsql hg18 -e 'load data local infile "all.txt" into table gadAll ignore 3 lines' # create gad table gadPos hg18 j18.tmp cat j18.tmp |sort -u >hg18.gad.tab # removed 1 record from hg18.gad.tab that has multiple words in geneSymbol # field. # use -nobin option to ensure display order is according to genomic position hgLoadBed -nobin hg18 gad hg18.gad.tab rm j18.tmp ####################################################################### # BLASTZ/CHAIN/NET Lamprey petMar1 (DONE - 2008-01-29 - Hiram) # with contigs for Lamprey ssh kkstore02 screen # use screen to control this job mkdir /cluster/data/hg18/bed/blastzPetMar1.2008-01-29 cd /cluster/data/hg18/bed/blastzPetMar1.2008-01-29 cat << '_EOF_' > DEF # Human vs. Lamprey # using the "close" genome alignment parameters # see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human - WindowMasker sequence SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Lamprey petMar1 SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/cluster/data/petMar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=300 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzPetMar1.2008-01-29 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk > do.log 2>&1 & # real 414m33.533s cat fb.hg18.chainPetMar1Link.txt # 36042598 bases of 2881515245 (1.251%) in intersection # That is OK, now for the swap: mkdir /cluster/data/petMar1/bed/blastz.hg18.swap cd /cluster/data/petMar1/bed/blastz.hg18.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/hg18/bed/blastzPetMar1.2008-01-29/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk > swap.log 2>&1 & # real 60m1.928s cat fb.petMar1.chainHg18Link.txt # 26751073 bases of 831696438 (3.216%) in intersection ####################################################################### ################### # Build recip-best alignments with calJac1 (DONE 2008-01-25 braney) cd /cluster/data/hg18/bed ln -s blastz.calJac1.2007-10-07 blastz.calJac1 cd blastz.calJac1 screen /cluster/bin/scripts/doRecipBest.pl hg18 calJac1 ################### # Build syntenic net for orang (DONE 2008-01-25 braney) cd /cluster/data/hg18/bed/blastz.ponAbe2 screen /cluster/bin/scripts/doBlastzChainNet.pl -syntenicNet -continue syntenicNet -stop syntenicNet `pwd`/DEF 2>&1 | tee syntenic.out ######################################################################### ## Primate Multiz (Working ## ssh hgwdev mkdir /cluster/data/hg18/bed/multizPrimate cd /cluster/data/hg18/bed/multizPrimate # take the 30-way tree from mm9 and eliminate genomes not in # this alignment # rearrange to get hg18 on the top of the graph # paste this tree into the on-line phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to create the image for the tree diagram /cluster/bin/phast/tree_doctor --prune-all-but Human_hg18,Mouse_mm9,Chimp_panTro2,Orangutan_ponAbe2,Rhesus_rheMac2,Marmoset_calJac1,Bushbaby_otoGar1,TreeShrew_tupBel1,Rat_rn4,Dog_canFam2 /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh > primate.fullNames.nh # looks something like this: (((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.249544,((((((Human_hg18:0.005873,Chimp _panTro2:0.007668):0.013037,Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2: 0.031973):0.036500,Marmoset_calJac1:0.070000):0.036500,Bushbaby_otoGar1:0.151185 ):0.015682,TreeShrew_tupBel1:0.162844):0.006272):0.019763,Dog_canFam2:0.187963); # rearrange to get human at the top: # this leaves us with: cat << _EOF_ > hg18.primate.nh ((((((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:0.031973):0.036500,Marmoset_calJac1:0.070000):0.036500,Bushbaby_otoGar1:0.151185):0.015682,TreeShrew_tupBel1:0.162844):0.006272,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.249544):0.019763,Dog_canFam2:0.187963); _EOF_ # << happy emacs # create a species list from that file: sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' hg18.primate.nh \ | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \ | sed -e "s/.*_//; s/:.*//" | sort > species.list # create a stripped down nh file for use in autoMZ run echo \ `sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' hg18.primate.nh \ | sed -e "s/ / /g"` > tree.primate.nh # that looks like, as a single line: # ((((((((hg18 panTro2) ponAbe2) rheMac2) calJac1) otoGar1) tupBel1) (mm9 rn4)) canFam2) # verify all blastz's exists cat << '_EOF_' > listMafs.csh #!/bin/csh -fe cd /cluster/data/hg18/bed/multizPrimate foreach db (`cat species.list`) set bdir = /cluster/data/hg18/bed/blastz.$db if (-e $bdir/mafRBestNet/chr1.maf.gz) then echo "$db mafRBestNet" else if (-e $bdir/mafSynNet/chr1.maf.gz) then echo "$db mafSynNet" else if (-e $bdir/mafNet/chr1.maf.gz) then echo "$db mafNet" else echo "$db mafs not found" endif end '_EOF_' # << happy emacs chmod +x ./listMafs.csh # see what it says, the "mafs not found" should only show up on hg18 ./listMafs.csh # calJac1 mafRBestNet # canFam2 mafSynNet # hg18 mafNet # mm9 mafSynNet # otoGar1 mafRBestNet # panTro2 mafSynNet # ponAbe2 mafSynNet # rheMac2 mafSynNet # rn4 mafSynNet # tupBel1 mafRBestNet /cluster/bin/phast/all_dists hg18.primate.nh > Primate.distances.txt grep -i hg18 Primate.distances.txt | sort -k3,3n # Human_hg18 Chimp_panTro2 0.013541 # Human_hg18 Orangutan_ponAbe2 0.038910 # Human_hg18 Rhesus_rheMac2 0.063920 # Human_hg18 Marmoset_calJac1 0.138447 # Human_hg18 Bushbaby_otoGar1 0.256132 # Human_hg18 TreeShrew_tupBel1 0.283473 # Human_hg18 Dog_canFam2 0.334627 # Human_hg18 Mouse_mm9 0.452719 # Human_hg18 Rat_rn4 0.460828 # copy net mafs to cluster-friendly storage, splitting chroms # into 50MB chunks to improve run-time # NOTE: splitting will be different for scaffold-based reference asemblies ssh hgwdev mkdir /cluster/data/hg18/bed/multizPrimate/run.split cd /cluster/data/hg18/bed/multizPrimate/run.split # this works by examining the rmsk table for likely repeat areas # that won't be used in blastz mafSplitPos hg18 50 mafSplit.bed ssh kki cd /cluster/data/hg18/bed/multizPrimate/run.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set targDb = "hg18" set db = $1 set sdir = /san/sanvol1/scratch/$targDb/BRsplitStrictMafNet mkdir -p $sdir if (-e $sdir/$db) then echo "directory $sdir/$db already exists -- remove and retry" exit 1 endif set bdir = /cluster/data/$targDb/bed/blastz.$db if (! -e $bdir) then echo "directory $bdir not found" exit 1 endif mkdir -p $sdir/$db if (-e $bdir/mafRBestNet) then set mdir = $bdir/mafRBestNet else if (-e $bdir/mafSynNet) then set mdir = $bdir/mafSynNet else if (-e $bdir/mafNet) then set mdir = $bdir/mafNet else echo "$bdir maf dir not found" exit 1 endif echo $mdir foreach f ($mdir/*) set c = $f:t:r:r echo " $c" nice mafSplit mafSplit.bed $sdir/$db/ $f end echo "gzipping $sdir/$db mafs" nice gzip $sdir/$db/* endif echo $mdir > $db.done '_EOF_' # << happy emacs chmod +x doSplit.csh grep -v hg18 ../species.list > split.list cat << '_EOF_' > template #LOOP doSplit.csh $(path1) {check out line+ $(path1).done} #ENDLOOP '_EOF_' gensub2 split.list single template jobList para create jobList # start these gently, this is a good load on the san filesystem para -maxPush=3 push # wait a while, verify these are running OK para push # let that run to a couple completions, a few minutes, then again: para try # etc ... # Completed: 9 of 9 jobs # CPU time in finished jobs: 9090s 151.50m 2.52h 0.11d 0.000 y # IO & Wait Time: 3093s 51.55m 0.86h 0.04d 0.000 y # Average job time: 1354s 22.56m 0.38h 0.02d # Longest finished job: 2134s 35.57m 0.59h 0.02d # Submission to last job: 2153s 35.88m 0.60h 0.02d # ready for the multiz run ssh pk cd /cluster/data/hg18/bed/multizPrimate # actually, the result directory here should be maf.split instead of maf mkdir -p maf run cd run mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # list chrom chunks, any db dir will do; better would be for the # splitter to generate this file # We temporarily use __ instead of . to delimit chunk in filename # so we can use $(root) to get basename find /san/sanvol1/scratch/hg18/BRsplitStrictMafNet -type f \ | while read F; do basename $F; done \ | sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.list wc -l chromChunks.list # 93 chromChunks.list cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = hg18 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/BRsplitStrictMafNet rm -fr $tmp mkdir -p $tmp cp ../tree.primate.nh ../species.list $tmp pushd $tmp foreach s (`cat species.list`) set c2 = `echo $c | sed 's/__/./'` set in = $pairs/$s/$c2.maf set out = $db.$s.sing.maf if ($s == hg18) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.primate.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multizPrimate/maf/$(root1).maf} #ENDLOOP '_EOF_' # << emacs gensub2 chromChunks.list single template jobList para create jobList # Completed: 93 of 93 jobs # CPU time in finished jobs: 302126s 5035.43m 83.92h 3.50d 0.010 y # IO & Wait Time: 3499s 58.32m 0.97h 0.04d 0.000 y # Average job time: 3286s 54.77m 0.91h 0.04d # Longest finished job: 6972s 116.20m 1.94h 0.08d # Submission to last job: 7052s 117.53m 1.96h 0.08d # put the split maf results back together into single chroms ssh kkstore02 cd /cluster/data/hg18/bed/multizPrimate # here is where the result directory maf should have already been maf.split mv maf maf.split mkdir maf # going to sort out the redundant header garbage to leave a cleaner maf for C in `ls maf.split | sed -e "s#__.*##" | sort -u` do echo ${C} head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \ sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf done # load tables for a look ssh hgwdev mkdir -p /gbdb/hg18/multizPrimate/maf ln -s /cluster/data/hg18/bed/multizPrimate/maf/*.maf \ /gbdb/hg18/multizPrimate/maf # this generates a large 1 Gb multizPrimate.tab file in the directory # where it is running. Best to run this over in scratch. cd /scratch/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/hg18/multizPrimate/maf hg18 multizPrimate # Loaded 12531777 mafs in 49 files from /gbdb/hg18/multizPrimate/maf # real 8m44.516s # load summary table time nice -n +19 cat /gbdb/hg18/multizPrimate/maf/*.maf \ | hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multizPrimateSummary stdin # Created 1417364 summary blocks from 29928557 components # and 6981421 mafs from stdin # real 21m35.057s # Gap Annotation # prepare bed files with gap info ssh kkstore02 mkdir /cluster/data/hg18/bed/multizPrimate/anno cd /cluster/data/hg18/bed/multizPrimate/anno mkdir maf run # these actually already all exist from previous multiple alignments for DB in `cat ../species.list` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `grep -v hg18 ../../species.list` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done ssh kki cd /cluster/data/hg18/bed/multizPrimate/anno/run cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set dir = /cluster/data/hg18/bed/multizPrimate set c = $1 cat $dir/maf/${c}.maf | \ nice mafAddIRows -nBeds=nBeds stdin /cluster/data/hg18/hg18.2bit $2 '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(root1) {check out line+ /cluster/data/hg18/bed/multizPrimate/anno/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 /cluster/data/hg18/chrom.sizes > chrom.list gensub2 chrom.list single template jobList para create jobList para try ... check ... push ... etc. # Completed: 49 of 49 jobs # CPU time in finished jobs: 10782s 179.71m 3.00h 0.12d 0.000 y # IO & Wait Time: 3380s 56.33m 0.94h 0.04d 0.000 y # Average job time: 289s 4.82m 0.08h 0.00d # Longest finished job: 751s 12.52m 0.21h 0.01d # Submission to last job: 1479s 24.65m 0.41h 0.02d ssh hgwdev cd /cluster/data/hg18/bed/multizPrimate/anno mkdir -p /gbdb/hg18/multizPrimate/anno/maf ln -s /cluster/data/hg18/bed/multizPrimate/anno/maf/*.maf \ /gbdb/hg18/multizPrimate/anno/maf # by loading this into the table multizPrimate, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/hg18/multizPrimate/anno/maf \ hg18 multizPrimate # Loaded 7331265 mafs in 55 files from /gbdb/hg18/multizPrimate/anno/maf # real 8m31.092s cat /cluster/data/hg18/chrom.sizes | \ awk '{if ($2 > 1000000) { print $1 }}' | while read C do echo /gbdb/hg18/multizPrimate/anno/maf/$C.maf done | xargs cat | \ hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multizPrimateSummary stdin # Created 1621960 summary blocks from 75794119 components and 12601786 # mafs from stdin # remove the multizPrimate*.tab files in this /scratch/tmp directory rm multizPrimate* ####### ################################################################################ # RE-SEQUENCING TRACE DOWNLOAD (DONE 2008-01-25, Andy) ssh kolossus bash cd /san/sanVol1/scratch/andy mkdir traces cd traces/ cat < "EOF" > getOldTraces.sh #!/bin/bash echo Retrieving sequences before Jan 2008 echo Starting at `date` # Query the database and figure out the total number of pages needed count=`./query_tracedb "query count species_code='HOMO SAPIENS' and strategy='Re-Sequencing' and load_date<'1/1/2008'"` pages=$(( (count/40000) + ((count % 40000) > 0) )) echo echo Total of $count sequences and $pages pages to retrieve echo for ((page=0; page < pages; page++)); do pagenum=`printf "%03d" $((page+1))` ./query_tracedb "query page_size 40000 page_number $page binary species_code='HOMO SAPIENS' and strategy='Re-Sequencing' and load_date<'1/1/2008'" > page.bin echo -n "Retrieving page $((page+1)) of $pages compressed fasta... " (echo -n "retrieve_gz fasta 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.fa.gz echo "done at `date +%T`" echo -n "Retrieving page $((page+1)) of $pages compressed quality file... " (echo -n "retrieve_gz quality 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.qa.gz echo "done at `date +%T`" echo -n "Retrieving page $((page+1)) of $pages xml file... " (echo -n "retrieve xml_info 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.xml gzip page-${pagenum}.xml echo "done at `date +%T`" rm page.bin done echo echo All done at `date`! EOF chmod +x getOldTraces.sh screen ./getOldTraces.sh > download.log # detach screen # tail -f download.log #Retrieving sequences before Jan 2008 #Starting at Wed Jan 23 11:47:04 PST 2008 # #Total of 13978657 sequences and 350 pages to retrieve # #Retrieving page 1 of 350 compressed fasta... done at 11:48:40 #Retrieving page 1 of 350 compressed quality file... done at 11:49:10 #Retrieving page 1 of 350 xml file... done at 11:51:05 #Retrieving page 2 of 350 compressed fasta... done at 11:52:40 #Retrieving page 2 of 350 compressed quality file... done at 11:53:10 # ... #Retrieving page 350 of 350 compressed quality file... done at 07:07:08 #Retrieving page 350 of 350 xml file... done at 07:08:16 # #All done at Fri Jan 25 07:08:16 PST 2008! ################################################################################ # RE-SEQUENCING TRACE ALIGNMENT TO HG18 (DONE 2008-01-31, Andy) ssh kkr12u22 cd /san/sanVol1/scratch/andy/traces mkdir run cd run/ ls -1 /scratch/hg/hg18/nib/* | grep -v hap > nib.lst ls -1 /san/sanVol1/scratch/andy/traces/page-*.fa.gz > traces.lst cat < "EOF" > gsub #LOOP ./doBlat.sh {check in exists $(path1)} $(path2) {check out line+ $(root2)/$(root1).$(root2).maf} #ENDLOOP cat < "EOF" > doBlat.sh #!/bin/bash thisDir=`pwd -P` fa=`basename $1` nib=$2 f=${fa%.fa.gz} n=`basename $2` n=${n%.nib} name=${f}.${n} out=${name}.maf mkdir -p /scratch/tmp/andy/$name mkdir -p $n pushd /scratch/tmp/andy/$name cp $1 . blat -minMatch=12 -ooc=/scratch/hg/hg18/11.ooc -out=maf $nib $fa $out cp $out ${thisDir}/$n popd rm -rf /scratch/tmp/andy/$name EOF chmod +x doBlat.sh ssh pk cd /san/sanVol1/scratch/andy/traces/run gensub2 traces.lst nib.lst gsub spec sed 's/\.fa\.c/.c/' spec > tmp; mv tmp spec para create spec para try, push, check para time #15750 jobs in batch #100 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 15750 of 15750 jobs #CPU time in finished jobs: 385991s 6433.19m 107.22h 4.47d 0.012 y #IO & Wait Time: 47866s 797.76m 13.30h 0.55d 0.002 y #Average job time: 28s 0.46m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 186s 3.10m 0.05h 0.00d #Submission to last job: 1551s 25.85m 0.43h 0.02d # Cat all the alignments ssh hgwdev cd /san/sanVol1/scratch/andy/traces/run head -n1 chrY/page-112.chrY.maf > maf.header for ((i=0; i < 350; i++)); do echo page $((i+1)) pagenum=`printf "%03d" $((i+1))` prefix=page-$pagenum newfile=cat/${prefix}.maf cp maf.header $newfile for f in `find . -name "${prefix}*"`; do tail +2 $f | sed 's/gnl|ti|//' >> $newfile done done ############################################################################ # Reload CCDS (2008-02-01 markd) # import ccds database as described in ccds.txt set db=hg18 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords ${db} -verbose=2 ccdsGene # update all.jointer to include ${db} in ccdsDb joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # request push of ccdsGene ccdsInfo ccdsKgMap # << emacs ############################################################################# # phastCons multizPrimage ## (DONE - 2008-02-11 braney ) # split mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh kki mkdir /cluster/data/hg18/bed/multizPrimate/msa.split mkdir -p /san/sanvol1/scratch/hg18/multizPrimate/cons/ss cd /cluster/data/hg18/bed/multizPrimate # just use primates cat << '_EOF_' > primates.list hg18 panTro2 ponAbe2 rheMac2 calJac1 otoGar1 '_EOF_' cd /cluster/data/hg18/bed/multizPrimate/msa.split zcat /san/sanvol1/braney/multizPrimate/chr1.maf.gz | \ perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' | \ mafOrder stdin /cluster/data/hg18/bed/multizPrimate/primates.list chr1.maf twoBitToFa -seq=chr1 /scratch/data/hg18/hg18.2bit chr1.fa /cluster/bin/phast/$MACHTYPE/msa_split chr1.maf -i MAF -M chr1.fa \ -o SS -r chr1 -w 300000000,0 -I 1000 -B 5000 time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \ chr1.1-247249719.ss --tree \ "(((((hg18,panTro2),ponAbe2),rheMac2),calJac1),otoGar1)" \ --out-root starting-tree rm chr1.maf chr1.fa chr1.1-247249719.ss mkdir -p /san/sanvol1/scratch/hg18/multizPrimate/cons/estimate cp msa.split/starting-tree.mod /san/sanvol1/scratch/hg18/multizPrimate/cons/estimate cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set MAFS = /san/sanvol1/braney/multizPrimate set WINDOWS = /san/sanvol1/scratch/hg18/multizPrimate/cons/ss pushd $WINDOWS set c = $1 rm -fr $c mkdir $c twoBitToFa -seq=$c /scratch/data/hg18/hg18.2bit /scratch/tmp/hg18.$c.fa set TMP = /scratch/BR.$c.maf zcat $MAFS/$c.maf.gz | perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' | \ mafOrder stdin /cluster/data/hg18/bed/multizPrimate/primates.list $TMP /cluster/bin/phast/$MACHTYPE/msa_split $TMP \ -i MAF \ -M /scratch/tmp/hg18.$c.fa \ -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 rm -f $TMP /scratch/tmp/hg18.$c.fa popd date >> $c.done '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../anno/maf | sed -e "s/.maf//" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 49 of 49 jobs # CPU time in finished jobs: 3520s 58.66m 0.98h 0.04d 0.000 y # IO & Wait Time: 1200s 20.00m 0.33h 0.01d 0.000 y # Average job time: 96s 1.61m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 464s 7.73m 0.13h 0.01d # Submission to last job: 723s 12.05m 0.20h 0.01d # XXXX Estimates were attempted, not really very useful, instead, as seen # below, merely take the cons and noncons trees from the mouse 30-way # Estimate phastCons parameters # see also: # http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html # Create a list of .ss files over 3,000,000 in length # this is almost everything cd /san/sanvol1/scratch/hg18/multizPrimate/cons/ss ls -1l chr*/chr*.ss | egrep -v "_hap|chrUn|random" | \ awk '$5 > 3000000 {print $9;}' > ../tuningRun.list # Set up parasol directory to calculate trees on these 50 regions ssh pk mkdir /cluster/data/hg18/bed/multizPrimate/treeRun2 cd /cluster/data/hg18/bed/multizPrimate/treeRun2 mkdir tree log most # Tuning this loop should come back to here to recalculate # Create script that calls phastCons with right arguments cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set SAN="/san/sanvol1/scratch/hg18/multizPrimate/cons" set SS=$1 set C=$1:h set F=$1:t set tmpDir="/scratch/tmp/pA2_$2" rm -fr $tmpDir mkdir $tmpDir mkdir -p log/${C} tree/${C} most/${C} cp -p $SAN/ss/$1 $tmpDir/$F cp -p $SAN/estimate/starting-tree.mod $tmpDir pushd $tmpDir /cluster/bin/phast/$MACHTYPE/phastCons $F starting-tree.mod \ --gc 0.355 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-length 45 --target-coverage 0.3 --most-conserved $F.most \ --quiet --log $F.log --estimate-trees $F.tree popd cp -p $tmpDir/$F.log log/$C cp -p $tmpDir/$F.most most/$C cp -p $tmpDir/$F.tree.*cons.mod tree/$C rm -fr $tmpDir '_EOF_' # << happy emacs chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) $(num1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it scp -p braney@pk:/san/sanvol1/scratch/hg18/multizPrimate/cons/tuningRun.list . gensub2 tuningRun.list single template jobList para create jobList para try/push/check/etc # Completed: 310 of 310 jobs # CPU time in finished jobs: 226767s 3779.45m 62.99h 2.62d 0.007 y # IO & Wait Time: 1224s 20.40m 0.34h 0.01d 0.000 y # Average job time: 735s 12.26m 0.20h 0.01d # Longest finished job: 908s 15.13m 0.25h 0.01d # Submission to last job: 4948s 82.47m 1.37h 0.06d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ave.cons.mod > cons_summary.txt ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ave.noncons.mod > noncons_summary.txt sort -k1,1 -k2,2n most/chr*/*.most > mostConserved.bed wc -l mostConserved.bed # 1192414 mostConserved.bed # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 /cluster/bin/phast/$MACHTYPE/consEntropy .3 45 \ ave.cons.mod ave.noncons.mod # Transition parameters: gamma=0.300000, omega=45.000000, mu=0.022222, # nu=0.009524 # Relative entropy: H=0.141789 bits/site # Expected min. length: L_min=98.721504 sites # Expected max. length: L_max=62.917932 sites # Phylogenetic information threshold: PIT=L_min*H=13.997639 bits ssh hgwdev featureBits -noRandom -noHap hg18 `pwd`/mostConserved.bed # 372348946 bases of 2858034764 (13.028%) in intersection ssh hgwdev featureBits -noRandom -noHap -enrichment hg18 genscan:cds \ `pwd`/mostConserved.bed # genscan:cds 1.927%, # mostConserved.bed 13.028%, # both 0.300%, cover 15.57%, enrich 1.20x # Estimates could be made, but more correctly, take the 30-way # .mod file, and re-use it here. ssh hgwdev cd /cluster/data/hg18/bed/multizPrimate # cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod . # add up the C and G: grep BACKGROUND treeRun2/ave.noncons.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.355 # This 0.355 is used in the --gc argument below # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh pk mkdir -p /cluster/data/hg18/bed/multizPrimate/cons/run.cons cd /cluster/data/hg18/bed/multizPrimate/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all gliers placentals cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.2007-05-04 set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set tmp = /scratch/tmp/$f set cons = /cluster/data/hg18/bed/multizPrimate/cons mkdir -p $tmp set san = /san/sanvol1/scratch/hg18/multizPrimate/cons cp -p $cons/$grp/*.mod . cp -p $san/ss/$c/$f.ss $cons/$grp/*.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phastCons $f.ss ave.cons.mod,ave.noncons.mod \ --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp # $PHASTBIN/phastCons $f.ss $grp.mod \ # --rho $rho --expected-length $len --target-coverage $cov --quiet \ # --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c sleep 4 touch $san/$grp/pp/$c $san/$grp/bed/$c rm -f $san/$grp/pp/$c/$f.pp rm -f $san/$grp/bed/$c/$f.bed mv $tmp/$f.pp $san/$grp/pp/$c mv $tmp/$f.bed $san/$grp/bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/hg18/multizPrimate/cons/all/bed/$(root1)/$(file1).bed} #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/hg18/multizPrimate/cons ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \ /cluster/data/hg18/bed/multizPrimate/cons/ss.list popd # run for all species cd .. mkdir -p all run.cons/all cd all # /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \ # --prune-all-but=hg18,hg18,panTro2,rheMac2,calJac1,mm9,monDom4,ornAna1 \ # > all.mod cd ../run.cons/all # root1 == chrom name, file1 == ss file name without .ss suffix # Create template file for "all" run cat << '_EOF_' > template #LOOP ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/hg18/multizPrimate/cons/all/bed/$(root1)/$(file1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../../ss.list single template jobList para create jobList para try ... check ... push ... etc. # crashed jobs are OK methinks since we're checking output in # bed file instead of pp file # Completed: 332 of 337 jobs # Crashed: 5 jobs # CPU time in finished jobs: 11572s 192.86m 3.21h 0.13d 0.000 y # IO & Wait Time: 3189s 53.15m 0.89h 0.04d 0.000 y # Average job time: 44s 0.74m 0.01h 0.00d # Longest finished job: 60s 1.00m 0.02h 0.00d # Submission to last job: 564s 9.40m 0.16h 0.01d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all time nice -n +19 cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/hg18/bed/multizPrimate/cons/all # load into database ssh hgwdev cd /cluster/data/hg18/bed/multizPrimate/cons/all time nice -n +19 hgLoadBed hg18 phastConsElementsPrimate mostConserved.bed # Loaded 1431934 elements of size 5 # Try for 5% overall cov, and 70% CDS cov featureBits hg18 phastConsElementsPrimate # 460640890 bases of 2881515245 (15.986%) in intersection # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastConsPrimateScores for D in pp/chr* do C=${D/pp\/} out=phastConsPrimateScores/${C}.data.gz echo "${D} > ${C}.data.gz" ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \ gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 47m46.099s # copy the phastCons8wayScores to: # /cluster/data/hg18/bed/multizPrimate/downloads/phastCons8way/phastConsScores # for hgdownload downloads # Create merged posterier probability file and wiggle track data files # currently doesn't matter where this is performed, the san is the same # network distance from all machines. cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all time nice -n +19 ls phastConsPrimateScores/*.data.gz | xargs zcat \ | wigEncode -noOverlap stdin phastConsPrimate.wig phastConsPrimate.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 30m18.821s time nice -n +19 cp -p *.wi? /cluster/data/hg18/bed/multizPrimate/cons/all # real 1m26.426s # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/hg18/bed/multizPrimate/cons/all ln -s `pwd`/phastConsPrimate.wib /gbdb/hg18/multizPrimate/phastConsPrimate.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multizPrimate hg18 \ phastConsPrimate phastConsPrimate.wig # real 0m53.686s # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/hg18/bed/multizPrimate/cons/all time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=hg18 phastConsPrimate > histogram.data 2>&1 # real 5m10.426s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Histogram phastConsPrimate track" set xlabel " phastConsPrimate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# ## Annotate multizPrimate multiple alignment with gene annotations ## (DONE - 2008-02-11 braney ) # Gene frames ## survey all genomes to see what type of gene track to use ssh hgwdev mkdir /cluster/data/hg18/bed/multizPrimate/frames cd /cluster/data/hg18/bed/multizPrimate/frames # dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " echo -n "Tables: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \ $table == "knownGene") then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql hg18 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh # given this output, manually sorted for this display: # calJac1: Tables: Mrnas: 3558 # canFam2: Tables: ensGene: 25568, refGene: 864, Mrnas: 367629 # hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 28497, refGene: # 26066, Mrnas: 8354195 # mm9: Tables: ensGene: 43795, knownGene: 49409, mgcGenes: 22368, refGene: # 21395, Mrnas: 5093221 # otoGar1: Tables: Mrnas: 0 # panTro2: Tables: ensGene: 32852, mgcGenes: 4, refGene: 26344, Mrnas: 6346 # ponAbe2: Tables: Mrnas: 0 # rheMac2: Tables: ensGene: 38561, refGene: 445, Mrnas: 61770 # rn4: Tables: ensGene: 33745, knownGene: 8202, mgcGenes: 5704, refGene: 14498, # Mrnas: 872209 # tupBel1: Tables: Mrnas: 2364 # use knownGene for hg18, mm9 # use ensGene for rn4, canFam2, panTro2, rheMac2 # use Mrnas for calJac1, ponAbe2 # no annotations for # tupBel1, otoGar1 mkdir genes # knownGene for DB in hg18 mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene for DB in rn4 canFam2 panTro2 rheMac2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # and finally, using the mrna tables for DB in calJac1 ponAbe2 do tmpExt=`mktemp temp.XXXXXX` tmpMrnaCds=${DB}.mrna-cds.${tmpExt} tmpMrna=${DB}.mrna.${tmpExt} tmpCds=${DB}.cds.${tmpExt} hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ $DB > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \ genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz rm -f $tmpExt echo "${DB} done" done ssh kkstore06 cd /cluster/data/hg18/bed/multizPrimate/frames time (cat ../anno/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout rn4 genes/rn4.gp.gz mm9 genes/mm9.gp.gz hg18 genes/hg18.gp.gz rheMac2 genes/rheMac2.gp.gz ponAbe2 genes/ponAbe2.gp.gz panTro2 genes/panTro2.gp.gz canFam2 genes/canFam2.gp.gz calJac1 genes/calJac1.gp.gz | gzip > multizPrimate.mafFrames.gz) > frames.log 2>&1 # see what it looks like in terms of number of annotations per DB: zcat multizPrimate.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n # 2732 calJac1 # 190927 hg18 # 195671 panTro2 # 208637 rheMac2 # 230764 mm9 # 231026 rn4 # 248086 canFam2 # load the resulting file ssh hgwdev cd /cluster/data/hg18/bed/multizPrimate/frames time nice -n +19 hgLoadMafFrames hg18 multizPrimateFrames \ multizPrimate.mafFrames.gz # real 1m1.893s # enable the trackDb entries: # frames multizPrimateFrames # irows on ############################################################################# ## Add CTD data (DONE - 2008-02-22, updated 2008-03-07, Fan ) mkir /cluster/store11/gs.19/build36/bed/ctd021508 cd /cluster/store11/gs.19/build36/bed/ctd021508 # Download chem_gene_ixns.tsv from CTD site, http://ctd.mdibl.org/downloads/. hgsql hg18 -e 'create database ctd' hgsql ctd < ~/kent/src/hg/lib/chem_gene_ixns.sql hgsql ctd -e 'load data local infile "chem_gene_ixns.tsv" into table chem_gene_ixns' # create sorted data hgsql hg18 -N -e \ 'select x.geneSymbol, ChemicalId, count(distinct Interaction), ChemicalName from kgXref x, ctd.chem_gene_ixns c where x.geneSymbol=c.GeneSymbol group by x.geneSymbol, ChemicalId'|\ sort -k 1,1 -k 3,3nr -k 4,4 >ctdSorted.tab hgsql hgFixed < ~/kent/src/hg/lib/ctdSorted.sql hgsql hgFixed -e 'load data local infile "ctdSorted.tab" into table ctdSorted' ############################################################################# # CREATE huge TABLE FOR HuGE LINK (DONE 3/6/08, Fan) # Get HuGEgeneList.txt (list of HuGE genes from HuGE collaborator). mkdir /cluster/store11/gs.19/build36/bed/HuGE cd /cluster/store11/gs.19/build36/bed/HuGE # put the file there. cp HuGEgeneList.txt huge.tab # get rid of header lines and blank lines at the end. vi huge.tab hgsql hg17 < ~/kent/src/hg/lib/huge.sql hgsql hg18 < ~/kent/src/hg/lib/huge.sql hgsql hg17 -e 'load data local infile "huge.tab" into table huge' hgsql hg18 -e 'load data local infile "huge.tab" into table huge' ############################################################################# ############################################################################# # ULTRACONSERVED TRACKS (LIFT FROM HG17) (DONE 2008-03-10, Andy) ssh hgwdev cd /cluster/data/hg18/bed mkdir ultras cd ultras/ echo "select chrom,chromStart,chromEnd,name from uc16" \ | hgsql hg17 | tail +2 > uc16Hg17.bed echo "select chrom,chromStart,chromEnd,name from ux16" \ | hgsql hg17 | tail +2 > ux16Hg17.bed liftOver uc16Hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \ uc16Hg18.bed uc16Hg18.unmapped liftOver ux16Hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \ ux16Hg18.bed ux16Hg18.unmapped hgLoadBed hg18 uc16 uc16Hg18.bed hgLoadBed hg18 ux16 ux16Hg18.bed ############################################################################# # TAJIMA'S D (LIFTOVER FROM HG17) (DONE 3/17/08 angie) ssh hgwdev mkdir /cluster/data/hg18/bed/tajdLiftOver cd /cluster/data/hg18/bed/tajdLiftOver # The submitted hg17 bedGraph custom tracks had 1-based start coords, # so correct; also, the tajdSnp* tables used a sql command to set # the rs names, so get the data from SQL not file: set loChain = /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz foreach pop (Ad Ed Xd) zcat /cluster/data/hg17/bed/tajdpoly/20050603/hg17.tajd$pop.bedGraph.gz \ | awk '{print $1 "\t" $2-1 "\t" $3 "\t" $4}' \ | liftOver stdin -minMatch=0.5 \ $loChain hg18.tajd$pop.bedGraph hg17.tajd$pop.unmapped hgsql hg17 -NBe "select chrom,chromStart,chromEnd,name from tajdSnp$pop" \ | liftOver stdin \ $loChain hg18.tajdSnp$pop.bed hg17.tajdSnp$pop.unmapped end foreach pop (Ad Ed Xd) hgLoadBed hg18 tajdSnp$pop hg18.tajdSnp$pop.bed hgLoadBed -bedGraph=4 hg18 tajd$pop hg18.tajd$pop.bedGraph end # The hg17 build had some fancy sql to find items overlapping with gaps, # awk'd to make sql to delete those items. Use featureBits to find: foreach pop (Ad Ed Xd) featureBits hg18 -countGaps tajdSnp$pop gap -bed=tajdSnp$pop.gap.bed featureBits hg18 -countGaps tajd$pop gap -bed=tajd$pop.gap.bed end wc -l *.gap.bed # 8 tajdAd.gap.bed # 8 tajdEd.gap.bed # 0 tajdSnpAd.gap.bed # 0 tajdSnpEd.gap.bed # 0 tajdSnpXd.gap.bed # 8 tajdXd.gap.bed diff tajdAd.gap.bed tajdEd.gap.bed diff tajdAd.gap.bed tajdXd.gap.bed # No output from either diff -- same ranges. awk '{print $3 - $2;}' tajdAd.gap.bed #2605 #5000 #5000 #1000 #1199 #1359 #5000 #4100 # Actually, I disagree with removing the items that overlap those. # As the description page says, each 10kb region is really the center # of a 100kb window. Those windows will overlap gaps -- and if the # center 10k of a window happens to overal a gap, the whole window is # no worse than a window that overlaps a gap 1/3 of the way in instead # of 1/2. ############################################################################# # ADD ALLEN BRAIN CORTEXT LINK (DONE, 2/12/08, Fan) mkdir -p /cluster/store11/gs.19/build36/bed/allenBrain cd /cluster/store11/gs.19/build36/bed/allenBrain # save list of genes from Allen Brain into file allenBrainGene.tab hgsql hg18 < ~/src/hg/lib/allenBrainGene.sql hgsql hg18 -e \ 'load data local infile "allenBrainGene.tab" into table allenBrainGene' ############################################################################# # BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-10 - larrym) ssh kkstore04 screen # use screen to control this multi-day job mkdir /cluster/data/hg18/bed/blastz.equCab2.2008-04-10 cd /cluster/data/hg18/bed/blastz.equCab2.2008-04-10 cat << '_EOF_' > DEF # Human vs. Horse BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/cluster/data/equCab2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.equCab2.2008-04-10 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log & # failed so had to rerun stuff manually then, continue thus: time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log & 0.157u 0.084s 1:21:15.25 0.0% 0+0k 0+0io 0pf+0w ln -s blastz.equCab2.2008-04-10 /cluster/data/hg18/bed/blastz.equCab2 featureBits hg18 -chrom=chr1 chainEquCab2Link # 133103986 bases of 224999719 (59.157%) in intersection cd /cluster/data/hg18/bed/blastz.equCab2.2008-04-10 cat fb.hg18.chainEquCab2Link.txt # 1647122438 bases of 2881515245 (57.162%) in intersection # re-running with fixed UnScaffolds business with fixed chr27: mkdir /hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01 cd /hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01 cat << '_EOF_' > DEF # Human vs. Horse BLASTZ=blastz BLASTZ_M=50 # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/scratch/data/equCab2/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # broken chain step for chr19, ran manually all day long on swarm, then time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -continue=chainMerge -verbose=2 -workhorse=hgwdev \ -stop=net -smallClusterHub=pk -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 XXX - running Tue Dec 2 15:42:18 PST 2008 time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \ -continue=syntenicNet -syntenicNet -verbose=2 -workhorse=hgwdev \ -stop=syntenicNet -smallClusterHub=pk -bigClusterHub=pk \ -debug -chainMinScore=3000 -chainLinearGap=medium > syntenicNet.log 2>&1 ############################################################################# # MAKE PCR TARGET FOR UCSC GENES (DONE 4/18/08 angie - UPDATED 11/4/08) ssh hgwdev mkdir /cluster/data/hg18/bed/mrnaPcr cd /cluster/data/hg18/bed/mrnaPcr # First, get consistent FA and PSL for UCSC Genes. # Initially I tried to use files from /cluster/data/hg18/bed/ucsc.10/: # subColumn 10 /cluster/data/hg18/bed/ucsc.10/rnaToGenome.psl # /cluster/data/hg18/bed/ucsc.10/txToAcc.tab ucscGenes.hg18.psl # /cluster/data/hg18/bed/ucsc.10/ucscGenes.fa # But the psl was not from exactly the same seq's as in the fa. # Jim's suggestion: use sequenceForBed to get genomic-translated # sequences, and then genePredToFakePsl. sequenceToBed must be # run on hgwdev. genePredToBed /cluster/data/hg18/bed/ucsc.11/ucscGenes.gp > ucscGenes.bed hgsql hg18 -NBe 'select kgId,geneSymbol from kgXref' \ | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \ > idSub.txt subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed sequenceForBed -keepName -db=hg18 -bedIn=ucscGenesIdSubbed.bed \ -fastaOut=stdout \ | faToTwoBit stdin kgTargetSeq.2bit cut -f 1-10 /cluster/data/hg18/bed/ucsc.11/ucscGenes.gp \ | genePredToFakePsl hg18 stdin kgTargetAli.psl /dev/null # Load up the UCSC Genes target PSL table and put 2bit in /gbdb:: cd /cluster/data/hg18/bed/mrnaPcr hgLoadPsl hg18 kgTargetAli.psl mkdir /gbdb/hg18/targetDb ln -s /cluster/data/hg18/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg18/targetDb/ # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on # /gbdb/hg18/targetDb/kgTargetSeq.2bit . ssh hgwdev # Add records to hgcentraltest blatServers and targetDb: hgsql hgcentraltest -e \ 'INSERT into blatServers values ("hg18KgNov08", "blat13", 17799, 0, 1);' hgsql hgcentraltest -e \ 'INSERT into targetDb values("hg18KgNov08", "UCSC Genes", \ "hg18", "kgTargetAli", "", "", \ "/gbdb/hg18/targetDb/kgTargetSeq.2bit", 1, now(), "");' ############################################################################# # MAKE PCR TARGET FOR SNAPSHOT OF ALL_MRNA (DONE 4/18/08 angie) ssh hgwdev # Load up native mRNA target tables: hgsql hg18 -NBe 'select qName from all_mrna' \ | sort -u > mrnaAccs.txt $HOME/kent/src/hg/makeDb/genbank/bin/$MACHTYPE/gbGetSeqs \ -gbRoot=/gbdb/genbank -accFile=mrnaAccs.txt \ -db=hg18 -native genbank mrna mrnaTargetSeq.fa faToTwoBit mrnaTargetSeq.fa mrnaTargetSeq.2bit ln -s /cluster/data/hg18/bed/mrnaPcr/mrnaTargetSeq.2bit \ /gbdb/hg18/targetDb/ hgsql hg18 -e ' \ create table mrnaTargetAli select * from all_mrna; \ alter table mrnaTargetAli add index (tName,bin); \ alter table mrnaTargetAli add index (qName);' rm *.tab ssh kolossus # Start up gfServer for mrnaTargetSeq: cd /cluster/data/hg18/bed/mrnaPcr faToTwoBit mrnaTargetSeq.fa mrnaTargetSeq.2bit gfServer -stepSize=5 -canStop start localhost 17991 mrnaTargetSeq.2bit & ssh hgwdev # Add records to hgcentraltest blatServers and targetDb: hgsql hgcentraltest -e \ 'INSERT into blatServers values ("hg18MrnaApr08", "kolossus", 17991, 0, 1);' hgsql hgcentraltest -e \ 'INSERT into targetDb values("hg18MrnaApr08", "Human mRNAs", \ "hg18", "mrnaTargetAli", "", "", \ "/gbdb/hg18/targetDb/mrnaTargetSeq.2bit", 2, now(), "");' ############################################################################# # Reload CCDS from CCDS.20080502 dump (2008-05-03 markd) # import ccds database as described in ccds.txt set db=hg18 set ncbiBld=36.3 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords ${db} -verbose=2 ccdsGene # update all.jointer to include ${db} in ccdsDb joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # request push of ccdsGene ccdsInfo ccdsKgMap # << emacs ############################################################################ # update vega genes to version 31 (v49 of Ensembl genes) # (DONE - 2008-05-15 - Hiram) mkdir /cluster/data/hg18/bed/vega31_49 cd /cluster/data/hg18/bed/vega31_49 wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/vega/human/gtf_file.gz" wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/vega/human/CHANGELOG.gz" wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/vega/human/catalog.txt" wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/vega/human/pep/Homo_sapiens.VEGA.apr.pep.tot.fa.gz" # processing similar to the same processing for Ensembl genes, # from /cluster/data/hg18/bed/ensGene.49/process/doProcess.csh zcat gtf_file.gz \ | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \ | liftUp -type=.gtf stdout \ /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry stdin \ | gzip > allGenes.gtf.gz gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \ | gzip > hg18.allGenes.gp.gz /cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl \ infoOut.txt > ensGtp.tab genePredCheck -db=hg18 hg18.allGenes.gp.gz # checked: 62418 failed: 0 zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf gtfToGenePred -genePredExt pseudo.gtf pseudo.gp gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp genePredCheck -db=hg18 pseudo.gp # checked: 5747 failed: 0 genePredCheck -db=hg18 not.pseudo.gp # checked: 56671 failed: 0 hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp ############################################################################ # DGV V10 (DATABASE OF GENOMIC VARIANTS) (DONE 11/10/10 angie - color change 2/22/11 #2917) # DGV V9 done 3/26/10 # DGV V8 done 8/12/09 (changed color of inverted 11/05/09 kuhn) # DGV V7 done 3/11/09 # DGV V6 thin regions dropped 2/23/09 # DGV V6 with useless thin regions done 11/12/08 # DGV V5 done 7/16/08 # DGV V4 done 5/9/08 # 11-04-2009 color change from brown to magenta: # old color # 6553700 Inversion (100,0,100) # new: # 13107400 Inversion (200,0,200) # 2/22/11 color change (Bug #2917): swap blue and red; green -> brown # Old DGV format is obsolete; see the following section. ####################################################################### # DGV BETA (DATABASE OF GENOMIC VARIANTS) (DONE 2/11/13 angie) # DGV has changed their data format, and for the time being the data are # served by a beta web site, http://dgvbeta.tcag.ca/ ; in time that will # replace their current site. set today = `date +%y%m%d` mkdir -p /hive/data/genomes/hg18/bed/dgv/$today cd /hive/data/genomes/hg18/bed/dgv/$today wget http://dgvbeta.tcag.ca/dgv/docs/NCBI36_hg18_2012-11-23.txt head -1 NCBI36_hg18*.txt #variantaccession chr start end varianttype variantsubtype reference pubmedid method platform mergeid mergedorsample frequency samplesize cohortdescription genes # It's more complicated than Gain/Loss/Complex or Inversion now (+ stray commas): cut -f 5,6 NCBI36_hg18*.txt | sort | uniq -c | head -100 # 20156 CNV # 1304 CNV "" # 27098 CNV CNV # 2988 CNV Complex # 187319 CNV Deletion # 17673 CNV Duplication # 123436 CNV Gain # 4170 CNV Gain+Loss # 27382 CNV Insertion # 479784 CNV Loss # 280 OTHER # 31 OTHER "" # 44 OTHER Complex # 2519 OTHER Inversion # 663 OTHER Tandem duplication # 1 varianttype variantsubtype # shuffle fields into bed9+ w/itemRgb set purple = "200,0,200" set red = "200,0,0" set blue = "0,0,200" set brown = "139,69,19" tail -n +2 NCBI36_hg18*.txt \ | perl -wpe 'chomp; \ s/""//; \ ($id, $chr, $start, $end, $varType, $varSubType, $ref, $pmid, $method, $platform, \ undef, undef, undef, $sampleSize, $sampleDesc, $genes) = split("\t"); \ $start-- unless ($start == 0); \ $landmark = $genes; \ $landmark =~ s/,/, /g; \ $varSubType =~ s/^,//; $varSubType =~ s/,$//; \ $varTypeOut = "$varType ($varSubType)"; \ $ref =~ s/_/ /g; \ $method =~ s/_/ /g; $method =~ s/,/, /g; \ $sample = $sampleDesc; \ $sample .= " (sample size: $sampleSize)" if ($sampleSize); \ $method .= " ($platform)" if ($platform && $platform ne "Not Provided"); \ $rgb = "0,0,0"; \ if ($varType eq "CNV") { \ if ($varSubType eq "Gain" || $varSubType eq "Insertion" || $varSubType eq "Duplication") {\ $rgb = "'$blue'"; \ } elsif ($varSubType eq "Loss" ||$varSubType eq "Deletion") { \ $rgb = "'$red'"; \ } elsif ($varSubType eq "") { \ $varTypeOut = $varType; \ } else { \ $rgb = "'$brown'"; \ } \ } elsif ($varType eq "OTHER") { \ if ($varSubType eq "Inversion") { \ $rgb = "'$purple'"; \ } elsif ($varSubType eq "Tandem Duplication") { \ $rgb = "'$blue'"; \ } else { \ $varTypeOut = $varType; \ } \ } \ $_ = join("\t", "chr$chr", $start, $end, $id, 0, "+", \ $start, $start, $rgb, $landmark, $varTypeOut, \ $ref, $pmid, $method, $sample) . "\n";' \ > dgv.bed hgLoadBed hg18 dgv dgv.bed \ -sqlTable=$HOME/kent/src/hg/lib/dgv.sql -renameSqlTable -tab #Read 894847 elements of size 15 from dgv.bed ############################################################################ # AGILENT CGH PROBES (AND MM8, RN4) (Done 2008-05-13, Andy) ssh hgwdev bash cd /cluster/data/hg18/bed mkdir agilentProbes cd agilentProbes/ cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Human_CGH.zip . # (agilent-provided zips) # what a pain... this zipfile isn't unzippable using linux unzip. # Bob's windows machine didn't do it either. Finally got it using the # mac in Erich and Victoria's office. Extracting creates a directory # called "Agilent_Human_CGH Folder" cp Agilent_Human_CGH\ Folder/* . rmdir Agilent_Human_CGH\ Folder/ tail +3 014693_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent244a.bed tail +3 014698_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent105a.bed tail +3 014950_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent44k.bed for bed in *.bed; do hgLoadBed hg18 ${bed%.bed}{,.bed}; done cd /cluster/data/mm8/bed mkdir agilentCgh cd agilentCgh/ cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Mouse_CGH.zip . # (same crap as before with the zip file) cp Agilent_Mouse_CGH\ Folder/* . rmdir Agilent_Mouse_CGH\ Folder/ tail +3 014695_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh244a.bed tail +3 014699_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh105a.bed tail +3 015028_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh44k.bed for bed in *.bed; do hgLoadBed mm8 ${bed%.bed}{,.bed}; done cd /cluster/data/rn4/bed mkdir agilentCgh cd agilentCgh/ cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Rat_CGH.zip . # (yep, again) cp Agilent_Rat_CGH\ Folder/* . rmdir Agilent_Rat_CGH\ Folder/ tail +3 015223_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh244a.bed tail +3 015235_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh105a.bed for bed in *.bed; do hgLoadBed rn4 ${bed%.bed}{,.bed}; done ############################################################################ # AGILENT HUMAN SUREPRINT G3 ARRAY PROBESETS (DONE 2008-12-09, Andy) ssh hgwdev cd /hive/data/hg18/bed/agilentProbes wget --timestamping --user=microarray --password= \ "ftp://ftp.agilent.com/restricted/UCSC_BED_FILES/*" zcat 021365_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCnv2x400k stdin zcat 021529_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh1x1m stdin zcat 021850_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh2x400k stdin zcat 021924_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh8x60k stdin zcat 022060_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh4x180k stdin ############################################################################ # TWO MORE AGILENT HUMAN ARRAYS (DONE, 2009-07-28 Andy) ssh hgwdev cd /hive/data/hg18/bed/agilentProbes wget --timestamping --user=microarray --password= \ "ftp://ftp.agilent.com/restricted/UCSC_BED_FILES/*" tail -n +3 022837_D_UCSCTrack_20090331.txt | hgLoadBed hg18 agilentCnv2x105k stdin tail -n +3 023642_D_BED_20090528.bed | \ awk 'BEGIN{FS="\t";OFS="\t"}{print $0, "1000", "+";}' | \ hgLoadBed hg18 agilentHdd1x1m stdin ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # ILLUMINA WG-6 PROBES (2008-06-13 Andy) # Download the Platform file from GEO here: # http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6884 # Click on "Download full table" ssh hgwdev bash cd /san/sanVol1/scratch/andy mkdir illumina cd illumina/ cp ~/GPL6884-5803.txt . # Collect GIs for all the RNAs # First download/install Biopython wget http://biopython.org/DIST/biopython-1.45.tar.gz tar xfz biopython-1.45.tar.gz mkdir biopythonLibs cd biopython-1.45/ python setup.py install --home=/san/sanVol1/scratch/andy/illumina/biopythonLibs export PYTHONPATH=/san/sanVol1/scratch/andy/illumina/biopythonLibs # Now get the RNAs mkdir getRna grabbed cd getRna/ tail +31 ../GPL6884-5803.txt | cut -f11 | sort | uniq > gis.txt wc -l gis.txt # 43338 gis.txt split -d -l 100 -a 3 gis.txt gis- rm gis.txt cat < "EOF" > getSeqs.py import Bio from Bio import EUtils from Bio.EUtils import HistoryClient gis = open('gis.txt', 'r').readlines() for i in range(len(gis)): gis[i] = gis[i].rstrip('\n') ids = EUtils.DBIds('nucleotide', gis) client = HistoryClient.HistoryClient() result = client.post(ids) print result.efetch(retmode="text", rettype="fasta").read() EOF # << emacs cat < "EOF" > getSeqs.sh #!/bin/bash for gi in gis-*; do numGot="0"; attempt="1"; while [ $numGot -lt 100 ]; do echo Getting $gi attempt $attempt; cp $gi gis.txt; fa=${gi}.fa python getSeqs.py > $fa numGot=`grep '>' $fa | wc -l`; if [ $numGot = 100 ]; then echo Got all for $gi mv $fa ../grabbed/; rm $gi else rm $fa; sleep 10; fi attempt=$((attempt+1)); done sleep 5; done EOF # << emacs chmod +x getSeqs.sh ./getSeqs.sh # there's a fair bit that retries the download over and over but eventually it # gets to the last one, which doesn't have 100 lines, so I run the python # program on that on by itself. cat ../grabbed/* > probeRna.fa rm -rf ../grabbed/ cd ../ # Now blat RNA to genome mkdir -p blatRna/{splits,out} cd blatRna/ faSplit sequence ../getRna/probeRNA.fa 400 splits/rna- ls -1 splits/* > splits.lst cat < "EOF" > runBlat.sh #!/bin/bash cd -P . fa=`basename $1` chr=`basename $2 .nib` split=`basename $1 .fa` out=${split}.${chr}.psl nibDir=/scratch/hg/hg18/bothMaskedNibs tmpDir=/scratch/tmp/$out mkdir $tmpDir pushd $tmpDir oldDir=`dirs +1` cp ${oldDir}/$1 . blat -noHead -ooc=/scratch/hg/hg18/11.ooc -out=psl ${nibDir}/$2 $fa $out mkdir -p ${oldDir}/out/${chr} cp $out ${oldDir}/out/${chr}/ popd rm -rf $tmpDir EOF # << emacs chmod +x runblat.sh cat < "EOF" > gsub #LOOP ./runBlat.sh {check in line+ $(path1)} $(path2) {check out exists out/$(root2)/$(root1).$(root2).psl} #ENDLOOP EOF # << emacs ls -1 /cluster/data/hg18/nib > nib.lst ssh pk cd /san/sanVol1/scratch/andy/illumina/blatRna gensub2 splits.lst nib.lst gsub spec para create spec para try para push para time #17820 jobs in batch #34457 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 17820 of 17820 jobs #CPU time in finished jobs: 84196s 1403.26m 23.39h 0.97d 0.003 y #IO & Wait Time: 48448s 807.47m 13.46h 0.56d 0.002 y #Average job time: 7s 0.12m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 270s 4.50m 0.07h 0.00d #Submission to last job: 1515s 25.25m 0.42h 0.02d exit; # back to hgwdev mkdir /tmp/andy pslSort -nohead dirs allSorted.psl /tmp/andy out/* rmdir /tmp/andy pslReps -singleHit allSorted.psl single.ps{l,r} # Blat probes against the RNAs cd ../ mkdir -p blatProbes/out cd blatProbes/ ln -s ../blatRna/splits . ln -s ../blatRna/splits.lst . ln -s ../blatRna/single.psl . tail +31 ../GPL6884-5803.txt | cut -f1,11,18 | \ awk '{printf("%s\tgi|%s\t%s\n", $1, $2, $3);}' > probes.tab cat << "EOF" > #!/bin/bash faFile=`basename $1`; pslFile=${faFile%.fa}.psl probeFile=$2; rnaOnGenomePsl=$3; tmpDir=/scratch/andy/`date +"%T" | tr ':' '_'`.$$ mkdir -p $tmpDir cp $1 $2 $3 $tmpDir pushd $tmpDir for id in `grep '>' $faFile | sed 's/^>//'`; do # make probe fa echo $id awk '{if ($2 == "'"$id"'") printf(">%s\n%s\n", $1, $3);}' $probeFile \ > probe.fa # extract single RNA fa faOneRecord $faFile $id > rna.fa blat -noHead rna.fa probe.fa probeOnRna.psl awk 'BEGIN{FS="\t";OFS="\t";}{if ($10 == "'"$id"'") print;}' \ $rnaOnGenomePsl > rnaOnGenome.psl if [ `find . -size '0b' -type f | wc -l` == 0 ]; then pslMap probeOnRna.psl rnaOnGenome.psl probeOnGenome.psl cat probeOnGenome.psl >> $pslFile fi done popd cp $tmpDir/$pslFile $4 rm -rf $tmpDir EOF # << emacs cat << "EOF" > gsub #LOOP ./probeBlat.sh {check in line+ $(path1)} probes.tab single.psl {check out exists out/$(root1).psl} #ENDLOOP EOF # << emacs ssh pk cd /san/sanVol1/scratch/andy/illumina/blatProbes gensub2 splits.lst single gsub spec para create spec para try para push para time #396 jobs in batch #41977 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 396 of 396 jobs #CPU time in finished jobs: 11101s 185.02m 3.08h 0.13d 0.000 y #IO & Wait Time: 1361s 22.68m 0.38h 0.02d 0.000 y #Average job time: 31s 0.52m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 121s 2.02m 0.03h 0.00d #Submission to last job: 271s 4.52m 0.08h 0.00d exit # back to hgwdev mkdir /tmp/andy pslSort -nohead dirs sorted.psl /tmp/andy out # Load stuff up pslToBed sorted.psl sorted.bed cd ../ mkdir tables cd tables/ cp ../blatProbes/sorted.{psl,bed} . hgLoadPsl -table=illuminaProbesAlign hg18 sorted.psl hgLoadBed hg18 illuminaProbes sorted.bed cat << "EOF" > CREATE TABLE illuminaProbesSeq ( id varchar(40) NOT NULL, seq varchar(55) NOT NULL, PRIMARY KEY (id) ) TYPE=MyISAM; EOF # << emacs cut -f1,3 ../blatProbes/probes.tab > illuminaProbesSeq.tab hgLoadSqlTab hg18 illuminaProbesSeq{,.sql,.tab} ############################################################################ # dbSNP BUILD 129 (DONE 6/24/08 angie) # 8/6/08: Regenerated snp129.sql with only those enum/set values that are # actually used (except always keep unknown, the default) and reloaded snp129. # No data change -- just the sql field definitions for enums and sets. # 8/7/08: Swapped molType values cDNA <--> genomic in snp129 because they # were swapped in the fasta headers. # QA NOTE: used sudo mytouch to change timestamps on all downstream snp129 # tables (snp129Exceptions, snp129ExceptionDesc, snp129OrthoPt2Pa2Rm2, # snp129Seq) to .2008-08-08 00:00:00 to avoid unwarranted joinerCheck # time discrepancy errors. (8/8/08, brooke) # Set up build directory mkdir -p /cluster/store3/dbSNP129/{human,shared} ln -s /cluster/store3/dbSNP129 /cluster/data/dbSNP/129 # Get field encodings -- if there are changes or additions to the # encoding of the corresponding fields, you might need to update # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also # hg/lib/snp125Ui.c). cd /cluster/data/dbSNP/129/shared alias wg wget --timestamping set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz # Here is another source -- it is not as up-to-date as the above, but # our encodings (enums and sets in snp129.sql) are named more similar # to those in the 2005 ASN: # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn ########################## DOWNLOAD ############################# cd /cluster/data/dbSNP/129/human mkdir data schema rs_fasta # Get data from NCBI (anonymous FTP) wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt cd /cluster/data/dbSNP/129/human/data # ContigLoc table has coords, orientation, loc_type, and refNCBI allele wg $ftpSnpDb/organism_data/b129_SNPContigLoc_36_3.bcp.gz wg $ftpSnpDb/organism_data/b129_SNPContigLocusId_36_3.bcp.gz wg $ftpSnpDb/organism_data/b129_ContigInfo_36_3.bcp.gz # MapInfo has alignment weights wg $ftpSnpDb/organism_data/b129_SNPMapInfo_36_3.bcp.gz # SNP has univar_id, validation status and heterozygosity wg $ftpSnpDb/organism_data/SNP.bcp.gz # Get schema cd /cluster/data/dbSNP/129/human/schema wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz # Get fasta files # using headers of fasta files for molType, class, observed cd /cluster/data/dbSNP/129/human/rs_fasta wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz ########################## LOAD NCBI TABLES ############################# # Simplify names of data files -- strip version & extras to get # local canonical table names. cd /cluster/data/dbSNP/129/human/data foreach f (*.bcp.gz) set new = `echo $f \ | sed -e 's/^b129_SNP//; s/^b129_//; s/_36_3//; s/.bcp//;'` mv $f $new echo $new end # Extract just the tables that we need from the NCBI msSQL table # creation file, and get CREATE statements from # human_9606_table.sql for our 5 tables cd /cluster/data/dbSNP/129/human/schema zcat human_9606_table.sql.gz \ | perl -we '$/ = "\nGO\n\n\n"; \ while (<>) { \ next unless /^CREATE TABLE \[(b129_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_3)?\]/; \ s/b129_(SNP)?//; s/_36_3//; \ s/[\[\]]//g; s/GO\n\n/;/; s/smalldatetime/datetime/g; \ s/ON PRIMARY//g; s/COLLATE//g; s/Latin1_General_BIN//g; \ s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \ s/nvarchar/varchar/g; s/set quoted/--set quoted/g; \ s/(image|varchar\s+\(\d+\))/BLOB/g; \ print; \ }' \ > table.sql # load on kolossus or a small cluster machine (mysql5 is OK for this; # in fact it's better than 4 because it has 'show warnings'). ssh kkr3u00 hgsql '' -e 'create database hg18snp129' cd /cluster/data/dbSNP/129/human/schema hgsql hg18snp129 < table.sql cd ../data # Avoid wasting space by excluding mappings to non-reference contigs: foreach t (ContigInfo MapInfo) zcat $t.gz \ | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \ | hgLoadSqlTab -oldTable hg18snp129 $t placeholder stdin end # Compare contig list between our ctgPos and reference contigs in # ContigInfo: ssh hgwdev-10 hgsql hg18 -N -B -e '"select contig from ctgPos;"' \ | sort > /tmp/1 hgsql hg18snp129 -NBe 'select distinct(group_label) from ContigInfo' # --> reference, c5_H2, c6_COX, c6_QBL, c22_H2, DR53 # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above) hgsql hg18snp129 -N -B -e 'select contig_acc from ContigInfo \ where group_label in \ ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2 diff /tmp/1 /tmp/2 # No diff. # Make sure there are no orient != 0 contigs among those selected. hgsql hg18snp129 -NBe \ 'select count(*) from ContigInfo where orient != 0 and \ group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' #0 # ContigLoc is huge, and we want just the reference contig mappings. # So, based on the reference & haplo ctg_id values in ContigInfo, # filter to get just the mappings for those contigs: zcat ContigLoc.gz \ | awk '$3 <= 377 || $3 == 7015' \ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \ | hgLoadSqlTab -oldTable hg18snp129 ContigLoc placeholder stdin foreach t (ContigLocusId SNP) zcat $t.gz \ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \ | hgLoadSqlTab -oldTable hg18snp129 $t placeholder stdin end # There were some warnings (many cleared up by the perl substitution) # but no rows were dropped. 'show warnings' after a manual 'load data' # complains about missing values (OK when e.g. position is not known). foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP) echo -n "${t}:\t" hgsql -N -B hg18snp129 -e 'select count(*) from '$t end #ContigInfo: 379 #ContigLoc: 15835019 (before filtering: 46913472) #ContigLocusId: 25496815 #MapInfo: 14845535 (before filtering: 44627804) #SNP: 14708770 #################### EXTRACT INFO FROM NCBI TABLES #################### mkdir -p /scratch/snp/129/human cd /scratch/snp/129/human time hgsql hg18snp129 -e \ 'alter table ContigLoc add index (ctg_id); \ alter table ContigInfo add index (ctg_id);' #0.002u 0.002s 2:14.79 0.0% 0+0k 0+0io 1pf+0w # was ~12m on a run without trimming ContigLoc! time hgsql hg18snp129 -e \ 'alter table ContigInfo add index (group_label(9));' #0.005u 0.000s 0:00.16 0.0% 0+0k 0+0io 1pf+0w # For joining files by shared column, we need a unique identifier in # that shared column. snp_id is not unique -- the same rsID can appear # in both the reference assembly and on one of the others e.g. c6_COX. # So concatenate the assembly identifier and snp_id to get hopefully # unique label. time hgsql hg18snp129 -NBe \ 'select concat(ContigInfo.group_label, ".", snp_id), \ ContigInfo.contig_acc, asn_from, asn_to, \ loc_type, orientation, allele, phys_pos_from \ from ContigLoc, ContigInfo \ where ContigLoc.ctg_id = ContigInfo.ctg_id and ContigInfo.group_label \ in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \ | sort \ > ucscContigLoc.txt # no time output because of the pipe... took 5 minutes. # Are these IDs unique? wc -l ucscContigLoc.txt #15835019 ucscContigLoc.txt awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l #14791529 # Nope. Find non-unique IDs: awk 'prev == $1 {print;} {prev = $1;}' ucscContigLoc.txt | head grep ^c5_H2.10035195 ucscContigLoc.txt #c5_H2.10035195 NT_113801 639954 639954 2 0 G 69605321 #c5_H2.10035195 NT_113801 660407 660407 2 0 G 69625774 #c5_H2.10035195 NT_113801 911780 911780 2 1 C 69877147 #c5_H2.10035195 NT_113801 933061 933061 2 1 C 69898428 # OK, they can be duplicated within the same contig. See if we can # get by with anchoring everything to ucscContigLoc.txt. But everybody # else better have unique IDs! # SNP -> valid, avHet, avHetSE # SNP has only snp_id as identifier, nothing relating to assembly. hgsql hg18snp129 -NBe \ 'select snp_id, validation_status, avg_heterozygosity, het_se \ from SNP;' \ | sort \ > ucscSNP.txt # Check ID uniqueness: wc -l ucscSNP.txt #14708770 ucscSNP.txt awk '{print $1;}' ucscSNP.txt | uniq | wc -l #14708770 # ContigLocusId -> func # ContigLocusId has only snp_id as an identifier (it gives one # example contig if the SNP is on multiple contigs). # The sort options and awk are to convert multiple entries with different # function classes for the same SNP into one entry per SNP with a list # of function classes. hgsql hg18snp129 -NBe \ 'select snp_id, fxn_class from ContigLocusId;' \ | sort -u -k1,1 -k2,2n \ | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \ else { if (prevId) {print prevId "\t" prevFunc;} \ prevFunc = $2 ","; }} \ {prevId = $1;} \ END {print prevId "\t" prevFunc;}' \ > ucscFunc.txt # Check ID uniqueness: wc -l ucscFunc.txt #6136008 ucscFunc.txt awk '{print $1;}' ucscFunc.txt | sort -u | wc -l #6136008 # MapInfo -> weight # MapInfo needs assembly+snp_ids in order to have unique IDs. time hgsql hg18snp129 -e \ 'alter table MapInfo add index (assembly(9));' #0.003u 0.003s 3:40.29 0.0% 0+0k 0+0io 1pf+0w hgsql hg18snp129 -NBe \ 'select concat(assembly, ".", snp_id), weight \ from MapInfo where assembly \ in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \ | sort \ > weight.txt # ~1 minute # Check ID uniqueness: wc -l weight.txt #14791529 weight.txt awk '{print $1;}' weight.txt | uniq | wc -l #14791529 awk '{print $2;}' weight.txt | sort -n | uniq -c # 40910 0 #14326127 1 # 157402 2 # 256608 3 # 10482 10 # SNPs w/weight 0 and 10 will be discarded later. # fasta headers -> observed, molType, class zcat /cluster/data/dbSNP/129/human/rs_fasta/rs_ch*.fas.gz \ | grep '^>gnl' \ | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \ | sort \ > ucscGnl.txt # ~5m wc -l ucscGnl.txt #14708630 ucscGnl.txt awk '{print $1;}' ucscGnl.txt | uniq | wc -l #14708630 ############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################ # Join files by ID. Start with ContigLoc and MapInfo because they # share the concatenated assembly+snp_id IDs. time join -a 1 -e MISSING -t ' ' ucscContigLoc.txt weight.txt \ > ucscCL+w.txt #28.334u 4.730s 1:43.47 31.9% 0+0k 0+0io 0pf+0w wc -l ucscCL+w.txt #15835019 ucscCL+w.txt # Same as ucscContigLoc.txt above, good. # Any missing weights? grep MISSING ucscCL+w.txt | head # No output, good. # Join the files with SNP-only IDs. time join -e MISSING -t ' ' ucscGnl.txt ucscSNP.txt \ > ucscG+S.txt #17.375u 2.127s 0:47.40 41.1% 0+0k 0+0io 0pf+0w wc -l ucscG+S.txt #14708630 ucscG+S.txt # Same as ucscGnl.txt -- somewhat less than ucscSNP.txt (14708770)... grep MISSING ucscG+S.txt | wc -l #0 time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \ -t ' ' ucscG+S.txt ucscFunc.txt \ > ucscG+S+F.txt #18.612u 2.334s 0:50.30 41.6% 0+0k 0+0io 0pf+0w wc -l ucscG+S+F.txt #14708630 ucscG+S+F.txt grep MISSING ucscG+S+F.txt | wc -l #8572703 # Not surprising -- ucscFunc.txt has only 6136008 lines. expr 14708630 - 6136008 #8572622 # Not an exact match like in 128, but not too far off. # Convert assembly+snp_id's to just snp_id (sorted) for final join. perl -wpe 's/^\S+\.(\d+)/$1/;' ucscCL+w.txt \ | sort > ucscCL+w.snp_id.txt awk '{print $1;}' ucscCL+w.snp_id.txt | uniq | wc -l #14626025 # Interesting... which snp_ids are missing from ContigLoc? # (note: don't use sort -n | comm, it needs alphabetical sort!) awk '{print $1;}' ucscCL+w.snp_id.txt | sort -u > /tmp/1 awk '{print $1;}' ucscGnl.txt | sort -u > /tmp/2 comm -13 /tmp/1 /tmp/2 > notInContigLoc.txt comm -23 /tmp/1 /tmp/2 > notInSNP.txt wc -l notIn*.txt # 83043 notInContigLoc.txt # 438 notInSNP.txt # notInContigLoc could simply mean that they weren't mapped, which is OK. # notInSNP is more concerning. #Not deleted!: 52789237, 55664014, 61749732, #Invalid (not retired): 63751714, 63751902 # -- sent email to snp-admin at ncbi. # Final join -- treat ContigLoc as authoritative (since it has coords). # Arrange columns in same order as in the SNP table, with extras for # checking at the end (phys_pos_from). # chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ... time join -a 1 -e MISSING -t ' ' \ -o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \ ucscCL+w.snp_id.txt ucscG+S+F.txt \ > ucscNcbiSnp.ctg.txt #41.204u 6.274s 1:05.99 71.9% 0+0k 0+0io 0pf+0w wc -l ucscNcbiSnp.ctg.txt #15835019 ucscNcbiSnp.ctg.txt grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l #8495168 # Lift the map contig coordinates to chrom coordinates (~2m); time liftUp ucscNcbiSnp.bed \ /cluster/data/hg18/jkStuff/liftContigs.lft warn \ ucscNcbiSnp.ctg.txt #123.952u 7.587s 2:22.24 92.4% 0+0k 0+0io 5pf+0w wc -l ucscNcbiSnp.bed #15835019 ucscNcbiSnp.bed # At this point, move back from /scratch to /cluster/data. nice gzip ucscNcbiSnp.bed cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/129/human/ cp -p notIn* /cluster/data/dbSNP/129/human/ # Drum roll please... translate NCBI's encoding into UCSC's, and # perform a bunch of checks. This is where developer involvement # is most likely as NCBI extends the encodings used in dbSNP. cd /cluster/data/dbSNP/129/human/ gunzip ucscNcbiSnp.bed.gz # Re-ran this command 8/6/08 to get new snp129.sql that includes # only those enum/set values that are actually used. No other output # files changed. time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \ snp129 # 8/7/08: added the awk command to unswap the molType values that # were swapped in dbSNP 129 fasta headers: # DO NOT USE THIS COMMAND NEXT TIME UNLESS NECESSARY AGAIN: awk 'BEGIN{OFS="\t";} \ {if ($8 == "genomic") {$8 = "cDNA";} \ else if ($8 == "cDNA") {$8 = "genomic";} \ print;}' ucscNcbiSnp.bed \ | snpNcbiToUcsc stdin /cluster/data/hg18/hg18.2bit snp129 #spaces stripped from observed: #chr12 5963395 5963395 rs41402545 #count of snps with weight 0 = 63507 #count of snps with weight 1 = 14375595 #count of snps with weight 2 = 325745 #count of snps with weight 3 = 924499 #count of snps with weight 10 = 145673 #Skipped 493 snp mappings due to errors -- see snp129Errors.bed #210.328u 10.793s 4:04.99 90.2% 0+0k 0+0io 0pf+0w # More skipped snps than in 128, but same reason: cut -f 5 snp129Errors.bed | sort | uniq -c # 493 Missing observed value (deleted SNP?). cut -f 4 snp129Errors.bed | sort -u | sed -e 's/^rs//' > errIds.txt comm -13 notInSNP.txt errIds.txt | wc -l #0 # So those are a subset of the notInSNP.txt ids, good. wc -l snp* # 15625346 snp129.bed # 22 snp129.sql # 493 snp129Errors.bed # 18 snp129ExceptionDesc.tab # 2673142 snp129Exceptions.bed # Make one big fasta file. # It's a monster: 16G! Can we split by hashing rsId? # NOTE FOR NEXT TIME: do this on the fileserver! zcat rs_fasta/rs_ch*.fas.gz \ | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \ > snp129.fa # Check for duplicates. grep ^\>rs snp129.fa | sort > /scratch/tmp/seqHeaders wc -l /scratch/tmp/seqHeaders #14708630 /scratch/tmp/seqHeaders uniq /scratch/tmp/seqHeaders | wc -l #14708630 # Use hgLoadSeq to generate .tab output for sequence file offsets, # and keep only the columns that we need: acc and file_offset. # Index it and translate to snpSeq table format. time hgLoadSeq -test placeholder snp129.fa #114.516u 37.585s 3:13.58 78.5% 0+0k 0+0io 6pf+0w cut -f 2,6 seq.tab > snp129Seq.tab rm seq.tab ssh hgwdev # Load up main track tables. cd /cluster/data/dbSNP/129/human # Re-ran this command 8/6/08 to get new snp129.sql that includes # only those enum/set values that are actually used. No data values # changed. Removed -noSort because Brooke had spotted some entries # sorted by chromEnd instead of chromStart. # Re-ran 8/7/08 to pick up corrected molType column in snp129.bed. time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \ hg18 snp129 -sqlTable=snp129.sql snp129.bed #100.406u 22.673s 9:44.17 21.0% 0+0k 0+0io 0pf+0w sed -e 's/snp125/snp129/' ~/kent/src/hg/lib/snp125Exceptions.sql \ > snp129Exceptions.sql time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \ hg18 snp129Exceptions -sqlTable=snp129Exceptions.sql \ snp129Exceptions.bed #13.125u 1.383s 1:15.39 19.2% 0+0k 0+0io 0pf+0w sed -e 's/snp125/snp129/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \ > snp129ExceptionDesc.sql hgLoadSqlTab hg18 snp129ExceptionDesc snp129ExceptionDesc.sql \ snp129ExceptionDesc.tab # Load up sequences. sed -e 's/snpSeq/snp129Seq/' ~/kent/src/hg/lib/snpSeq.sql \ > snp129Seq.sql mkdir -p /gbdb/hg18/snp ln -s /cluster/data/dbSNP/129/human/snp129.fa /gbdb/hg18/snp/snp129.fa time nice hgLoadSqlTab hg18 snp129Seq snp129Seq.sql snp129Seq.tab #0.007u 0.006s 3:06.83 0.0% 0+0k 0+0io 0pf+0w # Put in a link where one would expect to find the track build dir... ln -s /cluster/data/dbSNP/129/human /cluster/data/hg18/bed/snp129 # Look at the breakdown of exception categories: ssh kkr3u00 cd /cluster/data/dbSNP/129/human cut -f 5 snp129Exceptions.bed | sort | uniq -c | sort -nr #1580567 MultipleAlignments # 628933 ObservedMismatch # 387233 SingleClassLongerSpan # 31425 SingleClassTriAllelic # 13247 ObservedTooLong # 11095 FlankMismatchGenomeShorter # 10365 SingleClassZeroSpan # 3345 SingleClassQuadAllelic # 3310 FlankMismatchGenomeLonger # 1397 DuplicateObserved # 1250 MixedObserved # 547 NamedDeletionZeroSpan # 296 FlankMismatchGenomeEqual # 93 ObservedContainsIupac # 35 NamedInsertionNonzeroSpan # 3 RefAlleleMismatch # 1 ObservedWrongFormat ####################################################################### # SNPMASKED SEQUENCE FOR SNP129 (DONE 7/1/08 angie) ssh kolossus mkdir /cluster/data/hg18/snp129Mask cd /cluster/data/hg18/snp129Mask # Identify rsIds with various problems -- we will exclude those. # MultipleAlignments is kinda broad because anything that maps on # both chrN and chrN_foo_hap1 will be excluded... similarly, extra # matches on chrN_random might disqualify good matches on chrN. # Well, erring on the side of caution is good. awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \ /cluster/data/dbSNP/129/human/snp129Exceptions.bed \ | sort -u \ > snp129ExcludeRsIds.txt time grep -vFwf snp129ExcludeRsIds.txt \ /cluster/data/dbSNP/129/human/snp129.bed \ > snp129Cleaned.bed #154.384u 12.550s 3:09.01 88.3% 0+0k 0+0io 0pf+0w # Substitutions: mkdir substitutions snpMaskSingle snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \ | faSplit byname stdin substitutions/ # Also this warning about total size -- just means that some chroms # didn't have any SNPS that survived the stringent filtering. #-- 113 warnings about differing observed at same base positions #-- (113 distinct positions). saved as diffObserved.txt. #-- Spot-checking, I see a case (chr1|1476801|1476802) where two SNPs #-- should have been merged -- their flanking sequences were just from #-- diff. strands. In another case (chr9|10122961|10122962), one of #-- the mappings looks like an insertion instead of a substitution but #-- the SNP's class is single, and one genomic base is mapped. #-- IMO not serious to bother dbSNP about, they want to get on w/130. #Masked 10637395 snps in 10637306 out of 3091528550 genomic bases #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091528550 (difference is 16148723) # Make sure that sizes are identical, first diffs are normal -> IUPAC, # and first diffs' case is preserved: foreach f (substitutions/chr*.fa) faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" end #(output OK) foreach f (substitutions/chr*.fa) echo $f:t:r mv $f $f:r.subst.fa gzip $f:r.subst.fa end # Insertions: mkdir insertions snpMaskAddInsertions snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \ | faSplit byname stdin insertions/ #Added 1617522 snps totaling 3251578 bases to 3085167749 genomic bases #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524) # Again, that just means that some chroms didn't have filtered SNPs. # Make sure that all sizes have increased relative to original: foreach f (insertions/chr*.fa) faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \ |& perl -we '$_=<>; \ if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \ if ($1 > $2) {print "OK: ins size $1 > $2\n";} \ else {die "ERROR: ins size $1 <= $2\n";} \ } else {die $_;}' end #(output OK) foreach f (insertions/chr*.fa) mv $f $f:r.ins.fa gzip $f:r.ins.fa end # Deletions: mkdir deletions snpMaskCutDeletions snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \ | faSplit byname stdin deletions/ #Cut 1046324 snps totaling 2173708 bases from 3085167749 genomic bases #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524) # Again, that just means that some chroms didn't have filtered SNPs. # Make sure that all sizes have decreased relative to original: foreach f (deletions/chr*.fa) faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \ |& perl -we '$_=<>; \ if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \ if ($1 < $2) {print "OK: del size $1 < $2\n";} \ else {die "ERROR: del size $1 >= $2\n";} \ } else {die $_;}' end #(output OK) foreach f (deletions/chr*.fa) mv $f $f:r.del.fa gzip $f:r.del.fa end # Clean up and prepare for download: gzip snp129Cleaned.bed foreach d (substitutions insertions deletions) pushd $d md5sum *.gz > md5sum.txt popd end # Make a README.txt in each subdir. # Create download links on hgwdev. # NOTE: Currently we offer only the substitutions. # If we get any user requests, then maybe we can put the insertions # and deletions out there. ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask ln -s /cluster/data/hg18/snp129Mask/substitutions/* \ /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/ ## If there is user demand for ins & del, then start over with an empty ## goldenPath/snp129Mask and do this: ## foreach type (substitutions insertions deletions) ## mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/$type ## ln -s /cluster/data/hg18/snp129Mask/$type/* \ ## /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/$type/ ## end ####################################################################### # ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP129 (DONE 7/2/08 angie) ssh kolossus mkdir /cluster/data/hg18/bed/snp129Ortho cd /cluster/data/hg18/bed/snp129Ortho # Following Heather's lead in snp126orthos, filter SNPs to to keep # only those with class=single, length=1, chrom!~random; # Exclude those with exceptions MultipleAlignments, # SingleClassTriAllelic or SingleClassQuadAllelic. # Unlike snp masking, we do not filter for weight -- don't know why. awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \ /cluster/data/dbSNP/129/human/snp129Exceptions.bed \ | sort -u \ > snp129ExcludeIds.txt awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \ /cluster/data/dbSNP/129/human/snp129.bed \ | grep -vFwf snp129ExcludeIds.txt \ > snp129Simple.bed # took ~3 minutes wc -l snp129Simple.bed #10633840 snp129Simple.bed # Glom all human info that we need for the final table onto the # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand awk 'BEGIN{OFS="\t";} \ {print $1, $2, $3, \ $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \ 0, $6;}' \ snp129Simple.bed > snp129ForLiftOver.bed # Map coords to chimp using liftOver. # I don't know why chimp took so much longer than macaque... the # chimp .over has fewer chains and fewer bytes than the macaque .over. mkdir run.liftOChimp cd run.liftOChimp mkdir split out splitFile ../snp129ForLiftOver.bed 25000 split/chunk cp /dev/null jobList foreach f (split/chunk*) echo liftOver $f \ /cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \ \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \ >> jobList end ssh pk cd /cluster/data/hg18/bed/snp129Ortho/run.liftOChimp para make jobList #Completed: 426 of 426 jobs #CPU time in finished jobs: 83616s 1393.60m 23.23h 0.97d 0.003 y #IO & Wait Time: 1501s 25.02m 0.42h 0.02d 0.000 y #Average job time: 200s 3.33m 0.06h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 574s 9.57m 0.16h 0.01d #Submission to last job: 939s 15.65m 0.26h 0.01d # Map coords to orangutan using liftOver. mkdir ../run.liftOPon cd ../run.liftOPon mkdir out ln -s ../run.liftOChimp/split . cp /dev/null jobList foreach f (split/chunk*) echo liftOver $f \ /cluster/data/hg18/bed/liftOver/hg18ToPonAbe2.over.chain.gz \ \{check out exists out/ponAbe2.$f:t.bed\} out/hg18.$f:t.unmapped \ >> jobList end para make jobList #Completed: 426 of 426 jobs #CPU time in finished jobs: 171875s 2864.58m 47.74h 1.99d 0.005 y #IO & Wait Time: 1767s 29.45m 0.49h 0.02d 0.000 y #Average job time: 408s 6.79m 0.11h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 1268s 21.13m 0.35h 0.01d #Submission to last job: 1743s 29.05m 0.48h 0.02d # Map coords to macaque using liftOver. mkdir ../run.liftOMac cd ../run.liftOMac mkdir out ln -s ../run.liftOChimp/split . cp /dev/null jobList foreach f (split/chunk*) echo liftOver $f \ /cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \ \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \ >> jobList end para make jobList #Completed: 426 of 426 jobs #CPU time in finished jobs: 6356s 105.93m 1.77h 0.07d 0.000 y #IO & Wait Time: 1812s 30.21m 0.50h 0.02d 0.000 y #Average job time: 19s 0.32m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 51s 0.85m 0.01h 0.00d #Submission to last job: 221s 3.68m 0.06h 0.00d ssh kolossus cd /cluster/data/hg18/bed/snp129Ortho # Note: the formerly inlined script getOrthoSeq.pl has been checked in # as kent/src/hg/snp/snpLoad/getOrthoSeq.pl. # Concatenate the chimp results, sorting by chimp pos in order to # efficiently access 2bit sequence in getOrthoSeq. The output of # that is then sorted by the glommed human info field, so that we # can use join to combine chimp and macaque results in the next step. # Ditto for macaque and orangutan. sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \ | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \ | sort > panTro2.orthoGlom.txt sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \ | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \ | sort > ponAbe2.orthoGlom.txt sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \ | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \ | sort > rheMac2.orthoGlom.txt # The whole pipeline takes ~5-7 minutes each. wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt # 9909458 panTro2.orthoGlom.txt # 9597270 ponAbe2.orthoGlom.txt # 8467866 rheMac2.orthoGlom.txt # Use the glommed name field as a key to join up chimp and macaque # allele data. Include glommed name from both files because if only # file 2 has a line for the key in 2.1, then 1.1 is empty. Then plop # in the orthoGlom fields from each file, which are in the same order # as the chimp and macaque columns of snp129OrthoPanTro2RheMac2. join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \ -a 1 -a 2 -e '?' \ panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \ | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \ else { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \ > tmp.txt join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \ -a 1 -a 2 -e '?' \ tmp.txt rheMac2.orthoGlom.txt \ | perl -wpe 'chomp; \ ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \ $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \ $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \ $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \ ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \ split(/\|/, $glomKey); \ $o1Start =~ s/^\?$/0/; $o2Start =~ s/^\?$/0/; $o3Start =~ s/^\?$/0/; \ $o1End =~ s/^\?$/0/; $o2End =~ s/^\?$/0/; $o3End =~ s/^\?$/0/; \ print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \ $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \ $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \ $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \ s/^.*$//;' \ | sort -k1,1 -k2n,2n > snp129OrthoPt2Pa2Rm2.bed # took ~6 minutes. wc -l snp129OrthoPt2Pa2Rm2.bed #10325827 snp129OrthoPt2Pa2Rm2.bed ssh hgwdev cd /cluster/data/hg18/bed/snp129Ortho time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \ hg18 snp129OrthoPt2Pa2Rm2 snp129OrthoPt2Pa2Rm2.bed #Loaded 10325827 elements of size 22 #73.396u 10.864s 10:14.76 13.7% 0+0k 0+0io 0pf+0w # Cleanup on fileserver: cd /cluster/data/hg18/bed/snp129Ortho nice gzip snp129Simple.bed snp129ExcludeIds.txt snp129ForLiftOver.bed rm -r run*/split tmp.txt *.orthoGlom.txt ############################################################################ # dbSNP BUILD 130 (UPDATED 8/18/09 angie) # Originally done 5/22/09. # Functional annotations restricted by mapping position 7/7. # dbSNP corrections applied to func field 8/18. # Set up build directory mkdir -p /hive/data/outside/dbSNP/130/{human,shared} # Get field encodings -- if there are changes or additions to the # encoding of the corresponding fields, you might need to update # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also # hg/lib/snp125Ui.c). cd /hive/data/outside/dbSNP/130/shared alias wg wget --timestamping set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz # Here is another source -- it is not as up-to-date as the above, but # our encodings (enums and sets in snp130.sql) are named more similar # to those in the 2005 ASN: # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn ########################## DOWNLOAD ############################# cd /hive/data/outside/dbSNP/130/human mkdir data schema rs_fasta # Get data from NCBI (anonymous FTP) wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt cd /hive/data/outside/dbSNP/130/human/data # ContigLoc table has coords, orientation, loc_type, and refNCBI allele wg $ftpSnpDb/organism_data/b130_SNPContigLoc_36_3.bcp.gz wg $ftpSnpDb/organism_data/b130_SNPContigLocusId_36_3.bcp.gz wg $ftpSnpDb/organism_data/b130_ContigInfo_36_3.bcp.gz # MapInfo has alignment weights wg $ftpSnpDb/organism_data/b130_SNPMapInfo_36_3.bcp.gz # SNP has univar_id, validation status and heterozygosity wg $ftpSnpDb/organism_data/SNP.bcp.gz # Get schema cd /hive/data/outside/dbSNP/130/human/schema wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz # Get fasta files # using headers of fasta files for molType, class, observed cd /hive/data/outside/dbSNP/130/human/rs_fasta wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz # Get 1000 Genomes IDs (unfortunately not in validation field as Sol suggested) cd /hive/data/outside/dbSNP/130/human/data wg -O 1000Genomes_README ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/1000Genomes/ReadMe.txt wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/1000Genomes/B130_1000G_RsClusterReport.txt.gz zcat B130_1000G_RsClusterReport.txt.gz | wc -l #7512342 # Make a uniquified list of only the numeric portion of the assigned rs IDs: zcat B130_1000G_RsClusterReport.txt.gz \ | cut -d, -f 3 | sed -e 's/^rs//' \ | sort -nu > 1000GenomesRsIds.txt wc -l 1000GenomesRsIds.txt #5611085 1000GenomesRsIds.txt ########################## LOAD NCBI TABLES ############################# # Simplify names of data files -- strip version & extras to get # local canonical table names. cd /hive/data/outside/dbSNP/130/human/data foreach f (*.bcp.gz) set new = `echo $f \ | sed -e 's/^b130_SNP//; s/^b130_//; s/_36_3//; s/.bcp//;'` mv $f $new echo $new end cd /hive/data/outside/dbSNP/130/human/schema zcat human_9606_table.sql.gz \ | perl -we '$/ = "\nGO\n\n\n"; \ while (<>) { \ next unless /^CREATE TABLE \[(b130_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_3)?\]/; \ s/b130_(SNP)?//; s/_36_3//; \ s/[\[\]]//g; s/GO\n\n/;/; s/smalldatetime/datetime/g; \ s/ON PRIMARY//g; s/COLLATE//g; s/Latin1_General_BIN//g; \ s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \ s/nvarchar/varchar/g; s/set quoted/--set quoted/g; \ s/(image|varchar\s+\(\d+\))/BLOB/g; \ print; \ }' \ > table.sql # load on hgwdev (kolossus disk almost full, no more small cluster mysql5's): hgsql '' -e 'create database hg18snp130' cd /hive/data/outside/dbSNP/130/human/schema hgsql hg18snp130 < table.sql cd ../data # Avoid wasting space by excluding mappings to non-reference contigs: foreach t (ContigInfo MapInfo) zcat $t.gz \ | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \ | hgLoadSqlTab -oldTable hg18snp130 $t placeholder stdin end #load of ContigInfo did not go as planned: 379 record(s), 0 row(s) skipped, 88 warning(s) loading /dev/stdin # Checked ContigInfo visually, looks OK. # Compare contig list between our ctgPos and reference contigs in # ContigInfo: ssh hgwdev-10 hgsql hg18 -N -B -e '"select contig from ctgPos;"' \ | sort > /tmp/1 hgsql hg18snp130 -NBe 'select distinct(group_label) from ContigInfo' # --> reference, c5_H2, c6_COX, c6_QBL, c22_H2, DR53 # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above) hgsql hg18snp130 -N -B -e 'select contig_acc from ContigInfo \ where group_label in \ ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2 diff /tmp/1 /tmp/2 # No diff. # Make sure there are no orient != 0 contigs among those selected. hgsql hg18snp130 -NBe \ 'select count(*) from ContigInfo where orient != 0 and \ group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' #0 # ContigLoc is huge, and we want just the reference contig mappings. # So, based on the reference & haplo ctg_id values in ContigInfo, # filter to get just the mappings for those contigs: zcat ContigLoc.gz \ | awk '$3 <= 377 || $3 == 7015' \ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \ | hgLoadSqlTab -oldTable hg18snp130 ContigLoc placeholder stdin zcat SNP.gz \ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \ | hgLoadSqlTab -oldTable hg18snp130 SNP placeholder stdin zcat ContigLocusId.gz \ | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \ | hgLoadSqlTab -oldTable hg18snp130 ContigLocusId placeholder stdin # There were some warnings (many cleared up by the perl substitution) # but no rows were dropped. In mysql5, 'show warnings' after a manual 'load data' # complains about missing values (OK when e.g. position is not known). foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP) echo -n "${t}:\t" hgsql -N -B hg18snp130 -e 'select count(*) from '$t end #ContigInfo: 379 #ContigLoc: 19189750 #ContigLocusId: 11790054 #MapInfo: 17928700 #SNP: 17804034 #################### EXTRACT INFO FROM NCBI TABLES #################### # Glom each SNP's function codes together and load up a new hg18Snp130 table. # Also extract NCBI's annotations of coding SNPs' effects on translation. # We extract ContigLocusId info only for reference assembly mapping. # Some SNP's functional annotations are for an alternate assembly, so we will # have no NCBI functional annotations to display for those (but our own are # available). cd /hive/data/outside/dbSNP/130/human hgsql hg18snp130 -NBe 'select snp_id, ci.contig_acc, asn_from, asn_to, mrna_acc, \ fxn_class, reading_frame, allele, residue, codon, cli.ctg_id \ from ContigLocusId as cli, ContigInfo as ci \ where cli.ctg_id = ci.ctg_id and \ group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")' \ > ncbiFuncAnnotations.txt # Ignore function code 8 (cds-reference, just means that some allele matches reference) # and glom functions for each SNP id: cut -f 1-4,6,11 ncbiFuncAnnotations.txt \ | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \ | perl -we 'while (<>) { chomp; \ ($id, undef, $s, $e, $f, $c) = split; \ if (defined $prevId && $id == $prevId && $c == $prevC && $s == $prevS) { \ $prevFunc .= "$f," unless ($f == 8); \ } else { \ print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if (defined $prevId); \ $prevFunc = ($f == 8) ? "" : "$f,"; \ } \ ($prevId, $prevC, $prevS, $prevE) = ($id, $c, $s, $e); \ } \ print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n"' \ > ucscFunc.txt wc -l ucscFunc.txt #7035685 ucscFunc.txt cat > ucscFunc.sql < ncbiFuncInsertions.ctg.bed wc -l ncbiFuncInsertions.ctg.bed #1089086 ncbiFuncInsertions.ctg.bed # Extract observed allele, molType and snp class from FASTA headers gnl zcat /hive/data/outside/dbSNP/130/human/rs_fasta/rs_ch*.fas.gz \ | grep '^>gnl' \ | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \ | sort -n \ > ucscGnl.txt #407.555u 57.499s 4:32.89 170.4% 0+0k 0+0io 0pf+0w wc -l ucscGnl.txt #17804034 ucscGnl.txt cut -f 1 ucscGnl.txt | uniq | wc -l #17804034 cat > ucscGnl.sql < ucscNcbiSnp.ctg.bed #on a not-so busy hgwdev: 80.735u 36.958s 8:54.76 22.0% 0+0k 0+0io 0pf+0w #on a busy hgwdev: 78.753u 41.304s 30:19.77 6.5% 0+0k 0+0io 0pf+0w #on hgwdev with giant chains loading in parallel: # 78.213u 33.826s 58:16.41 3.2% 0+0k 0+0io 0pf+0w wc -l ucscNcbiSnp.ctg.bed #19189750 ucscNcbiSnp.ctg.bed liftUp ucscNcbiSnp.bed \ /hive/data/genomes/hg18/jkStuff/liftContigs.lft warn \ ucscNcbiSnp.ctg.bed #119.644u 8.992s 2:36.67 82.1% 0+0k 0+0io 3pf+0w # Drum roll please... translate NCBI's encoding into UCSC's, and # perform a bunch of checks. This is where developer involvement # is most likely as NCBI extends the encodings used in dbSNP. cd /hive/data/outside/dbSNP/130/human/ snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \ -1000GenomesRsIds=data/1000GenomesRsIds.txt snp130 #spaces stripped from observed: #chr12 5963395 5963395 rs41402545 #Line 8106609 of ucscNcbiSnp.bed: Encountered something that doesn't fit observedMixedFormat: GCAACTTCA #count of snps with weight 0 = 74828 #count of snps with weight 1 = 17254041 #count of snps with weight 2 = 389501 #count of snps with weight 3 = 1189989 #count of snps with weight 10 = 281391 #Found no errors. #163.878u 10.302s 3:33.84 81.4% 0+0k 0+0io 0pf+0w wc -l snp* # 18833531 snp130.bed # 22 snp130.sql # 0 snp130Errors.bed # 18 snp130ExceptionDesc.tab # 2631563 snp130Exceptions.bed # More SNPs but 0 errors and a bit fewer exceptions that snp129, cool! # Make one big fasta file. # It's a monster: 18G! Can we split by hashing rsId? zcat rs_fasta/rs_ch*.fas.gz \ | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \ > snp130.fa # Check for duplicates. grep ^\>rs snp130.fa | sort > /scratch/tmp/seqHeaders wc -l /scratch/tmp/seqHeaders #17804034 /scratch/tmp/seqHeaders uniq /scratch/tmp/seqHeaders | wc -l #17804034 # Use hgLoadSeq to generate .tab output for sequence file offsets, # and keep only the columns that we need: acc and file_offset. # Index it and translate to snpSeq table format. time hgLoadSeq -test placeholder snp130.fa #107.748u 24.338s 6:58.50 31.5% 0+0k 0+0io 0pf+0w cut -f 2,6 seq.tab > snp130Seq.tab rm seq.tab # Load up main track tables. cd /hive/data/outside/dbSNP/130/human time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \ hg18 snp130 -sqlTable=snp130.sql snp130.bed #Loaded 18833531 elements of size 17 #114.088u 12.924s 12:54.18 16.4% 0+0k 0+0io 0pf+0w time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \ hg18 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \ snp130Exceptions.bed #15.255u 1.257s 1:11.11 23.2% 0+0k 0+0io 0pf+0w hgLoadSqlTab hg18 snp130ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \ snp130ExceptionDesc.tab # Load up sequences. mkdir -p /gbdb/hg18/snp ln -s /hive/data/outside/dbSNP/130/human/snp130.fa /gbdb/hg18/snp/snp130.fa time nice hgLoadSqlTab hg18 snp130Seq ~/kent/src/hg/lib/snpSeq.sql snp130Seq.tab #0.005u 0.002s 6:02.78 0.0% 0+0k 0+0io 0pf+0w # Put in a link where one would expect to find the track build dir... ln -s /hive/data/outside/dbSNP/130/human /cluster/data/hg18/bed/snp130 # Look at the breakdown of exception categories: cd /hive/data/outside/dbSNP/130/human cut -f 5 snp130Exceptions.bed | sort | uniq -c | sort -nr #1960737 MultipleAlignments # 519222 ObservedMismatch # 38444 ObservedTooLong # 32069 SingleClassTriAllelic # 26351 FlankMismatchGenomeShorter # 19089 SingleClassLongerSpan # 15441 SingleClassZeroSpan # 6583 FlankMismatchGenomeLonger # 4108 DuplicateObserved # 3627 SingleClassQuadAllelic # 3473 MixedObserved # 1369 NamedDeletionZeroSpan # 547 FlankMismatchGenomeEqual # 355 NamedInsertionNonzeroSpan # 136 ObservedContainsIupac # 8 ObservedWrongFormat # 4 RefAlleleMismatch #TODO: go through those above and send some bug reports to dbSNP. # 8/18/09: dbSNP announced a correction to some functional class # annotations (- strand mRNA -> swapped near-gene-3 and near-gene-5). cd /hive/data/outside/dbSNP/130/human # This is a list of affected rs IDs, genes, old funcs and new funcs: wget ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database/organism_data/b130_update/b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt wc -l b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt #163147 b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt # The first 19 lines are the header. # Use the info in that file to make a series of sql update commands: tail -n +20 b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt \ | perl -we '$fns[6]="intron"; $fns[13]="near-gene-3"; $fns[15]="near-gene-5"; \ $fns[41]="nonsense"; $fns[42]="missense"; \ $fns[53]="untranslated-3"; $fns[55]="untranslated-5"; \ while (<>) { \ ($rs,undef,undef,$old,undef,$new) = split(","); \ $oldF = $fns[$old]; $newF = $fns[$new]; die if (!(defined $oldF && defined $newF)); \ print "UPDATE snp130 set func=(REPLACE(func,\"$oldF\",\"$newF\")) where name=\"rs$rs\";\n"; \ }' \ > snp130.func_13_15_fix.sql wc -l snp130.func_13_15_fix.sql #163128 snp130.func_13_15_fix.sql hgsql hg18 < snp130.func_13_15_fix.sql # The number of rows changed has to be smaller because some of those replacements # are for annotations relative to a different assembly; we have func=unknown for # those. E.g. rs437678. ####################################################################### # ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 5/26/09 angie) # Originally done 5/15; reloaded 5/26 after making sure no coords had changed, # reloaded 7/7/09 to bump timestamp mkdir /hive/data/genomes/hg18/bed/snp130Ortho cd /hive/data/genomes/hg18/bed/snp130Ortho # Following Heather's lead in snp126orthos, filter SNPs to to keep # only those with class=single, length=1, chrom!~random; # Exclude those with exceptions MultipleAlignments, # SingleClassTriAllelic or SingleClassQuadAllelic. # Unlike snp masking, we do not filter for weight -- don't know why. awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \ /hive/data/outside/dbSNP/130/human/snp130Exceptions.bed \ | sort -u \ > snp130ExcludeIds.txt awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \ /hive/data/outside/dbSNP/130/human/snp130.bed \ | grep -vFwf snp130ExcludeIds.txt \ > snp130Simple.bed #182.396u 12.388s 2:10.30 149.4% 0+0k 0+0io 0pf+0w wc -l snp130Simple.bed #12141377 snp130Simple.bed # Glom all human info that we need for the final table onto the # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand awk 'BEGIN{OFS="\t";} \ {print $1, $2, $3, \ $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \ 0, $6;}' \ snp130Simple.bed > snp130ForLiftOver.bed # Map coords to chimp using liftOver. # I don't know why chimp took so much longer than macaque... the # chimp .over has fewer chains and fewer bytes than the macaque .over. mkdir run.liftOChimp cd run.liftOChimp mkdir split out splitFile ../snp130ForLiftOver.bed 25000 split/chunk cp /dev/null jobList foreach f (split/chunk*) echo liftOver $f \ /hive/data/genomes/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \ \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \ >> jobList end ssh pk cd /hive/data/genomes/hg18/bed/snp130Ortho/run.liftOChimp para make jobList #Completed: 486 of 486 jobs #CPU time in finished jobs: 76679s 1277.99m 21.30h 0.89d 0.002 y #IO & Wait Time: 1828s 30.46m 0.51h 0.02d 0.000 y #Average job time: 162s 2.69m 0.04h 0.00d #Longest finished job: 486s 8.10m 0.14h 0.01d #Submission to last job: 513s 8.55m 0.14h 0.01d # Map coords to orangutan using liftOver. mkdir ../run.liftOPon cd ../run.liftOPon mkdir out ln -s ../run.liftOChimp/split . cp /dev/null jobList foreach f (split/chunk*) echo liftOver $f \ /hive/data/genomes/hg18/bed/liftOver/hg18ToPonAbe2.over.chain.gz \ \{check out exists out/ponAbe2.$f:t.bed\} out/hg18.$f:t.unmapped \ >> jobList end para make jobList #Completed: 486 of 486 jobs #CPU time in finished jobs: 165378s 2756.31m 45.94h 1.91d 0.005 y #IO & Wait Time: 2614s 43.56m 0.73h 0.03d 0.000 y #Average job time: 346s 5.76m 0.10h 0.00d #Longest finished job: 1017s 16.95m 0.28h 0.01d #Submission to last job: 1051s 17.52m 0.29h 0.01d # Map coords to macaque using liftOver. mkdir ../run.liftOMac cd ../run.liftOMac mkdir out ln -s ../run.liftOChimp/split . cp /dev/null jobList foreach f (split/chunk*) echo liftOver $f \ /hive/data/genomes/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \ \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \ >> jobList end para make jobList #Completed: 486 of 486 jobs #CPU time in finished jobs: 4068s 67.80m 1.13h 0.05d 0.000 y #IO & Wait Time: 1944s 32.40m 0.54h 0.02d 0.000 y #Average job time: 12s 0.21m 0.00h 0.00d #Longest finished job: 38s 0.63m 0.01h 0.00d #Submission to last job: 126s 2.10m 0.04h 0.00d cd /hive/data/genomes/hg18/bed/snp130Ortho # Concatenate the chimp results, sorting by chimp pos in order to # efficiently access 2bit sequence in getOrthoSeq. The output of # that is then sorted by the glommed human info field, so that we # can use join to combine chimp and macaque results in the next step. # Ditto for macaque and orangutan. Each command pipe takes ~5 minutes: sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \ | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \ | sort > panTro2.orthoGlom.txt sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \ | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \ | sort > ponAbe2.orthoGlom.txt sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \ | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \ | sort > rheMac2.orthoGlom.txt wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt # 11318466 panTro2.orthoGlom.txt # 10976821 ponAbe2.orthoGlom.txt # 9702063 rheMac2.orthoGlom.txt # Use the glommed name field as a key to join up chimp and macaque # allele data. Include glommed name from both files because if only # file 2 has a line for the key in 2.1, then 1.1 is empty. Then plop # in the orthoGlom fields from each file, which are in the same order # as the chimp and macaque columns of snp130OrthoPanTro2RheMac2. join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \ -a 1 -a 2 -e '?' \ panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \ | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \ else { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \ > tmp.txt join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \ -a 1 -a 2 -e '?' \ tmp.txt rheMac2.orthoGlom.txt \ | perl -wpe 'chomp; \ ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \ $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \ $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \ $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \ ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \ split(/\|/, $glomKey); \ $o1Start =~ s/^\?$/0/; $o2Start =~ s/^\?$/0/; $o3Start =~ s/^\?$/0/; \ $o1End =~ s/^\?$/0/; $o2End =~ s/^\?$/0/; $o3End =~ s/^\?$/0/; \ print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \ $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \ $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \ $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \ s/^.*$//;' \ | sort -k1,1 -k2n,2n > snp130OrthoPt2Pa2Rm2.bed #300.357u 31.419s 4:33.00 121.5% 0+0k 0+0io 0pf+0w wc -l snp130OrthoPt2Pa2Rm2.bed #11797184 snp130OrthoPt2Pa2Rm2.bed cd /hive/data/genomes/hg18/bed/snp130Ortho hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \ hg18 snp130OrthoPt2Pa2Rm2 snp130OrthoPt2Pa2Rm2.bed #Loaded 11797184 elements of size 22 #83.624u 9.627s 10:19.26 15.0% 0+0k 0+0io 0pf+0w # Cleanup fileserver: cd /hive/data/genomes/hg18/bed/snp130Ortho nice gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed rm -r run*/split tmp.txt *.orthoGlom.txt ####################################################################### # DBSNP CODING ANNOTATIONS (DONE 10/12/10 angie) # Updated 10/12/10 - redone w/corrected genome coords (Redmine Track #1249) # Updated 7/7/09 - redone w/snp130, using mapping locations of dbSNP's func. annos # originally done 6/2/09 cd /hive/data/outside/dbSNP/130/human # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # For anything except an insertion (0 bases between flanks), # we need to add 1 to the end coord. For an insertion, we need # to add 1 to the start coord. Make a hash of the insertion IDs, # then look up each ID in ncbiFuncAnnotations.txt to tell which # transform to apply. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. perl -we 'open($IDS, "ncbiFuncInsertions.ctg.bed") || die "ids: $!"; \ while (<$IDS>) { chomp; $ids{$_} = 1; } \ close($IDS); \ while (<>) { \ chomp; @w = split("\t"); # id, ctg, start, end, ... \ next unless ($w[7] || $w[8] || $w[9]); \ $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \ if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \ $w[2]++; # 2-base insertions: increment start coord \ } else { \ $w[3]++; # increment end coord to get half-open \ } \ print join("\t", @w) . "\n"; \ }' ncbiFuncAnnotations.txt \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiCodingAnnotations.txt wc -l ncbiCodingAnnotations.txt #576726 ncbiCodingAnnotations.txt # How many & what kinds of function types? cut -f 6 ncbiCodingAnnotations.txt \ | sort -n | uniq -c # 107963 3 (coding-synon) # 276197 8 (cds-reference) # 4664 41 (nonsense) # 146908 42 (missense) # 40994 44 (frameshift) # Does everybody have a reference annotation? awk '$6 == 8 {print $1 "\t" $5;}' ncbiCodingAnnotations.txt | uniq > tmp1 awk '$6 != 8 {print $1 "\t" $5;}' ncbiCodingAnnotations.txt | uniq > tmp2 wc -l tmp1 tmp2 # 276113 tmp1 # 279647 tmp2 # Doh! not everybody. So hgTracks will sometimes have to process ref itself... # Gather up multiple annotation lines into one line per {snp, gene, frame}: perl -e 'while (<>) { chomp; \ my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \ if (defined $lastRs && \ ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \ $lastTx ne $txId || $lastFrm ne $frm)) { \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $refRow = undef; @rows = (); ($count, $fxns, $nts, $codons, $aas) = (); \ } \ ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \ ($rsId, $ctg, $s, $e, $txId, $frm); \ $count++; \ if ($fxn == 8) { \ $refRow = [$fxn, $nt, $aa, $codon]; \ } else { \ $fxns .= "$fxn,"; $nts .= "$nt,"; $aas .= "$aa,"; $codons .= "$codon,"; \ } \ } \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n";' \ ncbiCodingAnnotations.txt \ > snp130CodingDbSnp.ctg.txt liftUp snp130CodingDbSnp.bed \ /hive/data/genomes/hg18/jkStuff/liftContigs.lft warn snp130CodingDbSnp.ctg.txt hgLoadBed hg18 snp130CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp130CodingDbSnp.bed #Loaded 279815 elements of size 11 ####################################################################### # SNPMASKED SEQUENCE FOR SNP130 (DONE 7/10/09 angie) mkdir /hive/data/genomes/hg18/snp130Mask cd /hive/data/genomes/hg18/snp130Mask # Identify rsIds with various problems -- we will exclude those. # MultipleAlignments is kinda broad because anything that maps on # both chrN and chrN_foo_hap1 will be excluded... similarly, extra # matches on chrN_random might disqualify good matches on chrN. # Well, erring on the side of caution is good. awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \ /hive/data/outside/dbSNP/130/human/snp130Exceptions.bed \ | sort -u \ > snp130ExcludeRsIds.txt time grep -vFwf snp130ExcludeRsIds.txt \ /hive/data/outside/dbSNP/130/human/snp130.bed \ > snp130Cleaned.bed #185.202u 4.847s 3:22.55 93.8% 0+0k 0+0io 0pf+0w # Substitutions: mkdir substitutions snpMaskSingle snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \ | faSplit byname stdin substitutions/ #Masked 12142171 snps in 12141860 out of 3091592211 genomic bases #/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091592211 (difference is 16085062) #94.376u 16.038s 3:10.37 57.9% 0+0k 0+0io 0pf+0w # Check that 16085062 is the total #bases in sequences with nothing in snp130Cleaned: cut -f 1 snp130Cleaned.bed | uniq > /tmp/1 grep -vwf /tmp/1 ../chrom.sizes grep -vwf /tmp/1 ../chrom.sizes \ | awk 'BEGIN {TOTAL = 0 ; } {TOTAL += $2 ; } END {printf "%d\n", TOTAL ; }' #16085062 # 338 warnings about differing observed strings at same base position -- # saved as diffObserved.txt. #TODO: send list to dbSNP. # Make sure that sizes are identical, first diffs are normal -> IUPAC, # and first diffs' case is preserved: foreach f (substitutions/chr*.fa) faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" end #chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 491 (y != c) #chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 55877 (s != c) #... #(output OK -- ambiguous bases replacing [agct] at SNP positions) foreach f (substitutions/chr*.fa) echo $f:t:r mv $f $f:r.subst.fa gzip $f:r.subst.fa end # Insertions: mkdir insertions snpMaskAddInsertions snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \ | faSplit byname stdin insertions/ #Added 2464798 snps totaling 5891837 bases to 3085167749 genomic bases #/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524) #99.269u 17.928s 3:31.80 55.3% 0+0k 0+0io 1pf+0w # Again, that just means that some chroms didn't have filtered SNPs. # Make sure that all sizes have increased relative to original: foreach f (insertions/chr*.fa) faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \ |& perl -we '$_=<>; \ if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \ if ($1 > $2) {print "OK: ins size $1 > $2\n";} \ else {die "ERROR: ins size $1 <= $2\n";} \ } else {die $_;}' end #OK: ins size 247711739 > 247249719 #OK: ins size 135642480 > 135374737 #... #(output OK -- new sizes > old) foreach f (insertions/chr*.fa) mv $f $f:r.ins.fa gzip $f:r.ins.fa end # Deletions: mkdir deletions snpMaskCutDeletions snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \ | faSplit byname stdin deletions/ #Cut 1514798 snps totaling 3554896 bases from 3086962619 genomic bases #/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3086962619 (difference is 20714654) #103.312u 31.094s 3:56.12 56.9% 0+0k 0+0io 1pf+0w # Again, that just means that some chroms didn't have filtered SNPs. # Make sure that all sizes have decreased relative to original: foreach f (deletions/chr*.fa) faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \ |& perl -we '$_=<>; \ if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \ if ($1 < $2) {print "OK: del size $1 < $2\n";} \ else {die "ERROR: del size $1 >= $2\n";} \ } else {die $_;}' end #OK: del size 246960459 < 247249719 #OK: del size 135214654 < 135374737 #... #(output OK -- del sizes < old) foreach f (deletions/chr*.fa) mv $f $f:r.del.fa gzip $f:r.del.fa end # Clean up and prepare for download: gzip snp130Cleaned.bed foreach d (substitutions insertions deletions) pushd $d md5sum *.gz > md5sum.txt cp ../../snp129Mask/$d/README.txt . popd end # Edit the README.txt in each subdir. # Create download links on hgwdev. # NOTE: Currently we offer only the substitutions. # If we get any user requests, then maybe we can put the insertions # and deletions out there. mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask ln -s /hive/data/genomes/hg18/snp130Mask/substitutions/* \ /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/ ## If there is user demand for ins & del, then start over with an empty ## goldenPath/snp130Mask and do this: ## foreach type (substitutions insertions deletions) ## mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/$type ## ln -s /hive/data/genomes/hg18/snp130Mask/$type/* \ ## /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/$type/ ## end ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # Nuclear Lamina (2008-06-16 mikep) # "Domain organization of human chromosomes revealed by mapping of nuclear lamina interactions" # We received these files from authors of Guelen et al. Nature 2008 # doi:10.138/nature06947 ssh hgwdev mkdir /cluster/data/hg18/bed/nuclearLamina cd /cluster/data/hg18/bed/nuclearLamina/ mv /var/ftp/encode/LADs_080513.bed.bz2 . mv /var/ftp/encode/LaminB1_080513.wig.bz2 . mv /var/ftp/encode/LaminB1_LAD.md5sum . # to check the md5sum we need to unzip it to its original name, done on the NFS host for this directory df -h . # Filesystem Size Used Avail Use% Mounted on # kkstore02-10:/export/cluster/store11 # 1.8T 1.7T 94G 95% /cluster/store11 ssh kkstore02-10 cd /cluster/data/hg18/bed/nuclearLamina/ # check they are not too big to unzip, look ok ll -h L*bz2 # -rw-r--r-- 1 mikep protein 13K Jun 10 00:58 LADs_080513.bed.bz2 # -rw-r--r-- 1 mikep protein 16M Jun 10 01:02 LaminB1_080513.wig.bz2 bunzip2 -dk L*bz2 md5sum -c LaminB1_LAD.md5sum # all ok # LADs_080513.bed: OK # LaminB1_080513.wig: OK # Description files were received via email and copied directly to this dir. # Needed to convert from mac to unix due to ^M chars: mac2unix L*.html # Checked files looked OK, needed to remove HTML tags such as: DOCTYPE vi L*.html # Now find the min/max/avg range of values from the wiggle file egrep "^[0-9]" LaminB1_080513.wig |ave -col=2 stdin # Q1 -0.509000 # median -0.000000 # Q3 0.514000 # average -0.041192 # min -6.602000 # max 5.678000 # count 2909178 # total -119833.701411 # standard deviation 1.037038 # Now load the tracks on hgwdev ssh hgwdev cd /cluster/data/hg18/bed/nuclearLamina/ # First two lines are custom track header tail +3 LADs_080513.bed | hgLoadBed hg18 laminB1Lads stdin # wigEncode the .wig and .wib files from the supplied wig ascii file, and symlink the .wib file from /gbdb wigEncode LaminB1_080513.wig laminB1.wig laminB1.wib ln -s /cluster/data/hg18/bed/nuclearLamina/laminB1.wib /gbdb/hg18/wib/ # Converted LaminB1_080513.wig, upper limit 5.68, lower limit -6.60 hgLoadWiggle hg18 laminB1 laminB1.wig rm bed.tab wiggle.tab ## Create the track definitions in hg18, copy them over, (these are my paths) and do make ## Make entries for: bed = "track laminB1Lads" wiggle = "track laminB1" ssh hgwdev # vi /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/trackDb.ra # cp /cluster/data/hg18/bed/nuclearLamina/laminB1.html /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/ # cp /cluster/data/hg18/bed/nuclearLamina/laminB1Lads.html /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/ # cp /cluster/data/hg18/bed/nuclearLamina/laminB1Super.html /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/ # cp /cluster/data/hg18/bed/nuclearLamina/laminB1Super.gif /cluster/home/mikep/browser/images/ # cd /cluster/home/mikep/kent/src/hg/makeDb/trackDb # make # Add wig ascii track (+readme) to goldenPath so it can be downloaded mkdir /data/apache/htdocs/goldenPath/hg18/nuclearLamina cp /cluster/data/hg18/bed/nuclearLamina/LaminB1_080513.wig.bz2 /data/apache/htdocs/goldenPath/hg18/nuclearLamina/hg18.laminB1.txt.bz2 cp /cluster/data/hg18/bed/nuclearLamina/goldenPath.README.txt /data/apache/htdocs/goldenPath/hg18/nuclearLamina/README.txt # Add both tracks to all.joiner under section: tablesIgnored $hg ############################################################################ ##### Positively Selected Genes (Pos Sel Genes) (braney - DONE - 2008-07-07) # get SQL data (mammalPsq.sql) from Adam Siepel # and Tomas Vinar (acs4@cornell.edu) hgsql hg18 < mammalPsg.sql echo "alter table mammalPsg add index (chrom(7));" | hgsql hg18 #################################################################### # UPDATE UNIGENE/SAGE TRACK (DONE - 2008-08-09 Fan) # Create the uniGene alignments # Download of the latest UniGene version is now automated by a # cron job -- see /cluster/home/angie/crontab , # /cluster/home/angie/unigeneVers/unigene.csh . # If hgwdev gets rebooted, that needs to be restarted... maybe there's # a more stable place to set up that cron job. ssh hgwdev cd /cluster/store11/gs.19/build36/bed cd uniGene mkdir old mv * old set Version = 214 zcat /cluster/store7/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\ sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa ssh pk set Version = 214 mv /san/sanvol1/scratch/hg18/uniGene /san/sanvol1/scratch/hg18/uniGene.old mkdir /san/sanvol1/scratch/hg18/uniGene/ cd /san/sanvol1/scratch/hg18/uniGene/ cp -p /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa . ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst ls -1S \ /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa \ > uniGene.lst cat << '_EOF_' > template.sub #LOOP /cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst uniGene.lst template.sub para.spec para create para.spec mkdir psl para try para check para push Completed: 49 of 49 jobs CPU time in finished jobs: 59778s 996.30m 16.60h 0.69d 0.002 y IO & Wait Time: 208s 3.47m 0.06h 0.00d 0.000 y Average job time: 1224s 20.40m 0.34h 0.01d Longest finished job: 4549s 75.82m 1.26h 0.05d Submission to last job: 4653s 77.55m 1.29h 0.05d Estimated complete: 0s 0.00m 0.00h 0.00d pslSort dirs raw.psl tmp psl >& pslSort.log cat raw.psl|\ pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \ stdin hg18.uniGene.pslReps.psl /dev/null # Processed 553470 alignments gzip raw.psl gzip Hs.seq.uniq.simpleHeader.fa ssh hgwdev cd /cluster/store11/gs.19/build36/bed/uniGene cp -p /san/sanvol1/scratch/hg18/uniGene/hg18.uniGene.pslReps.psl . hgLoadPsl -table=uniGene_3 hg18 hg18.uniGene.pslReps.psl # load the sequence with -replace option hgLoadSeq -replace hg18 /gbdb/hg18/uniGene/Hs.seq.uniq.simpleHeader.fa ############################################################################# # BLASTZ/CHAIN/NET dipOrd1 (DONE - 2008-10-22 - Hiram) screen # use screen to control this multi-day job mkdir /hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21 cd /hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21 cat << '_EOF_' > DEF # Human vs. Kangaroo rat BLASTZ_M=50 BLASTZ=lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Kangaroo rat SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 881m33.829s cat fb.hg18.chainDipOrd1Link.txt # 786126212 bases of 2881515245 (27.282%) in intersection # slight difficulty with the makeMd5sum.csh script, fixed in the source # and completed the copy of the liftOver file, then continuing, # with -syntenicNet: time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -continue=cleanup -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 & # real 86m15.646s cd /cluster/data/hg18/bed/blastzDipOrd1.2008-10-21 time nice -n +19 doRecipBest.pl hg18 dipOrd1 > rbest.log 2>&1 & # real 327m0.719s ############################################################################# # BLASTZ/CHAIN/NET pteVam1 (DONE - 2008-10-21,29 - Hiram) screen # use screen to control this multi-day job mkdir /hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21 cd /hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21 cat << '_EOF_' > DEF # Human vs. Megabat BLASTZ_M=50 BLASTZ=lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Megabat SEQ2_DIR=/scratch/data/pteVam1/pteVam1.2bit SEQ2_LEN=/scratch/data/pteVam1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 595m14.168s # some crashed jobs, finish the batch on pk manually, then, continuing: time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # real 151m54.924s cat fb.hg18.chainPteVam1Link.txt # 1311133709 bases of 2881515245 (45.502%) in intersection cd /cluster/data/hg18/bed/blastzPteVam1.2008-10-21 time nice -n +19 doRecipBest.pl hg18 pteVam1 > rbest.log 2>&1 & # finish manually due to problems: # real 286m25.330s doRecipBest.pl -continue=download hg18 pteVam1 > rbestDownload.log 2>&1 ############################################################################# # BLASTZ/CHAIN/NET turTru1 (DONE - 2008-10-22 - Hiram) screen # use screen to control this multi-day job mkdir /hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21 cd /hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21 cat << '_EOF_' > DEF # Human vs. Dolphin BLASTZ_M=50 BLASTZ=lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Dolphin SEQ2_DIR=/scratch/data/turTru1/turTru1.2bit SEQ2_LEN=/scratch/data/turTru1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 702m54.490s cat fb.hg18.chainTurTru1Link.txt # 1398587431 bases of 2881515245 (48.537%) in intersection # slight difficulty with the makeMd5sum.csh script, fixed in the source # and completed the copy of the liftOver file, then continuing, # with -syntenicNet: cd /cluster/data/hg18/bed/blastzTurTru1.2008-10-21 time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -continue=cleanup -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 & # real 74m4.276s time nice -n +19 doRecipBest.pl hg18 turTru1 > rbest.log 2>&1 & # real 275m19.714s ############################################################################# # BLASTZ/CHAIN/NET tarSyr1 (DONE - 2008-10-21,29 - Hiram) screen # use screen to control this multi-day job mkdir /hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21 cd /hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21 cat << '_EOF_' > DEF # Human vs. Tarsier BLASTZ_M=50 BLASTZ=lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Tarsier SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1518m42.776s # recovered the batch on pk, then continuing: time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # real 526m45.582s cat fb.hg18.chainTarSyr1Link.txt # 1383104827 bases of 2881515245 (47.999%) in intersection cd /cluster/data/hg18/bed/blastzTarSyr1.2008-10-21 time nice -n +19 doRecipBest.pl hg18 tarSyr1 > rbest.log 2>&1 & # failed, finishing manually # real 155m48.855s doRecipBest.pl -continue=download hg18 tarSyr1 > rbest.log 2>&1 ############################################################################# # BLASTZ/CHAIN/NET proCap1 (DONE - 2008-10-22,29 - Hiram) screen # use screen to control this multi-day job mkdir /hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22 cd /hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22 cat << '_EOF_' > DEF # Human vs. Rock Hyrax BLASTZ_M=50 BLASTZ=lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Rock Hyrax SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit SEQ2_LEN=/scratch/data/proCap1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1654m44.904s # finish lastz batch manually after script difficulties, then continuing: time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # real 227m41.045s cat fb.hg18.chainProCap1Link.txt # 891406629 bases of 2881515245 (30.935%) in intersection cd /cluster/data/hg18/bed/blastzProCap1.2008-10-22 time nice -n +19 doRecipBest.pl hg18 proCap1 > rbest.log 2>&1 & # real 232m9.789s # failed # running the last couple of commands to finish this off # real 561m51.171s doRecipBest.pl -continue=download hg18 proCap1 > rbestDownload.log 2>&1 ############################################################################# # BLASTZ/CHAIN/NET choHof1 (DONE - 2008-10-22,28 - Hiram) screen # use screen to control this multi-day job mkdir /hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22 cd /hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22 cat << '_EOF_' > DEF # Human vs. Sloth BLASTZ_M=50 BLASTZ=lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Sloth SEQ2_DIR=/scratch/data/choHof1/choHof1.2bit SEQ2_LEN=/scratch/data/choHof1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1649m6.606s # finish lastz batch manually after script difficulties, then continuing: time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # real 276m1.827s cat fb.hg18.chainChoHof1Link.txt # 993065598 bases of 2881515245 (34.463%) in intersection cd /cluster/data/hg18/bed/blastz.choHof1.2008-10-22 time nice -n +19 doRecipBest.pl hg18 choHof1 > rbest.log 2>&1 & # real 900m50.222s ############################################################################# # BLASTZ/CHAIN/NET dasNov2 (DONE - 2008-10-22,29 - Hiram) screen # use screen to control this multi-day job mkdir /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22 cd /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22 cat << '_EOF_' > DEF # Human vs. Armadillo BLASTZ_M=50 BLASTZ=lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Armadillo SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1664m4.331s # finish this batch manually after some code troubles, then: time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # real 230m4.513s # something broke during chainSplit, try that manuallyo nice -n +19 chainSplit \ /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22/axtChain/chain \ /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22/axtChain/hg18.dasNov2.all.chain.gz # no problem with that, continuing: time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=net -syntenicNet -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1 & # real 206m54.072s cd /cluster/data/hg18/bed/blastzDasNov2.2008-10-22 time nice -n +19 doRecipBest.pl hg18 dasNov2 > rbest.log 2>&1 & # failed, finishing manually: # real 680m1.703s # the following takes an instant: doRecipBest.pl -continue=download hg18 dasNov2 \ > rbestDownload.log 2>&1 & ############################################################################# # BLASTZ/CHAIN/NET loxAfr2 (DONE - 2008-10-22,29 - Hiram) screen # use screen to control this multi-day job mkdir /hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22 cd /hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22 cat << '_EOF_' > DEF # Human vs. Elephant BLASTZ_M=50 BLASTZ=lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Elephant SEQ2_DIR=/scratch/data/loxAfr2/loxAfr2.2bit SEQ2_LEN=/scratch/data/loxAfr2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # 1580m26.439s # problems with batch do to scriping errors, finishing the batch # manually time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # real 264m46.272s cat fb.hg18.chainLoxAfr2Link.txt # 1014404239 bases of 2881515245 (35.204%) in intersection cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-10-22 time nice -n +19 doRecipBest.pl hg18 loxAfr2 > rbest.log 2>&1 & # real 622m17.655s ############################################################################# # BLASTZ/CHAIN/NET vicPac1 (DONE - 2008-10-28,29 - Hiram) screen # use screen to control this multi-day job mkdir /hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28 cd /hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28 cat << '_EOF_' > DEF # Human vs. Alpaca BLASTZ_M=50 BLASTZ=lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=10000 # QUERY: Alpaca SEQ2_DIR=/scratch/data/vicPac1/vicPac1.2bit SEQ2_LEN=/scratch/data/vicPac1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm -syntenicNet \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 488m36.288s cat fb.hg18.chainVicPac1Link.txt # 1139088501 bases of 2881515245 (39.531%) in intersection cd /cluster/data/hg18/bed/blastzVicPac1.2008-10-28 time nice -n +19 doRecipBest.pl hg18 vicPac1 > rbest.log 2>&1 & # real 380m17.963s ############################################################################# # BLASTZ/CHAIN/NET Gorilla gorGor1 (DONE - 2008-11-04,05 - Hiram) screen # use screen to control this multi-day job mkdir /hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04 cd /hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04 cat << '_EOF_' > DEF # Human vs. Alpaca BLASTZ_M=50 BLASTZ=lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=100000000 SEQ1_LAP=10000 # QUERY: Alpaca SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=284 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm -syntenicNet \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 644m45.816s cat fb.hg18.chainGorGor1Link.txt # 1778801556 bases of 2881515245 (61.731%) in intersection cd /cluster/data/hg18/bed/blastzGorGor1.2008-11-04 time nice -n +19 doRecipBest.pl hg18 gorGor1 > rbest.log 2>&1 & # real 171m42.585s # failed, need to finish manually cd /hive/data/genomes/hg18/bed/blastz.gorGor1/axtChain # alter the doRecipBest.csh script to finiRecipBest.csh and run: time ./finiRecipBest.csh > finiRecipBest.log 2>&1 # real 1177m37.534s # then, continuing: doRecipBest.pl -continue=download hg18 gorGor1 ############################################################################# # BLASTZ/CHAIN/NET ochPri2 (DONE braney 2008-07-30) ssh kkstore02 screen # use screen to control this multi-day job mkdir /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29 cd /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29 cat << _EOF_ > DEF # Human vs. Pika BLASTZ_M=50 BLASTZ=/cluster/home/braney/bin/x86_64/lastz # TARGET: Human Hg18 SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Pika SEQ2_DIR=/san/sanvol1/scratch/ochPri2/ochPri2.2bit SEQ2_LEN=/san/sanvol1/scratch/ochPri2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.ochPri2.2008-07-29 TMPDIR=/scratch/tmp _EOF_ # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do.log 2>&1 & # Completed: 654120 of 654120 jobs # CPU time in finished jobs: 14082913s 234715.22m 3911.92h 163.00d 0.447 y # IO & Wait Time: 2257180s 37619.67m 626.99h 26.12d 0.072 y # Average job time: 25s 0.42m 0.01h 0.00d # Longest finished job: 292s 4.87m 0.08h 0.00d # Submission to last job: 59396s 989.93m 16.50h 0.69d nice doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do2.log 2>&1 & # memk cluster couldn't find san for chainRun, ran on pk nice doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -syntenicNet -continue=chainMerge \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do3.log 2>&1 & ln -s `pwd`/blastz.ochPri2.2008-07-29 /cluster/data/hg18/bed/blastz.ochPri2 featureBits hg18 chainOchPri2Link # 806073890 bases of 2881515245 (27.974%) in intersection cd /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 ochPri2 > rbest.log 2>&1 & ############################################################################# # BLASTZ/CHAIN/NET myoLuc1 (DONE braney 2008-08-02) ssh kkstore02 screen # use screen to control this multi-day job mkdir /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31 cd /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31 cat << _EOF_ > DEF # Human vs. Microbat BLASTZ_M=50 BLASTZ_T=2 BLASTZ=/cluster/home/braney/bin/x86_64/lastz # TARGET: Human Hg18 (whole chroms) SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=0 # QUERY: Microbat SEQ2_DIR=/san/sanvol1/scratch/myoLuc1/myoLuc1.2bit SEQ2_LEN=/san/sanvol1/scratch/myoLuc1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31 TMPDIR=/scratch/tmp _EOF_ # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/myoLuc1/blastz.hg18 > do.log 2>&1 & # Completed: 98879 of 99144 jobs # Crashed: 56 jobs # Other count: 209 jobs # CPU time in finished jobs: 2327505s 38791.75m 646.53h 26.94d 0.074 y # IO & Wait Time: 340164s 5669.40m 94.49h 3.94d 0.011 y # Average job time: 27s 0.45m 0.01h 0.00d # Longest finished job: 1034s 17.23m 0.29h 0.01d # Submission to last job: 56968s 949.47m 15.82h 0.66d # do remaining jobs on kolossus nice doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/myoLuc1/blastz.hg18 > do2.log 2>&1 & ln -s `pwd`/blastz.myoLuc1.2008-07-31 /cluster/data/hg18/bed/blastz.myoLuc1 featureBits hg18 chainMyoLuc1Link # 952177725 bases of 2881515245 (33.044%) in intersection cd /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 myoLuc1 > rbest.log 2>&1 & ############################################################################# # BLASTZ/CHAIN/NET loxAfr2 (not done) ssh kkstore02 screen # use screen to control this multi-day job mkdir /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01 cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01 cat << _EOF_ > DEF # Human vs. Elephant BLASTZ_M=50 BLASTZ=/cluster/home/braney/bin/x86_64/lastz # TARGET: Human Hg18 (whole chroms) SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=0 # QUERY: Elephant SEQ2_DIR=/san/sanvol1/scratch/loxAfr2/loxAfr2.2bit SEQ2_LEN=/san/sanvol1/scratch/loxAfr2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01 TMPDIR=/scratch/tmp _EOF_ # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do.log 2>&1 & # had to run some jobs on memk nice doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do2.log 2>&1 & # netChainSubset barfed with memory error (skipped over chains) nice doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do3.log 2>&1 & ln -s `pwd`/blastz.loxAfr2.2008-08-01 /cluster/data/hg18/bed/blastz.loxAfr2 featureBits hg18 chainLoxAfr2Link # 1025499138 bases of 2881515245 (35.589%) in intersection cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 loxAfr2 > rbest.log 2>&1 & ############################################################################# # BUILD snpArrayIllumina1M SUB-TRACK (DONE 8/4/08, Fan) # Received raw data file Illumina_Human1M-duoV3_SNPlist_Strand_Location.csv # from Illumina, Luana Galver (lgalver at illumina.com). mkdir -p /cluster/store11/gs.19/build36/bed/snp/illumina/1M cd /cluster/store11/gs.19/build36/bed/snp/illumina/1M cat Illumina_Human1M-duoV3_SNPlist_Strand_Location.csv |\ sed -e 's/,/\t/g' >1M.tab hgsql hg18 < ~src/hg/lib/snpArrayIllumina1MRaw.sql hgsql hg18 -e 'load data local infile "1M.tab" into table snpArrayIllumina1MRaw' ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIllumina1MRaw snp129 # The illuminaLookup1M generate two files: # # illuminaLookup1M.out contains all Illumina 1M probes found in snp129 # illuminaLookup1M.err contains all Illumina 1M probes not found in snp129 mv illuminaLookup.out illuminaLookup1Ma.out cut -f 1 illuminaLookup.err >j.1 cat j.1 |sed -e 's/chrMt/chrM/' |\ sed -e 's/XY/X/' >j.chr cut -f 2-5 illuminaLookup.err >j.2 cut -f 6 illuminaLookup.err >j.3 cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed paste j.chr j.2 j.strand j.observed >illuminaLookup1Mb.out # combine two parts cat illuminaLookup1Ma.out illuminaLookup1Mb.out >snpArrayIllumina1M.tab # load the table hgLoadBed hg18 snpArrayIllumina1M snpArrayIllumina1M.tab -tab -sqlTable=snpArrayIllumina1M.sql ############################################################################# # BLASTZ/CHAIN/NET micMur1 (DONE braney 2008-08-04 ) ssh kkstore02 screen # use screen to control this multi-day job mkdir /cluster/data/hg18/bed/blastz.micMur1.2008-08-03 cd /cluster/data/hg18/bed/blastz.micMur1.2008-08-03 cat << _EOF_ > DEF # Human vs. Mouse lemur BLASTZ_M=50 BLASTZ_T=2 BLASTZ=/cluster/home/braney/bin/x86_64/lastz # TARGET: Human Hg18 (whole chroms) SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=0 # QUERY: Mouse lemur SEQ2_DIR=/san/sanvol1/scratch/micMur1/micMur1.2bit SEQ2_LEN=/san/sanvol1/scratch/micMur1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.micMur1.2008-08-03 TMPDIR=/scratch/tmp _EOF_ # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/micMur1/blastz.hg18 > do.log 2>&1 & # did remaining jobs on memk nice doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/micMur1/blastz.hg18 > do2.log 2>&1 & ssh hgwdev cd /cluster/data/hg18/bed ln -s `pwd`/blastz.micMur1.2008-08-03 /cluster/data/hg18/bed/blastz.micMur1 featureBits hg18 chainMicMur1Link # 1338330504 bases of 2881515245 (46.445%) in intersection ssh kkstore02 cd /cluster/data/hg18/bed/blastz.micMur1.2008-08-03 time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 micMur1 > rbest.log 2>&1 & ############################################################################# # BLASTZ/CHAIN/NET speTri1 (DONE braney 2008-08-05) ssh kkstore02 screen # use screen to control this multi-day job mkdir /cluster/data/hg18/bed/blastz.speTri1.2008-08-04 cd /cluster/data/hg18/bed/blastz.speTri1.2008-08-04 cat << _EOF_ > DEF # Human vs. Squirrel BLASTZ_M=50 BLASTZ=/cluster/home/braney/bin/x86_64/lastz # TARGET: Human Hg18 (whole chroms) SEQ1_DIR=/scratch/data/hg18/nib SEQ1_LEN=/cluster/data/hg18/chrom.sizes SEQ1_CHUNK=200000000 SEQ1_LAP=0 # QUERY: Squirrel SEQ2_DIR=/san/sanvol1/scratch/speTri1/speTri1.2bit SEQ2_LEN=/san/sanvol1/scratch/speTri1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastz.speTri1.2008-08-04 TMPDIR=/scratch/tmp _EOF_ # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do.log 2>&1 & # did crashed jobs on memk nice doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do2.log 2>&1 & # had to run netChains.csh by hand due to PATH problem nice doBlastzChainNet.pl `pwd`/DEF \ -verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \ -chainMinScore=3000 -chainLinearGap=medium \ -blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do3.log 2>&1 & ssh hgwdev cd /cluster/data/hg18/bed ln -s `pwd`/blastz.speTri1.2008-08-04 /cluster/data/hg18/bed/blastz.speTri1 featureBits hg18 chainSpeTri1Link # 1032377454 bases of 2881515245 (35.828%) in intersection ssh kkstore02 cd /cluster/data/hg18/bed/blastz.speTri1.2008-08-04 nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 speTri1 > rbest.log 2>&1 & ####################################################### ## 44-way multiz (braney working.... mkdir /cluster/data/hg18/bed/multiz44way cd /cluster/data/hg18/bed/multiz44way cp /cluster/data/mm9/bed/multiz30way/mm9.guess.30way.nh . # get mammal tree from Michele Clamp (clamp.nh) # that I re-rooted ####################################################### # UW nucleosome occupancy predictions (2008-08-13 markd) # update due to chr3 being truncated (2009-05-12 markd) # contact William Stafford Noble # obtain data: mkdir -p /cluster/data/hg18/bed/uwNucOcc cd /cluster/data/hg18/bed/uwNucOcc http://USER:PASS@noble.gs.washington.edu/~noble/proj/dennis/results/2008-08-11/ucsc.tgz mkdir wig cd wig tar -zxf ../ucsc.tgz cd .. rm ucsc.tgz # encode and load wiggles ssh kkstore02 cd /cluster/data/hg18/bed/uwNucOcc/wib zcat ../wig/a375/a375.chr*.wig.gz|wigEncode stdin uwNucOccA375.wig uwNucOccA375.wib # Converted stdin, upper limit 9.88, lower limit -5.19 zcat ../wig/dennis/dennis.chr*.wig.gz|wigEncode stdin uwNucOccDennis.wig uwNucOccDennis.wib # Converted stdin, upper limit 8.26, lower limit -9.68 zcat ../wig/mec/mec.chr*.wig.gz|wigEncode stdin uwNucOccMec.wig uwNucOccMec.wib # Converted stdin, upper limit 5.05, lower limit -9.86 # link-n-load ssh hgwdev cd cluster/data/hg18/bed/uwNucOcc/wib ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccA375.wib /gbdb/hg18/wib hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccA375 uwNucOccA375.wig ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccDennis.wib /gbdb/hg18/wib hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccDennis uwNucOccDennis.wig ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccMec.wib /gbdb/hg18/wib hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccMec uwNucOccMec.wig rm wiggle.tab # noble lab supplied update due to chr3 being truncated (2009-05-12 markd) cd /cluster/data/hg18/bed/uwNucOcc mkdir bad mv wig/*/*.chr3.hg18.wig.gz bad/ mv wib bad/ wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/a375/a375.chr3.hg18.wig.gz wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/mec/mec.chr3.hg18.wig.gz wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/dennis/dennis.chr3.hg18.wig.gz mv dennis.chr3.hg18.wig.gz wig/dennis/ mv mec.chr3.hg18.wig.gz wig/mec/ mv a375.chr3.hg18.wig.gz wig/a375/ cd /cluster/data/hg18/bed/uwNucOcc/wib zcat ../wig/a375/a375.chr*.wig.gz|wigEncode stdin uwNucOccA375.wig uwNucOccA375.wib zcat ../wig/dennis/dennis.chr*.wig.gz|wigEncode stdin uwNucOccDennis.wig uwNucOccDennis.wib zcat ../wig/mec/mec.chr*.wig.gz|wigEncode stdin uwNucOccMec.wig uwNucOccMec.wib cd .. hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccA375 uwNucOccA375.wig hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccDennis uwNucOccDennis.wig hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccMec uwNucOccMec.wig ######################################################################### # BLASTZ/CHAIN/NET oryLat2 (DONE - 2008-08-19,25 - Hiram) ssh kkstore02 screen # use a screen to manage this longish running job mkdir /cluster/data/hg18/bed/blastzOryLat2.2008-08-19 cd /cluster/data/hg18/bed/blastzOryLat2.2008-08-19 cat << '_EOF_' > DEF # Human vs. Medaka BLASTZ=/cluster/bin/penn/x86_64/lastz # typical parameters for a genome that is distant from human BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Human hg18, randoms complete, as they are, no contig confusion SEQ1_DIR=/scratch/data/hg18/hg18.2bit SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp) SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes SEQ2_CHUNK=40000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/cluster/data/hg18/bed/blastzOryLat2.2008-08-19 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl `pwd`/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -verbose=2 > do.log 2>&1 & cat fb.hg18.chainOryLat2Link.txt # 52713428 bases of 2881515245 (1.829%) in intersection cd /cluster/data/hg18/bed ln -s blastzOryLat2.2008-08-19 blastz.oryLat2 # That is OK, now for the swap: mkdir /cluster/data/oryLat2/bed/blastz.hg18.swap cd /cluster/data/oryLat2/bed/blastz.hg18.swap time doBlastzChainNet.pl -verbose=2 -swap \ /cluster/data/hg18/bed/blastzOryLat2.2008-08-19/DEF \ -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust \ -bigClusterHub=pk > swap.log 2>&1 & # real 17m9.675s cat fb.oryLat2.chainHg18Link.txt # 46961822 bases of 700386597 (6.705%) in intersection ######################################################################### # BLASTZ/CHAIN/NET TAEGUT1 (DONE braney 2008-09-10) ssh swarm screen mkdir /cluster/data/hg18/bed/blastz.taeGut1.2008-09-09 cd /cluster/data/hg18/bed/blastz.taeGut1.2008-09-09 cat << _EOF_ > DEF # human vs. zebra finch BLASTZ_M=50 # Copied settings from human vs galGal3 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human hg18 SEQ1_DIR=/scratch/data/hg18/hg18.2bit # SEQ1_SMSK=/hive/data/genomes/hg18/linSpecRep/notInChicken SEQ1_LEN=/scratch/data/hg18/chrom.sizes # one chrom at a time SEQ1_CHUNK=200000000 SEQ1_LAP=0 # QUERY: Zebra finch taeGut1 SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes # SEQ2_DIR=/hive/data/genomes/taeGut1/taeGut1.2bit # SEQ2_LEN=/hive/data/genomes/taeGut1/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/hg18/bed/blastz.taeGut1.2008-09-09 _EOF_ # << emacs doBlastzChainNet.pl -syntenicNet \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=loose \ -smallClusterHub=swarm DEF -workhorse=swarm \ -qRepeats=windowmaskerSdust > do.log 2>&1 # Completed: 14910 of 14910 jobs # CPU time in finished jobs: 2744737s 45745.62m 762.43h 31.77d 0.087 y # IO & Wait Time: 1493361s 24889.34m 414.82h 17.28d 0.047 y # Average job time: 284s 4.74m 0.08h 0.00d # Longest finished job: 3678s 61.30m 1.02h 0.04d # Submission to last job: 6687s 111.45m 1.86h 0.08d cd /cluster/data/hg18/bed rm -f blastz.taeGut1 ln -s blastz.taeGut1.2008-09-09 /cluster/data/hg18/bed/blastz.taeGut1 ################################################################ # HUMAN FETAL BRAIN EXON ARRAYS (YALE) (Andy) ssh hgwdev bash mkdir /hive/data/genomes/hg18/bed/yaleMicroarrays cd /hive/data/genomes/hg18/bed/yaleMicroarrays cp /var/ftp/encode/Sestan_fetal_brain_exon_arrays.rar . rar e Sestan_fetal_brain_exon_arrays.rar tail +2 18_19_21_23_full_SLR_converted.txt | grep -v "\-\-\-" > sestanBrainAtlas.bed hgLoadBed hg18 sestanBrainAtlas{,.bed} # just a little array name organization head -n1 18_19_21_23_full_SLR_converted.txt | \ sed 's/.*expNames=\"//;s/\"\ name=.*//;s/\.CEL//g' | \ tr ',' '\n' | sed '/^$/d' | grep -n '' | tr ':' '\t' | \ awk 'BEGIN{OFS="\t";}{$1=$1 - 1; print;}' \ > arrays.txt awk 'BEGIN{OFS="\t";}{print $1, $2, $2, "n/a", "n/a", "n/a", "103", "n/a,n/a,"$2",";}' \ arrays.txt > sestanBrainAtlasExps.tab ln -s ~/kent/src/hg/lib/expRecord.sql sestanBrainAtlasExps.sql hgLoadSqlTab hgFixed sestanBrainAtlasExps{,.sql,.tab} # Removed some of the arrays... the manual way # something's weird tr '\r' '\n' < sestanBrainAtlas.bed | sed '/^$/d; s/$/,/' > ses.bed cut -f1-14 ses.bed | \ awk 'BEGIN{FS="\t";OFS="\t"}{$2 = $2 - 1; $13 = $13 - 8; print;}' | \ sed 's/95,96,97,98,99,100,101,102//' > ses14.bed cut -f15 ses.bed | cut -d',' -f1-74,77-92,99- > ses15.bed paste ses14.bed ses15.bed > newSestan.bed hgLoadBed hg18 sestanBrainAtlas newSestan.bed ln -s ~/kent/src/hg/makeDb/hgCgiData/Human/microarrayGroups.ra grep -A5 sestanBrainAtlasAll microarrayGroups.ra | grep "^names" | sed 's/names //' | tr ',' '\n' | sed '/^$/d' > namesCol.txt grep -A5 sestanBrainAtlasAll microarrayGroups.ra | grep "^expIds" | sed 's/expIds //' | tr ',' '\n' | sed '/^$/d' >expCol.txt paste expCol.txt namesCol.txt > arrays.txt awk 'BEGIN{OFS="\t";}{print $1, $2, $2, "n/a", "n/a", "n/a", "103", "n/a,n/a,"$2",";}' \ arrays.txt > sestanBrainAtlasExps.tab hgLoadSqlTab hgFixed sestanBrainAtlasExps{,.sql,.tab} ssh kolossus ################################################################ # HUMAN TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08) # # AffyExonTissue Step 1: download exon array coordinate data from Affy # and extract coordinates Download HuEx-1_0-st-v2 Annotations, Full, # Hg18/Build 36 gff. The file is available at # http://www.affymetrix.com/support/technical/byproduct.affx?product=huexon-st # and the download requires logging in to NetAffx (free, registration # required) Uncompress the GFF files. Parse out key fields with the # script below, generating hg18.affy.exon.coords.tab # #--------- #!/usr/bin/env perl =pod =head1 NAME parseGffArrayData.pl =head1 SYNOPSYS cat *gff |parseGffArrayData.pl > array.coords.tab =head1 DESCRIPTION Parses probeset coordinates out of the Affy design data =cut { use strict; use Getopt::Long; use GFF; use GFF::GeneFeature; use FileHandle; print "chr\tstart\tend\tID\tscore\tstrand\n"; while (my $line = <>) { chomp; my @tokens = split /\s/, $line; if ($tokens[2] eq "probeset") { my $gffFeature = new GFF::GeneFeature; my $gffData = $gffFeature->new_from_line($line); my $probesetId = $gffData->group_value('probeset_id'); my $probesetLevel = $gffData->group_value('level'); my $bounded = $gffData->group_value('bounded'); my $cds = $gffData->group_value('cds'); my $score; if ($probesetLevel eq "core") { $score = 900; } elsif ($probesetLevel eq "extended") { $score = 500; } else { $score = 200; } if ($bounded) { $score -= 200; } if ($cds) { $score += 100; } if ($score < 100) { $score = 100; } print($gffData->seqname(), "\t", $gffData->start(), "\t", $gffData->end(), "\t", $probesetId, "\t$score\t", $gffData->strand(), "\n"); } } } #------- # # AffyExonTissue Step 2: download tissue data from Affy, generate bed15 file # # Download Human Exon 1.0 ST APT results from # http://www.affymetrix.com/support/technical/sample_data/exon_array_data.affx # (requires free registration and login, as above) # Uncompress, and get rid of the undesired tissue mixture columns. cut -f 1-34 \ < apt-probeset-summarize-results-exon/quant-norm.pm-gcbg.plier.summary.txt \ > quant-norm.pm-gcbg.plier.nomix.summary.txt # # Generate a bed15 file using the command below, and script below that. # For the purposes of generating a track, ignore the first line. # arrayToBed15.py \ --coordinates hg18.affy.exon.coords.tab \ --plier quant-norm.pm-gcbg.plier.nomix.summary.txt \ --name "humanExon" \ --groups "breast,breast,breast,cerebellum,cerebellum,cerebellum,heart,heart,heart,kidney,kidney,kidney,liver,liver,liver,muscle,muscle,muscle,pancreas,pancreas,pancreas,prostate,prostate,prostate,spleen,spleen,spleen,testes,testes,testes,thyroid,thyroid,thyroid" \ |tail -n +2 > human.exon.headless.bed15 #--- #!/usr/bin/python from optparse import OptionParser import math import re # # get the genomic probeset coordinates # def parseProbesetCoordinates(coordinatesFilename): """Build a dictionary of coordinates from a tab-delmited file""" coordinateData = {} coordinatesFileHandle = open(coordinatesFilename) coordinatesFileHandle.readline() # skip the header line for line in coordinatesFileHandle: line = line.rstrip(); tokens = line.split('\t') id = tokens[3] coordinateData[id] = tokens; return(coordinateData) def median(numbers): """Sort the input list and return the middle element.""" nn = len(numbers) copy = numbers[:] # So that "numbers" keeps its original order copy.sort() if nn & 1: # There is an odd number of elements return copy[nn // 2] else: return (copy[nn // 2] + copy[nn // 2 - 1]) / 2 def medianOfMedians(experimentNames, experimentValues): """Given replicated values, find the median of the replicate medians.""" # Create a dictionary to sort the values by experiment set replicates = {} # # Group the epxeriments into replicate sets by experiment names. # This assumes that experiments in the same replicate set have the # same name. # for ii in range(0,len(experimentNames)-1): if not replicates.has_key(experimentNames[ii]): replicates[experimentNames[ii]] = [experimentValues[ii]] else : replicates[experimentNames[ii]].append(experimentValues[ii]) # Make a list containing the median value of each replicate set. medians = list() for replicateSet in replicates.keys() : values = replicates[replicateSet] thisMedian = median(values) medians.append(thisMedian) # Now get the median value of the median list medianValue = median(medians) return(medianValue) def printHeaderData(experimentList, trackName): """Print a header line for a bed15 file""" expNames = ",".join(experimentList) print "track type=\"array\" expScale=3 expStep=0.5 ", print " name=\"" + str(trackName) + "\"", \ " description=\"Microarray custom track\"", print " expNames=" "\"" + expNames + "\"" def printPlierResults(resultsLine, experimentGroups, probesetCoordinates): """median-center a line of expression results, print in bed15 format""" background = 10 plierResultsLine = line.split('\t') probesetId = plierResultsLine[0] del plierResultsLine[0] if probesetCoordinates.has_key(probesetId): coordinates = probesetCoordinates[probesetId] # # Given coordinate data (chr start end ID score strand) # and given experimental data (ID exp1 exp2 exp3 ... expN) # Print as follows: # 1. Basic bedfile stuff: chromosome, start, end, name, score, # strand, thickStart (=start), thickEnd (=end), 0, blocks (=1), # blocklengths (=end-start+1,), blockstarts (=0,) # start = int(coordinates[1]) - 1 end = int(coordinates[2]) length = end - start print str(coordinates[0]) + "\t" + str(start) + "\t" \ + str(end) + "\t" + str(probesetId) + "\t", \ coordinates[4], "\t", coordinates[5], "\t", start, "\t", \ end, "\t0\t1\t", \ str(length) + ",\t", "0,\t", # # Continue with microarray-specific stuff: # - experiment count # - comma-separated list of experiment IDs (0 .. experimentCount) # - comma-separated list of experiment scores (log(result)-log(median)) # experimentCount = len(plierResultsLine) experimentValues = list() for value in plierResultsLine: experimentValues.append(float(value)) medianValue = medianOfMedians(experimentGroups, experimentValues) logMedian = math.log(medianValue+background) valuesStrings = list() for thisValue in experimentValues: thisScore = math.log(thisValue+background) - logMedian valuesStrings.append(str(thisScore)) experimentScoreString = ",".join(valuesStrings) ids = list() for ii in range(0, experimentCount): ids.append(str(ii)) experimentIdString = ",".join(ids) print experimentCount, "\t", experimentIdString, "\t", \ experimentScoreString return parser = OptionParser() parser.add_option("--coordinates", dest="coordinatesFile") parser.add_option("--plier", dest="plierResultsFile") parser.add_option("--name", dest="trackName") parser.add_option("--groups", dest="experimentGroups") (parameters, args) = parser.parse_args() experimentGroups = parameters.experimentGroups.split(",") probesetCoordinates = parseProbesetCoordinates(parameters.coordinatesFile) plierResults = open(parameters.plierResultsFile) for line in plierResults: line = line.rstrip() if (re.search("^#", line)) : continue elif (re.search("^probeset_id", line)) : printHeaderData(experimentGroups, parameters.trackName) else : printPlierResults(line, experimentGroups, probesetCoordinates) #--- # # AffyExonTissue Step 3: set up a browser track from the bed15 file # created offline: trackDb.affyExonTissues.ra, # affyExonTissues.html, # microarrayGroups.affyExonTissues.ra # cat $KENT/src/hg/makeDb/trackDb/human/trackDb.ra trackDb.affyExonTissues.ra \ > trackDb.new.ra cp trackDb.new.ra $KENT/src/hg/makeDb/trackDb/human/trackDb.ra cp affyExonTissues.html $KENT/src/hg/makeDb/trackDb/human cat $KENT/src/hg/makeDb/hgCgiData/Human/microarrayGroups.ra \ microarrayGroups.affyExonTissues.ra > microarrayGroups.new.ra hgLoadBed hg18 affyExonTissues human.exon.headless.bed15 cd $KENT/src/hg/makeDb/trackDb make update DBS="hg17 hg18" cd $KENT/src make -j8 cgi >& ~/make.j8.cgi.errout # # AffyExonTissue Step 4: load the appropriate fields into hgFixed # grep -A5 affyExonTissuesAll microarrayGroups.ra | grep "^names" \ | sed 's/names //' | tr ',' '\n' | sed '/^$/d' | sed 's/^\s\+//' > n.txt grep -A5 affyExonTissuesAll microarrayGroups.ra | grep "^expIds" \ | sed 's/expIds //' | tr ',' '\n' | sed '/^$/d' | sed 's/^\s\+//' > e.txt paste e.txt n.txt > a.txt awk 'BEGIN{OFS="\t";} {print $1, $2, $2, "n/a", "n/a", "n/a", "33", "n/a,n/a,"$2",";}' a.txt \ > exps.tab ln -s ../../../lib/expRecord.sql hgLoadSqlTab hgFixed affyMouseExonTissuesAllExps expRecord.sql exps.tab rm a.txt n.txt e.txt exps.tab ############ ######################################################################## ## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy) ## 1. Log into Affymetrix netaffx site. ## 2. Use Firefox add-on "Export Cookies" to save a file called cookies.txt ssh hgwdev grep affymetrix.com cookies.txt > affycookies.txt wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/HuEx-1_0-st-v2.na27.hg18.probeset.csv.zip wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/MoEx-1_0-st-v1.na27.mm9.probeset.csv.zip wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/RaEx-1_0-st-v1.na27.rn4.probeset.csv.zip rm affycookies.txt for z in *.zip; do unzip $z; done rm *.zip ln -s HuEx-1_0-st-v2.na27.hg18.probeset.csv hg18.csv ln -s RaEx-1_0-st-v1.na27.rn4.probeset.csv rn4.csv ln -s MoEx-1_0-st-v1.na27.mm9.probeset.csv mm9.csv for csv in {hg18,mm9,rn4}.csv; do bed=${csv%.csv}.bed sed '1,20d' $csv | tr ',' '\t' | sed 's/\"//g' | cut -f1-5,16 \ | grep -v "\-\-\-" \ | awk 'BEGIN{FS="\t";OFS="\t";}{if ($6 == "core") score = 1000; else if ($6 == "extended") score = 700; else if ($6 == "full") score = 300; else score = 100; name = $1"|"$6; print $2, $4-1, $5, name, score, $3}' \ | bedSort stdin $bed done for db in hg18 mm9 rn4; do hgLoadBed $db affyAllExonProbes $db.bed; done rm hg18.csv mm9.csv rn4.csv gzip *.bed *.csv mkdir -p /hive/data/genomes/{hg18,mm9,rn4}/bed/affyAllExonProbes mv HuEx-1_0-st-v2.na27.* hg18.bed.gz /hive/data/genomes/hg18/bed/affyAllExonProbes/ mv MoEx-1_0-st-v1.na27.* mm9.bed.gz /hive/data/genomes/mm9/bed/affyAllExonProbes/ mv * /hive/data/genomes/rn4/bed/affyAllExonProbes/ ## forgot mm8 (see mm8.txt for that one) ################################################ # SPLIT EXPRESSION & REGULATION GROUPS # (2008-09-09 kate) echo "insert into grp (name, label, priority, defaultIsClosed) values ('expression', 'Expression', 4.5, 1)" | hgsql hg18 echo "update grp set label='Regulation' where name='regulation'" | hgsql hg18 ############################################################################ # KIDD/EICHLER DISCORDANT CLONE ENDS (DONE 9/16/08 angie) ssh hgwdev mkdir /cluster/data/hg18/bed/kiddEichlerDiscordant cd /cluster/data/hg18/bed/kiddEichlerDiscordant foreach i (ABC7 ABC8 ABC9 ABC10 ABC11 ABC12 ABC13 ABC14 G248) wget --user=uuuu --password=ppppppp \ http://eichlerlab.gs.washington.edu/kiddj/hg18_fosmidmap/$i.bestdiscordant.sorted.gz end # Load the tracks (translate bacEndPairs-inspired format to bed12): foreach f (*.gz) set track = `echo $f:r:r:r \ | perl -wpe 's/^([AG])(\w+)$/kiddEichlerDisc$1\L$2/ || die;'` if ($status != 0) break echo $track zcat $f \ | perl -wpe 'if (/^chrom\s+chromStart/) {s/^.*\n$//; next;} \ my ($c, $s, $e, $n, $sc, $st, $bSt, $bSz, undef, $t) = split; \ @bSts = split(",", $bSt); @bSzs = split(",", $bSz); \ if ($t =~ /^transchrm_/) { \ @bSts = (0); @bSzs = ($e - $s); \ } elsif ($t =~ /^OEA_/) { \ die "\nERROR: bSts[0] $bSts[0] != s $s\n" if ($bSts[0] != $s); \ $bSzs[0]--; \ $bE = $bSts[0] + $bSzs[0]; \ die "bE $bE != e $e\n" if ($bE != $e); \ $bSts[0] -= $s; \ } elsif ($#bSts == 1) { \ if ($bSts[0] > $bSts[1]) { \ # warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \ $tmp = $bSts[0]; $bSts[0] = $bSts[1]; $bSts[1] = $tmp; \ $tmp = $bSzs[0]; $bSzs[0] = $bSzs[1]; $bSzs[1] = $tmp; \ } \ if ($bSts[0] != $s) { \ die "\nERROR: n=$n,$t: bSts[0]=$bSts[0] but s=$s\n\t"; \ } \ $bSzs[0]--; $bSzs[1]--; \ $bE0 = $bSts[0] + $bSzs[0]; \ $bE1 = $bSts[1] + $bSzs[1]; \ $bE = $bE0 > $bE1 ? $bE0 : $bE1; \ if ($bE != $e) { \ warn "n=$n,$t: bE0=$bE0, bE1=$bE1, bE=$bE, e=$e\n"; \ if ($bE1 > $e) { \ warn "n=$n,$t: tweaking bSzs[1] (clip to chromEnd)\n"; \ $bSzs[1] = $e - $bSts[1]; \ } \ } \ $bSts[0] -= $s; $bSts[1] -= $s; \ } else { die "t is $t but \$#bSts is $#bSts"; } \ $bSt = join(",", @bSts) . ","; $bSz = join(",", @bSzs) . ","; \ $rgb = ($t =~ /^deletion/) ? "224,0,0" : \ ($t =~ /^insertion/) ? "0,0,224" : \ ($t =~ /^inversion/) ? "0,224,0" : \ ($t =~ /^OEA/) ? "240,160,64" : "0,0,0"; \ $_ = join("\t", $c, $s, $e, "$n,$t", int($sc+0.5), $st, $s, $e, $rgb, \ scalar(@bSzs), $bSz, $bSt) . "\n";' \ | hgLoadBed -tab hg18 $track stdin checkTableCoords hg18 $track end # Tons of overlapping block and blockEnd[n-1]!=end warnings from # checkTableCoords -- but these are discordant mappings, so we # expect those. Make sure there aren't any other types of errors: foreach f (*.gz) set track = `echo $f:r:r:r \ checkTableCoords hg18 $track |& egrep -v 'overlapping|!= end'` end # No output, good. # Get clone ID -> NCBI acc mapping (same as for hg17; redownloaded to # make sure). mkdir /cluster/data/hg18/bed/kiddEichlerDiscordant/cloneIds cd /cluster/data/hg18/bed/kiddEichlerDiscordant/cloneIds # Saved off emailed file from Jeff Kidd to clones_used_3nov.txt.accessions; # get trace archive trace names for end reads: foreach n (7 9 10 11 12 13 14) wget http://hgsv.washington.edu/general/download/clone_mapping/ABC$n/ABC$n.conversion.gz end # ABC8 has _a and _b files: wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_a.conversion.gz wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_b.conversion.gz # That file is not available for G248. gunzip *.gz # Combine the relevant data from the .conversion files; keep only those # IDs that are used in the tracks. zcat ../[AG]*.gz \ | cut -f 4 \ | egrep -v '^(#chrom|track|name)' \ | sed -e 's/,.*//' \ | sort -u > discIds.txt grep -h -v ^163722_163722- *.conversion \ | perl -wpe 's/^OurClone.*\n// || s/^\d+_(HUMAN|\d+_).*\n$// || \ s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\.(F|FORWARD|R|REVERSE)\.\d+\t(\w+)$/$2$3_$6\t$7\t$8/ || \ warn "Parse line $.:\n$_";' \ | sort > allEnds.tab grep -wFf discIds.txt allEnds.tab > discEnds.txt wc -l discIds.txt allEnds.tab discEnds.txt # 352330 discIds.txt # 17490847 allEnds.tab # 781513 discEnds.txt # discEnds.txt has 2 lines (forward & reverse) for most of its ids... # ideally we would see 2*(352330) lines in discEnds.txt. # Get a list of which discordant clone IDs don't have ends in *.conv*: cut -f 1 allEnds.tab | uniq > all.tmp comm -23 discIds.txt all.tmp > discNotInConv.txt wc -l discNotInConv.txt #41853 discNotInConv.txt cat > combine.pl <<'_EOF_' #!/usr/bin/perl -w use strict; my ($cloneFile, $endsFile) = @ARGV; open(CLONES, $cloneFile) || die "Can't open $cloneFile: $!\n"; my %idInfo; while() { (s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\t(\w+)$/$2$3_$6\t$7/ && m/^(\w+)\t(\w+)/) || \ m/^(G248\w+)\t(\w+)$/ || die "Parse line $.:$_"; my ($id, $acc) = ($1, $2); $idInfo{$id}->[0] = $acc; } close(CLONES); open(ENDS, $endsFile) || die "Can't open $endsFile: $!\n"; while () { chomp; my ($id, $dir, $traceName) = split("\t"); if ($dir =~ /^F/) { $idInfo{$id}->[1] = $traceName; } elsif ($dir =~ /^R/) { $idInfo{$id}->[2] = $traceName; } else { die "What is this \$dir: $dir ?\n"; } } close(ENDS); foreach my $id (sort keys %idInfo) { my $infoRef = $idInfo{$id}; $infoRef->[0] = '' if (! defined $infoRef->[0]); $infoRef->[1] = 0 if (! defined $infoRef->[1]); $infoRef->[2] = 0 if (! defined $infoRef->[2]); print join("\t", $id, @{$infoRef}) . "\n"; } '_EOF_' # << emacs chmod a+x combine.pl combine.pl clones_used_3nov.txt.accessions discEnds.txt \ | sort > kiddEichlerToNcbi.txt # Load table: hgLoadSqlTab hg18 kiddEichlerToNcbi \ $HOME/kent/src/hg/lib/kiddEichlerToNcbi.sql kiddEichlerToNcbi.txt # Add to makeDb/schema/all.joiner, then check: runJoiner.csh hg18 kiddEichlerToNcbi $HOME/kent/src/hg/makeDb/schema ############################################################################ # hgPal downloads 28way refGene, knownGene, knownCanonical ssh hgwdev screen bash rm -rf /cluster/data/hg18/bed/multiz28way/pal mkdir /cluster/data/hg18/bed/multiz28way/pal cd /cluster/data/hg18/bed/multiz28way/pal cat > order.lst < ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # real 232m24.611s # user 13m59.669s # sys 5m5.601s zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz28way gp=refGene db=hg18 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz28way gp=knownGene db=hg18 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 248m39.293s # user 23m30.788s # sys 8m2.714s zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc pd=/usr/local/apache/htdocs/goldenPath/$db/$mz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz # now do the canonical set cd /cluster/data/hg18/bed/multiz28way/pal mz=multiz28way gp=knownCanonical db=hg18 for j in `awk '{print $1}' /cluster/data/hg18/chrom.sizes` do echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed done mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -geneBeds=$j.known.bed $db $mz knownGene order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 216m41.700s # user 10m22.016s # sys 4m6.917s rm *.known.bed zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc pd=/usr/local/apache/htdocs/goldenPath/$db/$mz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ######################################################################### ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: hg18.upstreamGeneTbl = refGene hg18.upstreamMaf = multiz28way /hive/data/genomes/hg18/bed/multiz28way/species.lst ######################################################################### # BarskiChIPseq tracks Begun: 2008-09-19 Finished: 2008-09-22 Tim # Barski, et al 2007 Paper - High-Resolution Mapping of Histone Modifications in the Human Genome # Solexa high-throughput sequencing: ChIPseq data # http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/hgtcell.html ssh hgwdev mkdir /hive/data/genomes/hg18/bed/Barski2007/lab cd /hive/data/genomes/hg18/bed/Barski2007/lab wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me1.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me2.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me3.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me1.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me2.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me3.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me1.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me2.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me3.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me1.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me3.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me1.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me2.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me3.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me1.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me2.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me1.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me3.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4R3me2.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2BK5me1.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2AZ.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/PolII.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/CTCF.bed wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me1.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me2.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me3.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me1.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me2.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me3.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me1.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me2.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me3.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me1.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me3.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me1.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me2.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me3.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me1.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me2.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me1.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me3.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4R3me2.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2BK5me1.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2AZ.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/PolII.vstep.gz wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/CTCF.vstep.gz gunzip *.gz mv H3K4me1.vstep H3K4me1.wig mv H3K4me2.vstep H3K4me2.wig mv H3K4me3.vstep H3K4me3.wig mv H3K9me1.vstep H3K9me1.wig mv H3K9me2.vstep H3K9me2.wig mv H3K9me3.vstep H3K9me3.wig mv H3K27me1.vstep H3K27me1.wig mv H3K27me2.vstep H3K27me2.wig mv H3K27me3.vstep H3K27me3.wig mv H3K36me1.vstep H3K36me1.wig mv H3K36me3.vstep H3K36me3.wig mv H3K79me1.vstep H3K79me1.wig mv H3K79me2.vstep H3K79me2.wig mv H3K79me3.vstep H3K79me3.wig mv H3R2me1.vstep H3R2me1.wig mv H3R2me2.vstep H3R2me2.wig mv H4K20me1.vstep H4K20me1.wig mv H4K20me3.vstep H4K20me3.wig mv H4R3me2.vstep H4R3me2.wig mv H2BK5me1.vstep H2BK5me1.wig mv H2AZ.vstep H2AZ.wig mv PolII.vstep PolII.wig mv CTCF.vstep CTCF.wig head -1 H3K4me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me1/g" > barskiChIPseqH3K4me1.wigVar head -1 H3K4me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me2/g" > barskiChIPseqH3K4me2.wigVar head -1 H3K4me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me3/g" > barskiChIPseqH3K4me3.wigVar head -1 H3K9me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me1/g" > barskiChIPseqH3K9me1.wigVar head -1 H3K9me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me2/g" > barskiChIPseqH3K9me2.wigVar head -1 H3K9me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me3/g" > barskiChIPseqH3K9me3.wigVar head -1 H3K27me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me1/g" > barskiChIPseqH3K27me1.wigVar head -1 H3K27me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me2/g" > barskiChIPseqH3K27me2.wigVar head -1 H3K27me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me3/g" > barskiChIPseqH3K27me3.wigVar head -1 H3K36me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K36me1/g" > barskiChIPseqH3K36me1.wigVar head -1 H3K36me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K36me3/g" > barskiChIPseqH3K36me3.wigVar head -1 H3K79me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me1/g" > barskiChIPseqH3K79me1.wigVar head -1 H3K79me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me2/g" > barskiChIPseqH3K79me2.wigVar head -1 H3K79me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me3/g" > barskiChIPseqH3K79me3.wigVar head -1 H3R2me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3R2me1/g" > barskiChIPseqH3R2me1.wigVar head -1 H3R2me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3R2me2/g" > barskiChIPseqH3R2me2.wigVar head -1 H4K20me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH4K20me1/g" > barskiChIPseqH4K20me1.wigVar head -1 H4K20me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH4K20me3/g" > barskiChIPseqH4K20me3.wigVar head -1 H4R3me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH4R3me2/g" > barskiChIPseqH4R3me2.wigVar head -1 H2BK5me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH2BK5me1/g" > barskiChIPseqH2BK5me1.wigVar head -1 H2AZ.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH2AZ/g" > barskiChIPseqH2AZ.wigVar head -1 PolII.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqPolII/g" > barskiChIPseqPolII.wigVar head -1 CTCF.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqCTCF/g" > barskiChIPseqCTCF.wigVar tail --lines=+2 H3K4me1.vstep >> barskiChIPseqH3K4me1.wigVar tail --lines=+2 H3K4me2.vstep >> barskiChIPseqH3K4me2.wigVar tail --lines=+2 H3K4me3.vstep >> barskiChIPseqH3K4me3.wigVar tail --lines=+2 H3K9me1.vstep >> barskiChIPseqH3K9me1.wigVar tail --lines=+2 H3K9me2.vstep >> barskiChIPseqH3K9me2.wigVar tail --lines=+2 H3K9me3.vstep >> barskiChIPseqH3K9me3.wigVar tail --lines=+2 H3K27me1.vstep >> barskiChIPseqH3K27me1.wigVar tail --lines=+2 H3K27me2.vstep >> barskiChIPseqH3K27me2.wigVar tail --lines=+2 H3K27me3.vstep >> barskiChIPseqH3K27me3.wigVar tail --lines=+2 H3K36me1.vstep >> barskiChIPseqH3K36me1.wigVar tail --lines=+2 H3K36me3.vstep >> barskiChIPseqH3K36me3.wigVar tail --lines=+2 H3K79me1.vstep >> barskiChIPseqH3K79me1.wigVar tail --lines=+2 H3K79me2.vstep >> barskiChIPseqH3K79me2.wigVar tail --lines=+2 H3K79me3.vstep >> barskiChIPseqH3K79me3.wigVar tail --lines=+2 H3R2me1.vstep >> barskiChIPseqH3R2me1.wigVar tail --lines=+2 H3R2me2.vstep >> barskiChIPseqH3R2me2.wigVar tail --lines=+2 H4K20me1.vstep >> barskiChIPseqH4K20me1.wigVar tail --lines=+2 H4K20me3.vstep >> barskiChIPseqH4K20me3.wigVar tail --lines=+2 H4R3me2.vstep >> barskiChIPseqH4R3me2.wigVar tail --lines=+2 H2BK5me1.vstep >> barskiChIPseqH2BK5me1.wigVar tail --lines=+2 H2AZ.vstep >> barskiChIPseqH2AZ.wigVar tail --lines=+2 PolII.vstep >> barskiChIPseqPolII.wigVar tail --lines=+2 CTCF.vstep >> barskiChIPseqCTCF.wigVar mkdir ../signal mv *.wigVar ../signal gzip * mkdir ../tags mv H3K4me1.bed ../tags/barskiChIPseqH3K4me1.bed mv H3K4me2.bed ../tags/barskiChIPseqH3K4me2.bed mv H3K4me3.bed ../tags/barskiChIPseqH3K4me3.bed mv H3K9me1.bed ../tags/barskiChIPseqH3K9me1.bed mv H3K9me2.bed ../tags/barskiChIPseqH3K9me2.bed mv H3K9me3.bed ../tags/barskiChIPseqH3K9me3.bed mv H3K27me1.bed ../tags/barskiChIPseqH3K27me1.bed mv H3K27me2.bed ../tags/barskiChIPseqH3K27me2.bed mv H3K27me3.bed ../tags/barskiChIPseqH3K27me3.bed mv H3K36me1.bed ../tags/barskiChIPseqH3K36me1.bed mv H3K36me3.bed ../tags/barskiChIPseqH3K36me3.bed mv H3K79me1.bed ../tags/barskiChIPseqH3K79me1.bed mv H3K79me2.bed ../tags/barskiChIPseqH3K79me2.bed mv H3K79me3.bed ../tags/barskiChIPseqH3K79me3.bed mv H3R2me1.bed ../tags/barskiChIPseqH3R2me1.bed mv H3R2me2.bed ../tags/barskiChIPseqH3R2me2.bed mv H4K20me1.bed ../tags/barskiChIPseqH4K20me1.bed mv H4K20me3.bed ../tags/barskiChIPseqH4K20me3.bed mv H4R3me2.bed ../tags/barskiChIPseqH4R3me2.bed mv H2BK5me1.bed ../tags/barskiChIPseqH2BK5me1.bed mv H2AZ.bed ../tags/barskiChIPseqH2AZ.bed mv PolII.bed ../tags/barskiChIPseqPolII.bed mv CTCF.bed ../tags/barskiChIPseqCTCF.bed cd .. cd ../signal cat > makeWig.sh << \_EOF_ #!/bin/bash genDir=/gbdb/hg18/barskiChIPseq mkdir \${genDir} for file in *.wigVar do base=\${file%.wigVar} echo "Loading \${file} to \${base}..." time nice -n +19 wigEncode base \${base}.wigVar \${base}.wig \${base}.wib time nice -n +19 hgLoadWiggle -pathPrefix=\${genDir} hg18 \${base} \${base}.wig ln -sf `pwd`/\${base}.wib \${genDir}/\${base}.wib done _EOF_ chmod 755 makeWig.sh ./makeWig.sh & # ................ Got to here # ................ Got to here # ................ Got to here # ................ Got to here # .............. I have not loaded the tags !!! # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me1 BarskiChIPseqH3K4me1.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me2 BarskiChIPseqH3K4me2.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me3 BarskiChIPseqH3K4me3.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me1 BarskiChIPseqH3K9me1.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me2 BarskiChIPseqH3K9me2.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me3 BarskiChIPseqH3K9me3.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me1 BarskiChIPseqH3K27me1.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me2 BarskiChIPseqH3K27me2.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me3 BarskiChIPseqH3K27me3.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K36me1 BarskiChIPseqH3K36me1.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K36me3 BarskiChIPseqH3K36me3.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me1 BarskiChIPseqH3K79me1.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me2 BarskiChIPseqH3K79me2.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me3 BarskiChIPseqH3K79me3.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3R2me1 BarskiChIPseqH3R2me1.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3R2me2 BarskiChIPseqH3R2me2.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH4K20me1 BarskiChIPseqH4K20me1.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH4K20me3 BarskiChIPseqH4K20me3.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH4R3me2 BarskiChIPseqH4R3me2.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH2BK5me1 BarskiChIPseqH2BK5me1.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqH2AZ BarskiChIPseqH2AZ.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqPolII BarskiChIPseqPolII.bed # time nice -n +19 hgLoadBed hg18 barskiChIPseqCTCF BarskiChIPseqCTCF.bed # .............. I have not loaded the tags !!! ######################################################################### ## 44-Way Multiz (DONE - 2008-11-10,15 - Hiram) ssh hgwdev mkdir /hive/data/genomes/hg18/bed/multiz44way cd /hive/data/genomes/hg18/bed/multiz44way # starting with the 44way tree that Brian made earlier: cp -p ../multiz44way.2008-08-06/44way.db.nh ./44way.nh sed -e "s/oryLat1/hg18/; s/danRer4/danRer5/; s/oryLat1/oryLat2/" \ /cluster/data/oryLat1/bed/multiz44way/44way.nh > 44way.nh # this file looks like: cat << '_EOF_' > 44way.nh (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2, (stickleback_gasAcu1:0.2,medaka_hg18:0.2):0.2):0.292961, zebrafish_danRer5:0.782561); '_EOF_' # << happy emacs # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/hg18_44way.gif /cluster/bin/phast/all_dists 44way.nh > 44way.distances.txt # Use this output to create the table below, with this perl script: cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "grep -y hg18 44way.distances.txt | sort -k3,3n|") or die "can not read 44way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($hg18, $D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/hg18/bed/blastz.$D/fb.hg18." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %02d %.4f - %s %s\t(%% %.3f)\n", $count, $dist, $orgName, $D, $chainLinkMeasure } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainOryLat1Link chain linearGap # distance on hg18 on other minScore # 01 0.0092 - Chimp panTro2 (% 94.888) # 02 0.0267 - Gorilla gorGor1 (% 61.731) # 03 0.0467 - Orangutan ponAbe2 (% 92.892) # 04 0.0667 - Marmoset calJac1 (% 78.351) # 05 0.0783 - Rhesus rheMac2 (% 85.552) # 06 0.1767 - Tarsier tarSyr1 (% 47.999) # 07 0.2448 - Mouse lemur micMur1 (% 46.445) # 08 0.3061 - Bushbaby otoGar1 (% 44.638) # 09 0.3367 - Rabbit oryCun1 (% 34.015) # 10 0.3507 - TreeShrew tupBel1 (% 37.348) # 11 0.3567 - Squirrel speTri1 (% 35.828) # 12 0.4067 - Guinea Pig cavPor3 (% 43.971) # 13 0.4067 - Alpaca vicPac1 (% 39.531) # 14 0.4098 - Megabat pteVam1 (% 45.502) # 15 0.4099 - Microbat myoLuc1 (% 33.044) # 16 0.4154 - Cat felCat3 (% 35.888) # 17 0.4293 - Elephant loxAfr2 (% 35.204) # 18 0.4314 - Dog canFam2 (% 52.915) # 19 0.4317 - Mouse mm9 (% 35.201) # 20 0.4362 - Rat rn4 (% 32.893) # 21 0.4367 - Pika ochPri2 (% 27.974) # 22 0.4639 - Horse equCab2 (% 57.162) # 23 0.4693 - Rock hyrax proCap1 (% 30.935) # 24 0.4767 - Dolphin turTru1 (% 48.537) # 25 0.5067 - Kangaroo rat dipOrd1 (% 27.282) # 26 0.5187 - Armadillo dasNov2 (% 33.663) # 27 0.5191 - Cow bosTau4 (% 46.689) # 28 0.5298 - hedgehog eriEur1 (% 19.622) # 29 0.5399 - Sloth choHof1 (% 34.463) # 30 0.5605 - Shrew sorAra1 (% 20.056) # 31 0.5815 - Tenrec echTel1 (% 23.645) # 32 0.7309 - Opossum monDom4 (% 12.385) # 33 0.9870 - Platypus ornAna1 (% 7.870) # 34 1.0313 - Zebra finch taeGut1 (% 3.503) # 35 1.0436 - Lamprey petMar1 (% 1.251) # 36 1.1013 - Chicken galGal3 (% 3.589) # 37 1.2253 - Lizard anoCar1 (% 4.774) # 38 1.5473 - X. tropicalis xenTro2 (% 2.623) # 39 1.8337 - Stickleback gasAcu1 (% 1.923) # 40 1.8482 - Zebrafish danRer5 (% 2.565) # 41 1.8721 - Tetraodon tetNig1 (% 2.001) # 42 1.9077 - Fugu fr2 (% 1.766) # 43 2.0215 - Medaka oryLat2 (% 1.829) # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 44way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list cd /hive/data/genomes/hg18/bed/multiz44way # bash shell syntax here ... export H=/hive/data/genomes/hg18/bed mkdir mafLinks for G in `sed -e "s/hg18 //" species.list` do mkdir mafLinks/$G if [ -s ${H}/blastz.${G}/mafRBestNet/chr1.maf.gz ]; then echo "$G - recipBest" ln -s ${H}/blastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G else if [ -s ${H}/blastz.${G}/mafSynNet/chr1.maf.gz ]; then echo "$G - synNet" ln -s ${H}/blastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G else if [ -s ${H}/blastz.${G}/mafNet/chr1.maf.gz ]; then echo "$G - mafNet" ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G else echo "missing directory blastz.${G}/*Net" fi fi fi done # need to split these things up into smaller pieces for # efficient kluster run. Using the new hive architecture. ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way mkdir mafSplit # mafSplitPos splits on repeat areas that will not have any chains mafSplitPos -minGap=50000 hg18 10 mafSplit.bed for G in `sed -e "s/hg18 //" species.list` do echo -n "working ${G} ..." rm -fr mafSplit/${G} mkdir mafSplit/${G} cd mafSplit/${G} mafSplit ../../mafSplit.bed hg18_ ../../mafLinks/${G}/chr*.maf.gz \ -verbose=2 cd /hive/data/genomes/hg18/bed/multiz44way echo " done" done # create a run-time list of files to operate on, not all file names # exist for all assemblies cd mafSplit for D in * do cd "${D}" find . -type f cd .. done | sort -u | sed -e "s#./##" > ../44-way.split.list wc -l ../44-way.split.list # 267 ../44-way.split.list # the autoMultiz cluster run ssh swarm cd /hive/data/genomes/hg18/bed/multiz44way/ mkdir splitRun cd splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = hg18 set c = $1 set result = $2 set run = `pwd` set tmp = $run/tmp/$db/multiz.$c set pairs = /hive/data/genomes/hg18/bed/multiz44way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp foreach s (`sed -e "s/ $db//" species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out else if (-e $in) then ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db /bin/rmdir --ignore-fail-on-non-empty $run/tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/hg18/bed/multiz44way/splitRun/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../../44-way.split.list single template jobList para create jobList # initial run experience suggest some of the big jobs reach 8 Gb # of memory usage, so, tell parasol to limit the number of jobs per # node to avoid thrashing para -ram=6g push # 8 jobs were finished manually on hgwdev, kolossus and memk nodes XXXX - running 2008-11-12 - Wed Nov 12 15:29:39 PST 2008 # Completed: 792 of 792 jobs # CPU time in finished jobs: 5423s 90.38m 1.51h 0.06d 0.000 y # IO & Wait Time: 138287s 2304.79m 38.41h 1.60d 0.004 y # Average job time: 181s 3.02m 0.05h 0.00d # Longest finished job: 404s 6.73m 0.11h 0.00d # Submission to last job: 436s 7.27m 0.12h 0.01d # Estimated complete: 0s 0.00m 0.00h 0.00d # put the split maf results back together into a single maf file # eliminate duplicate comments ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/splitRun mkdir ../maf # the sed edits take out partitioning name information from the comments # so the multiple parts will condense to smaller number of lines # this takes almost 2 hours of time, resulting in a bit over 150 Gb, # almost all chrom files over 1 Gb, up to almost 10 Gb for chr2 # HOWEVER, this is actually not necessary to maintain these comments, # they are lost during the mafAddIRows ls maf | sed -e "s/hg18_//; s/\..*//" | sort -u | while read C do echo "========== $C ==============" rm -f ../maf/${C}.maf.gz head -q -n 1 maf/hg18_${C}.*.maf | sort -u > ../maf/${C}.maf grep -h "^#" maf/hg18_${C}.*.maf | egrep -v "maf version=1|eof maf" | \ sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \ | sort -u >> ../maf/${C}.maf grep -h -v "^#" `ls maf/hg18_${C}.*.maf | sort -t. -k2,2n` \ >> ../maf/${C}.maf tail -q -n 1 maf/hg18_${C}.*.maf | sort -u >> ../maf/${C}.maf done # load tables for a look ssh hgwdev mkdir -p /gbdb/hg18/multiz44way/maf cd /hive/data/genomes/hg18/bed/multiz44way/maf ln -s `pwd`/*.maf /gbdb/hg18/multiz44way/maf # this generates an immense multiz44way.tab file in the directory # where it is running. Best to run this over in scratch. cd /data/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/hg18/multiz44way/maf hg18 multiz44way # real 1m10.380s # Loaded 1366931 mafs in 1 files from /gbdb/hg18/multiz44way/maf # load summary table time nice -n +19 cat /gbdb/hg18/multiz44way/maf/*.maf \ | hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz44waySummary stdin # real 2m39.822 # Created 353577 summary blocks from 2852890 components and 1197504 mafs # from stdin # Gap Annotation # prepare bed files with gap info mkdir /hive/data/genomes/hg18/bed/multiz44way/anno cd /hive/data/genomes/hg18/bed/multiz44way/anno mkdir maf run # most of these will already exist from previous multiple alignments # remove the echo from in front of the twoBitInfo command to get them # to run if this loop appears to be correct for DB in `cat ../species.list` do CDIR="/hive/data/genomes/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `sed -e "s/hg18 //" ../../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # the annotation step requires large memory, run on memk nodes ssh memk cd /hive/data/genomes/hg18/bed/multiz44way/anno/run ls ../../maf | sed -e "s/.maf//" > chr.list cat << '_EOF_' > template #LOOP ./anno.csh $(root1) {check out line+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > anno.csh #!/bin/csh -fe set inMaf = ../../maf/$1.maf set outMaf = ../maf/$1.maf rm -f $outMaf mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/hg18/hg18.2bit $outMaf '_EOF_' # << happy emacs chmod +x anno.csh gensub2 chr.list single template jobList para create jobList # specify lots of ram to get one job per node para -ram=30g push ssh hgwdev rm -fr /gbdb/hg18/multiz44way/maf mkdir /gbdb/hg18/multiz44way/maf cd /hive/data/genomes/hg18/bed/multiz44way/anno/maf ln -s `pwd`/*.maf /gbdb/hg18/multiz44way/maf/ # by loading this into the table multiz44way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /data/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/hg18/multiz44way/maf hg18 multiz44way # with final set of quality annotated files: # Loaded 33320838 mafs in 49 files from /gbdb/hg18/multiz44way/maf # real 91m46.889s # running on Irow annotated mafs Fri Nov 28 00:28:09 PST 2008 # Loaded 33320675 mafs in 49 files from /gbdb/hg18/multiz44way/maf # real 236m15.279s # running on bare bones mafs Thu Nov 27 19:29:44 PST 2008 # Loaded 33273351 mafs in 49 files from /gbdb/hg18/multiz44way/maf # real 198m55.761s - while swarm busy with rebalancing # from before the fixed multiz: # Loaded 35154852 mafs in 49 files from /gbdb/hg18/multiz44way/maf # real 71m5.594s time nice -n +19 cat /gbdb/hg18/multiz44way/maf/*.maf \ | hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz44waySummary stdin # with the quality annotated mafs, and mem interference on hgwdev: # Created 8514381 summary blocks from 600504256 components \ # and 33320838 mafs from stdin # real 169m56.936s # with the Irow annotations after the multiz fix: # Created 8514380 summary blocks from 600499937 # components and 33298894 mafs from stdin # real 184m42.893s # user 70m44.431s # sys 8m7.970s # Created 8514078 summary blocks from 604683213 components # and 35125649 mafs from stdin # real 130m55.115s # user 71m37.409s # sys 8m5.110s # by loading this into the table multiz44waySummary, it will replace # the previously loaded table with the unannotated mafs # remove the multiz44way*.tab files in this /data/tmp directory # -rw-rw-r-- 1 1949221892 Nov 15 14:04 multiz44way.tab # -rw-rw-r-- 1 417994189 Nov 15 20:57 multiz44waySummary.tab wc -l multiz44way*.tab # 33964377 multiz44way.tab # 8514078 multiz44waySummary.tab # 42478455 total rm multiz44way*.tab # create some downloads mkdir -p /hive/data/genomes/hg18/bed/multiz44way/download/maf cd /hive/data/genomes/hg18/bed/multiz44way/download/maf time cp -p ../../anno/maf/chr*.maf . # real 72m46.514s # user 0m1.293s # sys 5m15.981s time gzip --rsyncable *.maf time gzip --rsyncable *.maf # real 185m37.884s # user 179m51.161s # sys 3m48.016s time md5sum *.gz > md5sum.txt # real 3m59.009s # user 1m19.338s # sys 0m18.976s ############################################################################# ## Annotate 44-way multiple alignment with gene annotations ## (DONE - 2008-12-08,23 - Hiram) # Gene frames ## survey all genomes to see what type of gene track to use ssh hgwdev mkdir /hive/data/genomes/hg18/bed/multiz44way/frames cd /hive/data/genomes/hg18/bed/multiz44way/frames # cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \ $table == "knownGene") then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql hg18 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh # rearrange that output to create four sections: # 1. knownGenes for hg18, mm9 # 2. ensGene for almost everything else # 3. Mrnas for taeGut1, anoCar1, petMar1, calJac1 # 4. nothing for loxAfr2, dasNov2, choHof1 mkdir genes # knownGene for DB in hg18 mm9 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # ensGene for DB in bosTau4 canFam2 cavPor3 danRer5 dipOrd1 echTel1 equCab2 \ eriEur1 felCat3 fr2 galGal3 gasAcu1 gorGor1 micMur1 monDom4 myoLuc1 \ ochPri2 ornAna1 oryCun1 oryLat2 otoGar1 panTro2 ponAbe2 proCap1 \ pteVam1 rheMac2 rn4 sorAra1 speTri1 tarSyr1 tetNig1 tupBel1 \ turTru1 vicPac1 xenTro2 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # use Mrnas for taeGut1 anoCar1 petMar1 calJac1 for DB in taeGut1 anoCar1 petMar1 calJac1 do tmpExt=`mktemp temp.XXXXXX` tmpMrnaCds=${DB}.mrna-cds.${tmpExt} tmpMrna=${DB}.mrna.${tmpExt} tmpCds=${DB}.cds.${tmpExt} hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ $DB > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \ genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz rm -f $tmpExt echo "${DB} done" done # leaving out loxAfr2, dasNov2, choHof1 since no gene preds there # Create this command with this script: cat << '_EOF_' > mkCmd.sh #!/bin/sh echo "time (cat ../quals/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout \\" if [ ! -s genes/mm9.gp.gz ]; then echo "missing genes/mm9.gp.gz" exit 255 fi echo "mm9 genes/mm9.gp.gz \\" for D in `sort ensGene.list` do if [ ! -s genes/${D}.gp.gz ]; then echo "missing genes/${D}.gp.gz" exit 255 fi echo -n "${D} genes/${D}.gp.gz " done echo "\\" for D in `sort mrna.list` do if [ ! -s genes/${D}.gp.gz ]; then echo "missing genes/${D}.gp.gz" exit 255 fi echo -n "${D} genes/${D}.gp.gz " done echo "\\" echo " | gzip > multiz44way.mafFrames.gz) > frames.log 2>&1" '_EOF_' # << happy emacs chmod +x ./mkCmd.sh time (cat ../quals/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout \ mm9 genes/mm9.gp.gz \ bosTau4 genes/bosTau4.gp.gz canFam2 genes/canFam2.gp.gz cavPor3 genes/cavPor3.gp.gz danRer5 genes/danRer5.gp.gz dipOrd1 genes/dipOrd1.gp.gz echTel1 genes/echTel1.gp.gz equCab2 genes/equCab2.gp.gz eriEur1 genes/eriEur1.gp.gz felCat3 genes/felCat3.gp.gz fr2 genes/fr2.gp.gz galGal3 genes/galGal3.gp.gz gasAcu1 genes/gasAcu1.gp.gz micMur1 genes/micMur1.gp.gz monDom4 genes/monDom4.gp.gz myoLuc1 genes/myoLuc1.gp.gz ochPri2 genes/ochPri2.gp.gz ornAna1 genes/ornAna1.gp.gz oryCun1 genes/oryCun1.gp.gz oryLat2 genes/oryLat2.gp.gz otoGar1 genes/otoGar1.gp.gz panTro2 genes/panTro2.gp.gz ponAbe2 genes/ponAbe2.gp.gz proCap1 genes/proCap1.gp.gz pteVam1 genes/pteVam1.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz sorAra1 genes/sorAra1.gp.gz speTri1 genes/speTri1.gp.gz tarSyr1 genes/tarSyr1.gp.gz tetNig1 genes/tetNig1.gp.gz tupBel1 genes/tupBel1.gp.gz turTru1 genes/turTru1.gp.gz vicPac1 genes/vicPac1.gp.gz xenTro2 genes/xenTro2.gp.gz \ anoCar1 genes/anoCar1.gp.gz calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz taeGut1 genes/taeGut1.gp.gz \ | gzip > multiz44way.mafFrames.gz) > frames.log 2>&1 # that doesn't work on any 32 Gb computer, requires much more memory # turn it into a kluster job ssh swarm cd /hive/data/genomes/hg18/bed/multiz44way/frames cat << '_EOF_' > runOne #!/bin/csh -fe set C = $1 set G = $2 cat ../quals/maf/${C}.maf | genePredToMafFrames hg18 stdin stdout \ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz '_EOF_' # << happy emacs chmod +x runOne ls ../quals/maf | sed -e "s/.maf//" > chr.list ls genes | sed -e "s/.gp.gz//" | grep -v hg18 > gene.list cat << '_EOF_' > template #LOOP runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz} #ENDLOOP '_EOF_' # << happy emacs mkdir parts gensub2 chr.list gene.list template jobList para -ram=8g create jobList para try ... check ... push # Completed: 1911 of 1911 jobs # CPU time in finished jobs: 126751s 2112.52m 35.21h 1.47d 0.004 y # IO & Wait Time: 2573543s 42892.38m 714.87h 29.79d 0.082 y # Average job time: 1413s 23.55m 0.39h 0.02d # Longest finished job: 6490s 108.17m 1.80h 0.08d # Submission to last job: 11310s 188.50m 3.14h 0.13d # see what it looks like in terms of number of annotations per DB: find ./parts -type f | while read F do zcat ${F} done | cut -f4 | sort | uniq -c | sort -n 165 anoCar1 2807 calJac1 3306 taeGut1 5416 petMar1 141256 tarSyr1 142346 vicPac1 163854 sorAra1 164475 galGal3 174150 felCat3 178531 oryCun1 178744 ornAna1 179511 turTru1 190622 eriEur1 191477 tupBel1 197338 panTro2 198063 speTri1 199541 micMur1 207391 ponAbe2 208629 rheMac2 208850 otoGar1 212751 myoLuc1 212857 dipOrd1 213343 proCap1 214972 echTel1 216367 monDom4 220724 ochPri2 223159 equCab2 227928 bosTau4 231351 cavPor3 231553 pteVam1 233980 mm9 234268 rn4 249016 canFam2 258191 xenTro2 315098 danRer5 365824 oryLat2 387739 fr2 423941 gasAcu1 549846 tetNig1 # load the resulting file ssh hgwdev cd /cluster/data/hg18/bed/multiz44way/frames find ./parts -type f | while read F do zcat ${F} done | sort -k1,1 -k2,2n | hgLoadMafFrames hg18 multiz44wayFrames stdin find ./parts -type f | while read F do zcat ${F} done | sort -k1,1 -k2,2n > multiz44wayFrames.bed featureBits -countGaps hg18 multiz44wayFrames.bed # 62315198 bases of 3107677273 (2.005%) in intersection featureBits -countGaps hg18 multiz28wayFrames # 48236360 bases of 3107677273 (1.552%) in intersection # enable the trackDb entries: # frames multiz44wayFrames # irows on # appears to work OK ######################################################################### # Phylogenetic tree from 44-way (2008-12-06 kate) # Extract 4-fold degenerate sites based on # of RefSeq Reviewed, coding ssh pk cd /hive/data/genomes/hg18/bed/multiz44way mkdir 4d cd 4d hgsql hg18 -Ne \ "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" | cut -f 2-20 > refSeqReviewed.gp wc -l refSeqReviewed.gp #12684 refSeqReviewed.gp genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp wc -l refSeqReviewedNR.gp #7365 refSeqReviewedNR.gp mkdir run cd run # chopped up mafs version # run on swarm with -ram=8g cat > 4d.csh << 'EOF' set infile = $1 set outfile = $2 set c = `echo $1 | sed 's/^.*hg18_\(chr[^.][^.]*\).*.maf/\1/'` echo $c cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' /cluster/data/hg18/bed/multiz44way/4d/refSeqReviewedNR.gp > $c.gp set PHASTBIN=/cluster/bin/phast.2008-11-30 $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile #rm -f $c.gp $c.maf $c.ss 'EOF' # whole chrom mafs version, using new version of # uses memory-efficient version of phast, from Melissa Hubisz at Cornell (mjhubisz@gmail.com) cat > 4d.csh << 'EOF' set c = $1 set infile = $2 set outfile = $3 echo $c cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' /cluster/data/hg18/bed/multiz44way/4d/refSeqReviewedNR.gp > $c.gp set PHASTBIN=/cluster/bin/phast.2008-12-18 $PHASTBIN/msa_view --4d --features --do-cats 3 $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile rm -f $c.gp $c.maf $c.ss 'EOF' ls -1S /hive/data/genomes/hg18/bed/multiz44way/maf/*.maf | \ grep -v random | grep -v chrM | grep -v hap > in.lst cat << 'EOF' > template #LOOP csh 4d.csh $(root1) {check in line+ $(path1)} {check out line+ /cluster/data/hg18/bed/multiz44way/4d/mfa/$(root1).mfa} #ENDLOOP 'EOF' # << this line makes emacs coloring happy cat << 'EOF' > template #LOOP csh 4d.csh $(root1) {check in line+ $(path1)} {check out line+ /cluster/data/hg18/bed/multiz44way/4d/mfa2/$(root1).mfa} #ENDLOOP 'EOF' # << this line makes emacs coloring happy gensub2 in.lst single template stdout | tac > jobList rm -fr /cluster/data/hg18/bed/multiz44way/4d/mfa mkdir /cluster/data/hg18/bed/multiz44way/4d/mfa para create jobList para try para check para push # combine mfa files cd .. sed -e "s/ /,/g" ../species.list > species.lst /cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \ sed s/"> "/">"/ > 4d.all.mfa sed -e 's/,monDom4.*//' species.lst > placentals.lst /cluster/bin/phast/msa_view --aggregate `cat placentals.lst` mfa/*.mfa | \ sed s/"> "/">"/ > 4d.placentals.mfa # use phyloFit to create tree model (output is phyloFit.mod) set PHASTBIN=/cluster/bin/phast.2008-12-18 $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA --subst-mod REV --tree ../tree-commas.nh 4d.all.mfa # started at 5:50pm # ended at 7:27 => ~90 min on swarm mv phyloFit.mod phyloFit.all.mod grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.44way.nh $PHASTBIN/tree_doctor \ --prune=monDom4,ornAna1,taeGut1,petMar1,galGal3,anoCar1,xenTro2,gasAcu1,danRer5,tetNig1,fr2,oryLat2 \ tree_4d.44way.nh > tree_4d.44way.placental.nh # chrX-only for placental subset (requested by 2X project) set PHASTBIN=/cluster/bin/phast.2008-12-18 $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA --subst-mod REV --tree ../tree-commas.nh --out-root 4d.chrX mfa/chrX.mfa ############################################################################# # phastCons 44-way (DONE - 2008-12-23 - 2009-01-02 - Hiram) # split 44way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh memk mkdir -p /hive/data/genomes/hg18/bed/multiz44way/cons/msa.split mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/ss cd /hive/data/genomes/hg18/bed/multiz44way/cons/msa.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/hg18/bed/multiz44way/maf/$c.maf set WINDOWS = /hive/data/genomes/hg18/bed/multiz44way/cons/ss/$c rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null twoBitToFa -seq=$c /hive/data/genomes/hg18/hg18.2bit hg18.$c.fa /cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \ -M hg18.$c.fa -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 rm -f hg18.$c.fa popd > /dev/null date >> $c.done '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../maf | sed -e "s/.maf//" > maf.list gensub2 maf.list single template jobList para -ram=32g create jobList para try ... check ... etc # this takes a really long time. memk was down to 2 usable # machines - got it finished manually on a combination of hgwdevnew CPUs # and other machines # Estimate phastCons parameters # experimented with this as a parasol job on hgwdevnew to try a number # of SS files. With a command of: /cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \ --tree "(((((((((((((((((hg18,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \ --out-root=$OUT/starting_tree # running over the input files ../ss/*/*.ss results to #.../genomes/hg18/bed/multiz44way/cons/startingTree/result/*/starting-tree.mod # add up the C and G: find ./result -type f | xargs ls -rt | while read F do D=`dirname $F` echo -n `basename $D`" - " grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}' done # counting number of species seen in the maf file: find ./result -type f | xargs ls -rt | while read F do D=`dirname $F` echo -n `basename $D`" - " grep TREE $F | sed -e \ "s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g" | tr ',' '\n' | wc -l done # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ ssh swarm mkdir -p /hive/data/genomes/hg18/bed/multiz44way/cons/run.cons cd /hive/data/genomes/hg18/bed/multiz44way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all euarchontogliers placentals cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast/x86_64 set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/hg18/bed/multiz44way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/ss/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp else ln -s $ssSrc/ss/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ bed/$(root1)/$(file1).bed} #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > ss.list # run for all species cd /hive/data/genomes/hg18/bed/multiz44way/cons mkdir -p all cd all # Using Kate's .mod tree cp -p ../../4d/44way.all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. XXX - running Tue Jan 13 22:19:21 PST 2009 # Completed: 322 of 322 jobs # CPU time in finished jobs: 47406s 790.10m 13.17h 0.55d 0.002 y # IO & Wait Time: 29902s 498.37m 8.31h 0.35d 0.001 y # Average job time: 240s 4.00m 0.07h 0.00d # Longest finished job: 354s 5.90m 0.10h 0.00d # Submission to last job: 536s 8.93m 0.15h 0.01d # create Most Conserved track cd /hive/data/genomes/hg18/bed/multiz44way/cons cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute # load into database ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/cons/all time nice -n +19 hgLoadBed hg18 phastConsElements44way mostConserved.bed # Loaded 4878296 elements of size 5 # real 2m3.414s # Try for 5% overall cov, and 70% CDS cov # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits hg18 -enrichment refGene:cds phastConsElements44way # refGene:cds 1.144%, mostConserved.bed 4.973%, # both 0.854%, cover 74.62%, enrich 15.01x # --rho .31 --expected-length 45 --target-coverage .3 # refGene:cds 1.144%, phastConsElements44way 4.706%, # both 0.824%, cover 72.07%, enrich 15.31x # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits hg18 -enrichment knownGene:cds phastConsElements44way # knownGene:cds 1.205%, mostConserved.bed 4.973%, # both 0.874%, cover 72.55%, enrich 14.59x # --rho .31 --expected-length 45 --target-coverage .3 # knownGene:cds 1.205%, phastConsElements44way 4.706%, # both 0.844%, cover 70.05%, enrich 14.88x featureBits hg18 -enrichment refGene:cds phastConsElements28way # refGene:cds 1.144%, phastConsElements28way 4.920%, # both 0.858%, cover 74.96%, enrich 15.24x featureBits hg18 -enrichment knownGene:cds phastConsElements28way # knownGene:cds 1.205%, phastConsElements28way 4.920%, # both 0.878%, cover 72.88%, enrich 14.81x # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/hg18/bed/multiz44way/cons/all cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p downloads for D in pp/chr* do C=${D/pp\/} out=downloads/${C}.phastCons44way.wigFix.gz echo "${D} > ${C}.phastCons44way.wigFix.gz" ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \ gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 30m7.228s # encode those files into wiggle data zcat downloads/*.wigFix.gz \ | wigEncode stdin phastCons44way.wig phastCons44way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 22m54.291s # Load gbdb and database with wiggle. ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/cons/all ln -s `pwd`/phastCons44way.wib /gbdb/hg18/multiz44way/phastCons44way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \ phastCons44way phastCons44way.wig # real 1m13.681s # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/cons/all time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=hg18 phastCons44way > histogram.data 2>&1 # real 8m6.841s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Hg18 Histogram phastCons44way track" set xlabel " phastCons44way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for Primates # setup primates-only run ssh swarm mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/primates cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates # primates-only: exclude all but these for phastCons tree: /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \ --prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1 \ > primates.mod # and place the removed ones in the non-inf file so phastCons will # truly ignore them: echo "tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1,monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \ > primates.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # bed/chr18_random/chr18_random.1-4262.bed is empty # bed/chr19_random/chr19_random.1-301858.bed is empty # bed/chr21/chr21.1-10000000.bed is empty # bed/chrM/chrM.1-16571.bed is empty # the jobs that fail have messages like this: # bed/chrM/chrM.1-16571.bed is empty # WARNING: No match for name "tupBel1" in alignment. # WARNING: No match for name "sorAra1" in alignment. # Completed: 318 of 322 jobs # Crashed: 4 jobs # CPU time in finished jobs: 20253s 337.54m 5.63h 0.23d 0.001 y # IO & Wait Time: 33093s 551.56m 9.19h 0.38d 0.001 y # Average job time: 168s 2.80m 0.05h 0.00d # Longest finished job: 249s 4.15m 0.07h 0.00d # Submission to last job: 282s 4.70m 0.08h 0.00d # create Most Conserved track cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates time nice -n +19 hgLoadBed hg18 phastConsElements44wayPrimates \ mostConserved.bed # Loaded 808218 elements of size 5 # real 0m16.817s # verify coverage featureBits hg18 phastConsElements44wayPrimates # 113268574 bases of 2881515245 (3.931%) in intersection # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits hg18 -enrichment refGene:cds phastConsElements44wayPrimates # refGene:cds 1.144%, phastConsElements44wayPrimates 4.222%, # both 0.756%, cover 66.07%, enrich 15.65x featureBits hg18 -enrichment knownGene:cds phastConsElements44wayPrimates # knownGene:cds 1.205%, phastConsElements44wayPrimates 4.222%, # both 0.769%, cover 63.84%, enrich 15.12x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates mkdir downloads cat << '_EOF_' > gzipAscii.sh #!/bin/sh for D in pp/chr* do C=${D/pp\//} ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \ > downloads/${C}.primates.wigFix.gz echo $D $C done done '_EOF_' # << happy emacs time nice -n +19 ./gzipAscii.sh # real 36m13.492s # Create merged posterier probability file and wiggle track data files zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons44wayPrimates.wig phastCons44wayPrimates.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 24m15.688s ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates ln -s `pwd`/phastCons44wayPrimates.wib \ /gbdb/hg18/multiz44way/phastCons44wayPrimates.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \ phastCons44wayPrimates phastCons44wayPrimates.wig # real 0m48.942s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=hg18 phastCons44wayPrimates > histogram.data 2>&1 # real 5m50.154s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Hg18 Histogram phastCons44wayPrimates track" set xlabel " phastCons44wayPrimates score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for Euarchontoglires # setup euarchontoglires-only run ssh swarm cd /hive/data/genomes/hg18/bed/multiz44way/cons mkdir euarchontoglires cd euarchontoglires # euarchontoglires-only: exclude all but these for phastCons tree: /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \ --prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2 \ > euarchontoglires.mod # and place the removed ones in the non-inf file so phastCons will # truly ignore them: echo "vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1,monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \ > euarchontoglires.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Two of these jobs fail to produce any output in the bed file: # I believe this is because there is a missing sequence in these files # compared to the ones specified in euarchontoglires.mod: # bed/chr18_random/chr18_random.1-4262.bed is empty # bed/chr19_random/chr19_random.1-301858.bed is empty # Completed: 320 of 322 jobs # Crashed: 2 jobs # CPU time in finished jobs: 25869s 431.14m 7.19h 0.30d 0.001 y # IO & Wait Time: 34404s 573.41m 9.56h 0.40d 0.001 y # Average job time: 188s 3.14m 0.05h 0.00d # Longest finished job: 272s 4.53m 0.08h 0.00d # Submission to last job: 309s 5.15m 0.09h 0.00d # create Most Conserved track cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires time nice -n +19 hgLoadBed hg18 phastConsElements44wayEuarch \ mostConserved.bed # Loaded 1623656 elements of size 5 # real 4m15.125s # verify coverage featureBits hg18 phastConsElements44wayEuarch # 109221588 bases of 2881515245 (3.790%) in intersection # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits hg18 -enrichment refGene:cds phastConsElements44wayEuarch # refGene:cds 1.144%, mostConserved.bed 3.696%, # both 0.822%, cover 71.87%, enrich 19.45x # --rho 0.31 --expected-length 45 --target-coverage 0.3 # refGene:cds 1.144%, phastConsElements44wayEuarch 3.790%, # both 0.822%, cover 71.79%, enrich 18.94x # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits hg18 -enrichment knownGene:cds phastConsElements44wayEuarch # knownGene:cds 1.205%, mostConserved.bed 3.696%, # both 0.839%, cover 69.59%, enrich 18.83x # --rho 0.31 --expected-length 45 --target-coverage 0.3 # knownGene:cds 1.205%, phastConsElements44wayEuarch 3.790%, # both 0.838%, cover 69.51%, enrich 18.34x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires mkdir downloads cat << '_EOF_' > gzipAscii.sh #!/bin/sh for D in pp/chr* do C=${D/pp\//} ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \ > downloads/${C}.euarchontoglires.wigFix.gz echo $D $C done done '_EOF_' # << happy emacs time nice -n +19 ./gzipAscii.sh # real 26m54.263s # Create merged posterier probability file and wiggle track data files zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons44wayEuarch.wig phastCons44wayEuarch.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 18m15.693s ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires ln -s `pwd`/phastCons44wayEuarch.wib \ /gbdb/hg18/multiz44way/phastCons44wayEuarch.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \ phastCons44wayEuarch phastCons44wayEuarch.wig # real 0m57.590s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=hg18 phastCons44wayEuarch > histogram.data 2>&1 # real 6m37.512s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Hg18 Histogram phastCons44wayEuarch track" set xlabel " phastCons44wayEuarch score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for Placentals # setup placental-only run ssh swarm mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/placental cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental # placental-only: exclude all but these for phastCons tree: /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \ --prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1 \ > placental.mod # and place the removed ones in the non-inf file so phastCons will # truly ignore them: echo "monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \ > placental.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Two of these jobs fail to produce any output: # bed/chr18_random/chr18_random.1-4262.bed is empty # bed/chr19_random/chr19_random.1-301858.bed is empty # Completed: 320 of 322 jobs # Crashed: 2 jobs # CPU time in finished jobs: 38258s 637.63m 10.63h 0.44d 0.001 y # IO & Wait Time: 34704s 578.40m 9.64h 0.40d 0.001 y # Average job time: 228s 3.80m 0.06h 0.00d # Longest finished job: 313s 5.22m 0.09h 0.00d # Submission to last job: 1030s 17.17m 0.29h 0.01d # create Most Conserved track cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental time nice -n +19 hgLoadBed hg18 phastConsElements44wayPlacental \ mostConserved.bed # Loaded 3962527 elements of size 5 # real 3m28.564s # verify coverage featureBits hg18 phastConsElements44wayPlacental # 119635433 bases of 2881515245 (4.152%) in intersection # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits hg18 -enrichment refGene:cds phastConsElements44wayPlacental # refGene:cds 1.144%, phastConsElements44wayPlacental 4.329%, # both 0.840%, cover 73.41%, enrich 16.96x featureBits hg18 -enrichment knownGene:cds phastConsElements44wayPlacental # knownGene:cds 1.205%, phastConsElements44wayPlacental 4.329%, # both 0.858%, cover 71.17%, enrich 16.44x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental mkdir downloads cat << '_EOF_' > gzipAscii.sh #!/bin/sh for D in pp/chr* do C=${D/pp\//} ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \ > downloads/${C}.placental.wigFix.gz echo $D $C done done '_EOF_' # << happy emacs time nice -n +19 ./gzipAscii.sh # real 22m12.762s # Create merged posterier probability file and wiggle track data files zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons44wayPlacental.wig \ phastCons44wayPlacental.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 37m20.176s ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental ln -s `pwd`/phastCons44wayPlacental.wib \ /gbdb/hg18/multiz44way/phastCons44wayPlacental.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \ phastCons44wayPlacental phastCons44wayPlacental.wig # real 1m16.900s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=hg18 phastCons44wayPlacental > histogram.data 2>&1 # real 8m15.623s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Hg18 Histogram phastCons44wayPlacental track" set xlabel " phastCons44wayPlacental score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # Update phastCons44way tables from Adam (DONE - 2009-05-22 - Hiram) mkdir /hive/data/genomes/hg18/bed/multiz44way/chrX.phastCons cd /hive/data/genomes/hg18/bed/multiz44way/chrX.phastCons mkdir primates cd primates wget --timestamping \ ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/primates/* cd .. mkdir placental cd placental wget --timestamping \ ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/placental/* cd .. mkdir all cd all wget --timestamping \ ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/all/* zcat all/*.wigFix.gz \ | wigEncode stdin phastCons44way_v2.wig phastCons44way_v2.wib zcat primates/*.wigFix.gz \ | wigEncode stdin phastCons44wayPrimates_v2.wig phastCons44wayPrimates_v2.wib zcat placental/*.wigFix.gz \ | wigEncode stdin phastCons44wayPlacental_v2.wig phastCons44wayPlacental_v2.wib ln -s `pwd`/*.wib /gbdb/hg18/multiz44way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \ phastCons44way_v2 phastCons44way_v2.wig # real 0m43.022s time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \ phastCons44wayPrimates_v2 phastCons44wayPrimates_v2.wig # real 0m43.660s time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \ phastCons44wayPlacental_v2 phastCons44wayPlacental_v2.wig # real 0m44.607s time nice -n +19 hgLoadBed hg18 phastConsElements44way_v2 \ all/mostConserved.bed # Loaded 4779670 elements of size 5 # real 2m10.975s time nice -n +19 hgLoadBed hg18 phastConsElements44wayPrimates_v2 \ primates/mostConserved.bed # Loaded 785075 elements of size 5 # real 0m21.619s time nice -n +19 hgLoadBed hg18 phastConsElements44wayPlacental_v2 \ placental/mostConserved.bed # Loaded 3862854 elements of size 5 # real 1m41.223s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=hg18 phastCons44wayPlacental_v2 > placental.histogram.data 2>&1 time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=hg18 phastCons44wayPrimates_v2 > primates.histogram.data 2>&1 time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=hg18 phastCons44way_v2 > vertebrate.histogram.data 2>&1 cat << '_EOF_' | gnuplot > placental.histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Hg18 Histogram phastCons44wayPlacental_v2 track" set xlabel " phastCons44wayPlacental_v2 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "placental.histogram.data" using 2:5 title " RelFreq" with impulses, \ "placental.histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display placental.histo.png & cat << '_EOF_' | gnuplot > primates.histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Hg18 Histogram phastCons44wayPrimates_v2 track" set xlabel " phastCons44wayPrimates_v2 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "primates.histogram.data" using 2:5 title " RelFreq" with impulses, \ "primates.histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display primates.histo.png & cat << '_EOF_' | gnuplot > vertebrate.histo.png set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Hg18 Histogram phastCons44way_v2 track" set xlabel " phastCons44way_v2 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "vertebrate.histogram.data" using 2:5 title " RelFreq" with impulses, \ "vertebrate.histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display placental.histo.png & ######################################################################### # phyloP conservation for 44-way (2009-01-05 kate) # # Vertebrate, Placental # Also doing Euarchontoglire, since Hiram did # # Using newer scoring method LRT (replaces SPH), based # on scoring method experiments, above (compared to SCORE method). # Using phast from Adam's student Melissa Hubisz, with fixes needed for LRT scoring # Will replace with version from CVS if/when these fixes are integrated # PHAST version is 0.9.9.9b # split SS files into 1M chunks (tried 10M used for phastCons, and these # took 5hrs/chunk w/ LRT scoring) ssh swarm cd /cluster/data/hg18/bed/multiz44way mkdir consPhyloP cd consPhyloP mkdir ss run.split cd run.split cat << 'EOF' > doSplit.csh set c = $1 set d = /cluster/data/hg18/bed/multiz44way set in = $d/cons/ss set out = $d/consPhyloP/ss set PHASTBIN = /cluster/bin/phast.2008-12-18 @ i=0 foreach f (`ls $in/$c/*.ss | sort -n -t\. -k2`) @ i++ mkdir -p $out/$c/$i $PHASTBIN/msa_split $f -i SS -o SS \ -r $out/$c/$i/$c.$i -w 1000000,0 -I 1000 -B 5000 end echo "Done" >> $out/$c.done 'EOF' # << happy emacs set d = /cluster/data/hg18/bed/multiz44way/consPhyloP set JOBS = $d/run.split/jobList rm -f $JOBS touch $JOBS foreach c (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`) echo "csh doSplit.csh $c {check out line+ $d/ss/$c.done}" >> $JOBS end para create jobList # 49 jobs para try para check para push para time # run phyloP with score=LRT ssh swarm cd /cluster/data/hg18/bed/multiz44way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of whole-genome (.41 -- as was done for ENCODE) # using utility, 'modFreqs' from PHAST package set PHASTBIN = /cluster/bin/phast.2008-12-18 set gc = `grep BACKGROUND /cluster/data/hg18/bed/multiz17way/cons/elliotsEncode.mod | awk '{printf "%0.3f\n", $3 + $4}'` echo $gc # .410 # NOTE: this corresponds well to Hiram's GC values from his phyloFit runs # on the 44-way ss files $PHASTBIN/modFreqs ../../4d/phyloFit.all.mod $gc > ../../4d/44way.all.mod # repeat for chrX only tree cd /cluster/data/hg18/bed/multiz44way/4d $PHASTBIN/modFreqs 4d.chrX.mod $gc > 44way.chrX.mod ln -s `pwd`/44way.chrX.mod /usr/local/apache/golenPath/hg18/phastCons44way cat > doPhyloP.csh << 'EOF' set f = $1 set out = $2 set c = $f:r:r set n = $f:r:e set tmp = /scratch/tmp/$f rm -fr $tmp mkdir -p $tmp cp -p /cluster/data/hg18/bed/multiz44way/consPhyloP/ss/$c/$n/$f.ss $tmp cp -p tree.mod $tmp pushd $tmp > /dev/null set PHASTBIN = /cluster/bin/phast.2008-12-18 $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $c \ -i SS tree.mod $f.ss > $f.wig popd > /dev/null mkdir -p $out:h mv $tmp/$f.wig $out rm -fr $tmp 'EOF' # Create list of chunks pushd /cluster/data/hg18/bed/multiz44way/consPhyloP/ss ls chr*/*/chr*.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \ /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/in.list popd > /dev/null # need to fill in chr8, neglected in main run pushd /cluster/data/hg18/bed/multiz44way/consPhyloP/ss ls chr8/*/chr*.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \ /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/in.chr8.list popd > /dev/null # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat > template << 'EOF' #LOOP csh ../doPhyloP.csh $(file1) {check out line+ wig/$(dir1)/$(file1).wig} #ENDLOOP 'EOF' # setup run for all species mkdir all cd all cp ../../../4d/44way.all.mod tree.mod rm -fr wig mkdir wig # << happy emacs gensub2 ../in.list single ../template jobList # 2823 jobs para create jobList para try para check para push para time #Completed: 2823 of 2823 jobs #CPU time in finished jobs: 4691641s 78194.02m 1303.23h 54.30d 0.149 y #IO & Wait Time: 171343s 2855.71m 47.60h 1.98d 0.005 y #Average job time: 1723s 28.71m 0.48h 0.02d #Longest finished job: 2451s 40.85m 0.68h 0.03d #Submission to last job: 6055s 100.92m 1.68h 0.07d ssh hgwdev cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP # check for clean dir here -- chr* will match garbage if it's there cat > listWig.csh << 'EOF' foreach c (`ls -d chr*`) foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`) ls -1 $d/*.wig | sort -n -t\. -k3 end end 'EOF' cd all/wig csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayAll.wig phyloP44wayAll.wib # Reloaded to include chr8 (2008-01-15 kate) #Converted stdin, upper limit 7.13, lower limit -15.41 # Load gbdb and database with wiggle. ln -s \ /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/all/wig/phyloP44wayAll.wib \ /gbdb/hg18/multiz44way/phyloP44wayAll.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayAll phyloP44wayAll.wig # placental-only: exclude all but these: cd /cluster/data/hg18/bed/multiz44way/4d set PHASTBIN = /cluster/bin/phast.2008-12-18 $PHASTBIN/tree_doctor 44way.all.mod \ --prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,\ micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,\ vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,\ sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1 \ > 44way.placental.mod cd ../consPhyloP/run.phyloP mkdir placental cd placental cp ../../../4d/44way.placental.mod tree.mod mkdir wig gensub2 ../in.list single ../template jobList # 2823 jobs para create jobList para try para check para push para time #Completed: 2823 of 2823 jobs #CPU time in finished jobs: 3358003s 55966.71m 932.78h 38.87d 0.106 y #IO & Wait Time: 142664s 2377.74m 39.63h 1.65d 0.005 y #Average job time: 1240s 20.67m 0.34h 0.01d #Longest finished job: 1781s 29.68m 0.49h 0.02d #Submission to last job: 4383s 73.05m 1.22h 0.05d # load wiggle ssh hgwdev cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/wig csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPlacMammal.wig phyloP44wayPlacMammal.wib #Converted stdin, upper limit 3.46, lower limit -14.42 # Load gbdb and database with wiggle. ln -s \ /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/wig/phyloP44wayPlacMammal.wib \ /gbdb/hg18/multiz44way/phyloP44wayPlacMammal.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPlacMammal phyloP44wayPlacMammal.wig cd /cluster/data/hg18/bed/multiz44way/4d set PHASTBIN = /cluster/bin/phast.2008-12-18 $PHASTBIN/tree_doctor 44way.all.mod \ --prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2 \ > 44way.euarchontoglires.mod # euarchontoglires only: exclude all but these: cd ../consPhyloP/run.phyloP mkdir euarch cd euarch cp ../../../4d/44way.euarchontoglires.mod tree.mod mkdir wig gensub2 ../in.list single ../template jobList # 2823 jobs para create jobList para try para check para push para time #Completed: 2823 of 2823 jobs #CPU time in finished jobs: 1646910s 27448.49m 457.47h 19.06d 0.052 y #IO & Wait Time: 94310s 1571.84m 26.20h 1.09d 0.003 y #Average job time: 617s 10.28m 0.17h 0.01d #Longest finished job: 901s 15.02m 0.25h 0.01d #Submission to last job: 2127s 35.45m 0.59h 0.02d # process results and load wiggle ssh hgwdev cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/euarch/wig csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayEuarch.wig phyloP44wayEuarch.wib #Converted stdin, upper limit 2.03, lower limit -9.78 ln -s \ /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/euarch/wig/phyloP44wayEuarch.wib \ /gbdb/hg18/multiz44way/phyloP44wayEuarch.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayEuarch phyloP44wayEuarch.wig # primates only: exclude all but these: cd /cluster/data/hg18/bed/multiz44way/4d set PHASTBIN = /cluster/bin/phast.2008-12-18 $PHASTBIN/tree_doctor 44way.all.mod \ --prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1 \ > 44way.primate.mod cd ../consPhyloP/run.phyloP mkdir primate cd primate cp ../../../4d/44way.primate.mod tree.mod mkdir wig gensub2 ../in.list single ../template jobList para create jobList # 2823 jobs para try para check para push # quick! para time #Completed: 2823 of 2823 jobs #CPU time in finished jobs: 895998s 14933.30m 248.89h 10.37d 0.028 y #IO & Wait Time: 66654s 1110.90m 18.52h 0.77d 0.002 y #Average job time: 341s 5.68m 0.09h 0.00d #Longest finished job: 503s 8.38m 0.14h 0.01d #Submission to last job: 1190s 19.83m 0.33h 0.01d # process results and load wiggle ssh hgwdev cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/wig csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPrimate.wig phyloP44wayPrimate.wib #Converted stdin, upper limit 0.99, lower limit -8.17 ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/wig/phyloP44wayPrimate.wib /gbdb/hg18/multiz44way/phyloP44wayPrimate.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimate phyloP44wayPrimate.wig # get stats cd run.phyloP/all hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayAll > stats.out hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayAll | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out cd ../placental hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayPlacMammal > stats.out hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayPlacMammal | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out cd ../euarch hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayEuarch > stats.out hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayEuarch | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out cd ../primate hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayPrimate > stats.out hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayPrimate | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out # Downloads cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP cat > listWigsByChrom.csh << 'EOF' set c = $1 foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`) ls -1 $d/*.wig | sort -n -t\. -k3 end 'EOF' cat > downloads.csh << 'EOF' mkdir ../downloads foreach c (`ls -d chr*`) echo $c csh ../../listWigsByChrom.csh $c > ../downloads/$c.lst csh ../../listWigsByChrom.csh $c | xargs cat | gzip -c > ../downloads/$c.$1.wigFix.gz end cd ../downloads md5sum *.wigFix.gz > md5sum.txt 'EOF' cd all/wig csh ../../downloads.csh phyloP44way >&! downloads.log & cd ../../placental/wig csh ../../downloads.csh phyloP44way.placental >&! downloads.log & cd ../../primate/wig csh ../../downloads.csh phyloP44way.primate >&! downloads.log & # add create web downloads dir and add symlinks to files cd ../../ mkdir downloads cp /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/README.txt downloads # edit cd /usr/local/apache/htdocs/goldenPath/hg18/ mkdir phyloP44way cd phyloP44way ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/downloads/README.txt . mkdir vertebrate ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/all/downloads/{*.gz,md5sum.txt} vertebrate mkdir placentalMammals ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/downloads/{*.gz,md5sum.txt} placentalMammals mkdir primates ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/downloads/{*.gz,md5sum.txt} primates # Lineage-specific runs # uses --subtree option of phyloP # name ancestor nodes cd /cluster/data/hg18/bed/multiz44way/4d set PHASTBIN = /cluster/bin/phast.2008-12-18 $PHASTBIN/tree_doctor 44way.all.mod --name-ancestors >44way.all-ancestors.mod cd ../consPhyloP/run.phyloP # built new PHAST package with fix from Adam for --subtree problems:w sed -e 's/phyloP/phyloP --subtree=$3/' -e 's/phast.2008-12-18/phast.2009-01-26/' doPhyloP.csh > doPhyloPSubtree.csh # visually inspect shell script cat > template.subtree << 'EOF' #LOOP csh ../doPhyloPSubtree.csh $(file1) {check out line+ wig/$(dir1)/$(file1).wig} SUBTREE #ENDLOOP 'EOF' # primate lineage-specific cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP mkdir primate-ls cd primate-ls cp ../../../4d/44way.all-ancestors.mod tree.mod mkdir wig sed 's/SUBTREE/hg18-micMur1/' ../template.subtree > template.ls gensub2 ../in.list single template.ls jobList para create jobList # 2823 jobs para try para check para push para time #CPU time in finished jobs: 4949300s 82488.33m 1374.81h 57.28d 0.157 y #IO & Wait Time: 143956s 2399.27m 39.99h 1.67d 0.005 y #Average job time: 1805s 30.08m 0.50h 0.02d #Longest finished job: 2780s 46.33m 0.77h 0.03d #Submission to last job: 6447s 107.45m 1.79h 0.07d # process results and load wiggle ssh hgwdev cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate-ls/wig csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPrimateLs.wig phyloP44wayPrimateLs.wib #Converted stdin, upper limit 3.91, lower limit -9.28 ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate-ls/wig/phyloP44wayPrimateLs.wib /gbdb/hg18/multiz44way/phyloP44wayPrimateLs.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimateLs phyloP44wayPrimateLs.wig # glire lineage-specfic cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP mkdir glire-ls cd glire-ls cp ../../../4d/44way.all-ancestors.mod tree.mod mkdir wig sed 's/SUBTREE/mm9-oryCun1/' ../template.subtree > template.ls gensub2 ../in.list single template.ls jobList para create jobList # 2823 jobs para try para check para push para time #CPU time in finished jobs: 5173192s 86219.87m 1437.00h 59.87d 0.164 y #IO & Wait Time: 145615s 2426.91m 40.45h 1.69d 0.005 y #Average job time: 1884s 31.40m 0.52h 0.02d #Longest finished job: 2721s 45.35m 0.76h 0.03d #Submission to last job: 6883s 114.72m 1.91h 0.08d # process results and load wiggle ssh hgwdev cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/glire-ls/wig csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayGlireLs.wig phyloP44wayGlireLs.wib #Converted stdin, upper limit 5.95, lower limit -6.99 ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/glire-ls/wig/phyloP44wayGlireLs.wib /gbdb/hg18/multiz44way/phyloP44wayGlireLs.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayGlireLs phyloP44wayGlireLs.wig ######################################################################### # Update phyloP44way tables from Adam Siepel, Melissa Hubisz at Cornell # This version uses a different neutral tree model for chrX # and will replace the original version as default view on the Conservation track # ( 2009-06-30 kate) mkdir /hive/data/genomes/hg18/bed/multiz44way/chrX.phyloP cd /hive/data/genomes/hg18/bed/multiz44way/chrX.phyloP mkdir primates cd primates wget --timestamping ftp:ftp.biotech.cornell.edu/2x/phyloP/44way/primates/\* cd .. mkdir placental cd placental wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/placental/\* cd .. mkdir all cd all wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/all/\* cd .. zcat all/*.wigFix.gz | wigEncode stdin phyloP44way_v2.wig phyloP44way_v2.wib zcat primates/*.wigFix.gz | wigEncode stdin phyloP44wayPrimates_v2.wig phyloP44wayPrimates_v2.wib zcat placental/*.wigFix.gz | wigEncode stdin phyloP44wayPlacental_v2.wig phyloP44wayPlacental_v2.wib ln -s `pwd`/*.wib /gbdb/hg18/multiz44way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44way_v2 phyloP44way_v2.wig time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimates_v2 phyloP44wayPrimates_v2.wig time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPlacental_v2 phyloP44wayPlacental_v2.wig # Lineage specific phyloP # These updated tables will appear in the Lineage Cons track mkdir glires-ls cd glires-ls wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/glires-ls/\* cd .. mkdir primates-ls cd primates-ls wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/primates-ls/\* cd .. zcat glires-ls/*.wigFix.gz | wigEncode stdin phyloP44wayGliresLs_v2.wig phyloP44wayGliresLs_v2.wib zcat primates-ls/*.wigFix.gz | wigEncode stdin phyloP44wayPrimatesLs_v2.wig phyloP44wayPrimatesLs_v2.wib ln -s `pwd`/phyloP44wayGliresLs_v2.wib /gbdb/hg18/multiz44way nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayGliresLs_v2 phyloP44wayGliresLs_v2.wig ln -s `pwd`/phyloP44wayPrimatesLs_v2.wib /gbdb/hg18/multiz44way nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimatesLs_v2 phyloP44wayPrimatesLs_v2.wig ###################################################################### # downloads for 44-way (DONE - 2009-01-09 - Hiram) mkdir -p /hive/data/genomes/hg18/bed/multiz44way/downloads/maf cd /hive/data/genomes/hg18/bed/multiz44way/downloads/maf # bash script #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits hg18 refGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags hg18 multiz44way \ stdin stdout \ -orgs=/hive/data/genomes/hg18/bed/multiz44way/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done cd /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/maf ln -s /hive/data/genomes/hg18/bed/multiz44way/downloads/maf/up*.gz . md5sum up*.gz >> md5sum.txt mkdir /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way cd /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way mkdir placentalMammals primates vertebrate cd vertebrate ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/all/downloads/* . cd ../placentalMammals ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/placental/downloads/* . cd ../primates ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/primates/downloads/* . cd .. ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/all/all.mod \ vertebrate.mod ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/primates/primates.mod . ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/placental/placental.mod \ ./placentalMammals.mod ln -s \ /hive/data/genomes/hg18/bed/multiz44way/downloads/phastCons44way/README.txt . # pushQ MySQL tables: phastCons44way, phastCons44wayPlacental, phastCons44wayPrimates, multiz44way, multiz44wayFrames, multiz44waySummary, phastConsElements44way, phastConsElements44wayPlacental, phastConsElements44wayPrimates, phyloP44wayAll, phyloP44wayPlacMammal, phyloP44wayPrimate # pushQ files: /gbdb/hg18/multiz44way/maf/* /gbdb/hg18/multiz44way/phastCons44way.wib /gbdb/hg18/multiz44way/phastCons44wayPlacental.wib /gbdb/hg18/multiz44way/phastCons44wayPrimates.wib /gbdb/hg18/multiz44way/phyloP44wayAll.wib /gbdb/hg18/multiz44way/phyloP44wayPlacMammal.wib /gbdb/hg18/multiz44way/phyloP44wayPrimate.wib /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/vertebrate/* /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/primates/* /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/placentalMammals/* /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/*.mod /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/README.txt /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/maf/* /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/alignments/ /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/*.nh /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/README.txt /usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/vertebrate/* /usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/placentalMammals/* /usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/primate/* # MySQL tables: 5,624,932,756 = 5,364 Mb # gbdb files: 271,318,361,985 = 258,749 Mb # apache htdocs: 58,767,852,372 = 56,045 Mb # Total 335,711,147,113 = 320,159 Mb # An extra set of error corrected MAF's from the Siepel lab: mkdir /hive/data/genomes/hg18/bed/multiz44way/errorCorrectedMafs wget --timestamping \ "ftp://siepellab:XXXXXX@ftp.biotech.cornell.edu/2x/maf-ec/*" # not showing the password here on purpose # verify md5sums: md5sum *.maf.gz > md5sum.here diff md5sum.txt md5sum.here # no difference rm md5sum.here mkdir \ /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/SiepelLabCorrectedMafs cd \ /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/SiepelLabCorrectedMafs ln -s /hive/data/genomes/hg18/bed/multiz44way/errorCorrectedMafs/* . ######################################################################### # Create Syntenic and Recip Best net files to load into tracks to view # on the browser to see what was used during the multiple alignment cd /hive/data/genomes/hg18/bed/blastz.gorGor1/axtChain netClass -verbose=0 -noAr hg18.gorGor1.rbest.net.gz hg18 gorGor1 stdout \ | gzip -c > netRBestGorGor1.net.gz hgLoadNet hg18 netRBestGorGor1 netRBestGorGor1.net.gz cd /hive/data/genomes/hg18/bed/blastz.ponAbe2/axtChain hgLoadNet hg18 netSyntenyPonAbe2 hg18.ponAbe2.syn.net.gz cd /hive/data/genomes/hg18/bed/blastz.calJac1/axtChain netClass -verbose=0 -noAr hg18.calJac1.rbest.net.gz hg18 calJac1 stdout \ | gzip -c > netRBestCalJac1.net.gz hgLoadNet hg18 netRBestCalJac1 netRBestCalJac1.net.gz cd /hive/data/genomes/hg18/bed/blastz.tarSyr1/axtChain netClass -verbose=0 -noAr hg18.tarSyr1.rbest.net.gz hg18 tarSyr1 stdout \ | gzip -c > netRBestTarSyr1.net.gz hgLoadNet hg18 netRBestTarSyr1 netRBestTarSyr1.net.gz ######################################################################### # EIO/JCVI NAS TRACK (2008-11-25 Fan) # Contact: Gaetano Gargiulo [gaetano.gargiulo@ifom-ieo-campus.it] cd /hive/data/genomes/hg18/bed mkdir eioJcviNAS cd eioJcviNAS # receive the doc and two bed files and put them there. fgrep -v description HG18_NAS_CD34_neg.bed| \ cut -f 1-3 |hgLoadBed -noBin hg18 eioJcviNASNeg stdin checkTableCoords -table=eioJcviNASNeg hg18 fgrep -v description HG18_NAS_CD34_pos.bed| \ cut -f 1-3 |hgLoadBed -noBin hg18 eioJcviNASPos stdin checkTableCoords -table=eioJcviNASPos hg18 # Create the description file, eioJcviNAS.html, according to # according to the latest doc file from Gaetano. # # Add the two composite sub-tracks to human/hg18/trackDb.ra. ######################################################################### # hgPal downloads (DONE braney 2008-12-07) # FASTA from 44way for refGene, knownGene, knownCanonical ssh hgwdev screen bash rm -rf /cluster/data/hg18/bed/multiz44way/pal mkdir /cluster/data/hg18/bed/multiz44way/pal cd /cluster/data/hg18/bed/multiz44way/pal echo hg18 | cat - /cluster/data/hg18/bed/multiz44way/ordered.list > order.lst mz=multiz44way gp=refGene db=hg18 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # real 525m57.376s # user 25m36.072s # sys 7m41.565s ssh kolossus mz=multiz44way gp=refGene db=hg18 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz44way gp=refGene db=hg18 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz44way gp=knownGene db=hg18 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 442m46.735s # user 43m3.060s # sys 10m45.635s mz=multiz44way gp=knownGene db=hg18 zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc mz=multiz44way gp=knownGene db=hg18 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz # now do the canonical set cd /cluster/data/hg18/bed/multiz44way/pal mz=multiz44way gp=knownCanonical db=hg18 for j in `awk '{print $1}' /cluster/data/hg18/chrom.sizes` do echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed done mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -geneBeds=$j.known.bed $db $mz knownGene order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 326m12.849s # user 17m40.850s # sys 3m59.648s rm *.known.bed mz=multiz44way gp=knownCanonical db=hg18 zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc mz=multiz44way gp=knownCanonical db=hg18 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ######################################################################### # BUILD OMIM RELATED GENES TRACK (complete rebuild, 10/13/09 JK) ssh hgwdev cd /hive/data/genomes/gs.19/build36/bed mkdir omimGene cd omimGene # download the file morbidmap and genemap from OMIM mkdir omim cd omim wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/morbidmap wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/genemap cat genemap|sed -e 's/|/\t/g' > genemap.tab autoSql ~/src/hg/lib/omimGeneMap.as x cat x.sql |sed -e 's/PRIMARY KEY(numbering)/KEY(omimId)/' >omimGeneMap.sql hgLoadSqlTab -warn hg18 omimGeneMap omimGeneMap.sql genemap.tab # got warning on 3 records, just ignore them # Warning: load of omimGeneMap did not go as planned: 12216 record(s), 0 row(s) rm x.c x.h cd .. cat omim/morbidmap|sed -e 's/|/\t/g' > mobidmap.tab autoSql ~/src/hg/lib/omimMorbidMap.as x cat x.sql |sed -e 's/PRIMARY KEY(description)/KEY(omimId)/' >omimMorbidMap.sql hgLoadSqlTab -warn hg18 omimMorbidMap omimMorbidMap.sql mobidmap.tab # get all UCSC genes (from the knownGene table) that cross-reference to a RefSeq gene # that has a non-empty OMIM ID according to the refLink table. And use OMIM ID as # the gene name for this new table. Please note the alignId field still holds the KG ID. hgsql hg18 -N -e \ 'select omimId, kg.* from knownGene kg, knownToRefSeq kr, refLink l where omimId != 0 and mrnaAcc=kr.value and kg.name=kr.name ' \ |cut -f 1,3-13 >o1.tab # collect more OMIM related genes via the MIM external DB links from UniProt hgsql hg18 -N -e \ 'select extAC, kg.* from knownGene kg, kgXref k, proteome.spXref2 p where spId=p.accession and extDB="MIM" and kg.name=kgId ' \ |cut -f 1,3-13 >o2.tab # concatenate the above two gene sets and remove duplications. cat o1.tab o2.tab |sort -u >o3.tab # load the result into a temp table, fanO3 hgLoadSqlTab hg18 fanO3 ~/src/hg/lib/knownGene.sql o3.tab # while holding onto the OMIM ID, get the canonical gene (via the knownGene, knowIsoforms, # and knownCanonical tables) that represent a cluster which contains # initial OMIM gene in the fanO3 table hgsql hg18 -N -e \ 'select f3.name, kg.* from fanO3 f3, knownGene kg, knownCanonical c, knownIsoforms i where f3.alignId=i.transcript and kg.name=c.transcript and c.clusterId=i.clusterId'\ > o4.tab # first column is the OMIM ID cut -f 1 o4.tab >j1.tmp # col 3-13 is the gene structure of the canonical KG cut -f 3-13 o4.tab >j2.tmp # stitch them together and remove duplicates, load the result into fanO4 table paste j1.tmp j2.tmp |sort -u >fanO4.tab hgLoadSqlTab hg18 fanO4 ~/src/hg/lib/knownGene.sql fanO4.tab # finally sort the table and create bed 4 file and load it as the omimGene table hgsql hg18 -N -e 'select chrom, txStart, txEnd, name from fanO4 order by chrom, txStart, txEnd' |sort -u >omimGene.bed hgLoadBed hg18 omimGene omimGene.bed # create and load the omimToKnownCanonical table. hgsql hg18 -N -e 'select name, alignId from fanO4 order by name'\ > omimToKnownCanonical.tab hgLoadSqlTab hg18 omimToKnownCanonical \ ~/src/hg/lib/omimToKnownCanonical.sql omimToKnownCanonical.tab # The following clean up could be done. # hgsql hg18 -e 'drop table fanO3' # hgsql hg18 -e 'drop table fanO4' # rm j*.tmp # rm o1.tab o2.tab o3.tab o4.tab ############################################################################# # fox2ClipSeq from Gene Yeo (DONE - 2009-01-08 - Hiram) mkdir /hive/data/genomes/hg18/bed/fox2ClipSeq cd /hive/data/genomes/hg18/bed/fox2ClipSeq # lift the hg17 data to here liftOver -bedPlus=9 \ /hive/data/genomes/hg17/bed/fox2ClipSeq/forwardStrand.bed.gz \ /usr/local/apache/htdocs/goldenPath/hg17/liftOver/hg17ToHg18.over.chain.gz \ stdout forwardStrand.unMapped | gzip -c > forwardStrand.bed.gz liftOver -bedPlus=9 \ /hive/data/genomes/hg17/bed/fox2ClipSeq/reverseStrand.bed.gz \ /usr/local/apache/htdocs/goldenPath/hg17/liftOver/hg17ToHg18.over.chain.gz \ stdout reverseStrand.unMapped | gzip -c > reverseStrand.bed.gz # turn into wiggle density plot zcat forwardStrand.bed.gz | bedItemOverlapCount hg18 stdin \ | wigEncode stdin fox2ClipSeqDensityForwardStrand.wig \ fox2ClipSeqDensityForwardStrand.wib # Converted stdin, upper limit 2401.00, lower limit 1.00 zcat reverseStrand.bed.gz | bedItemOverlapCount hg18 stdin \ | wigEncode stdin fox2ClipSeqDensityReverseStrand.wig \ fox2ClipSeqDensityReverseStrand.wib # Converted stdin, upper limit 1406.00, lower limit 1.00 # and load tables zcat forwardStrand.bed.gz reverseStrand.bed.gz \ | hgLoadBed hg18 fox2ClipSeq stdin # Loaded 4418298 elements of size 9 ln -s `pwd`/*.wib /gbdb/hg18/wib hgLoadWiggle hg18 fox2ClipSeqDensityForwardStrand \ fox2ClipSeqDensityForwardStrand.wig hgLoadWiggle hg18 fox2ClipSeqDensityReverseStrand \ fox2ClipSeqDensityReverseStrand.wig # add composite track definitions to makeDb/trackDb/human/trackDb.ra ############################################################################# # REPEATMASKER - LATEST VERSION, 3.2.7 (DONE 1/30/09 rhubley and angie) # Robert Hubley ran the new and improved version (3.2.7) of RepeatMasker # but politely deferred to staff to load the results: mkdir /hive/data/genomes/hg18/bed/RMRunRMH cd /hive/data/genomes/hg18/bed/RMRunRMH doRepeatMasker.pl -stop mask -buildDir `pwd` hg18 # see do.log, cat.log # Angie loaded with new table name, chr*_rmskRM327. Used -debug to # make scripts, edited those. cd /hive/data/genomes/hg18/bed/RMRunRMH doRepeatMasker.pl -debug \ -continue install -buildDir `pwd` hg18 # Edit doLoad.csh: change table names: rmsk -> rmskRM327, # nestedRepeats -> nestedRepeatsRM327 ./doLoad.csh >& load.log & tail -f load.log # Edit doSplit.csh: change -ending to .RM327.fa.out ./doSplit.csh >& split.log & tail -f split.log doRepeatMasker.pl -continue cleanup -buildDir `pwd` \ -fileServer hgwdev hg18 >& cleanup.log & tail -f cleanup.log # Compare coverage to original RepeatMasker run: featureBits hg18 rmskRM327 #1457032101 bases of 2881515245 (50.565%) in intersection featureBits hg18 rmsk #1406290513 bases of 2881515245 (48.804%) in intersection # Wow, Arian got his 50%! :) # Compare Alu counts, since that is supposed to be an area of improvement: grep SINE/Alu hg18.fa.out | wc -l #1186885 ls /hive/data/genomes/hg18/?{,?}{,_*_hap[12]}/chr[0-9XYM]{,[0-9]}{,_random,*_hap[12]}.fa.out \ | uniq | xargs grep SINE/Alu | wc -l #1189976 # A decrease... weird. OK, breaking it down chrom-by-chrom, the _random's # have fewer and the regular chrom's have more Alu's. Sounds OK to me :) featureBits hg18 rmsk \!rmskRM327 #12318974 bases of 2881515245 (0.428%) in intersection featureBits hg18 rmskRM327 \!rmsk #63060562 bases of 2881515245 (2.188%) in intersection # hgTables: 49,804 rmskRM327 items (4,805,535 bases) have no overlap with rmsk # Added download file 2/5/09: cd /hive/data/genomes/hg18 zip -j bigZips/chromOut.RM3.2.7.zip */chr*.RM327.fa.out ln -s /hive/data/genomes/hg18/bigZips/chromOut.RM3.2.7.zip \ /usr/local/apache/htdocs/goldenPath/hg18/bigZips/ ############################################################################# # GENOME VARIANTS - adding AK1, Saqqaq(Eskimo), Quake, Tutu, + Bushmen # also adding phenotype information for those from PSU # Mar 8, 2010 Load from exports from PSU Browser. Merge needed code changes from PSU Browser for phenotype. ############################################################################# # GENOME VARIANTS - 1000 GENOMES (DONE 1/7/2009 giardine, adapted from an email to angie) # December release from 1000 Genomes: SNP calls on four of the 6 high-cov # individuals: a CEU trio and a YRI daughter. # see ftp://ftp-trace.ncbi.nih.gov/1000genomes/release/2008_12/README_December2008_release cd /hive/data/genomes/hg18/bed/pgSnp/ cat > trio2pg.pl <<'EOF' #!/usr/bin/perl -w use strict; #split out individual SNPs from trio file #format:chr loc ref alleles snp.Q av.max.map.Q depth.cov NA12891 NA12891.Q NA12892 NA12892.Q NA12878 NA12878.Q hwe maf tdt display my $ac = shift @ARGV; #allele column, zero based if (!$ac) { print "Usage: trio2pg.pl alleleColumn# < infile > outfile\n"; exit; } while (<>) { chomp; my @f = split(/\t/); if ($f[0] eq 'chr') { next; } $f[$ac] =~ s/([ATGC])\/\1/$1/; if ($f[$ac] eq uc($f[2])) { next; } #reference allele only print "chr$f[0]\t", ($f[1]-1), "\t$f[1]\t$f[$ac]\t"; my $c = ($f[$ac] =~ tr/\//\//) + 1; my $s = $f[$ac+1]; if ($s !~ /\//) { for (my $i = 1; $c > $i; $i++) { $s .= ",$f[$ac+1]"; } }else { $s =~ s/\//,/g; if ($c == 1) { $s =~ s/,.*//; } } my $n = "0"; for (my $i = 1; $c > $i; $i++) { $n .= ",0"; } #allele count print "$c\t$n\t$s\n"; } exit; 'EOF' # << emacs chmod a+x trio2pg.pl #convert to pgSnp set relDir = /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/release/2008_12/ zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 7 > NA12891.pgSnp zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 9 > NA12892.pgSnp zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 11 > NA12878.pgSnp zcat $relDir/YRI.child.dec.intersect.calls.gz | trio2pg.pl 7 > NA19240.pgSnp #gff for indels does not give nts, can't put in pgSnp format # 9/25/09: use samtools pileup to add base counts back in to those files. cat > addCounts.pl <<'_EOF_' #!/usr/bin/env perl use warnings; use strict; my $sample = shift @ARGV; my $bamTemplate = shift @ARGV; if (! (defined $sample && defined $bamTemplate)) { die "Usage: $0 sampleId bamTemplate [pgSnpFile]\n"; } my $prevChr; my ($bamFile, $PLUP); while (<>) { my ($chr, $s, $e, $alleles, $aCount, $baseCounts, $quals) = split("\t"); # New chrom? open pipe from samtools pileup: if (!defined $prevChr || $prevChr ne $chr) { close ($PLUP) if (defined $PLUP); (my $c = $chr) =~ s/^chr//; ($bamFile = $bamTemplate) =~ s/__S__/$sample/g; $bamFile =~ s/__C__/$c/; if (-e $bamFile) { my $pileupPipe = "samtools pileup $bamFile |"; warn "Opening '$pileupPipe'\n"; open($PLUP, $pileupPipe) || die "Can't open pipe '$pileupPipe': $!\n"; } else { warn "bamFile '$bamFile' does not exist"; $PLUP = undef; } } # Fast-forward to pileup line corresponding to this pgSnp line: if (defined $PLUP) { my ($pc, $ps, undef, $depth, $bases); do { ($pc, $ps, undef, $depth, $bases) = split("\t", <$PLUP>); if (defined $pc) { die "Unexpected chrom '$pc' (!~ '$chr') in $bamFile" if ("chr$pc" ne $chr); $ps--; } else { $ps = $s+1; close($PLUP); $PLUP = undef; } } while ($ps < $s); if (defined $pc && $ps == $s) { $bases =~ s/\^.//g; $bases =~ s/\$//g; # ignore begin/end-of-read markers while ($bases =~ /[-+](\d+)\w+/) { # ignore indels my $count = $1; $bases =~ s/[-+]$count\w{$count}//; } die "length of $bases (" . length($bases) . ") != $depth" if (length($bases) != $depth); $bases =~ tr/acgtn/ACGTN/; my @origBaseCounts = split(',', $baseCounts); $baseCounts = ""; foreach my $al (split("/", $alleles)) { my $alCt = ($bases =~ s/$al//g) + shift @origBaseCounts; $baseCounts .= ',' if ($baseCounts ne ""); $baseCounts .= $alCt; } #warn "Leftover bases: $bases ($alleles)" if (length($bases) > 10); # Sometimes the allele is given as homozygous but there are many other # copies of some other base detected...? And sometimes lots of "*" # characters, not described on http://samtools.sourceforge.net/pileup.shtml } # end if we found the pileup line for this pgSnp line } # end if there is a $bamFile for this template and chrom. print join("\t", $chr, $s, $e, $alleles, $aCount, $baseCounts, $quals); $prevChr = $chr; } '_EOF_' # << emacs chmod a+x addCounts.pl foreach f (NA*.pgSNP) set s = $f:r cat $f \ | ./addCounts.pl $s \ /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/__S__/alignment/__S__.chrom__C__.SLX.maq.SRP000032.2009_07.bam \ | ./addCounts.pl $s \ /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/__S__/alignment/__S__.chrom__C__.SOLID.corona.SRP000032.2009_08.bam \ | ./addCounts.pl $s \ /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/__S__/alignment/__S__.chrom__C__.454.ssaha.SRP000032.2009_07.bam \ > $f.counts end # NA12878 and NA19240 have all 3 platforms; just SLX.maq for NA12891, NA12892 hgLoadBed hg18 pgNA12878 NA12878.pgSnp.counts \ -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab #Loaded 3049749 elements of size 7 hgLoadBed hg18 pgNA12891 NA12891.pgSnp.counts \ -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab #Loaded 2968312 elements of size 7 hgLoadBed hg18 pgNA12892 NA12892.pgSnp.counts \ -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab #Loaded 2972120 elements of size 7 hgLoadBed hg18 pgNA19240 NA19240.pgSnp.counts \ -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab #Loaded 3586490 elements of size 7 ############################################################################# # GENOME VARIANTS - (DONE 1/7/09 giardine, adapted by angie from pgSnp/README) # File pgVenter.bed placed in /hive/data/genomes/hg18/bed/pgSnp/ by # Belinda. cd /hive/data/genomes/hg18/bed/pgSnp/ grep "^chr" pgVenter.bed | sort -k1,1 -k2,2n \ | hgLoadBed hg18 pgVenter stdin \ -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab # 3/11/09: fetching this file because I think it's the original data (angie) wget ftp://ftp.jcvi.org/pub/data/huref/HuRef.InternalHuRef-NCBI.gff ############################################################################# # GENOME VARIANTS - YRI NA18507 (DONE 1/9/07 giardine, adapted by angie from pgSnp/README) # SNP calls made by Aakrosh Ratan at PSU. # Files pgYri{2,3}.txt placed in /hive/data/genomes/hg18/bed/pgSnp/ by # Belinda. # yoruban snp calls (using solid software instead of maq) # Loaded 11/4/08 according to hg18.history, but table status says created # 1/7/09: cd /hive/data/genomes/hg18/bed/pgSnp/ grep "^chr" pgYri2.txt | sort -k1,1 -k2,2n \ | hgLoadBed hg18 pgYoruban2 stdin \ -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab #Another yoruban SNP set, same individual, Solexa reads, includes indels # Loaded 11/7/08 according to hg18.history, but table status says created # 1/7/09: grep "^chr" pgYri3.txt | sort -k1,1 -k2,2n \ | hgLoadBed hg18 pgYoruban3 stdin \ -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab ############################################################################# # GENOME VARIANTS - YH (DONE 2/24/09 giardine, adapted by angie from pgSnp/README) #Asian individual (YH1) from Nature paper #http://yh.genomics.org.cn/index.jsp # File pgSnpYh.txt placed in /hive/data/genomes/hg18/bed/pgSnp/ by # Belinda. cd /hive/data/genomes/hg18/bed/pgSnp/ grep "^chr" pgSnpYh.txt | sort -k1,1 -k2,2n \ | hgLoadBed hg18 pgYh1 stdin \ -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab # 3/11/09: fetching this file because I think it's the original data (angie) wget -O "yhsnp_add.gff" \ 'http://yh.genomics.org.cn/do.downServlet?file=data/snps/yhsnp_add.gff' ############################################################################# # GENOME VARIANTS - KOREF (DONE 9/17/09 angie) # Korean individual (Seong-Jin Kim) from Genome Research paper cd /hive/data/genomes/hg18/bed/pgSnp/ # Download Belinda's file from PSU, use same table name (pgSjk) as on # http://main.genome-browser.bx.psu.edu/ : wget http://www.bx.psu.edu/~giardine/tests/tmp/koref.sub.pgSnp hgLoadBed hg18 pgSjk koref.sub.pgSnp \ -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab #Loaded 3439107 elements of size 7 # Downloading because I think it's the original data: wget ftp://ftp.kobic.kr/pub/KOBIC-KoreanGenome/genetic_variations/KOREF-solexa-snp-X30_Q40d4D100.gff ############################################################################# # Initial import of LSSNP data for SNP and hgGene linking (2009-02-02 markd) ############################################################################# # dump and load LSSNP databases from Johns Hopkins. This will be automated # soon. # download dump into tmp directory LSSNP; must load on bugle as the # database is mysql 5 ssh bugle hgsql -e 'create database LSSNP' cat LSSNP/*.sql |hgsql LSSNP hgsqlimport LSSNP `pwd`/LSSNP/*.txt ssh hgwdev hgLsSnpPdbLoad fetch bugle:LSSNP lsSnpPdb.tab hgLsSnpPdbLoad load hg18 lsSnpPdb lsSnpPdb.tab ############################################################################# ############################################################################# # HGDP GEOGRAPHIC SNP MAPS (DONE 2/5/09 angie - UPDATED 9/15/10) # Project data downloaded and parsed in /hive/data/outside/hgdpGeo, # see makeDb/doc/hgdpGeo.txt. mkdir /hive/data/genomes/hg18/bed/hgdpGeo cd /hive/data/genomes/hg18/bed/hgdpGeo # Make an rsId-sorted snp coords file for joining with the hgdpGeo data. grep -Fwf /hive/data/outside/hgdpGeo/rsIDs.lst \ ../snp129/snp129.bed \ | awk 'BEGIN{OFS="\t";} {print $4, $1, $2, $3;}' \ | sort > snp129Coords.txt wc -l snp129Coords.txt #660280 snp129Coords.txt # How many distinct SNPs in there? (compare to 657000 from HGDP): cut -f 1 snp129Coords.txt |uniq | wc -l #656496 # Join files to make a track table: join -e ERROR -t' ' -o 1.2,1.3,1.4,1.1,2.2,2.3,2.4 \ snp129Coords.txt /hive/data/outside/hgdpGeo/hgdpGeoCoordless.txt \ | sed -re 's/([AGTC])\*/\1/' \ | sort -k1,1 -k2n,2n \ > hgdpGeo.tab wc -l hgdpGeo.tab #660280 hgdpGeo.tab grep ERROR hgdpGeo.tab | wc -l #0 hgLoadBed hg18 hgdpGeo hgdpGeo.tab \ -sqlTable=$HOME/kent/src/hg/lib/hgdpGeo.sql #Loaded 660280 elements of size 7 # Correcting strand and remapping to snp130 9/15/10: mkdir /hive/data/genomes/hg18/bed/hgdpGeo/100915 cd /hive/data/genomes/hg18/bed/hgdpGeo/100915 grep -Fwf /hive/data/outside/hgdpGeo/rsIDs.lst \ ../../snp130/snp130.bed \ | awk 'BEGIN{OFS="\t";} {print $4, $1, $2, $3, $8;}' \ | sort > snp130CoordsAndRef.txt cut -f 1 snp130CoordsAndRef.txt | uniq | wc -l #656484 join -e ERROR -t' ' -o 1.2,1.3,1.4,1.1,2.2,2.3,2.4,1.5 \ snp130CoordsAndRef.txt /hive/data/outside/hgdpGeo/hgdpGeoCoordless.txt \ | sed -re 's/([AGTC])\*/\1/' \ | sort -k1,1 -k2n,2n \ > hgdpGeo.fixme wc -l hgdpGeo.fixme #660265 hgdpGeo.fixme # Use the snp130 reference allele to detect when we need to rev-comp # the alleles to match the + strand. Also, throw out SNPs for which # the ref allele is multi-base -- it's questionable whether we're giving # the right coords (some funny things happen with dbSNP's clustering...): cat > fixAlleles.pl <<'_EOF_' #!/usr/bin/env perl use warnings; use strict; my %rc = ('A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A'); while (<>) { chomp; my ($c, $s, $e, $rs, $ancAl, $derAl, $freqs, $ref) = split; next unless ($ref =~ /^[ACGT]$/); if ($ancAl ne $ref && $derAl ne $ref) { $ancAl = $rc{$ancAl}; $derAl = $rc{$derAl}; } print join("\t", $c, $s, $e, $rs, $ancAl, $derAl, $freqs) . "\n"; } '_EOF_' # << emacs chmod a+x fixAlleles.pl ./fixAlleles.pl hgdpGeo.fixme > hgdpGeo.tab wc -l hgdpGeo.tab #660221 hgdpGeo.tab hgLoadBed hg18 hgdpGeo hgdpGeo.tab \ -sqlTable=$HOME/kent/src/hg/lib/hgdpGeo.sql #Loaded 660219 elements of size 7 ############################################################################# # HGDP HETEROZYGOSITY (DONE 2/12/09 angie, except for Bantu 3/12/09) mkdir /hive/data/genomes/hg18/bed/hgdpHzy cd /hive/data/genomes/hg18/bed/hgdpHzy foreach continent (african americas easia european mideast oceania sasia) wget --timestamping http://hgdp.uchicago.edu/data/hzy/$continent.gff.gz end wget --timestamping http://hgdp.uchicago.edu/data/hzy/allbantu.hzy.gff.gz foreach continent (african allbantu americas easia european mideast oceania sasia) set bedGraph = `echo $continent \ | sed -re 's/can$/ca/; s/pean$/pe/; s/asia/Asia/; s/allbantu/bantu/; \ s/(.*)/hgdpHzy\u\1.bedGraph/'` echo $bedGraph zcat $continent.gff.gz \ | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \ > $bedGraph end # 3/12/09: All of the original files' coords were intervals between SNPs, # but the Bantu file had SNP coordinates, and one more line per chrom than # the others. So (after getting OK from Joe) I am going to transform the # Bantu SNP coords to intervals like the others. perl -we 'while (<>) { \ chomp; ($c, $s, undef, $h) = split; \ if (defined $lastC) { \ if ($lastC eq $c) { \ print "$c\t$lastS\t$s\t$lastH\n"; \ } # Discarding last SNP on each chrom \ } \ ($lastC, $lastS, $lastH) = ($c, $s, $h); \ }' \ hgdpHzyBantu.bedGraph > tmp mv tmp hgdpHzyBantu.bedGraph # Using bedGraph, not wig, because there are only 640k datapoints and # some are over the 10Mbase wiggle item size limit. foreach f (*.bedGraph) hgLoadBed hg18 $f:r $f -bedGraph=4 end # All have same size: #Loaded 640676 elements of size 4 ############################################################################# # HGDP FST (DONE 2/12/09 angie) mkdir /hive/data/genomes/hg18/bed/hgdpFst cd /hive/data/genomes/hg18/bed/hgdpFst wget --timestamping \ http://hgdp.uchicago.edu/data/FST/autosomal_illuminasnps7_pval.gff.gz zcat autosomal_illuminasnps7_pval.gff.gz \ | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \ > hgdpFst.bedGraph hgLoadBed hg18 hgdpFst hgdpFst.bedGraph -bedGraph=4 #Loaded 640676 elements of size 4 ############################################################################# # HGDP IHS (DONE 2/13/09 angie) mkdir /hive/data/genomes/hg18/bed/hgdpIhs cd /hive/data/genomes/hg18/bed/hgdpIhs foreach continent (Bantu Americas E.Asia European MiddleEast Oceania S.Asian) wget --timestamping \ http://hgdp.uchicago.edu/data/iHS/smoothed$continent.iHS.gff.gz set bedGraph = `echo $continent \ | sed -re 's/pean$/pe/; s/\.Asian?/Asia/; \ s/MiddleEast/Mideast/; s/(.*)/hgdpIhs\1.bedGraph/'` echo $bedGraph zcat smoothed$continent.iHS.gff.gz \ | sed -e 's/^chr23/chrX/' \ | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \ > $bedGraph end foreach f (*.bedGraph) hgLoadBed hg18 $f:r $f -bedGraph=4 end #Reading hgdpIhsBantu.bedGraph #Loaded 540438 elements of size 4 #Reading hgdpIhsAmericas.bedGraph #Loaded 422167 elements of size 4 #Reading hgdpIhsEAsia.bedGraph #Loaded 487801 elements of size 4 #Reading hgdpIhsEurope.bedGraph #Loaded 543875 elements of size 4 #Reading hgdpIhsMideast.bedGraph #Loaded 552277 elements of size 4 #Reading hgdpIhsOceania.bedGraph #Loaded 425340 elements of size 4 #Reading hgdpIhsSAsia.bedGraph #Loaded 550231 elements of size 4 ############################################################################# # HGDP XP-EHH (DONE 2/12/09 angie) mkdir /hive/data/genomes/hg18/bed/hgdpXpehh cd /hive/data/genomes/hg18/bed/hgdpXpehh foreach continent (Bantu Americas E.Asia Europe Mideast Oceania S.Asia) wget --timestamping \ http://hgdp.uchicago.edu/data/XPEHH/$continent.xpehh.forbrowser.gff.gz set bedGraph = `echo $continent \ | sed -re 's/\.Asia?/Asia/; s/(.*)/hgdpXpehh\1.bedGraph/'` echo $bedGraph zcat $continent.xpehh.forbrowser.gff.gz \ | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \ > $bedGraph end foreach f (*.bedGraph) hgLoadBed hg18 $f:r $f -bedGraph=4 end #Reading hgdpXpehhBantu.bedGraph #Loaded 636680 elements of size 4 #Reading hgdpXpehhAmericas.bedGraph #Loaded 636143 elements of size 4 #Reading hgdpXpehhEAsia.bedGraph #Loaded 635799 elements of size 4 #Reading hgdpXpehhEurope.bedGraph #Loaded 636680 elements of size 4 #Reading hgdpXpehhMideast.bedGraph #Loaded 636849 elements of size 4 #Reading hgdpXpehhOceania.bedGraph #Loaded 637418 elements of size 4 #Reading hgdpXpehhSAsia.bedGraph #Loaded 636773 elements of size 4 ############################################################################# # LIFTOVER TO Hg19 (DONE - 2009-03-06 - Hiram ) mkdir /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06 cd /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug hg18 hg19 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ hg18 hg19 > do.log 2>&1 # real 85m8.064s ############################################################################# # HAPMAP REL22 RECOMBINATION RATES (PHASE II) (DONE 2/24/09 angie) mkdir -p /hive/data/outside/hapmap/recombination/2008-03_rel22_B36/rates cd /hive/data/outside/hapmap/recombination/2008-03_rel22_B36/ wget --timestamping \ ftp://ftp.hapmap.org/pub/hapmap/public/recombination/2008-03_rel22_B36/00README.txt cd rates wget --timestamping \ ftp://ftp.hapmap.org/pub/hapmap/public/recombination/2008-03_rel22_B36/rates/\* # Make bedGraph-formatted files. mkdir -p /hive/data/genomes/hg18/bed/hapmap/recombination/2008-03_rel22_B36 cd /hive/data/genomes/hg18/bed/hapmap/recombination/2008-03_rel22_B36 cp /dev/null hapmapRecombRate.bed foreach f (/hive/data/outside/hapmap/recombination/2008-03_rel22_B36/rates/*.txt) set chr = `echo $f:t:r | sed -e 's/^.*chr/chr/; s/_b36.*//;'` echo $f $chr perl -wpe 's/^position .*\n// && next; \ m/^(\d+) (\d+\.?\d*) .*/ || die $_; $end=$1; $rate=$2; \ $start=$end-100 unless (defined $start); \ $_ = "'$chr'\t$start\t$end\t$rate\n"; $start = $end;' \ $f >> hapmapRecombRate.bedGraph end # Some items are over the 10Mbase wiggle item size limit, so use bedGraph. time hgLoadBed hg18 hapmapRecombRate hapmapRecombRate.bedGraph -bedGraph=4 #Loaded 3281323 elements of size 4 #14.688u 1.796s 0:31.99 51.4% 0+0k 0+0io 0pf+0w # There are >3M items... try bigWig! :) wigToBigWig hapmapRecombRate.bedGraph /hive/data/genomes/hg18/chrom.sizes \ hapmapRecombRate.bw ln -s `pwd`/hapmapRecombRate.bw /gbdb/hg18/bbi/ hgsql hg18 -e 'drop table if exists hapmapRecombRateBW; \ create table hapmapRecombRateBW (fileName varchar(255) not null); \ insert into hapmapRecombRateBW values ("/gbdb/hg18/bbi/hapmapRecombRate.bw");' ############################################################################# # HAPMAP REL27 GENOTYPES (MERGED PHASE II+III) (DONE 2/25/09 angie) # First, download release to /hive/data/outside... mkdir -p /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III/{excluded,forward} cd /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III wget --timestamping \ ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/00README.txt cd excluded wget --timestamping \ ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/excluded/\* cd ../forward wget --timestamping \ ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/forward/\* # This directory's README refers to the README from the # phaseIII-only 2009_01, which gives the file format and explains # the population codes: wget --timestamping -o 00README_2009-01_phaseIII.txt \ ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-01_phaseIII/00README.txt # For details page... this is Coriell's NHGRI panel (all HapMap except # CEPH): http://ccr.coriell.org/Sections/Collections/NHGRI/?SsId=11 # http://www.broad.mit.edu/mpg/hapmap3/ # Broad, BCM and Sanger have a nice phase3 writeup. Here is Broad's # copy: http://www.broad.mit.edu/mpg/hapmap3/ # Now translate those into hapmapSnps* tables. # NOTE FOR NEXT TIME: make this a cluster job. It takes ~half hour each pop! # Could run the script on each downloaded file as a separate job, and then # concatenate results (or just feed chr*_$pop to hgLoadBed). mkdir -p /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III set sourceDir = /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III/forward foreach pop (ASW CEU CHB CHD GIH JPT LWK MEX MKK TSI YRI) echo $pop zcat $sourceDir/genotypes_chr*_${pop}_r27_nr.b36_fwd.txt.gz \ | perl -wpe 'chomp; \ if (/^rs# alleles c\w+ pos s\w+ a\w+# c\w+ protLSID assayLSID panelLSID QCcode NA/) { \ $_ = ""; # skip header lines \ } elsif (s/^(rs\d+) ([ACGT])\/([ACGT]) (chr\w+) (\d+) \+ ncbi_[bB]?36 .* QC\+ //) { \ ($rsId, $obs1, $obs2, $chr, $end) = ($1, $2, $3, $4, $5); \ %compl = (A=>"T", C=>"G", G=>"C", T=>"A"); \ %hom = (); %het = (); \ # NOTE: one trouble-maker (other pop files have A/C with AC genotypes): \ if ($rsId eq "rs7059622" && "'$pop'" eq "YRI") { warn "Tweaking YRI rs7059622.\n"; } \ foreach my $al (split()) { \ next if ($al eq "NN"); \ $al =~ /^([ACGT])([ACGT])$/ || die "Unrecognized allele string $al"; \ ($a1, $a2) = ($1, $2); \ # NOTE: one trouble-maker (other pop files have A/C with AC genotypes): \ if ($rsId eq "rs7059622" && "'$pop'" eq "YRI") \ { $a1 = $compl{$a1}; $a2 = $compl{$a2}; } \ # The error that the trouble-maker triggered: \ if (($a1 !~ /^[$obs1$obs2]$/) || ($a2 !~ /^[$obs1$obs2]$/)) \ { die "$rsId (${chr}_'$pop'): obs $obs1/$obs2 !~ $a1$a2!\n\t"; } \ if ($a1 eq $a2) { $hom{$a1}++; } else { $het{$a1}++; $het{$a2}++; } \ } \ $start = $end - 1; \ $hom1 = $hom{$obs1} || 0; $hom2 = $hom{$obs2} || 0; \ $het = $het{$obs1} || 0; $het2 = $het{$obs2} || 0; \ $score = (1000 * (2*$hom2 + $het) / (2*($hom1 + $hom2 + $het))); \ if ($score >= 500) { $score = 1000 - $score; } \ $score = int($score + 0.5); \ if ($het != $het2) { die "het{$obs1} ($het{$obs1}) != het{$obs2} ($het{$obs2})"; } \ $_ = "$chr\t$start\t$end\t$rsId\t$score\t+\t$obs1/$obs2\t$obs1\t$hom1\t$obs2\t$hom2\t$het\n"; \ } else { \ die "Unrecognized format:\n$_\n\t"; \ }' > hapmapSnps$pop.bed end wc -l hapmapSnps*.bed # 1561453 hapmapSnpsASW.bed # 4030774 hapmapSnpsCEU.bed # 4052336 hapmapSnpsCHB.bed # 1306196 hapmapSnpsCHD.bed # 1407877 hapmapSnpsGIH.bed # 4052423 hapmapSnpsJPT.bed # 1529764 hapmapSnpsLWK.bed # 1410265 hapmapSnpsMEX.bed # 1537638 hapmapSnpsMKK.bed # 1419921 hapmapSnpsTSI.bed # 3984356 hapmapSnpsYRI.bed foreach pop (ASW CEU CHB CHD GIH JPT LWK MEX MKK TSI YRI) hgLoadBed hg18 hapmapSnps$pop hapmapSnps$pop.bed -renameSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/hapmapSnps.sql end #Reading hapmapSnpsASW.bed #Loaded 1561453 elements of size 12 #Reading hapmapSnpsCEU.bed #Loaded 4030774 elements of size 12 #Reading hapmapSnpsCHB.bed #Loaded 4052336 elements of size 12 #Reading hapmapSnpsCHD.bed #Loaded 1306196 elements of size 12 #Reading hapmapSnpsGIH.bed #Loaded 1407877 elements of size 12 #Reading hapmapSnpsJPT.bed #Loaded 4052423 elements of size 12 #Reading hapmapSnpsLWK.bed #Loaded 1529764 elements of size 12 #Reading hapmapSnpsMEX.bed #Loaded 1410265 elements of size 12 #Reading hapmapSnpsMKK.bed #Loaded 1537638 elements of size 12 #Reading hapmapSnpsTSI.bed #Loaded 1419921 elements of size 12 #Reading hapmapSnpsYRI.bed #Loaded 3984356 elements of size 12 rm bed.tab; nice gzip *.bed ############################################################################# # HAPMAP REL27 ORTHOLOGOUS ALLELES (DONE 3/4/09 angie) # Similar procedure to snp129Ortho, but we make one table per species # because they are independent subtracks of HapMap SNPs. cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III # Glom all human info that we need for the final table onto the # name, to sneak it through liftOver: rsId|chr|start|end|obs|strand awk 'BEGIN{OFS="\t";} \ {print $1, $2, $3, \ $4 "|" $1 "|" $2 "|" $3 "|" $7 "|" $6, \ 0, $6;}' \ hapmapSnps???.bed \ | sort -u -k1,1 -k2n,2n \ > hapmapSnpsForLiftOver.bed wc -l hapmapSnpsForLiftOver.bed #4165831 hapmapSnpsCombined.bed # Orthologous allele locations: mkdir run.liftOChimp cd run.liftOChimp mkdir split out splitFile ../hapmapSnpsForLiftOver.bed 25000 split/chunk cp /dev/null jobList foreach f (split/chunk*) echo liftOver $f \ /hive/data/genomes/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \ \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \ >> jobList end ssh pk cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III/run.liftOChimp para make jobList #Completed: 167 of 167 jobs #CPU time in finished jobs: 31364s 522.74m 8.71h 0.36d 0.001 y #IO & Wait Time: 800s 13.33m 0.22h 0.01d 0.000 y #Average job time: 193s 3.21m 0.05h 0.00d #Longest finished job: 431s 7.18m 0.12h 0.00d #Submission to last job: 442s 7.37m 0.12h 0.01d mkdir ../run.liftOMac cd ../run.liftOMac mkdir out ln -s ../run.liftOChimp/split . cp /dev/null jobList foreach f (split/chunk*) echo liftOver $f \ /hive/data/genomes/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \ \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \ >> jobList end para make jobList #Completed: 167 of 167 jobs #CPU time in finished jobs: 2482s 41.36m 0.69h 0.03d 0.000 y #IO & Wait Time: 1361s 22.69m 0.38h 0.02d 0.000 y #Average job time: 23s 0.38m 0.01h 0.00d #Longest finished job: 33s 0.55m 0.01h 0.00d #Submission to last job: 97s 1.62m 0.03h 0.00d # Concatenate the liftOver results, sorting by ortho pos in order to # efficiently access 2bit sequence in getOrthoSeq. The output of # that is swizzled so that a glom of ortho coords is the first column, # and then we sort by that for joining with base quality info. # Ditto for macaque. ~5 minutes per species: cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \ | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /hive/data/genomes/panTro2/panTro2.2bit \ | awk 'BEGIN{OFS="\t";} {print $2 ":" $3 ":" $4, $5, $6, $1;}' \ | sort > panTro2.orthoGlom.txt sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \ | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /hive/data/genomes/rheMac2/rheMac2.2bit \ | awk 'BEGIN{OFS="\t";} {print $2 ":" $3 ":" $4, $5, $6, $1;}' \ | sort > rheMac2.orthoGlom.txt wc -l panTro2.orthoGlom.txt rheMac2.orthoGlom.txt # 4057739 panTro2.orthoGlom.txt # 3750076 rheMac2.orthoGlom.txt # Get base qualities -- ~12-16min per species. cut -f 1 panTro2.orthoGlom.txt | sed -e 's/:/\t/g' \ | hgWiggle -db=panTro2 -lift=1 -doAscii -bedFile=stdin quality \ | varStepToBedGraph.pl stdin \ | awk 'BEGIN{OFS="\t";} {print $1 ":" $2 ":" $3, int($4+0.5);}' \ | sort > panTro2.baseQuals.txt #Processed 4003968 lines input, 4003685 data lines, 47 variable step declarations cut -f 1 rheMac2.orthoGlom.txt | sed -e 's/:/\t/g' \ | hgWiggle -db=rheMac2 -lift=1 -doAscii -bedFile=stdin quality \ | varStepToBedGraph.pl stdin \ | awk 'BEGIN{OFS="\t";} {print $1 ":" $2 ":" $3, int($4+0.5);}' \ | sort > rheMac2.baseQuals.txt #Processed 3749772 lines input, 3749645 data lines, 21 variable step declarations # Join the allele-glom with the base qual-glom and swizzle columns into # the right order for a hapmapAllelesOrtho table. join -a 1 -e 0 panTro2.orthoGlom.txt panTro2.baseQuals.txt \ | perl -wpe 'chomp; ($oG, $oA, $oStr, $hG, $bQ) = split; \ ($oC, $oS, $oE) = split(":", $oG); \ ($rs, $hC, $hS, $hE, $hO, $hStr) = split(/\|/, $hG); \ unless (defined $bQ) { \ if ($oC =~ /^chr(21|Y|Y_random)$/) { $bQ = 98; } # per panTro2 quality track desc \ elsif ($oC eq "chrM") { $bQ = 0; } \ else { die "missing qual for $oC: $_\n\t"; } } \ $_ = "$hC\t$hS\t$hE\t$rs\t$bQ\t$hStr\t\t$hO\t$oC\t$oS\t$oE\t$oStr\t$oA\n";' \ | sort -k1,1 -k2n,2n \ > hapmapAllelesChimp.bed wc -l hapmapAllelesChimp.bed #4057739 hapmapAllelesChimp.bed join -a 1 -e 0 rheMac2.orthoGlom.txt rheMac2.baseQuals.txt \ | perl -wpe 'chomp; ($oG, $oA, $oStr, $hG, $bQ) = split; \ ($oC, $oS, $oE) = split(":", $oG); \ ($rs, $hC, $hS, $hE, $hO, $hStr) = split(/\|/, $hG); \ unless (defined $bQ) { die "missing qual for $oC: $_\n\t"; } \ $_ = "$hC\t$hS\t$hE\t$rs\t$bQ\t$hStr\t\t$hO\t$oC\t$oS\t$oE\t$oStr\t$oA\n";' \ | sort -k1,1 -k2n,2n \ > hapmapAllelesMacaque.bed wc -l hapmapAllelesMacaque.bed #3750076 hapmapAllelesMacaque.bed # Load tables. cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III hgLoadBed hg18 hapmapAllelesChimp hapmapAllelesChimp.bed \ -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapAllelesOrtho.sql #Loaded 4057739 elements of size 13 hgLoadBed hg18 hapmapAllelesMacaque hapmapAllelesMacaque.bed \ -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapAllelesOrtho.sql ############################################################################# # HAPMAP REL27 SUMMARY FOR HGTRACKS FILTERING (DONE 3/5/09 angie) cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III time hapmapPhaseIIISummary . #115.244u 5.009s 2:10.08 92.4% 0+0k 0+0io 2pf+0w time hgLoadBed hg18 hapmapPhaseIIISummary hapmapPhaseIIISummary.bed \ -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapPhaseIIISummary.sql #Loaded 4166007 elements of size 18 #33.401u 3.275s 1:46.95 34.2% 0+0k 0+0io 0pf+0w ############################################################################# # DOWNLOAD HAPMAP PHASED GENOTYPES (PHASE III) (DONE 2/23/09 angie) mkdir -p /hive/data/outside/hapmap/phasing/2009-02_phaseIII/HapMap3_r2 cd /hive/data/outside/hapmap/phasing/2009-02_phaseIII/HapMap3_r2 wget --timestamping \ ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2009-02_phaseIII/HapMap3_r2/\* foreach pop (ASW CEU CHD GIH JPT+CHB LWK MEX MKK TSI YRI) foreach type (DUOS TRIOS UNRELATED) mkdir -p $pop/$type pushd $pop/$type wget --timestamping \ ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2009-02_phaseIII/HapMap3_r2/$pop/$type/\* popd end end # Looks like phased genotypes are given only for the populations with # family structure: ASW, CEU, MEX, MKK, and YRI. # Next: use these data to make LD tracks. ############################################################################# # HAPMAP LD COMPUTED ON PHASED & UNPHASED GENOTYPES (TODO angie) ############################################################################# # GERP Conservation scoring and elements for Ensembl 31-way alignments # From Javier Guerroro # ENCODE-related data (equested by Margulies, for use by ENCODE analysis group) # (2009-03-05 kate) ssh hgwdev cd /cluster/data/hg18/bed mkdir -p ensembl31wayGerp/lab cd ensembl31wayGerp/lab wget -r ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/31way_msa/ cd .. bzcat lab/31way_gerp_elements.bed.bz2 | \ tail -n +2 | \ sed 's/31way_gerp_elem_365000000/gerp31./' | \ hgLoadBed hg18 ensembl31wayGerpElements stdin \ -sqlTable=$HOME/kent/src/hg/lib/encode/broadPeak.sql -renameSqlTable # Loaded 1464897 elements of size 9 cat > we.csh << 'EOF' foreach f (lab/*.wig.bz2) echo $f bzcat $f | tail -n +2 | wigEncode stdin temp.wig temp.wib end 'EOF' # << emacs bzcat lab/*.wig.bz2 | tail -n +2 | \ wigEncode stdin ensembl31wayGerpScores.wig ensembl31wayGerpScores.wib # load database mkdir /gbdb/hg18/wib ln -s `pwd`/ensembl31wayGerpScores.wib /gbdb/hg18/wib hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 ensembl31wayGerpScores ensembl31wayGerpScores.wig ############################################################################ # VEGA GENES UPDATE (BUILD 33) (DONE 2008-03-11 Andy) mkdir /cluster/data/hg18/bed/vega33 cd /cluster/data/hg18/bed/vega33 wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/human/*" \ "ftp://ftp.sanger.ac.uk/pub/vega/human/pep/*.tot.fa.gz" zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \ | grep "^chr" > nonHaps.gtf zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \ | grep -v "^chr" > haps.gtf awk 'BEGIN{OFS="\t";FS="\t";}{ if ($1 == "c6_COX") { if (($4 >= 28688544) && ($5 <= 33420241)) print; } else if ($1 == "c6_QBL") { if (($4 >= 28885510) && ($5 <= 33451440)) print;}}' haps.gtf > keeptHaps.gtf liftUp -type=.gtf lifted.gtf /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry keeptHaps.gtf cat nonHaps.gtf lifted.gtf > all.gtf gzip all.gtf rm *.gtf gtfToGenePred -infoOut=infoOut.txt -genePredExt all.gtf.gz stdout | gzip > all.gp.gz /cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl infoOut.txt > ensGtp.tab genePredCheck -db=hg18 all.gp.gz #checked: 69859 failed: 0 zcat all.gtf.gz | grep -i pseudo > pseudo.gtf zcat all.gtf.gz | grep -v -i pseudo > not.pseudo.gtf gtfToGenePred -genePredExt pseudo.gtf pseudo.gp gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp genePredCheck -db=hg18 pseudo.gp #checked: 6901 failed: 0 genePredCheck -db=hg18 not.pseudo.gp #checked: 62958 failed: 0 hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp ############################################################################# # COVERAGE FOR 1000 GENOMES HIGH-COV INDIVIDS (IN PROGRESS 6/10/09 angie) #TODO: try again now that wigToBigWig is more mem-efficient # also, new alignments have probably become available since then. # wigBedToStep ran out of memory on hgwdev (w/limit of 32G)... roll own: cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes foreach s (NA12878 NA12891 NA12892 NA19238 NA19239 NA19240) pushd data/$s/alignment foreach p (454 SLX) echo "==== $s $p ====" ls -1 $s.chrom*.$p.SRP000032.2009_04.bam \ | grep -v chromMT \ | xargs -L 1 samtools pileup \ | perl -pe '($c, $start, undef, $depth) = split; \ if ($c ne $lastC || $start != $lastStart+1) { \ print "fixedStep chrom=chr$c start=$start step=1 span=1\n"; \ } \ $_ = "$depth\n"; \ ($lastC, $lastStart) = ($c, $start);' \ | gzip -c > cov${s}By{$p}.fixedStep.gz echo "" end popd end #TODO # Killing memory -- run separately: | wigToBigWig -clip stdin /hive/data/genomes/hg18/chrom.sizes cov${s}By$p.bw #[bam_pileup] fail to read the header of NA12878.chromY.454.SRP000032.2009_04.bam: non-exisiting file or wrong format. # NA12878.chromY.454.SRP000032.2009_04.bam is an empty file. # Load tables foreach bw (`find /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes \ -name cov\*.bw`) ln -s $bw /gbdb/hg18/bbi/ hgsql hg18 -e "drop table if exists $bw:t:r; \ create table $bw:t:r (fileName varchar(255) not null); \ insert into $bw:t:r values ('/gbdb/hg18/bbi/$bw:t');" end ############################################################################# # 1000 GENOMES HIGH-COV INDIVIDS READ ALIGNMENTS (DONE 11/30/09 angie) # one-off to test BAM as track type: cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes set testBam = NA12878/alignment/NA12878.chrom22.SRP000032.2009_02.bam ln -s `pwd`/$testBam{,.bai} \ /gbdb/hg18/bbi/ hgsql hg18 -e "drop table if exists bamNA12878; \ create table bamNA12878 (fileName varchar(255) not null); \ insert into bamNA12878 values ('/gbdb/hg18/bbi/$testBam:t');" # 9/14/09: update bamNA12878 to use new seqName column and try samtools' # capability to fetch ftp sparsely: hgsql hg18 -e "drop table if exists bamNA12878; \ create table bamNA12878 (fileName varchar(255) not null, \ seqName varchar(255) not null); \ insert into bamNA12878 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12878/alignment/NA12878.chrom21.SLX.maq.SRP000032.2009_07.bam', '21'); \ insert into bamNA12878 values ('/gbdb/hg18/bbi/NA12878.chrom22.SLX.SRP000032.2009_04.bam', '22');" # 11/30/09: Add more remote files: foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Y) hgsql hg18 -e "insert into bamNA12878 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12878/alignment/NA12878.chrom$c.SLX.maq.SRP000032.2009_07.bam', '$c');" end # Add an all-remote NA12891 for testing composite track: hgsql hg18 -e "create table bamNA12891 (fileName varchar(255) not null, \ seqName varchar(255) not null);" foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) hgsql hg18 -e "insert into bamNA12891 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12891/alignment/NA12891.chrom$c.SLX.maq.SRP000032.2009_07.bam', '$c');" end ############################################################################## # UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram) mkdir /hive/data/genomes/hg18/bed/ucscToEnsembl cd /hive/data/genomes/hg18/bed/ucscToEnsembl awk '{printf "%s\t%s\n", $4, $2}' ../../jkStuff/ensGene.haplotype.lift \ > ucscToEnsembl.tab cat << '_EOF_' > ucscToEnsembl.sql # UCSC to Ensembl chr name translation CREATE TABLE ucscToEnsembl ( ucsc varchar(255) not null, # UCSC chromosome name ensembl varchar(255) not null, # Ensembl chromosome name #Indices PRIMARY KEY(ucsc(21)) ); '_EOF_' hgsql hg18 < ucscToEnsembl.sql hgsql hg18 \ -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl' awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \ > ensemblLift.tab cat << '_EOF_' > ensemblLift.sql # UCSC offset to Ensembl coordinates CREATE TABLE ensemblLift ( chrom varchar(255) not null, # Ensembl chromosome name offset int unsigned not null, # offset to add to UCSC position #Indices PRIMARY KEY(chrom(6)) ); '_EOF_' hgsql hg18 < ensemblLift.sql hgsql hg18 \ -e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift' ############################################################################## # FOX2 CLUSTERS (DONE 2009-04-08, Andy) cp cluster.combine.bed /hive/data/genomes/hg18/bed/fox2ClipSeq ## (got the data as an attachment from Gene Yeo) cd /hive/data/genomes/hg18/bed/fox2ClipSeq grep chr cluster.combine.bed | cut -f1-4 | \ bedSort stdin fox2ClipClusters.hg17.bed liftOver fox2ClipClusters.hg17.bed \ /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \ fox2ClipClusters.bed unmapped.bed hgLoadBed hg18 fox2ClipClusters{,.bed} ############################################################################## # RE-BUILD sno/miRNA TRACK (DONE, 2009-06-11 - 2009-06-13, hartera) # The data in this track is out of date so update the track. mkdir -p /hive/data/genomes/hg18/bed/wgRna-2009-06-11 cd /hive/data/genomes/hg18/bed/wgRna-2009-06-11 # Download GFF file of latest miRNA annotations from miRBase at the # Wellcome Trust Sanger Institute (WTSI). This is Release 13.0 (March # 2009) wget --timestamping \ ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/CURRENT/genomes/hsa.gff # Re-format, need to add "chr" to the beginning of each line. sed -e 's/^/chr/' hsa.gff > hsMirBaseFormat.gff # Remove extra "chr" in comment lines perl -pi.bak -e 's/chr#/#/' hsMirBaseFormat.gff # Change chrMT to chrM perl -pi.bak -e 's/chrMT/chrM/' hsMirBaseFormat.gff # Remove all but ID name in last field sed -e 's/\";//g' hsMirBaseFormat.gff | sed -e 's/ID=\"//g' \ | sed -e 's/ACC=\"MI[0-9]*\s//' > hsMirBaseFormatIdOnly.gff # use score 906 for + strand and 480 for - strand. This will show # up black on the track for + strand and grey for - strand. # Starts appear to be 1-based when compared to miRNAs in current track # and those in Ensembl. # Confirmed with Sam Griffith-Jones (one of the authors of miRBase, # sam.griffith-jones@manchester.ac.uk) that these GFF coordinates # are 1-based. # Also add thickStart and thickEnd columns and "miRNA" for type. awk 'BEGIN {FS="\t"} {OFS="\t"} \ {if ($0 !~ /#/ && $7 == "+") \ print $1, $4-1, $5, $9, 960, $7, 0, 0, "miRNA"; \ else if ($0 !~ /#/ && $7 == "-") \ print $1, $4-1, $5, $9, 480, $7, 0, 0, "miRNA";}' \ hsMirBaseFormatIdOnly.gff > hsMirBaseFormatIdOnly.bed # 2009-06-12 # snoRNAs are from snoRNABase at http://www-snorna.biotoul.fr/ # Download coordinates for hg18 from # http://www-snorna.biotoul.fr/coordinates.php # This is version 3 of the database. # save as tab-separated file: snoRNABaseVersion3Coords.txt and remove # first and last lines. perl -pi.bak -e 's/\"//g' snoRNABaseVersion3Coords.txt # Reformat to BED format with thickStart and thickEnd set to 0. awk 'BEGIN {FS="\t"} {OFS="\t"} \ {if ($4 == "+") \ print $1, $2-1, $3, $5, 960, $4, 0, 0,$6; \ else if ($4 == "-") \ print $1, $2-1, $3, $5, 480, $4, 0, 0,$6;}' \ snoRNABaseVersion3Coords.txt > snoRNABaseVersion3Coords.bed # Merge the miRNA and snoRNA files together cat hsMirBaseFormatIdOnly.bed snoRNABaseVersion3Coords.bed \ > wgRna20090611.bed # Load into separate table rather than overwriting wgRna cp -p /cluster/home/hartera/src/hg/lib/wgRna.sql wgRnaJun09.sql perl -pi.bak -e 's/TABLE wgRna/TABLE wgRnaJun09/' wgRnaJun09.sql hgLoadBed -sqlTable=wgRnaJun09.sql hg18 wgRnaJun09 wgRna20090611.bed # Reading wgRna20090611.bed # Loaded 1120 elements of size 9 # Sorted # Creating table definition for wgRnaJun09 # Saving bed.tab # Loading hg18 # Clean up rm *.bak hgsql -e 'select count(*) from wgRna;' hg18 # 1059 # for miRNAs: 685 (676 unique names) # and others: 374 including 21 scaRNA hgsql -e 'select count(*) from wgRnaJun09;' hg18 # 1120 # for miRNAs: 718 (705 unique) # and others: 402 including 21 scaRNA # 2009-06-13 # Renamed the old wgRna track to wgRnaOld and renamed the new wgRnaJun09 # track to wgRna. Will keep the old track around for a while until # new track checked and QA'd. hgsql -e 'alter table wgRna rename wgRnaOld;' hg18 hgsql -e 'alter table wgRnaJun09 rename wgRna;' hg18 ################## ## Uniqueness Track: Step one (courtesy of John Castle, Rosetta) ## Make oligos of length XX # Perl one-liner to make a batch file # I've included the perl files CNV_makereads2.pl (simply uses substr on a chromosome) and fastagrep.pl (to remove sequences with Ns # The files chr$x.fa are the individual chromosomes perl -e 'for ($i = 1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} if ($i == 24) {$x = 'Y';} if ($i == 25) {$x = 'M';} print "~/DTcode/CNV_makereads2.pl 100 /info/genome/Projects/721/ref/chr$x.fa | fastagrep.pl -v n > chr$x.fa\n";}' > batch_chr_get #!/usr/bin/perl -w #--------------------------------------------------------------------- # C O P Y R I G H T N O T I C E #--------------------------------------------------------------------- # Copyright (c) 2001 Rosetta Inpharmatics, Inc. # 12040 115th Avenue NE, Kirkland, WA 98034-6900 # All Rights Reserved. Reproduction, adaptation, or # translation without prior written permission of # Rosetta Inpharmatics, Inc. is prohibited. #--------------------------------------------------------------------- # CNV_makereads.pl # $Id: hg18.txt,v 1.422 2010/06/02 23:00:02 angie Exp $ #use lib ('/home/castlej/perl/','/home/castlej/OSDTools/','/home/castlej/DTcode/'); #use strict; my $oligo_length = $ARGV[0]; my $file = $ARGV[1]; open(IN,$file); $/ = "\n>";# change input line separator to '>' to suck up FASTA sequences while ($line= ) { $line =~ s/^>//m; # remove '>' from end of $line $line =~ s/>$//m; # remove Unigene lines starting with '#' $line =~ s/\n\#.*$//m; # get sequence id $line =~ /^\s*(\S+).*([^\0]*)/; $id = $1; $seq = $2; $seq =~ s/\n//g; } if ($id =~ /(chr\S+)\.nib/) { $chr = $1; } elsif ($id =~ /(chr\S+)/) { $chr = $1; } for ($i = 0; $i $chr:$i-$j\n$a\n"; } #!/usr/bin/perl -w #--------------------------------------------------------------------- # C O P Y R I G H T N O T I C E #--------------------------------------------------------------------- # Copyright (c) 2000,2001,2002 Rosetta Inpharmatics, Inc. # 12040 115th Avenue NE, Kirkland, WA 98034-6900 # All Rights Reserved. Reproduction, adaptation, or # translation without prior written permission of # Rosetta Inpharmatics, Inc. is prohibited. #--------------------------------------------------------------------- # # $Id: hg18.txt,v 1.422 2010/06/02 23:00:02 angie Exp $ # # finds selected sequences in FASTA by regex matching in defline or sequence use strict; my( $option, $regex, @regexes, %tofind, $exceptflag, $key, $value, $line, ); $exceptflag = 0; unless (scalar(@ARGV)) { print "\nUsage: $0 [OPTION] PATTERN [FASTAFILE]\n"; print "$0 finds sequences by pattern matching in FASTA format data\n\n"; exit; } while ((scalar(@ARGV)) && ($ARGV[0] =~ /^-(\w+)/)) { $option = $1; shift(@ARGV); if ($option =~ /v/) { # user wants sequences NOT matching regex(es) $exceptflag = 1; } if ($option =~ /s/) { # regex on command line push(@regexes, shift(@ARGV)); } if ($option =~ /f/) { # user wants list of regexes from file open(INHANDLE, "<$ARGV[0]") || die "$0: error, can't open regex list file $ARGV[0]\n"; while (defined($regex = )) { chomp $regex; push(@regexes, $regex); } shift(@ARGV); } } if (scalar(@regexes) < 1) { push(@regexes, shift(@ARGV)); } $/ = "\n>"; # change input line separator to suck up FASTA sequences SEQUENCE: while (defined($line = <>)) { # remove '>' from start of first $line $line =~ s/^>//m; # stick '>' back on all $lines $line = '>'.$line; # remove '>' from end of $line $line =~ s/>$//m; # remove Unigene lines starting with '#' $line =~ s/\n\#.*$//m; foreach $regex (@regexes) { if ($line =~ /$regex/) { unless ($exceptflag) { print $line; } next SEQUENCE; } } if ($exceptflag) { print $line; } } # Submit batch file to cluster (we use LSF), each line is a submission perl -ne 'chomp; $a = "bsub -q short64 \"$_\"\n"; system($a);' batch_chr_get #################### # Uniqueness Step two # I've used an older version of BWA. The newer version from sourceforge outputs a binary file which then must be converted to a text file # HG18 is the human genome # I could include banything_2GBNew.pl but it is simply a cluster "chunk and submit" code # Method 1 perl -e 'for ($i =1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} elsif ($i == 24) {$x = 'Y';} elsif ($i == 25) {$x = 'M';} print "banything_2GbNew2.pl -a /ifs65/dtap/bin/bwa/bwa-0.2.0/bwa -z 1000000 -in chr$x.fa -o chr$x.bwa -stdout chr$x.bwa -pre \"aln -o 0 /info/dtap/projects/1057_CNV/HG18/HG18 \" -suf \" \" \n";}' >! batch_banything chmod +777 batch_banything batch_banything # Method 2 perl -e 'for ($i =1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} elsif ($i == 24) {$x = 'Y';} elsif ($i == 25) {$x = 'M';} print "/ifs65/dtap/bin/bwa/bwa-0.2.0/bwa aln -o 0 /info/dtap/projects/1057_CNV/HG18/HG18 chr$x.fa > chr$x.bwa\n"}' >! batch_banything chmod +777 batch_banything perl -ne 'chomp; $a = "bsub -q long64 \"$_\"\n"; system($a);' batch_anything ##################### # Uniqueness Step three # I ran this one-liner from a higher level directory perl -e '$pwd = `pwd`; chomp($pwd); @a = `ls`; foreach $dir (@a) {chomp ($dir); unless ($dir =~ /(\d+)mer_2nd/) {next;}; @b = `ls $dir/*fa.bwa`; foreach $file (@b) {chomp($file); $f = "$pwd/$file"; $f =~ /^(\S+chr[^\.]+)\.*/; $e = $1; print "~/DTcode/CNV_parseBWA_wiggle.pl 100 1 $f\* > $e.quality.100.wiggle\n";}}' > batch_wiggle # Submit batch file to cluster (we use LSF), each line is a submission perl -ne 'chomp; $a = "bsub -q long64 \"$_\"\n"; system($a);' batch_wiggle #!/usr/bin/perl -w # John Castle # May 19, 2009 # $Cap a maximum value to clip data with # $Use_score whether to output the uniqueness score or the number of hits # @FilesIn the BWA text output files to scan # ** NOTE ** The newer BWA algorithm outputs a binary file that is then made into a text file using BWA again. # However, the text file output has a slightly different format so the parsing will need to change. ($Cap, $Use_score, @FilesIn) = @ARGV; if ($FilesIn[0] =~ /\.gz/) { open(IN,"gzip -dc $FilesIn[0] |") } else { open(IN,$FilesIn[0]); } #### Description @a = split("\t",); $a[6] =~ /(\d+)/; $len = $1; close(IN); ### Wiggle header text if ($Use_score == 0) { print "track type=wiggle_0 name=\"Alignment scores of $len\mer as\" description=\"Unique $len mer alignments\" color=100,50,150 gridDefault=on yLineOnOff=on visibility=full maxHeightPixels=40:40:12\n"; } else { print "track type=wiggle_0 name=\"$len\mer alignment scores\" description=\"$len\mer alignment scores from BWA/MAQ, where 37 indicates a unique alignment\" color=100,50,150 gridDefault=on yLineOnOff=on visibility=full maxHeightPixels=40:40:12\n"; } ### Parse through file(s) foreach $file (@FilesIn) { if ($file =~ /\.gz/) { open(IN,"gzip -dc $file |"); } else { open(IN,$file); } @a = split("\t",); $a[0] =~ /(chr\S+):(\d+)/; $Chr = $1; $start = $2; $score = $a[5]; $hits = $a[11]; if ($hits > $Cap) {$hits = $Cap;} if ($Use_score == 1) {$value = $score;} else {$value = $hits;} while () { # Make wiggle track, with start and end coordinates for same scoring regions @a = split("\t",$_); if ($#a <15) { next; } $a[0] =~ /(chr\S+):(\d+)/; $chr = $1; $pos = $2; $score = $a[5]; $hits = $a[11]; if ($hits > $Cap) {$hits = $Cap;} if ($Use_score == 1) {$x = $score; } else {$x = $hits;} if ($x != $value) { print "$Chr\t$start\t$pos\t$value\n"; $Chr = $chr; $value = $x; $start = $pos; } } print "$Chr\t$start\t$pos\t$value\n"; close(IN); } ############################################################################ # Re-Run equCab2 alignment (DONE - 2009-06-29,07-02 - Hiram) mkdir /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29 cd /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29 cat << '_EOF_' > DEF # Human vs. Horse BLASTZ_M=50 # TARGET: Human hg18 SEQ1_DIR=/scratch/data/hg18/bothMaskedNibs SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Horse SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/scratch/data/equCab2/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -workhorse=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 582m47.015s # failed due to power failure - Mon Jun 29 23:32:54 PDT 2009 time doBlastzChainNet.pl `pwd`/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -continue=chainRun -workhorse=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > chainRun.log 2>&1 & # real 430m13.886s cat fb.hg18.chainEquCab2Link.txt # 1647122438 bases of 2881515245 (57.162%) in intersection mkdir /hive/data/genomes/equCab2/bed/blastz.hg18.swap cd /hive/data/genomes/equCab2/bed/blastz.hg18.swap time doBlastzChainNet.pl \ /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29/DEF \ -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \ -swap -workhorse=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 238m42.004s cat fb.equCab2.chainHg18Link.txt # 1622340736 bases of 2428790173 (66.796%) in intersection ############################################################################ # Fantom Cage 4 Track (2009-07-16) cd /projects/compbiousr/sugnet/projects/cage-20090428 mkdir data cd data # Get the Human tags from Riken's download site. wget -r -l 3 http://fantom.gsc.riken.jp/4/download/Tables/human/CAGE/mapping/ # Apparently time series with hours at: # 4,5,6,8,10,11,15,21,22,27,28,33,34,35,37,40,42,43,45,47,48,49,51,52,53,57,59,61,62,63,64,65,69,73,74,91,92,93,h95 ctrls, i02, i03 # Goto the data directory cd /projects/compbiousr/sugnet/projects/cage-20090428/data/fantom.gsc.riken.jp/4/download/Tables/human/CAGE/mapping/ # Unzip data for bz in `ls *.bz2`; do \ echo "Unzipping $bz"; \ bunzip2 $bz; \ done # From column headers it looks like the values of interest are: # 0 = id # 1 = library_count # 2 = edit_string # 3 = chrom # 4 = strand # 5 = start # 6 = end # Pull the raw scores into a single file cat h*_mapping.tbl.txt | grep -v '^#' | grep -v 'library_count' | grep 'chr' | perl -ne '$l=$_; @w = split /\t/, $l; print "$w[3]\t$w[5]\t$w[6]\t$w[0]\t$w[1]\t$w[4]\n";' > all.wscores.bed cat << '_EOF_' > toBed.pl #!/usr/bin/perl $prefix = shift(@ARGV); $prefix =~ s/h/H/g; while($l = <>) { if(!($l=~ /^\#/) && !($l=~/^id/)) { chomp($l); @w = split /\t/, $l; $score = 100 * $w[1]; if($score > 1000) { $score = 1000; } $name = $prefix; $size = $w[6] - $w[5]; print "$w[3]\t$w[5]\t$w[6]\t$prefix\t$score\t$w[4]\t$w[5]\t$w[6]\t0\t1\t$size,\t0,\n"; } } '_EOF_' # << happy emacs chmod 755 toBed.pl # Make the top level bed track for f in `ls *mapping.tbl.txt`; do root=`basename $f .txt`; prefix=`basename $f _mapping.tbl.txt`; bed=$root.bed; echo "Reading from $f into $bed with prefix $prefix"; toBed.pl $prefix < $f > $bed; done; # Call program in stats mode to generate summary statistics about how many reads there are in a sliding window around # sites with tags cageSingleTrack -input=all.wscores.bed -forward=all.forward.plaw.scores -reverse=all.reverse.plaw.scores -stats-only # Grab every 100th record to make a bite (byte?) sized chunk for R cat all.forward.plaw.scores | perl -e '$c = 0; while($l=<>) { if($c++ % 100 == 0) { print "$l"; } }' > sample.txt # Some R code to fit a power law model and get coefficient via log/log line fit d = read.table('sample.txt'); # Grab all the data less than 200 counts (81% of data) as that is where the model really fits dd = d$V4[d$V4 < 200] # Use hist command to find counts at each bucket size h = hist(dd, 200, plot=F) # Take the logs y = log10(h$counts) x = log10(h$breaks[1:198]) # Fit a robust line library(MASS) r = rlm(y~x) # Call: # rlm(formula = y ~ x) # Converged in 5 iterations # # Coefficients: #(Intercept) x # 3.987744 -1.196954 # Visually note that the data fits a power law nicely plot(log10(h$breaks[1:198]),log10(h$counts), xlab="Log10 Tags In Window", ylab="Log10 Number of Times Occuring", main="Distribution of CAGE Tags in Sliding 35bp Window") abline(r) # Using the coefficient learned above predict the posterior probability of seeing this observation cageSingleTrack -input=all.wscores.bed -forward=all.forward.plaw.bg2 -reverse=all.reverse.plaw.bg2 -alpha=1.196954 -xmax=198 # Load up the bed graph tracks hgLoadBed -bedGraph=4 hg18 FantomCageForwardPowerLawGraph all.forward.plaw.bg2 hgLoadBed -bedGraph=4 hg18 FantomCageReversePowerLawGraph all.reverse.plaw.bg2 ############################################################################ # ENCODE PHASED GENOTYPES for NA12878 (DONE 7/22/09 angie) mkdir /hive/data/genomes/hg18/bed/phasedGenotypesNA12878 cd /hive/data/genomes/hg18/bed/phasedGenotypesNA12878 wget http://illumina-mac.stanford.edu/NA12878_Reference_Genome/code/CEU.trio.dec.with.x.with.rs.calls wget http://illumina-mac.stanford.edu/NA12878_Reference_Genome/code/{Makefile,PhaseSNPs.pm} #TODO: strip homozyg-same-as-reference SNPs from CEU.trio, then make: make NA12878_SNPs_Phased.bed perl -wpe '/^(\w+)\t(\d+)\t(\d+)\t([ACGT])\/([ACGT])\t([MP\/HA]+)$/ || die "parse\n$_\t"; \ ($c, $s, $e, $a1, $a2, $t) = ($1, $2, $3, $4, $5, $6); \ if ($t eq "M/P") { \ $_ = "$c\t$s\t$e\tM:$a1\n" . "$c\t$s\t$e\tP:$a2\n"; \ } elsif ($t eq "P/M") { \ $_ = "$c\t$s\t$e\tM:$a2\n" . "$c\t$s\t$e\tP:$a1\n"; \ } elsif ($t eq "H") { \ $_ = "$c\t$s\t$e\t$a1\n"; \ } elsif ($t eq "A") { \ $_ = "$c\t$s\t$e\tA:$a1/$a2\n"; \ } else { die "unrec type $t"; } \ ' NA12878_SNPs_Phased.bed \ > phasedGenotypesNA12878.bed hgLoadBed -noNameIx hg18 phasedGenotypesNA12878 phasedGenotypesNA12878.bed #Loaded 5469032 elements of size 4 ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ # rnaBinding RNA Binding Proteins (2009-07-28 markd) # contributor: Jeremy Sanford # sfrs1Input BED table: # need to drop color, as it's in the wrong column # skip header tawk 'NR>1{print $1,$2,$3,$4,$5,$6}' Input_sequence_blocks.bed | hgLoadBed hg18 sfrs1Input stdin # sfrs1Clip BED table: # skip header tawk 'NR>1{print $1,$2,$3,$4,$5,$6}' SFRS1_CLIP_sequence_blocks.bed | hgLoadBed hg18 sfrs1Clip stdin # SFRS1_consensus_sites.wig tawk 'NR>1' SFRS1_consensus_sites.wig | wigEncode stdin sfrs1ConsensusSites.wig sfrs1ConsensusSites.wib # Converted stdin, upper limit 11.63, lower limit -28.64 hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 sfrs1ConsensusSites sfrs1ConsensusSites.wig ln -s $(pwd -P)/sfrs1ConsensusSites.wib /gbdb/hg18/wib/ ############################################################################ # VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-08-04 - 2009-09-09, hartera) # Needs updating as the current version is build 33. # Download the human VEGA Genes posted on ftp site on 2009-03-31 # 2009-08-03 (hartera) - Added code to register track handler for # vegaGeneComposite. # 2009-08-15 - 2009-08-16 (hartera) - Added code to allow use of radio buttons # on the configuratio page for the track item labels. Modified code so it # can be shared with Ensembl to create the links to Vega transcript, gene # and protein reports on the details pages. # 2009-08-22 - Finished code for adding Vega report URLs to the details pages. # Loaded the vegaGtp table. # 2009-09-01 - 2009-09-02 (hartera). Loaded a vegaPep table for the protein # sequence link on the details pages. # 2009-09-04 Re-load all tables as some reverted to the older version during # mySQL 5 upgrade. # 2009-09-08 - 2009-09-09 Code change to change message on details page when # no protein is available and change to trackDb to make vegaGene items a # darker blue colour. Reloaded vegaPep after removing proteins whose # transcripts are not in vegaGtp to make all.joiner happy. mkdir /hive/data/genomes/hg18/bed/vega35 cd /hive/data/genomes/hg18/bed/vega35 wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/human/*" \ "ftp://ftp.sanger.ac.uk/pub/vega/human/pep/*.tot.fa.gz" zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \ | grep "^chr" > nonHaps.gtf zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \ | grep -v "^chr" > haps.gtf awk 'BEGIN{OFS="\t";FS="\t";}{ if ($1 == "c6_COX") { if (($4 >= 28688544) && ($5 <= 33420241)) print; } else if ($1 == "c6_QBL") { if (($4 >= 28885510) && ($5 <= 33451440)) print;}}' haps.gtf > keptHaps.gtf liftUp -type=.gtf lifted.gtf /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry keptHaps.gtf cat nonHaps.gtf lifted.gtf > all.gtf # Do this to create the infoOut.txt file and extract the extra information gtfToGenePred -infoOut=infoOut.txt -genePredExt all.gtf stdout | gzip > tempAll.gp.gz ~/kent/src/hg/utils/automation/extractGtf.pl infoOut.txt > vegaGtp.tab # Change the gene name to have the gene_id label so that this is in the # name2 field of the extended genePred table. This can then be displayed # at the track item label. perl -pi.bak -e 's/gene_id/other_gene_id/' all.gtf perl -pi.bak -e 's/gene_name/gene_id/' all.gtf gzip all.gtf rm *.gtf tempAll.gp.gz # create genePred files for loading into database gtfToGenePred -genePredExt all.gtf.gz stdout | gzip > all.gp.gz genePredCheck -db=hg18 all.gp.gz # checked: 81244 failed: 0 zcat all.gtf.gz | grep -i pseudo > pseudo.gtf zcat all.gtf.gz | grep -v -i pseudo > not.pseudo.gtf gtfToGenePred -genePredExt pseudo.gtf pseudo.gp gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp genePredCheck -db=hg18 pseudo.gp # checked: 8331 failed: 0 genePredCheck -db=hg18 not.pseudo.gp # checked: 72913 failed: 0 hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp # Added code to src/hg/hgTracks/simpleTracks.c to register a track handler # for vegaGeneComposite that is now used for this data. This used # vegaGeneMethods to display the name2 field (gene) as the item label in # the track. ############################################################################ # EPO ANCESTRAL REGIONS (DONE 8/5/09 angie) # Use Aspera client to download 1000Genomes' Enredo-Pecan-Ortheus # four-catarrhini ancestral-tree calls for genome regions, as well as # their annotated fasta (requested by Sol Katzman): cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/ set asperaInst = /opt/aspera/connect set ascp = $asperaInst/bin/ascp set aKey = $asperaInst/etc/asperaweb_id_dsa.putty set aOpts = "-i $aKey -QTr -l300M -q" set server = anonftp@ftp-private.ncbi.nlm.nih.gov set aliDir = technical/reference/ancestral_alignments mkdir -p $aliDir cd $aliDir foreach f (MD5SUM README README.ancestral_alignments summary.txt \ $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/$f . end foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/human_ancestor$c.bed $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/human_ancestor$c.fa.bz2 end chmod 444 * # Check md5sums: perl -wpe 'chomp; ($expSum, $f) = split; $actSum = `md5sum $f`; $actSum =~ s/ .*\n//; \ $_ = ""; \ if ($expSum ne $actSum) { warn "MISMATCH: $f exp=$expSum, actual=$actSum\n"; } \ else {print "$f OK\n";}' MD5SUM # Shortcut requested by Sol: ln -s `pwd` /hive/data/outside/ancestral.epo.hg18 # Load up the regions: mkdir /hive/data/genomes/hg18/bed/epoAncestralRegions cd /hive/data/genomes/hg18/bed/epoAncestralRegions set aliPath = /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/$aliDir sed -e 's/^/chr/' $aliPath/human_ancestor_*.bed > epoAncestralRegions.bed hgLoadBed hg18 epoAncestralRegions epoAncestralRegions.bed -tab -allowStartEqualEnd #Loaded 10195 elements of size 4 featureBits hg18 epoAncestralRegions #2778857014 bases of 2881515245 (96.437%) in intersection featureBits hg18 -countGaps gap epoAncestralRegions #6232933 bases of 3107677273 (0.201%) in intersection # 2009-08-16 (hartera) # ensGtp table definition is in ~/kent/src/hg/lib/ensGtp.sql # There is an index on the protein field so it can not be NULL. # If there is no protein, the gene name is given. # Added code to hgTracks.c and hgTrackUi.c to allow the use of # radio buttons on the track configuratioin page to select the # gene name, accession or both to be displayed in the track. # The gene name is displayed by default. # Added code to hgc.c so that Ensembl and Vega can share code to # create links on the details pages to the Vega reports for transcript, # gene and protein through these IDs. Created new function # printEnsemblOrVegaCustomUrl(). # 2009-08-22 (hartera) # Create a vegaGtp table using the vegaGtp.tab file above. Use ensGtp.sql # to create the table. vegaGtp associates geneId/transcriptId/proteinId # for the links to Vega reports from the details page. If there is no # protein ID because the transcript is noncoding, the gene name is used # instead. This field can not be NULL in the table as there is an index # on it. cd /hive/data/genomes/hg18/bed/vega35 cp ~/kent/src/hg/lib/ensGtp.sql . # One of the gene names is long for a noncoding gene so it does not fit # in the protein ID field so change the protein field in ensGtp.sql # to allow 40 chars instead of 20 and re-load the table. hgsql -e 'drop table vegaGtp;' hg18 hgLoadSqlTab hg18 vegaGtp ensGtp.sql vegaGtp.tab # Loaded succesfully # Added code to hgc.c to use printEnsemblOrVegaCustomUrl() in # doVegaGene() to add the links to Vega reports on the details pages. # Code was added so that there is no protein sequence link on the details # page if it there is none available e.g. noncoding. # 2009-09-01 - 2009-09-02 (hartera) # Coding genes are displaying the message that there is no protein # prediction available. Need to add a vegaPep table. cd /hive/data/genomes/hg18/bed/vega35 # from the Ensembl process: zcat Homo_sapiens.VEGA.mar.pep.tot.fa.gz \ | sed -e 's/^>.* Transcript:/>/;' | gzip > vegaPep.txt.gz zcat vegaPep.txt.gz \ | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \ | sed -e '/^$/d; s/*$//' | sort > vegaPep.hg18.fa.tab # Load table hgPepPred hg18 tab vegaPep vegaPep.hg18.fa.tab # Add vegaPep to the trackDb.ra entry for the vegaGeneComposite track # in the type line for src/hg/makeDb/trackDb/human/hg18/trackDb.ra. # Check that the vegaPep table looks ok and then check protein-coding and # noncoding transcript details pages for protein links. # 2009-09-04, hartera # Re-load tables after upgrade to mySQL 5 as they had reverted back to # tables with the previous Vega dataset. cd /hive/data/genomes/hg18/bed/vega35 hgsql -e 'drop table vegaGene;' hg18 hgsql -e 'drop table vegaPseudoGene;' hg18 hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp hgsql -e 'drop table vegaGtp;' hg18 hgLoadSqlTab hg18 vegaGtp ensGtp.sql vegaGtp.tab hgsql -e 'drop table vegaPep;' hg18 hgPepPred hg18 tab vegaPep vegaPep.hg18.fa.tab # 2009-09-08 (hartera). Changed message in code for details page when no # protein sequence is available to be more explanatory. "Non-protein # coding gene or gene fragment, no protein prediction available." Changed # the colouring for the vegaGene subtrack to be darker blue so there is # more of a contrast between vegaGene and vegaPseudoGene subtracks. # 2009-09-09 (hartera) - re-loaded vegaPep table with only those proteins # that have a transcript ID in vegaGtp. # all.joiner is complaining as there are about 23,000 extra proteins in # vegaPep that do not have transcripts in vegaGtp. Decided to remove these # and e-mailed the HAVANA group to ask about the discrepancy. cd /hive/data/genomes/hg18/bed/vega35 awk '{print $2}' vegaGtp.tab | sort | uniq > vegaGtp.tx.ids awk '{print $1}' vegaPep.hg18.fa.tab | sort | uniq > vegaPep.tx.ids wc -l *.tx.ids # 81244 vegaGtp.tx.ids # 60003 vegaPep.tx.ids # Number of transcripts that have a protein ID: hgsql -Ne 'select transcript from vegaGtp where protein like "OTTHUMP%";' \ hg18 | sort | uniq > vegaGtpWithProt.tx.ids wc -l vegaGtpWithProt.tx.ids # 36747 vegaGtpWithProt.tx.ids # find those that are common to both. comm -12 vegaGtp.tx.ids vegaPep.tx.ids > pepandGtp.tx.ids wc -l pepandGtp.tx.ids # 36747 pepandGtp.tx.ids comm -12 pepandGtp.tx.ids vegaGtpWithProt.tx.ids | wc -l # 36747 # Therefore all the vegaGtp transcripts with a protein ID are in the # protein FASTA file. hgsql -Ne 'select * from vegaPep as p, vegaGtp as g where g.protein \ like "OTTHUMP%" and p.name = g.transcript;' hg18 \ > vegaPepOnlyInGtp.hg18.fa.tab wc -l vegaPepOnlyInGtp.hg18.fa.tab # 36747 vegaPepOnlyInGtp.hg18.fa.tab hgsql -e 'drop table vegaPep;' hg18 hgPepPred hg18 tab vegaPep vegaPepOnlyInGtp.hg18.fa.tab ############################################################################# # lsSnpPdb: import of LS-SNP/PDB data for SNP 130 (2009-02-02 markd) # down load from JHU ssh genbank sudo su - genbank cd /cluster/data/genbank ./bin/lsSnpPdbDownloadStep hg18 # load into hgwdev database ssh hgwdev cd /cluster/data/genbank ./bin/lsSnpPdbDbLoadStep hg18 # once this has been QAed, will auto-update from genbank scripts ############################################################################# # BURGE LAB DATA MAPPED WITH GEMMAPPER. PROVIDED BY THOMAS DERRIEN FROM RODERIC # GUIGO'S LAB AT CRG. (E-MAIL: thomas.derrien@crg.es) # (hartera, 2009-09-13 - 2009-09-16, DONE) # 2009-12-14, hartera. Set cdsStart = cdsEnd = 0. Moved track data directory to # /hive/data/genomes/hg18/bed. # 2010-01-04, hartera. Change the data to BED format and re-loaded tables. BED # is more appropriate for this data type. # The data is too dense in places (feedback from QA) so it would be more # appropriate to have a Signal track as for the ENCODE RNA-seq data tracks. # 2010-02-09, hartera. Create bedGraph Signal subtracks for each tissue/cell # using reads/per million mapped reads as the data value. # 2010-02-17, hartera. Updated trackDb.ra entry to include views. # 2010-05-15 and 2010-05-16, hartera. Re-created the Signal subtracks using # the -bed12 option of bedItemOverlapCount so that blocks are used. mkdir /hive/groups/gencode/browser/hg18/burgeRnaSeqGemMapperAlign cd /hive/groups/gencode/browser/hg18/burgeRnaSeqGemMapperAlign wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325476_brain_HCT168_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325477_liver_HCT169_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325478_heart_HCT170_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325479_skelMuscle_HCT171_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325480_colon_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325481_adipose_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325482_testes_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325483_lymphNode_hg18.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325484_HCT204_bt474_.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325485_HCT205_HME_.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325486_HCT202_s2468_.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325487_HCT203_s2468.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325488_HCT206_s2468_.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode2b/GSM325489_HCT207_s2468_.fa.GEM_index.hg18.gencode_data.rel2b.length_32.gff.gz" cd /hive/groups/gencode/browser/hg18/burgeRnaSeqGemMapperAlign # Load this data into tables for hg18. # Unzip the files: gunzip *.gff.gz # Create a file with the list of file names and tissues. ls *.gff > burgeDataFiles.txt GSM325486_HCT202_s2468 breast GSM325487_HCT203_s2468 MCF-7 GSM325488_HCT206_s2468 MB435 GSM325489_HCT207_s2468 T47D # Did not map these two as they are not 32 bp. GSM325490_brain_s1368 MAQC mixed human brain tissue/cell lines GSM325491_UHR_s247 MAQC_UHR mixed human cell lines # Edit the file above to add a tab separation between file name and tissue # name. Then remove the "read_name: " from the last field in each # file otherwise it gets included in the name and load the data into hg18. # Write a script to do this: cat << '_EOF_' > formatAndLoadData #!/bin/bash -e # Assign variables # Tab-separated file of file names and tissue/cell line names DATAFILES=$1 # track name used as prefix for subtracks TRACK=$2 # database DATABASE=$3 cat $DATAFILES | while read file tissue; do subTrack=`echo $TRACK$tissue` echo $subTrack sed -e 's/read_name:\s//' $file > ${subTrack}.gff ldHgGene -exon=read $DATABASE ${subTrack} ${subTrack}.gff done '_EOF_' # << emacs chmod +x formatAndLoadData ./formatAndLoadData burgeDataFiles.txt burgeRnaSeqGemMapperAlign hg18 \ > load.log & # Added a trackDb entry in # ccds/trunk/gencode/browser/trackDb/human/hg18/trackDb.ra # 2009-12-14, Need to change cdsStart = cdsEnd = 0 in the table as this # data should have no CDS defined. Currently cdsStart = cdsEnd = txEnd. cd /hive/groups/gencode/browser/hg18/burgeRnaSeqGemMapperAlign hgsql -Ne 'show tables like "burge%";' hg18 > burgeTables foreach t (`cat burgeTables`) echo $t hgsql -e "update $t set cdsStart = 0;" hg18 hgsql -e "update $t set cdsEnd = 0;" hg18 end # Then move data to directory in hg18 genome bed directory cd /hive/data/genomes/hg18/bed mv /hive/groups/gencode/browser/hg18/burgeRnaSeqGemMapperAlign ./ # 2010-01-04 Change the data to BED format. For genePred format, # there is always a track configuration added for colouring tracks by # genomic codons which does not make sense for this data. Also, BED is # more appropriate for this data type. cd /hive/data/genomes/hg18/bed/burgeRnaSeqGemMapperAlign # Convert gff to genePred and then genePred to BED, drop old table and # then load database with BED format data. Need to fix the cdStart and # cdsEnd fields to be 0. foreach f (`ls burgeRnaSeqGemMapperAlign*.gff`) echo $f >> bed.log set g=$f:r echo $g ldHgGene -exon=read -nobin -out=${g}.gp hg18 $g $f >>& bed.log awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1,$2,$3,$4,$5,0,0,$8,$9,$10}' \ ${g}.gp > ${g}Fixed.gp genePredToBed ${g}Fixed.gp > ${g}.bed echo "Dropping table $g" hgsql -e "drop table ${g};" hg18 hgLoadBed hg18 $g ${g}.bed >>& bed.log end # Changed track type in trackDb/human/trackDb.ra to bed 12 and # then did make alpha in trackDb directory. # 2010-02-17 # trackDb.ra entry in trackDb/human was updated to include views for the # Raw Signal and Alignment subtracks. # 2010-05-15 and 2010-05-16. Add Signal tracks so it is easier to view the # data in regions where there is a high density of reads. cd /hive/data/genomes/hg18/bed/burgeRnaSeqGemMapperAlign # Use bedItemOverlapCount to get counts of overlapping items for each base. # Need to sort the bed files and then get the number of reads mapped for # that tissue. Divide the counts by the number of million mapped reads to # get reads per million mapped reads as the data value. # Re-make the subtracks using the -bed12 option so that blocks are used # instead of just the first three fields of the BED file as is the default. rm *.count *.bedGraph foreach f (`ls *.bed`) echo $f set g=$f:r sort ${f} | bedItemOverlapCount -bed12 hg18 stdin > ${f}.count set size=`hgsql -Ne "select count(distinct name) from ${g};" hg18` awk -v size=${size} 'BEGIN {OFS="\t"} {print $1,$2,$3,($4 / (size/1000000));}' ${f}.count > ${g}.bedGraph end # Load the bedGraph tables into the database as Raw Signal tracks. foreach f (`ls *.bedGraph`) echo $f set g=$f:r hgsql -e "drop table ${g}AllRawSignal;" hg18 hgLoadBed -bedGraph=4 hg18 ${g}AllRawSignal $f >>& load.log end ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # ADD LINK TO GENENETWORK (DONE. 11/06/09 Fan). # Received geneNetwork ID list file, GN_human_RefSeq.txt, for hg18 from GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com]. ssh hgwdev mkdir -p /cluster/data/hg18/bed/geneNetwork cd /cluster/data/hg18/bed/geneNetwork hgsql hg18 < ~/src/hg/lib/geneNetworkId.sql hgsql hg18 -e \ 'load data local infile "GN_human_RefSeq.txt" into table geneNetworkId' ######################################################################### # BUILD snpArrayIllumina HumanCytoSNP-12 SUB-TRACK (DONE 12/4/09, Fan) # Received raw data file HumanCytoSNP-12_forUCSC.csv # from Illumina, Jennifer L. Stone Ph.D., jstone@illumina.com # mkdir -p /hive/data/genomes/gs.19/build36/bed/snp/illumina/120309 # cd /hive/data/genomes/gs.19/build36/bed/snp/illumina/120309 cat HumanCytoSNP-12_forUCSC.csv |\ sed -e 's/,/\t/g' >HumanCytoSNP.tab hgsql hg18 -e 'drop table snpArrayIlluminaHumanCytoSNP_12Raw' hgsql hg18 < ~/src/hg/lib/snpArrayIlluminaHumanCytoSNP_12Raw.sql hgsql hg18 -e 'load data local infile "HumanCytoSNP.tab" into table snpArrayIlluminaHumanCytoSNP_12Raw ignore 1 lines' ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIlluminaHumanCytoSNP_12Raw snp130 # The illuminaLookup1M generate two files: # # illuminaLookup.out contains all probes found in snp130 # illuminaLookup.err contains all probes not found in snp130 mv illuminaLookup.out illuminaLookupHumanCytoSNP_12a.out cut -f 1 illuminaLookup.err >j.1 cat j.1 |sed -e 's/chrMt/chrM/' |\ sed -e 's/XY/X/' >j.chr cut -f 2-5 illuminaLookup.err >j.2 cut -f 6 illuminaLookup.err >j.3 cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed paste j.chr j.2 j.strand j.observed >illuminaLookupHumanCytoSNP_12b.out # combine two parts cat illuminaLookupHumanCytoSNP_12a.out illuminaLookupHumanCytoSNP_12b.out >snpArrayIlluminaHumanCytoSNP_12.tab # load the table hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHumanCytoSNP_12 snpArrayIlluminaHumanCytoSNP_12.tab -tab -sqlTable=snpArrayIlluminaHumanCytoSNP_12.sql ############################################################################# # BUILD snpArrayIllumina Human660W-Quad SUB-TRACK (DONE 12/9/09, Fan) # Received raw data file Human660W.ucsc.csv # from Illumina, Jennifer L. Stone Ph.D., jstone@illumina.com mkdir -p /hive/data/genomes/gs.19/build36/bed/snp/illumina/120809 cd /hive/data/genomes/gs.19/build36/bed/snp/illumina/120809 cat Human660W.ucsc.csv|\ sed -e 's/,/\t/g' >Human660W.tab hgsql hg18 -e 'drop table snpArrayIlluminaHuman660W_QuadRaw' hgsql hg18 < ~/src/hg/lib/snpArrayIlluminaHuman660W_QuadRaw.sql hgsql hg18 -e 'load data local infile "Human660W.tab" into table snpArrayIlluminaHuman660W_QuadRaw ignore 1 lines' ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIlluminaHuman660W_QuadRaw snp130 # The illuminaLookup1M generate two files: # # illuminaLookup.out contains all probes found in snp130 # illuminaLookup.err contains all probes not found in snp130 mv illuminaLookup.out illuminaLookupHuman660W_Quada.out cut -f 1 illuminaLookup.err >j.1 cat j.1 |sed -e 's/chrMt/chrM/' |\ sed -e 's/XY/X/' >j.chr cut -f 2-5 illuminaLookup.err >j.2 cut -f 6 illuminaLookup.err >j.3 cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed paste j.chr j.2 j.strand j.observed >illuminaLookupHuman660W_Quadb.out # combine two parts cat illuminaLookupHuman660W_Quada.out illuminaLookupHuman660W_Quadb.out >snpArrayIlluminaHuman660W_Quad.tab # load the table hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHuman660W_Quad snpArrayIlluminaHuman660W_Quad.tab -tab -sqlTable=snpArrayIlluminaHuman660W_Quad.sql ############################################################################# # BUILD snpArrayIllumina Human Omni1-Quad SUB-TRACK (DONE 12/9/09, Fan) # Received raw data file Omni.ucsc.txt # from Illumina, Jennifer L. Stone Ph.D., jstone@illumina.com # mkdir -p /hive/data/genomes/gs.19/build36/bed/snp/illumina/120309 # cd /hive/data/genomes/gs.19/build36/bed/snp/illumina/120309 cat Omni.ucsc.txt |\ sed -e 's/,/\t/g' >HumanOmni1.tab hgsql hg18 -e 'drop table snpArrayIlluminaHumanOmni1_QuadRaw' hgsql hg18 < ~/src/hg/lib/snpArrayIlluminaHumanOmni1_QuadRaw.sql hgsql hg18 -e 'load data local infile "HumanOmni1.tab" into table snpArrayIlluminaHumanOmni1_QuadRaw ignore 1 lines' ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIlluminaHumanOmni1_QuadRaw snp130 # The illuminaLookup1M generate two files: # # illuminaLookup.out contains all probes found in snp130 # illuminaLookup.err contains all probes not found in snp130 mv illuminaLookup.out illuminaLookupHumanOmni1_Quada.out cut -f 1 illuminaLookup.err >j.1 cat j.1 |sed -e 's/chrMt/chrM/' |\ sed -e 's/XY/X/' >j.chr cut -f 2-5 illuminaLookup.err >j.2 cut -f 6 illuminaLookup.err >j.3 cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed paste j.chr j.2 j.strand j.observed >illuminaLookupHumanOmni1_Quadb.out # combine two parts cat illuminaLookupHumanOmni1_Quada.out illuminaLookupHumanOmni1_Quadb.out >snpArrayIlluminaHumanOmni1_Quad.tab # load the table hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHumanOmni1_Quad snpArrayIlluminaHumanOmni1_Quad.tab -tab -sqlTable=snpArrayIlluminaHumanOmni1_Quad.sql ############################################################################# # NHGRI GWAS CATALOG (DONE 2/4/13 angie) # 2013 updates: 2/4 # 2012 updates: 12/10, 10/4, 8/1, 6/4, 4/4, 2/21 (remove extra whitespace, translate non-ASCII to html), 2/6 # Updated 12/7/11, 11/2/11, 10/3/11, 9/2/11, 8/1/11, 6/9/11, 4/1/11, 3/1/11, 2/1/11 # Updated 12/7/10, 11/1/10, 10/6/10, 9/1/10, 8/2/10, 6/2/10, 5/12/10, 4/1/10, 3/1/10 # Originally done 1/19/10 # Area of possible future improvement: for SNPs that can't be mapped via our SNP track, # could some of them be obsolete IDs that have been merged into current IDs? mkdir /hive/data/genomes/hg18/bed/gwasCatalog cd /hive/data/genomes/hg18/bed/gwasCatalog # Done once, don't need to redo: cut -f 1-4 ../snp130/snp130.bed \ | sort -k4,4 \ > snp130Coords.bed set today = `date +%y%m%d` mkdir /hive/data/genomes/hg18/bed/gwasCatalog/$today cd /hive/data/genomes/hg18/bed/gwasCatalog/$today wget http://www.genome.gov/admin/gwascatalog.txt head -1 gwascatalog.txt | sed -re 's/\t/\n/g' # Compare to original column headers -- some additions in June 2011 (2nd column): # 1 1 Date Added to Catalog # 2 2 PubMedID # 3 3 First Author # 4 4 Date # 5 5 Journal # 6 6 Link # 7 7 Study # 8 8 Disease/Trait # 9 9 Initial Sample Size # 10 10 Replication Sample Size # 11 11 Region # 12 Chr_id # 13 Chr_pos # 12 14 Reported Gene(s) # 15 Mapped Gene # 16 Upstream_gene_id # 17 Downstream_gene_id # 18 Snp_gene_ids # 19 Upstream_gene_distance # 20 Downstream_gene_distance # 13 21 Strongest SNP-Risk Allele # 14 22 SNPs # 23 Merged # 24 Snp_id_current # 25 Context # 26 Intergenic # 15 27 Risk Allele Frequency # 16 28 p-Value # 29 Pvalue_mlog # 17 30 p-Value (text) # 18 31 OR or beta # 19 32 95% CI (text) # 20 33 Platform [SNPs passing QC] # 21 34 CNV # Original columns of interest: pretty much all except for Date Added to the Catalog, # and Link which can be generated from PubMedID. Watch out for these: # * Some rows don't name a SNP ("" or "NR") -- in that case, skip. # * Risk allele is not always just a number, may have desc # * Missing data may be "", "NR", "NS" or "Pending" # June 2011 new columns: ignore for now; make new table format if user demand # Use SNPs (comma-sep list) to map to genome coords, and strongest SNP-Risk Allele # as bed 4+ name. perl -MEncode -we 'while (<>) { \ next if (/^\s*$/); \ s/\r$//; \ @w = split("\t"); \ next if ($w[21] !~ /^rs\d+/); \ if ($w[3] =~ /^(\d+)\/(\d+)\/(\d+)$/) { # transform to mysql DATE \ ($month, $day, $year) = ($1, $2, $3); \ $w[3] = "$year-$month-$day"; \ } else { die "Cant parse date ($w[3])\t" } \ $w[21] =~ s/ //g; \ my @snps = split(",", $w[21]); \ # discard columns (use descending order): \ foreach $i (28, 25, 24, 23, 22, 21, 19, 18, 17, 16, 15, 14, 12, 11, 5, 0) { \ splice(@w, $i, 1); \ } \ # trim leading/trailing spaces if any; \ # convert the Unicode in titles to HTML because non-ASCII gives Galaxy trouble. \ foreach $i (0 .. $#w) { \ $w[$i] =~ s/^\s*//; $w[$i] =~ s/\s*$//; \ # ugh, clean out non-utf8 stuff before decoding utf8 into unicode: \ $w[$i] =~ s/\226/-/g; $w[$i] =~ s/\327/x/g; $w[$i] =~ s/\317\?/τ/g; \ $w[$i] =~ s/\342\?\?/1<\/sub>/g; $w[$i] =~ s/\347/c/g; \ $w[$i] =~ s/\351/e/g; $w[$i] =~ s/\353/e/g; \ $w[$i] = decode_utf8($w[$i], Encode::FB_CROAK); \ @chars = split(//, $w[$i]); \ $w[$i] = ""; \ foreach $c (@chars) { \ if (ord($c) > 127) { \ $c = sprintf "&#%d;", ord($c); \ } \ $w[$i] .= $c; \ } \ } \ foreach $s (@snps) { \ print join("\t", $s, @w) . "\n"; \ } \ }' \ gwascatalog.txt \ | sort > noCoords.tab join -t " " -1 4 ../snp130Coords.bed noCoords.tab \ -o 1.1,1.2,1.3,1.4,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12,2.13,2.14,2.15,2.16,2.17,2.18,2.19 \ | sort -k1,1 -k2n,2n \ > gwasCatalog.bed hgLoadBed hg18 gwasCatalog gwasCatalog.bed \ -tab -sqlTable=$HOME/kent/src/hg/lib/gwasCatalog.sql -notItemRgb -allowStartEqualEnd #Read 10796 elements of size 22 from gwasCatalog.bed # For David: find examples of risk alleles for which dbSNP observed # alleles are complementary (A/T or C/G) -- how do we know what strand the # risk allele is on?? -- asked corresp. author Teri Manolio. Info is not # always available in the original publication, so sadly there is not always # a way to resolve these. GWAS catalog folks aren't going to modify their # database to add a column for these cases. hgsql hg18 -NBe 'select snp.name,gc.riskAllele,snp.strand,snp.refNcbi,snp.observed \ from gwasCatalog as gc, snp130 as snp \ where gc.riskAllele rlike "^rs[0-9]+-[ACGT]" and \ gc.name = snp.name and snp.observed in ("C/G", "A/T") \ order by gc.name;' > ambigStrand.txt wc -l ambigStrand.txt #689 ambigStrand.txt ############################################################################# # CRG MAPABILITY (2010-01-19 - 2010-01-28, hartera, DONE) # Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca # from the Guigo lab at the Center for Genomic Regulation (CRG) in Barcelona. # Data was produced using their GEM mapper aligner taking sliding k-mers # window of the human genome that were mapped back onto the genome with up # to 2mismatches. For each window, a mappability score is computed # S = 1/(nb of match_found) and the BigWig index was created according to # this score. # 2010-01-26 Loaded tables and added data to /gbdb/ # 2010-01-28 Changed the table names to have wgEncode prefix for consistency. # Added trackDb entry for the subtracks to the ENCODE Mapability track entry. # 2010-03-16 - 2010-03-18. Added metadata to trackDb for the subtracks and# # added downloads for the bigWig data files. # 2010-04-28 Received new data from Thomas Derrien. Downloaded data and # added it to /gbdb/. A bug was found in a library used by bedGraphToBigWig # so sent a new binary to data providers and they re-created the bigWig files. # 2010-05-12. Updated downloads for the new data files. mkdir -p /hive/data/genomes/hg18/bed/crgMapability cd /hive/data/genomes/hg18/bed/crgMapability cat << 'EOF' > temp #!/bin/tcsh -ef http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg18_H.sapiens.genome.hg18.main.mappability-36.bw.bz2 http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg18_H.sapiens.genome.hg18.main.mappability-40.bw.bz2 http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg18_H.sapiens.genome.hg18.main.mappability-50.bw.bz2 http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg18_H.sapiens.genome.hg18.main.mappability-75.bw.bz2 http://genome.imim.es/~tderrien/UCSC_Tracks/ALL_mappablity_hg18_H.sapiens.genome.hg18.main.mappability-100.bw.bz2 'EOF' awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \ temp > download.csh rm temp chmod +x download.csh ./download.csh >& download.log & # Add the data to /gbdb/ and load the file names into tables (2010-01-26) cd /hive/data/genomes/hg18/bed/crgMapability bunzip2 *.bz2 # Add data to gbdb mkdir -p /gbdb/hg18/bbi/ # Symlink files with names as crgMapabilityAlignXmer.bw to /gbdb/hg18/bbi # and load file name into a table - one per dataset. Each table # represents a subtrack. foreach f (`ls *.bw`) echo $f set g=`echo $f | cut -d "-" -f2` set num=`echo $g | cut -d "." -f1` set mer=`echo "${num}mer"` set nf=`echo "crgMapabilityAlign${mer}.bw"` echo $nf ln -s `pwd`/${f} /gbdb/hg18/bbi/${nf} hgsql hg18 -e "drop table if exists crgMapabilityAlign${mer}; \ create table crgMapabilityAlign${mer} (fileName varchar(255) not null); \ insert into crgMapabilityAlign${mer} values ('/gbdb/hg18/bbi/${nf}');" end # 2010-01-28. # Renamed the tables to have a wgEncode prefix for consistency. cd /hive/data/genomes/hg18/bed/crgMapability hgsql -Ne 'show tables like "crg%";' hg18 > tables.txt foreach t (`cat tables.txt`) set g=`echo $t | sed -e 's/c/C/'` hgsql -e "alter table ${t} rename enc${g};" hg19 end # Added a trackDb entry for this subtrack of the ENCODE Mapability # track in kent/src/hg/makeDb/trackDb/human/hg18/trackDb.wgEncode.ra # use bigWigInfo to check min and max values. # 2010-03-16 - 2010-03-18 # Added metadata to the trackDb entries for the subtracks and # added downloads for these data files. cd /usr/local/apache/htdocs/goldenPath/hg18/encodeDCC/wgEncodeMapability cp -p /gbdb/hg18/bbi/crg*.bw gzip crg*.bw # Edited the preamble.html in # hg18/encodeDCC/wgEncodeMapability/ to include the CRG dataset. # Run encodeDownloadsPage.pl to generate the index page for downloads. # It does not capture all the information probably because the subtrack # name is different to the downloads name so change the file names and # re-load the tables and make the downloads. cd /hive/data/genomes/hg18/bed/crgMapability foreach f (`ls *.bw`) echo $f set g=`echo $f | cut -d "-" -f2` set num=`echo $g | cut -d "." -f1` set mer=`echo "${num}mer"` set of=`echo "crgMapabilityAlign${mer}.bw"` set nf=`echo "wgEncodeCrgMapabilityAlign${mer}.bw"` echo $nf rm /gbdb/hg18/bbi/${of} ln -s `pwd`/${f} /gbdb/hg18/bbi/${nf} hgsql hg18 -e "drop table if exists wgEncodeCrgMapabilityAlign${mer}; \ create table wgEncodeCrgMapabilityAlign${mer} (fileName varchar(255) not null); \ insert into wgEncodeCrgMapabilityAlign${mer} values ('/gbdb/hg18/bbi/${nf}');" end cd /usr/local/apache/htdocs/goldenPath/hg18/encodeDCC/wgEncodeMapability rm crg* cp -p /gbdb/hg18/bbi/wgEncodeCrg*.bw . gzip wgEncodeCrg*.bw # Then run encodeDownloadsPages.pl /cluster/home/hartera/bin/encodeDownloadsPage.pl -checksum \ -preamble=preamble.html index.html . # Downloaded and added new bigWig files to /gbdb/hg18/bbi # (2010-04-28, hartera). New files were created as there was a bug # in the older version of bedGraphToBigWig. cd /hive/data/genomes/hg18/bed/crgMapability rm temp download.csh download.log cat << 'EOF' > temp #!/bin/tcsh -ef http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg18.main.mappability-100.bw.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg18.main.mappability-36.bw.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg18.main.mappability-40.bw.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg18.main.mappability-50.bw.bz2 http://genome.crg.es/~tderrien/UCSC_Tracks/H.sapiens.genome.hg18.main.mappability-75.bw.bz2 'EOF' awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \ temp > download.csh rm temp chmod +x download.csh ./download.csh >& download.log & # Add data to /gbdb/. The file names in /gbdb/ are the same as before # so the tables do not need to be reloaded. cd /hive/data/genomes/hg18/bed/crgMapability bunzip2 *.bz2 foreach f (`ls *.bw`) echo $f set g=`echo $f | cut -d "-" -f2` set num=`echo $g | cut -d "." -f1` set mer=`echo "${num}mer"` set nf=`echo "wgEncodeCrgMapabilityAlign${mer}.bw"` echo $nf rm /gbdb/hg18/bbi/${nf} ln -s `pwd`/${f} /gbdb/hg18/bbi/${nf} end # 2010-05-12 # Updated downloads for the new data files. cd /usr/local/apache/htdocs/goldenPath/hg18/encodeDCC/wgEncodeMapability rm wgEncodeCrg* cp -p /gbdb/hg18/bbi/wgEncodeCrg*.bw . gzip wgEncodeCrg*.bw rm md5sum.txt # Run encodeDownloadsPage.pl to generate the index page for downloads # and generate new md5sum.txt file for the data. encodeDownloadsPage.pl -checksum -db=hg19 index.html ##################################################################### # tRNAs track (2010-03-12, Fan RE-BUILT) # ssh hgwdev cd /hive/data/genomes/gs.19/build36/bed mkdir tRNAs cd tRNAs # Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/ cp -p /projects/lowelab/users/lowe/Browser/vertebrates/hg18-tRNAs.bed . cp -p \ /projects/lowelab/users/lowe/Browser/vertebrates/hg18_tRNAs_images.tar . hgsql hg18 -e 'drop table if exists tRNAs' hgLoadBed -tab hg18 tRNAs hg18-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql mkdir gif cd gif tar -xvf ../hg18_tRNAs_images.tar mv images/*.gif . rm -rf images mkdir /hive/data/gbdb/hg18/RNA-img rm /hive/data/gbdb/hg18/RNA-img/* cp -p * /hive/data/gbdb/hg18/RNA-img ##################################################################### # PAR track (2010-02-18, markd DONE) cd /hive/data/genomes/hg18/bed/par/ # create hg18.par using the documented coordinates hgPar hg18 hg18.par par ##################################################################### # H-INVITATIONAL GENE ANNOTATION DATABASE (Working 2010-0226, chin) #http://h-invitational.jp/hinv/ahg-db/index.jsp # Create knownGene table to reference HINV gene ID's # for link on knownGenes details page # Also, create an HINV gene track # download CDNA file H-InvDB_7.0 (Feb 26, 2010) -- got release # from downloads page). # ftp://ftp.ddbj.nig.ac.jp/mirror_database/hinv/jbirc_ff/annotation/ mkdir /cluster/data/hinv/H-InvDB_7.0 cd /cluster/data/hinv/H-InvDB_7.0 wget --timestamp \ ftp://ftp.ddbj.nig.ac.jp/mirror_database/hinv/jbirc_ff/annotation/FCDNA.gz # HH-Inv(7.0) mkdir /hive/data/genomes/hg18/bed/hinv7.0 cd /hive/data/genomes/hg18/bed/hinv7.0 cat << '_EOF_' > hinvToBed.pl #!/usr/bin/env perl use strict; use warnings; my $chr=""; my $start=""; my $end=""; my %accNoDups; my $invId = ""; my $invIdVer = ""; my $accNo = ""; my $strand = ""; my $cai = 0; open (FH, "zcat FCDNA.gz|") or die "can not zcat FCDNA.gz"; while (my $line = ) { my ($id, $tag, $rest) = split('\s+', $line, 3); if ($line =~ m/^CDNA_H-INVITATIONAL-ID:/ ) { $invId = $tag; } elsif ($line =~ m/^CDNA_H-INVITATIONAL-ID-VERSION:/ ) { $invIdVer = $tag; } elsif ($line =~ m/^CDNA_CHROMOSOME-NUMBER:/ ) { $chr = $tag; } elsif ($line =~ m/^CDNA_STRAND:/ ) { $strand = $tag; } elsif ($line =~ m/^PREDICTED-ORF_CAI:/ ) { $cai = int($tag * 1000); } elsif ($line =~ m/^CDNA_START:/ ) { $start = $tag; } elsif ($line =~ m/^CDNA_END:/ ) { $end = $tag; } elsif ($line =~ m/^CDNA_ACCESSION-NO:/ ) { $accNo = $tag; } elsif ($line =~ m/CDNA_CLUSTER-ID:/ ) { if (length($accNo) > 0) { next if ($chr eq "UM"); if (length($start) < 1 || length($end) < 1) { printf STDERR "no start,end ? chr%s\t%s\n", $chr, $invIdVer; } else { die "have accession but no ID ?" if (length($invId) < 1); $invIdVer =~ s/\.[0-9]+$//; printf "chr%s\t%d\t%d\t%s\t%d\t%s\n", $chr, $start, $end, $invIdVer, $cai, $strand; } } $accNo = ""; $invId = ""; $invIdVer = ""; $chr = ""; $start = ""; $end = ""; $cai = 0; $strand = ""; } } close (FH); '_EOF_' # << happy emacs ln -s /hive/data/outside/hinv/H-InvDB_7.0/FCDNA.gz . chmod +x hinvToBed.pl time ./hinvToBed.pl | grep -v chr6_hla_hap | sort -k1.1 -k2.2n > hinv7.0.bed # zcat: FCDNA.gz: decompression OK, trailing garbage ignored # real 3m1.060s # user 3m14.142s # sys 0m10.961s # verify the new table does not exist hgsql -e "show tables" hg18 | grep -i hinv hgLoadBed -verbose=2 hg18 HInvGeneMrnaBed hinv7.0.bed # Reading hinv7.0.bed # Loaded 217721 elements of size 4 hgsql -e "show tables" hg18 | grep -i hinv # HInv # HInvGeneMrna # HInvGeneMrnaBed # knownToHInv # knownXToHInv # check the coverage featureBits hg18 HInvGeneMrnaBed # 1350541623 bases of 2881515245 (46.869%) in intersection # exon only featureBits hg18 HInvGeneMrna # 82136473 bases of 2881515245 (2.850%) in intersection # measure exon and intron to compare hgsql -N -e "select tName, tStart, tEnd, qName, strand from HInvGeneMrna;" \ hg18 > hinvGeneMrna.bed # 988629029 bases of 2881515245 (34.309%) in intersection featureBits hg18 HInvGeneMrnaBed -countGaps gap # 4523138 bases of 3107677273 (0.146%) in intersection # stop here pending answer for seraching with newest version id ######################################################################### # UPDATE snpArrayIllumina HumanCytoSNP-12 SUB-TRACK (DONE 3/23/10, Fan) # Received raw data file HumanCytoSNP-12_v2_1_forUCSC.csv # from Illumina, Jennifer L. Stone Ph.D., jstone@illumina.com mkdir -p /hive/data/genomes/gs.19/build36/bed/snp/illumina/032210 cd /hive/data/genomes/gs.19/build36/bed/snp/illumina/032210 cat HumanCytoSNP-12_v2_1_forUCSC.csv |\ sed -e 's/,/\t/g' >HumanCytoSNP.tab hgsql hg18 -e 'drop table snpArrayIlluminaHumanCytoSNP_12Raw' hgsql hg18 < ~/src/hg/lib/snpArrayIlluminaHumanCytoSNP_12Raw.sql hgsql hg18 -e 'load data local infile "HumanCytoSNP.tab" into table snpArrayIlluminaHumanCytoSNP_12Raw ignore 1 lines' ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIlluminaHumanCytoSNP_12Raw snp130 # The illuminaLookup1M generate two files: # # illuminaLookup.out contains all probes found in snp130 # illuminaLookup.err contains all probes not found in snp130 mv illuminaLookup.out illuminaLookupHumanCytoSNP_12a.out cut -f 1 illuminaLookup.err >j.1 cat j.1 |sed -e 's/chrMt/chrM/' |\ sed -e 's/XY/X/' >j.chr cut -f 2-5 illuminaLookup.err >j.2 cut -f 6 illuminaLookup.err >j.3 cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed paste j.chr j.2 j.strand j.observed >illuminaLookupHumanCytoSNP_12b.out # combine two parts cat illuminaLookupHumanCytoSNP_12a.out illuminaLookupHumanCytoSNP_12b.out >snpArrayIlluminaHumanCytoSNP_12.tab # load the table hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHumanCytoSNP_12 snpArrayIlluminaHumanCytoSNP_12.tab -tab -sqlTable=/cluster/home/fanhsu/scratch/tip201/kent/src/hg/lib/snpArrayIlluminaHumanCytoSNP_12.sql ############################################################################# # ucscRetro track (2010-04-12, baertsch DONE) mkdir -p /hive/users/baertsch/retro/hg18 cd /hive/users/baertsch/retro/hg18 wget http://compbio.soe.ucsc.edu/retrogene/retroFinder-1.16.tar.gz tar xvfz retroFinder-1.16.tar.gz cd retroFinder-1.16/src/pslPseudo make cd ../../.. cat << '_EOF_' > DEF RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 " DB=hg18 SCORETHRESH=550 LOGNAME=baertsch GENOMENAME='Homo sapiens' GBDB=hg MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz/ TMPMRNA=/hive/users/baertsch/mrnaBlastz/$DB TMPEST=/hive/users/baertsch/est/$DB EST=all_est SPLICED_EST=intronEst SPLIT_EST=0 SPLIT_SPLICED_EST=1 SCRIPT=/hive/users/baertsch/retro/hg18/retroFinder-1.16/scripts GENOME=/hive/data/genomes/ RETRODIR=$GENOME/$DB/bed/retro BASE=/hive/users/baertsch/retro OUTDIR=/hive/users/baertsch/retro/$DB/ RESULT=$OUTDIR/result LOG=$OUTDIR/log OUT=$OUTDIR/out OVERLAPDIR=$OUTDIR/run.o VERSION=5 TABLE=ucscRetroInfo$VERSION ALIGN=ucscRetroAli$VERSION LOCAL=/scratch/data/$DB TWOBIT=$LOCAL/$DB.2bit NIB=$LOCAL/nib RMSK=/hive/data/genomes/$DB/linSpecRep/ NET1=netMm8 NET2=netCanFam2 NET3=netRheMac2 GENE1=knownGene GENE2=refGene GENE3=mgcGenes CLUSTER=swarm SPECIES="hg18 mm9 rheMac2" ROOTDIR="/cluster/home/baertsch/public_html" EXPDIR=exp GENEPFAM=knownGene PFAM=knownToPfam PFAMIDFIELD=name PFAMDOMAIN=value ARRAY=gnfAtlas2 AFFYPROBE="affyU133A,affyGnf1h" ARRAYMEDIAN=hgFixed.gnfHumanAtlas2Median ARRAYRATIO=hgFixed.gnfHumanAtlas2AllRatio ARRAYABS=hgFixed.gnfHumanAtlas2All ARRAYEXP=hgFixed.gnfHumanAtlas2MedianExps ARRAYEXPALL=hgFixed.gnfHumanAtlas2AllExps ARRAYLOOKUP=knownToGnfAtlas2 ARRAYPSLS="/hive/data/genomes/hg18/bed/geneAtlas2/affyU133A.psl /hive/data/genom es/hg18/bed/geneAtlas2/affyGnf1h.psl" ALTSPLICE=altGraphX SPLITBYAGE=splitRetrosByAge PDB=proteins090821 '_EOF_' # << happy emacs #add ./retroFinder-1.16/scripts to PATH retroFinder-1.16/scripts/filterMrna.sh DEF retroFinder-1.16/scripts/filterEst.sh DEF nohup retroFinder-1.16/scripts/ucscRetroStep1.sh DEF #check cluster job nohup retroFinder-1.16/scripts/ucscRetroStep2.sh DEF nohup retroFinder-1.16/scripts/ucscRetroStep3.sh DEF #check cluster job nohup retroFinder-1.16/scripts/ucscRetroStep4.sh DEF nohup retroFinder-1.16/scripts/ucscRetroStep5.sh DEF # Load the track nohup retroFinder-1.16/scripts/ucscRetroStep6.sh DEF #add ucscRetroAli to trackDb.ra ############################################################################# # NEANDERTAL TRACKS (DONE 5/6/10 angie) # Ed Green and Hernan Burbano contributed data for several tracks # in advance of the publication in Science of the Neandertal genome # sequence and analysis. These tracks were built on a private, # access-controlled server (genome-nt) and then transferred to hgwdev # and quickly pushed to hgwbeta and RR when the embargo lifted. # Full descriptions are in a separate file, hg18.nt.txt. # Track tables, in case anybody searches for them in here: # H-C Coding Diffs: ntHumChimpCodingDiff # Sel Swp Scan (S): ntSssZScorePMVar # 5% Lowest S: ntSssTop5p # S SNPs: ntSssSnps # Cand. Gene Flow: ntOoaHaplo # Neandertal Cntgs: bamAll bamFeld1 bamMez1 bamSid1253 bamVi33dot16 bamVi33dot25 bamVi33dot26 # Neandertal Seq: bamSLFeld1 bamSLMez1 bamSLSid1253 bamSLVi33dot16 bamSLVi33dot25 bamSLVi33dot26 # Modern Human Seq: bamMMS4 bamMMS5 bamMMS6 bamMMS7 bamMMS8 # Neandertal Mito: ntMito ############################################################################# # BUILD DECIPHER TRACK (DONE, 6/1/10, Fan) ssh hgwdev # Received raw DECIPHER data file, daa_28-05-10_ucsc.txt, # from Manuel Corpas [mc10@sanger.ac.uk] and place it under # /hive/data/outside/decipher/ cd /hive/data/genomes/gs.19/build36/bed mkdir decipher cd decipher cp -p /hive/data/outside/decipher/daa_28-05-10_ucsc.txt . hgsql hg18 -e 'drop table decipherRaw' hgsql hg18 < ~/src/hg/lib/decipherRaw.sql # load into decipherRaw table hgsql hg18 -e \ 'load data local infile "daa_28-05-10_ucsc.txt" into table decipherRaw ignore 1 lines' # construct the bed file, decipher.bed hgsql hg18 -N -e 'select "chr", chr, start-1, end, id from decipherRaw ' |\ sed -e 's/chr\t/chr/' |sort -u>j.tmp # fix some out of range of entries cat j.tmp|sed -e 's/243000000/242951149/' |\ sed -e 's/115090019/114142980/' >decipher.bed rm j.tmp # load the bed track. hgLoadBed hg18 decipher decipher.bed # create knownToDecipher table. hgMapToGene -all hg18 decipher knownGene knownToDecipher ############################################################################# # UPDATE KEGG TABLES (DONE, Fan, 6/18/10) mkdir -p /hive/data/genomes/hg18/bed/pathways/kegg cd /hive/data/genomes/hg18/bed/pathways/kegg wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab cat map_title.tab | sed -e 's/\t/\thsa\t/' > j.tmp cut -f 2 j.tmp >j.hsa cut -f 1,3 j.tmp >j.1 paste j.hsa j.1 |sed -e 's/\t//' > keggMapDesc.tab rm j.hsa j.1 rm j.tmp hgsql hg18 -e 'drop table keggMapDesc' hgsql hg18 < ~/kent/src/hg/lib/keggMapDesc.sql hgsql hg18 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/hsa/hsa_pathway.list cat hsa_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp hgsql hg18 -e 'drop table keggPathway' hgsql hg18 < ~/kent/src/hg/lib/keggPathway.sql hgsql hg18 -e 'load data local infile "j.tmp" into table keggPathway' hgsql hg18 -N -e \ 'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \ >keggPathway.tab hgsql hg18 -e 'delete from keggPathway' hgsql hg18 -e 'load data local infile "keggPathway.tab" into table keggPathway' rm j.tmp ############################################################################# # Add KEGG column to hg18 Gene Sorter (Done, Fan, 6/18/2010) mkdir -p /hive/data/genomes/hg18/bed/geneSorter cd /hive/data/genomes/hg18/bed/geneSorter hgsql hg18 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab hgsql hg18 -e 'drop table knownToKeggEntrez' hgsql hg18 < ~/kent/src/hg/lib/knownToKeggEntrez.sql hgsql hg18 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez' ############################################################################# # Add Human RNA-editing track hg18 (Done, galt, 7/12/2010) # DARNED=DAtabase of RNa EDiting #http://darned.ucc.ie/ #University College Cork mkdir -p /hive/data/genomes/hg18/bed/darned cd /hive/data/genomes/hg18/bed/darned # create go.csh to download and compose allChroms.bed ./go.csh hgLoadBed hg18 darned allChroms.bed # at human, level # added darned.html # added trackDb.ra entry ############################################################################# # REFINE DECIPHER DETAILS PAGE (DONE, Fan, 7/13/10) # ssh hgwdev cd /hive/data/genomes/gs.19/build36/bed/decipher hgsql hg18 -N -e \ 'select d.* from knownToDecipher d, knownCanonical c where c.transcript=d.name' >knownCanonToDecipher.tab hgsql hg18 -e "drop table knownCanonToDecipher" hgsql hg18 < ~/src/hg/lib/knownCanonToDecipher.sql hgsql hg18 -e 'load data local infile "knownCanonToDecipher.tab" into table knownCanonToDecipher' ############################################################################# # Got UCSF Brain Methyl data from Ting already loaded. 7/2010 ############################################################################# # LIFTOVER TO Hg19 (RE-DONE - 2010-07-26 - Hiram ) # preserving the previous 10K liftOver files mkdir /hive/data/genomes/hg18/bed/liftOver10K cd /hive/data/genomes/hg18/bed/liftOver10K ln -s ../blat.hg19.2009-03-06/hg18ToHg19.over.chain.gz . # this liftOver is a 5000 size chunk mkdir /hive/data/genomes/hg18/bed/blat.hg19.2010-07-26 cd /hive/data/genomes/hg18/bed/blat.hg19.2010-07-26 # -debug run to create run dir, preview scripts... # verifies files can be found doSameSpeciesLiftOver.pl -debug hg18 hg19 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ hg18 hg19 > do.log 2>&1 & # real 67m51.597s # checking liftOver accuracy mkdir /hive/data/genomes/hg18/bed/blat.hg19.2010-07-26/refGene cd /hive/data/genomes/hg18/bed/blat.hg19.2010-07-26/refGene hgsql -N -e "select * from refGene;" hg18 | cut -f2- > refGene.hg18.gp wc -l refGene.hg18.gp # 36766 # the 5K block size lift over chain liftOver -genePred refGene.hg18.gp ../hg18ToHg19.over.chain.gz \ refGene.hg18ToHg19.5K.lift.gp refGene.hg18ToHg19.5K.unMapped.gp wc -l refGene.hg18ToHg19.5K.unMapped.gp # 440 # the 10K block size lift over chain liftOver -genePred refGene.hg18.gp \ ../../liftOver10K/hg18ToHg19.over.chain.gz \ refGene.hg18ToHg19.10K.lift.gp refGene.hg18ToHg19.10K.unMapped.gp wc -l refGene.hg18ToHg19.10K.unMapped.gp # 430 # construct custom track of chain files. # the 5K block size lift over chain chainToPsl ../hg18ToHg19.over.chain.gz \ /hive/data/genomes/hg18/chrom.sizes \ /hive/data/genomes/hg19/chrom.sizes \ /hive/data/genomes/hg18/hg18.2bit \ /hive/data/genomes/hg19/hg19.2bit stdout \ | pslToBed stdin hg18ToHg19.5K.bed # the 10K block size lift over chain chainToPsl ../../liftOver10K/hg18ToHg19.over.chain.gz \ /hive/data/genomes/hg18/chrom.sizes \ /hive/data/genomes/hg19/chrom.sizes \ /hive/data/genomes/hg18/hg18.2bit \ /hive/data/genomes/hg19/hg19.2bit stdout \ | pslToBed stdin hg18ToHg19.10K.bed grep -v "^#" refGene.hg18ToHg19.5K.unMapped.gp \ | awk '{print $1}' | sort -u > 5K.genes.unMapped grep -v "^#" refGene.hg18ToHg19.10K.unMapped.gp \ | awk '{print $1}' | sort -u > 10K.genes.unMapped # do just the exons all by themselves: featureBits hg18 refGene:exon -bed=hg18.refGene.exons.bed liftOver hg18.refGene.exons.bed ../hg18ToHg19.over.chain.gz \ hg18ToHg19.refGene.exons.lifted.5K.bed \ hg18ToHg19.refGene.exons.5K.unMapped liftOver hg18.refGene.exons.bed \ ../../liftOver10K/hg18ToHg19.over.chain.gz \ hg18ToHg19.refGene.exons.lifted.10K.bed \ hg18ToHg19.refGene.exons.10K.unMapped wc -l *.exons.*.unMapped # 284 hg18ToHg19.refGene.exons.10K.unMapped # 260 hg18ToHg19.refGene.exons.5K.unMapped # create custom track showing identical fragments in hg18 and hg19: hgsql -e "show tables;" hg18 | grep _gold | while read T do hgsql -N -e "select frag,fragStart,fragEnd from $T;" hg18 done | sort > hg18.gold.frags.tab hgsql -N -e "select frag,fragStart,fragEnd from gold;" hg19 \ | sort > hg19.gold.frags.tab # most are identical: comm -12 hg18.gold.frags.tab hg19.gold.frags.tab | wc -l # 26436 # unique to hg18: comm -23 hg18.gold.frags.tab hg19.gold.frags.tab | wc -l # 705 # unique to hg19: (includes patch1 fragments) comm -13 hg18.gold.frags.tab hg19.gold.frags.tab | wc -l # 1126 hgsql -e "show tables;" hg18 | grep _gold | while read T do hgsql -N -e "select chrom,chromStart,chromEnd,frag,0,strand from $T;" hg18 done | sort -k4,4 > hg18.gold.bed # construct custom track of fragments in hg18 that are not in hg19 comm -23 hg18.gold.frags.tab hg19.gold.frags.tab | sort \ > hg18.unique.frags.tab join -1 4 -2 1 hg18.gold.bed hg18.unique.frags.tab \ | awk '{print $2,$3,$4,$1,$5,$6}' | sort -k1,1 -k2,2n \ > hg18.unique.frags.bed comm -12 hg18.gold.frags.tab hg19.gold.frags.tab | sort \ > hg18.hg19.common.frags.tab join -1 4 -2 1 hg18.gold.bed hg18.hg19.common.frags.tab \ | awk '{print $2,$3,$4,$1,$5,$6}' | sort -k1,1 -k2,2n \ > hg18.hg19.common.frags.bed ############################################################################# # LIFTOVER TO Hg17 (RE-DONE - 2010-07-26 - Hiram ) # preserving the previous 10K liftOver files cd /hive/data/genomes/hg18/bed/liftOver10K ### XXX !!!! **** The liftOver directory is full of the "*Original*" files. # the blat.*.date directories are mere symlinks to ../liftOver # this is bad. Fixup the file in blat.hg17.2009-03-06 so it is the # real file, eliminate the liftOver copy, and construct this symlink: ln -s ../blat.hg17.2009-03-20/hg18ToHg17.over.chain.gz . # this liftOver is a 5000 size chunk mkdir /hive/data/genomes/hg18/bed/blat.hg17.2010-07-26 cd /hive/data/genomes/hg18/bed/blat.hg17.2010-07-26 # -debug run to create run dir, preview scripts... # verifies files can be found doSameSpeciesLiftOver.pl -debug hg18 hg17 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ hg18 hg17 > do.log 2>&1 & # real 74m50.836s ############################################################################ # NUMTS TRACK (DONE 2010-08-09 - Chin) mkdir /cluster/data/hg18/bed/NumtS cd /cluster/data/hg18/bed/NumtS # download raw data from wget http://193.204.182.50/files/all_UCSC_custom_tracks.txt wget http://193.204.182.50/files/NumtS_fragments_extlink.html # split the all_UCSC_custom_tracks.txt into 4 bed files # numtSAssembled.bed, numtS.bed, numtSMitochondrion.bed and # numtSMitochondrionChrPlacement.bed cat all_UCSC_custom_tracks.txt | awk ' /^track name/ {print $_}' / > tracks.list # load the bed files to hg18 hgLoadBed hg18 numtSAssembled numtSAssembled.bed hgLoadBed hg18 numtS numtS.bed hgLoadBed hg18 numtSMitochondrion numtSMitochondrion.bed hgLoadBed hg18 numtSMitochondrionChrPlacement numtSMitochondrionChrPlacement.bed # reload the tracks with data with updated ID (DONE 2011-01-26 Chin) mkdir /cluster/data/hg18/bed/NumtS/2011-01-26 # cp over all new data cd /cluster/data/hg18/bed/NumtS/2011-01-26 # load the bed files to hg18 hgLoadBed hg18 numtSAssembled numtSAssembled.bed hgLoadBed hg18 numtS numtS.bed hgLoadBed hg18 numtSMitochondrion numtSMitochondrion.bed hgLoadBed hg18 numtSMitochondrionChrPlacement numtSMitochondrionChrPlacement.bed # update the ~/kent/src/hg/makeDb/trackDb/human/numtSeq.html with # description_revisedMC ( numtSeq-20110126.html) cp numtSeq-20110126.html \ /cluster/home/chinhli/kent/src/hg/makeDb/trackDb/human/numtSeq.html ############################################################################## # hg18 <-> hg19 difference tracks (WORKING - 2010-09-03 - Hiram) # single instance of documentation for hg18 *and* hg19 tracks mkdir /hive/data/genomes/hg18/bed/liftOverHg19 cd /hive/data/genomes/hg18/bed/liftOverHg19 # not needed, but interesting, collect all the fragment # definitions from the gold tables: hgsql -e "show tables;" hg18 | grep _gold | while read T do hgsql -N -e "select frag,fragStart,fragEnd,strand from $T;" hg18 done | sort > hg18.gold.frags.tab hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg19 \ | sort > hg19.gold.frags.tab # construct common and difference listings comm -12 hg18.gold.frags.tab hg19.gold.frags.tab \ > identical.hg18.hg19.frags.tab comm -23 hg18.gold.frags.tab hg19.gold.frags.tab \ > unique.hg18Only.frags.tab comm -13 hg18.gold.frags.tab hg19.gold.frags.tab \ > unique.hg19Only.frags.tab # better yet, get full information about each fragment hgsql -e "show tables;" hg18 | grep _gold | while read T do hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from $T;" hg18 done | sort -k6 > hg18.gold.tab hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg19 \ | sort -k6 > hg19.gold.tab # construct a single key for each fragment for joining. # the key is frag,fragStart,fragEnd,strand awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n", $6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg18.gold.tab | sort \ > hg18.fragKey.tab awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n", $6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg19.gold.tab | sort \ > hg19.fragKey.tab # now, by joining those keys, we can get exact identicals, and # the only-in listings as bed files to load as tracks: join hg18.fragKey.tab hg19.fragKey.tab \ | awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $4,$5,$6,$2,$3,$5,$6}' \ | sort -k1,1 -k2,2n > hg18.hg19.identical.bed join hg18.fragKey.tab hg19.fragKey.tab \ | awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $11,$12,$13,$9,$10,$12,$13}' \ | sort -k1,1 -k2,2n > hg19.hg18.identical.bed join -v 1 hg18.fragKey.tab hg19.fragKey.tab \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \ | sort -k1,1 -k2,2n > hg18.only.bed join -v 2 hg18.fragKey.tab hg19.fragKey.tab \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \ | sort -k1,1 -k2,2n > hg19.only.bed hgLoadBed hg18 hg19ContigDiff hg18.only.bed hgLoadBed hg19 hg18ContigDiff hg19.only.bed wc -l hg1?.only.bed # 708 hg18.only.bed # 1131 hg19.only.bed # this leaves the outstanding question of "why" they might be in # the only-in listings. Some contigs may be different versions, # sometimes different sections of the same contig are used, # and contigs are dropped from hg18 to hg19, or new contigs added # to hg19 to fill in gaps from hg18 # Let's see if we can measure some of this: awk '{print $4}' hg18.only.bed | sort -u > hg18.only.ids.list awk '{print $4}' hg19.only.bed | sort -u > hg19.only.ids.list # Looks like 333 idential contigs with different parts used: comm -12 hg18.only.ids.list hg19.only.ids.list > differentPortions.list wc -l differentPortions.list # 333 # and perhaps 198 = 531-333 of different versions of same contig: sed -e "s/\.[0-9]*$//" hg18.only.ids.list | sort -u \ > hg18.noVersions.ids.list sed -e "s/\.[0-9]*$//" hg19.only.ids.list | sort -u \ > hg19.noVersions.ids.list comm -12 hg18.noVersions.ids.list hg19.noVersions.ids.list | wc -l # 531 sed -e "s/\.[0-9]*$//" differentPortions.list | sort -u \ > differentPortions.noVersions.list comm -12 hg18.noVersions.ids.list hg19.noVersions.ids.list | sort -u \ > noVersions.common.list # indeed, 198 contigs of different versions: comm -23 noVersions.common.list differentPortions.noVersions.list \ | sort -u > differentVersions.list wc -l differentVersions.list # 198 # dividing up these items: cat << '_EOF_' > identifyPortions.pl #!/usr/bin/env perl use strict; use warnings; my %differentVersions; my %differentPortions; open (FH, ") { chomp $line; $differentVersions{$line} = 1; } close (FH); open (FH, "differentPortions.list" ) or die "can not read differentPortions.list"; while (my $line = ) { chomp $line; $differentPortions{$line} = 1; } close (FH); my %hg18Done; open (DP, ">hg18.differentPortions.bed") or die "can not write to hg18.differentPortions.bed"; open (DV, ">hg18.differentVersions.bed") or die "can not write to hg18.differentVersions.bed"; open (FH, ") { chomp $line; my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); # assume done while $acc is still complete $hg18Done{$acc} = 1; if (exists($differentPortions{$acc})) { printf DP "%s\n", $line; } else { my $trimAcc = $acc; $trimAcc =~ s/\.[0-9]+$//; if (exists($differentVersions{$trimAcc})) { printf DV "%s\n", $line; } else { # this one does not match $hg18Done{$acc} = 0; } } } close (FH); close (DV); close (DP); open (DR, ">hg18.dropped.bed") or die "can not write to hg18.dropped.bed"; open (FH, ") { chomp $line; my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); if (0 == $hg18Done{$acc}) { printf DR "%s\n", $line; } } close (FH); close (DR); my %hg19Done; open (DP, ">hg19.differentPortions.bed") or die "can not write to hg19.differentPortions.bed"; open (DV, ">hg19.differentVersions.bed") or die "can not write to hg19.differentVersions.bed"; open (FH, ") { chomp $line; my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); # assume done while $acc is still complete $hg19Done{$acc} = 1; if (exists($differentPortions{$acc})) { printf DP "%s\n", $line; } else { my $trimAcc = $acc; $trimAcc =~ s/\.[0-9]+$//; if (exists($differentVersions{$trimAcc})) { printf DV "%s\n", $line; } else { # this one does not match $hg19Done{$acc} = 0; } } } close (FH); close (DV); close (DP); open (DR, ">hg19.newTo19.bed") or die "can not write to hg19.newTo19.bed"; open (FH, ") { chomp $line; my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); if (0 == $hg19Done{$acc}) { printf DR "%s\n", $line; } } close (FH); close (DR); '_EOF_' # << happy emacs chmod +x identifyPortions.pl ./identifyPortions.pl # make sure nothing was lost sort hg18.differentVersions.bed hg18.differentPortions.bed \ hg18.dropped.bed | sum # 50075 28 sort hg18.only.bed | sum # 50075 28 sort hg19.differentVersions.bed hg19.differentPortions.bed \ hg19.newTo19.bed | sum # 36621 45 sort hg19.only.bed | sum # 36621 45 sort -k1,1 -k2,2n hg19.differentVersions.bed hg19.differentPortions.bed \ hg19.newTo19.bed > hg19.itemRgb.bed sort -k1,1 -k2,2n hg18.differentVersions.bed hg18.differentPortions.bed \ hg18.dropped.bed > hg18.itemRgb.bed hgLoadBed hg18 hg18ContigDiff hg18.itemRgb.bed hgLoadBed hg19 hg19ContigDiff hg19.itemRgb.bed ############################################################################## # 1000 GENOMES COVERAGE MASK (DONE 10/1/10 angie) mkdir /hive/data/genomes/hg18/bed/1000GenomesMask cd /hive/data/genomes/hg18/bed/1000GenomesMask wget --timestamping \ ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/release/2010_07/low_coverage/other_data/\* cat > pseudoFastaToBed.pl <<'_EOF_' #!/usr/bin/env perl use warnings; use strict; my ($base, $inFile) = @ARGV; die "usage: $0 outputBase input.gz\n" if (! $base || ! $inFile); my ($IN, $OUTD, $OUTM, $OUTU); open($IN, "zcat $inFile |") || die; my $outD = $base . "Depth.bed.gz"; my $outM = $base . "MapQ.bed.gz"; my $outU = $base . "Uncov.bed.gz"; open($OUTD, "| gzip -c > $outD") || die; open($OUTM, "| gzip -c > $outM") || die; open($OUTU, "| gzip -c > $outU") || die; sub printItem { my ($chr, $start, $end, $mask) = @_; return unless $end > $start; if ($mask eq 'D') { print $OUTD join("\t", $chr, $start, $end) . "\n"; } elsif ($mask eq 'M') { print $OUTM join("\t", $chr, $start, $end) . "\n"; } elsif ($mask eq 'B') { print $OUTD join("\t", $chr, $start, $end) . "\n"; print $OUTM join("\t", $chr, $start, $end) . "\n"; } elsif ($mask eq "-") { print $OUTU join("\t", $chr, $start, $end) . "\n"; } } sub maskToBed3Subtracks { my ($chr, $seqRef) = @_; $chr =~ s/MT$/M/; $chr =~ s/^([0-9XMY])/chr$1/; my ($start, $end) = (0, 0); my $len = length($$seqRef); my $prevM; while ($end < $len) { my $m = substr $$seqRef, $end, 1; if (defined $prevM && $m ne $prevM) { &printItem($chr, $start, $end, $prevM); $start = $end; } $end++; $prevM = $m; } &printItem($chr, $start, $end, $prevM); } my ($prevChrom, $seq); while (<$IN>) { if (/^>(\S+)/) { my $chrom = $1; if (defined $prevChrom) { &maskToBed3Subtracks($prevChrom, \$seq); } $prevChrom = $chrom; $seq = ""; } elsif (/^([NMDB0-]+)$/) { $seq .= $1; } else { die "Unexpected line format:\n$_\t"; } } &maskToBed3Subtracks($prevChrom, \$seq); close($OUTD); close($OUTM); close($OUTU); '_EOF_' # << emacs chmod a+x pseudoFastaToBed.pl foreach f ({CEU,CHBJPT,YRI}.low_coverage.mask.fa.gz) set pop = `echo $f:r:r:r:r | perl -wpe '$_ = ucfirst lc; s/Chbjpt/ChbJpt/;'` set tBase = "covMask1kGPilotLowCov$pop" echo $tBase ./pseudoFastaToBed.pl $tBase $f end # Use featureBits to merge adjacent regions foreach f (covMask*.bed.gz) echo $f:r:r featureBits hg18 $f -bed=stdout \ | cut -f 1-3 \ | hgLoadBed hg18 $f:r:r stdin end # covMask1kGPilotLowCovCeuDepth # 6718955 bases of 2881515245 (0.233%) in intersection # Loaded 97777 elements of size 3 # covMask1kGPilotLowCovCeuMapQ # 408568477 bases of 2881515245 (14.179%) in intersection # Loaded 4052843 elements of size 3 # covMask1kGPilotLowCovCeuUncov # 3942 bases of 2881515245 (0.000%) in intersection # Loaded 311 elements of size 3 # covMask1kGPilotLowCovChbJptDepth # 12143572 bases of 2881515245 (0.421%) in intersection # Loaded 198277 elements of size 3 # covMask1kGPilotLowCovChbJptMapQ # 429343803 bases of 2881515245 (14.900%) in intersection # Loaded 4198464 elements of size 3 # covMask1kGPilotLowCovChbJptUncov # 50676 bases of 2881515245 (0.002%) in intersection # Loaded 2108 elements of size 3 # covMask1kGPilotLowCovYriDepth # 11875006 bases of 2881515245 (0.412%) in intersection # Loaded 193700 elements of size 3 # covMask1kGPilotLowCovYriMapQ # 454810959 bases of 2881515245 (15.784%) in intersection # Loaded 4338322 elements of size 3 # covMask1kGPilotLowCovYriUncov # 21232 bases of 2881515245 (0.001%) in intersection # Loaded 1255 elements of size 3 # Make some union tables featureBits hg18 -or covMask1kGPilotLowCovCeuDepth \ covMask1kGPilotLowCovChbJptDepth \ covMask1kGPilotLowCovYriDepth \ -bed=stdout \ | cut -f 1-3 | gzip -c > covMask1kGPilotLowCovUnionDepth.bed.gz #14033969 bases of 2881515245 (0.487%) in intersection hgLoadBed hg18 covMask1kGPilotLowCovUnionDepth covMask1kGPilotLowCovUnionDepth.bed.gz #Loaded 232479 elements of size 3 featureBits hg18 -or covMask1kGPilotLowCovCeuMapQ \ covMask1kGPilotLowCovChbJptMapQ \ covMask1kGPilotLowCovYriMapQ \ -bed=stdout \ | cut -f 1-3 | gzip -c > covMask1kGPilotLowCovUnionMapQ.bed.gz #463864561 bases of 2881515245 (16.098%) in intersection hgLoadBed hg18 covMask1kGPilotLowCovUnionMapQ covMask1kGPilotLowCovUnionMapQ.bed.gz #Loaded 4319382 elements of size 3 featureBits hg18 -or covMask1kGPilotLowCovCeuUncov \ covMask1kGPilotLowCovChbJptUncov \ covMask1kGPilotLowCovYriUncov \ -bed=stdout \ | cut -f 1-3 | gzip -c > covMask1kGPilotLowCovUnionUncov.bed.gz #66237 bases of 2881515245 (0.002%) in intersection hgLoadBed hg18 covMask1kGPilotLowCovUnionUncov covMask1kGPilotLowCovUnionUncov.bed.gz #Loaded 3129 elements of size 3 # Make intersection of uncovered bits too: featureBits hg18 covMask1kGPilotLowCovCeuUncov \ covMask1kGPilotLowCovChbJptUncov \ covMask1kGPilotLowCovYriUncov \ -bed=stdout \ | cut -f 1-3 | gzip -c > covMask1kGPilotLowCovIntersectionUncov.bed.gz #676 bases of 2881515245 (0.000%) in intersection hgLoadBed hg18 covMask1kGPilotLowCovIntersectionUncov covMask1kGPilotLowCovIntersectionUncov.bed.gz #Loaded 49 elements of size 3 featureBits hg18 -or covMask1kGPilotLowCovCeuDepth covMask1kGPilotLowCovCeuMapQ \ covMask1kGPilotLowCovCeuUncov \ covMask1kGPilotLowCovChbJptDepth covMask1kGPilotLowCovChbJptMapQ \ covMask1kGPilotLowCovChbJptUncov \ covMask1kGPilotLowCovYriDepth covMask1kGPilotLowCovYriMapQ \ covMask1kGPilotLowCovYriUncov \ -bed=stdout \ | cut -f 1-3 | gzip -c > covMask1kGPilotLowCovUnion.bed.gz #467348829 bases of 2881515245 (16.219%) in intersection hgLoadBed hg18 covMask1kGPilotLowCovUnion covMask1kGPilotLowCovUnion.bed.gz #Loaded 4339659 elements of size 3 ######################################################################### # SNP "BAD APPLES" TRACK (IN PROGRESS 10/4/10 angie) cd /hive/data/genomes/gs.19/build36/bed/1000GenomesMask foreach t (Depth MapQ Uncov "") echo $t bedIntersect -tab -aHitAny -allowStartEqualEnd ../snp130/snp130.bed \ covMask1kGPilotLowCovUnion$t.bed.gz snp130BadApplesUnion$t.bed end wc -l snp130BadApplesUnion*.bed # 5240097 snp130BadApplesUnion.bed # 502329 snp130BadApplesUnionDepth.bed # 5150686 snp130BadApplesUnionMapQ.bed # 1491 snp130BadApplesUnionUncov.bed cut -f 4 snp130BadApplesUnion.bed | sort -u | wc -l #4452279 sed -e 's/snp130/snp130BadApples/' /hive/data/outside/dbSNP/130/human/snp130.sql \ > snp130BadApples.sql foreach t (Depth MapQ Uncov "") echo $t hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ hg18 snp130BadApples$t -sqlTable=snp130BadApples.sql snp130BadApplesUnion$t.bed end # Reading snp130BadApplesUnionDepth.bed # Loaded 502329 elements of size 17 # Reading snp130BadApplesUnionMapQ.bed # Loaded 5150686 elements of size 17 # Reading snp130BadApplesUnionUncov.bed # Loaded 1491 elements of size 17 # Reading snp130BadApplesUnion.bed # Loaded 5240097 elements of size 17 hgsql hg18 -e 'rename table snp130BadApples to snp130BadApplesUnion' gzip snp130BadApples*.bed # rs ID lists foreach f (snp130BadApples*.bed.gz) zcat $f | cut -f 4 | sort -u > $f:r:r.rsIDs.txt end wc -l *.txt # 4452279 snp130BadApplesUnion.rsIDs.txt # 392307 snp130BadApplesUnionDepth.rsIDs.txt # 4376753 snp130BadApplesUnionMapQ.rsIDs.txt # 1477 snp130BadApplesUnionUncov.rsIDs.txt # use list of SNPs to port to hg19? Try to port masked regions?? # asked richard and sendu to repeat for hg19/GRCh37. Richard's reply: # -------------------------- # No, we haven't done that. We would be doing it for new call sets I # expect, not for the pilot calls. # We are developing new approaches to variant calling that might well # change the accessibility criteria and # masks. # # I am copying the 1000GP data processing group to remind ourselves that # amongst all the current discussion # about calling, we need to return to how we handle accessibility. # -------------------------- ######################################################################### # DENISOVA (ANCIENT HUMAN) (DONE 11/16/10 angie) mkdir /hive/data/genomes/hg18/bed/denisova cd /hive/data/genomes/hg18/bed/denisova # Use username and password emailed by Ed Green 9/30/10 alias wg wget -r --user=xxx --password=xxx wg ftp://cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2/\* wg ftp://cdna.eva.mpg.de/Present-day_human_sequence_alignments_to_hg18_and_panTro2/\* wg ftp://cdna.eva.mpg.de/Catalog_of_changes/\* # Ed says we don't need files with 'hcca' or 'hcscca' in the names # (only hg18 or panTro2). find . -name \*h\*cca\* -exec echo rm {} \; # Inspect & execute output of the find command to save a bit of space (11G of 83G). # Combine the two sequence-lib files for Denisova into one bam. First ensure that # the headers are identical: samtools view -H cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2/SL3003/SL3003-hg18.bam > h1 samtools view -H cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2/SL3004_100122/SL3004-hg18.bam > h2 cmp h1 h2 # No output, and they seem to be sorted by position, good to go: samtools merge SL3003_SL3004_100122-hg18.bam \ cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2/SL3003/SL3003-hg18.bam \ cdna.eva.mpg.de/Denisova_sequence_alignments_to_hg18_and_panTro2/SL3004_100122/SL3004-hg18.bam #1023.571u 22.997s 17:34.24 99.2% 0+0k 0+0io 0pf+0w # Build BAM index (.bam.bai) files. samtools index SL3003_SL3004_100122-hg18.bam #94.811u 3.560s 1:51.16 88.4% 0+0k 0+0io 0pf+0w pushd cdna.eva.mpg.de/Present-day_human_sequence_alignments_to_hg18_and_panTro2 foreach f (*.bam) echo $f samtools index $f end popd # Make /gbdb/ links and load database tables mkdir /gbdb/hg18/denisova ln -s `pwd`/SL3003_SL3004_100122-hg18.bam{,.bai} /gbdb/hg18/denisova/ mkdir /gbdb/hg18/denisova/modernHumanSeq find `pwd` -name MMS\*hg18\*.bam\* -exec echo ln -s {} /gbdb/hg18/denisova/modernHumanSeq/ \; # Inspect & execute output of the find command. hgBbiDbLink hg18 bamSLDenisova /gbdb/hg18/denisova/SL3003_SL3004_100122-hg18.bam foreach f (/gbdb/hg18/denisova/modernHumanSeq/MMS*.bam) set track = `echo $f | perl -wpe 's/.*(MMS\d+)_HGDP\d+_(\w+)\..*/bam$1$2/ || die; s/_//;'` echo $track hgBbiDbLink hg18 $track $f end # to see the grp table: hgsql -e "select * from grp order by priority;" hg18 # add new denisova group: hgsql hg18 -e "INSERT INTO grp VALUES ('denisova', 'Denisova Assembly and Analysis', 6.6, 0);" ######################################################################### # BUILD R-DMR TRACK. DONE (Fan 7/23/2010) ssh hgwdev mkdir -p /hive/data/genomes/gs.19/build36/bed/rdmr cd /hive/data/genomes/gs.19/build36/bed/rdmr # download the supplemental data file, ng.471-S2.txt. cp -p ng.471-S2.txt rdmrRaw.tab # remove the header lines at the beginning of rdmrRaw.tab vi rdmrRaw.tab # load rdmrRaw table hgsql hg18 -e 'drop table rdmrRaw' hgsql hg18 < ~/kent/src/hg/lib/rdmrRaw.sql hgsql hg18 -e 'load data local infile "rdmrRaw.tab" into table rdmrRaw' # create rdmr table hgsql hg18 -N -e 'select chrom, chromStart-1, chromEnd, gene from rdmrRaw' >rdmr.tab hgLoadBed hg18 rdmr rdmr.tab ##################################################################### # Create liftOver files to and from: calJac3 <-> hg18 (DONE 2011-01-10 - Chin) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/hg18/bed/lastzCalJac3.2010-12-20 cd /hive/data/genomes/hg18/bed/lastzCalJac3.2010-12-20 cat << '_EOF_' > DEF # human vs. marmoset BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q # and place those items here BLASTZ_O=600 BLASTZ_E=150 # other parameters from panTro2 vs hg18 lastz on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Human Hg19 SEQ1_DIR=/scratch/data/hg18/hg18.2bit SEQ1_LEN=/scratch/data/hg18/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Marmoset (calJac3) SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_LIMIT=50 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/hg18/bed/lastzCalJac3.2010-12-20 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \ -stop net \ > do.log 2>&1 & # real 101m10.634s cd /hive/data/genomes/hg18/bed ln -s lastzCalJac3.2010-12-20 lastz.calJac cd /hive/data/genomes/hg18/bed/lastz.calJac3/axtChain cp hg18.calJac3.over.chain.gz ../../liftOver/. cd /hive/data/genomes/hg18/bed/liftOver mv hg18.calJac3.over.chain.gz hg18ToCalJac3.over.chain.gz cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg18/liftOver ln -s /hive/data/genomes/hg18/bed/liftOver/hg18ToCalJac3.over.chain.gz . md5sum *.gz > md5sum.txt ######################################################################### # ISCA (FORMERLY ISCA RETROSPECTIVE) FROM DBVAR (DONE 5/21/12 angie) # Updated 3/2/12. Updated 5/21/12 to include nstd45 (Curated) and b0b's aggregate/depth subtracks. # Combined submitted+remapped, w/new subcategories likely benign, likely pathogenic, # and removed Retrospective from names 10/18/11. # Split into benign/pathogenic/uncertain subtracks 9/14/11. # Reloaded 4/19/11 to drop the boring aggregate variants (sv; keep ssv). # Originally done 1/31/11 # Redmine: Track #34 (dbVar for human) set today = `date +%Y_%m_%d` mkdir /hive/data/genomes/hg18/bed/isca/$today cd /hive/data/genomes/hg18/bed/isca/$today # Get variants submitted on this assembly, and variants remapped from other assemblies. wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd37_ISCA/gvf/nstd37_ISCA.NCBI36.submitted.all.germline.ucsc.gvf.gz wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd37_ISCA/gvf/nstd37_ISCA.NCBI36.remap.all.germline.ucsc.gvf.gz wget ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd45_ISCA_curated_dataset/gvf/nstd45_ISCA_curated_dataset.NCBI36.submitted.all.germline.ucsc.gvf.gz # See notes on data contents: http://redmine.soe.ucsc.edu/issues/34#note-34 # and notes on data format plan: http://redmine.soe.ucsc.edu/issues/34#note-36 zcat nstd37_ISCA*.gvf.gz \ | ~/kent/src/hg/utils/automation/gvfToBed8Attrs.pl \ > isca.bed zcat nstd45_ISCA*.gvf.gz \ | ~/kent/src/hg/utils/automation/gvfToBed8Attrs.pl \ > iscaCurated.bed wc -l isca*.bed # 12923 isca.bed # 84 iscaCurated.bed # Split into subtracks by clinical_int value. zcat nstd37_ISCA*.gvf.gz \ | grep ssv | sed -e 's/.*clinical_int=//; s/;.*//;' | sort | uniq -c # 4304 Benign # 4583 Pathogenic # 3408 Uncertain significance # 464 Uncertain significance: likely benign # 164 Uncertain significance: likely pathogenic zcat nstd45_ISCA*.gvf.gz \ | grep ssv | sed -e 's/.*clinical_int=//; s/;.*//;' | sort | uniq -c # 29 Benign # 55 Pathogenic foreach subtrack (Benign Pathogenic) grep -w $subtrack isca.bed > isca$subtrack.bed hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \ -allowStartEqualEnd hg18 isca$subtrack isca$subtrack.bed grep -w $subtrack iscaCurated.bed > iscaCurated$subtrack.bed hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \ -allowStartEqualEnd hg18 iscaCurated$subtrack iscaCurated$subtrack.bed end #Read 4304 elements of size 11 from iscaBenign.bed #Read 29 elements of size 11 from iscaCuratedBenign.bed #Read 4583 elements of size 11 from iscaPathogenic.bed #Read 55 elements of size 11 from iscaCuratedPathogenic.bed # The subcategories of Uncertain need a bit more sophisticated treatment: set subtrack = Uncertain grep -w $subtrack isca.bed \ | grep -vi 'Uncertain Significance: likely' \ > isca$subtrack.bed hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \ -allowStartEqualEnd hg18 isca$subtrack isca$subtrack.bed #Read 3408 elements of size 11 from iscaUncertain.bed foreach unc (benign pathogenic) set subtrack = Likely`perl -we 'print ucfirst("'$unc'");'` grep -wi "Uncertain Significance: likely $unc" isca.bed \ > isca$subtrack.bed hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \ -allowStartEqualEnd hg18 isca$subtrack isca$subtrack.bed end #Read 464 elements of size 11 from iscaLikelyBenign.bed #Read 164 elements of size 11 from iscaLikelyPathogenic.bed # Add b0b's aggregate/depth subtracks. # make bedGraphs hgsql -N -e "SELECT chrom, chromStart, chromEnd FROM iscaPathogenic \ WHERE attrVals LIKE '%number_gain%'" hg18 | sort \ | bedItemOverlapCount hg18 stdin > iscaPathGain.bedGraph hgsql -N -e "SELECT chrom, chromStart, chromEnd FROM iscaPathogenic \ WHERE attrVals LIKE '%number_loss%'" hg18 | sort \ | bedItemOverlapCount hg18 stdin > iscaPathLoss.bedGraph # load tables hgLoadBed -bedGraph=4 hg18 iscaPathGainCum iscaPathGain.bedGraph #Read 1997 elements of size 4 from iscaPathGain.bedGraph hgLoadBed -bedGraph=4 hg18 iscaPathLossCum iscaPathLoss.bedGraph #Read 3570 elements of size 4 from iscaPathLoss.bedGraph ############################################################################# # BUILD B CELL RNA-SEQ TRACKS (DONE, 3/29/11, Fan) ssh hgwdev mkdir -p /hive/data/genomes/gs.19/build36/bed/Bcell cd /hive/data/genomes/gs.19/build36/bed/Bcell # Get data from custom track site wget --timestamping http://genomicsweb1.med.upenn.edu/ucsc/bcelltranscriptometracks.txt wget --timestamping http://genomicsweb1.med.upenn.edu/ucsc/800m_junctions.bed wget --timestamping http://genomicsweb1.med.upenn.edu/ucsc/800m.bw ln -s `pwd`/800m.bw /gbdb/hg18/bbi/ceuBcellRNASeqBW.bw hgsql hg18 -e 'drop table if exists ceuBcellRNASeqBW; \ create table ceuBcellRNASeqBW (fileName varchar(255) not null); \ insert into ceuBcellRNASeqBW values ("/gbdb/hg18/bbi/ceuBcellRNASeqBW.bw");' fgrep -v track 800m_junctions.bed >ceuBcellRNASeq.bed hgLoadBed hg18 ceuBcellRNASeq ceuBcellRNASeq.bed ############################################################################# # CREATE .PNG PICTURE FILES OF EVOFOLD RNA STRUCTURES. (DONE, 4/29/2011, Fan) ssh hgwdev mkdir /hive/data/genomes/hg18/bed/evofold/doEvoFold cd /hive/data/genomes/hg18/bed/evofold/doEvoFold # Creaet sub-directories to store .png files (total of about 47.5 K of them) sparated by chromosomes. mkdir -p evoFold/chr1 mkdir -p evoFold/chr10 mkdir -p evoFold/chr11 mkdir -p evoFold/chr12 mkdir -p evoFold/chr13 mkdir -p evoFold/chr14 mkdir -p evoFold/chr15 mkdir -p evoFold/chr16 mkdir -p evoFold/chr17 mkdir -p evoFold/chr18 mkdir -p evoFold/chr19 mkdir -p evoFold/chr2 mkdir -p evoFold/chr20 mkdir -p evoFold/chr21 mkdir -p evoFold/chr22 mkdir -p evoFold/chr3 mkdir -p evoFold/chr4 mkdir -p evoFold/chr5 mkdir -p evoFold/chr6 mkdir -p evoFold/chr7 mkdir -p evoFold/chr8 mkdir -p evoFold/chr9 mkdir -p evoFold/chrM mkdir -p evoFold/chrX mkdir -p evoFold/chrY # get latest verion of the .jar file of VARNA wget --timestamping http://varna.lri.fr/bin/VARNAv3-7.jar # Create Java command line files echo 'doEvoFold hg18 do$1 $1' >do1Chrom chmod +x do1Chrom do1Chrom chr1 do1Chrom chr10 do1Chrom chr11 do1Chrom chr12 do1Chrom chr13 do1Chrom chr14 do1Chrom chr15 do1Chrom chr16 do1Chrom chr17 do1Chrom chr18 do1Chrom chr19 do1Chrom chr2 do1Chrom chr20 do1Chrom chr21 do1Chrom chr22 do1Chrom chr3 do1Chrom chr4 do1Chrom chr5 do1Chrom chr6 do1Chrom chr7 do1Chrom chr8 do1Chrom chr9 do1Chrom chrM do1Chrom chrX do1Chrom chrY # run the dochrXX command files in small batches with '&' to exploit multiple CPU # wait an hour for each batch to finish so that we don't suck in too much computational resources. dochr1 & dochr2 & dochr3 & dochr4 & dochr5 & sleep 3600 dochr6 & dochr7 & dochr8 & dochr9 & dochr10 & sleep 3600 dochr11 & dochr12 & dochr13 & dochr14 & dochr15 & sleep 3600 dochr16 & dochr17 & dochr18 & dochr19 & dochr20 & sleep 3600 dochr21 & dochr22 & dochrX & dochrY & dochrM & # check the resulting .png files # create a simple script file, check1, with the following 3 lines: echo $1 hgsql hg18 -N -e "select count(*) from evofold where chrom='${1}'" ls evoFold/$1/*.png|wc chmod +x check1 # create another script file, checkAll, with the following lines: check1 chr1 check1 chr10 check1 chr11 check1 chr12 check1 chr13 check1 chr14 check1 chr15 check1 chr16 check1 chr17 check1 chr18 check1 chr19 check1 chr2 check1 chr20 check1 chr21 check1 chr22 check1 chr3 check1 chr4 check1 chr5 check1 chr6 check1 chr7 check1 chr8 check1 chr9 check1 chrM check1 chrX check1 chrY chmod +x checkAll checkAll >j.check # examing the resuls in j.check to make sure things are OK. # create symbolic links ln -s /hive/data/genomes/gs.19/build36/bed/evofold/doEvoFold/evoFold /gbdb/hg18/evoFold ln -s /gbdb/hg18/evoFold /usr/local/apache/htdocs/evoFold/hg18 ############################################################################# # BUILD evoCpg TRACK (DONE, Fan, 5/23/11) ssh hgwdev cd /hive/data/genomes/gs.19/build36/bed mkdir evoCpg cd evoCpg # put data file, weizmann_evo_cgi.bed, here. cat weizmann_evo_cgi.bed|grep -v track >evoCpg.bed hgLoadBed hg18 evoCpg evoCpg.bed # create kent/src/hg/makeDb/trackDb/human/hg18/evoCpg.html. # add pushQue record ############################################################################# # BUILD HG18 OMIM RELATED TRACKS (DONE, 6/3/11, Fan) ssh hgwdev cd /hive/data/genomes/hg18/bed mkdir -p omim/05172011 cd omim/05172011 # obtain the following files from OMIM and place them at this subdirectory genemap.txt mim2gene.txt mimAV.txt script1.pl script2.pl cat genemap.txt|sed -e 's/|/\t/g' > genemap.tab hgLoadSqlTab -warn hg18 omimGeneMap ~/kent/src/hg/lib/omimGeneMap.sql genemap.tab # Load mim2gene table hgsql hg18 -e 'drop table mim2gene' hgsql hg18 < ~/kent/src/hg/lib/mim2gene.sql hgsql hg18 -e 'load data local infile "mim2gene.txt" into table mim2gene ignore 1 lines' # build omimGeneSymbol table doOmimGeneSymbols hg18 j.out cat j.out |sort -u >omimGeneSymbol.tab hgLoadSqlTab -warn hg18 omimGeneSymbol ~/kent/src/hg/lib/omimGeneSymbol.sql omimGeneSymbol.tab perl ./script1.pl --gene-map-file=genemap.txt >omimPhenotype.tab hgLoadSqlTab -warn hg18 omimPhenotype ~/kent/src/hg/lib/omimPhenotype.sql omimPhenotype.tab hgsql hg18 -e 'update omimPhenotype set phenotypeClass = -1 where phenotypeClass=0' hgsql hg18 -e 'update omimPhenotype set phenotypeId = -1 where phenotypeId=0' doOmimGene2 hg18 j.tmp cat j.tmp |sort -u > omimGene2.tab hgLoadBed hg18 omimGene2 omimGene2.tab rm j.tmp ############################################################## # build the omimAvSnp track cd /hive/data/genomes/hg18/bed/omim/05172011 mkdir av cd av # get the mimAV.txt data file from OMIM cut -f 1 mimAV.txt >j1 cut -f 2 mimAV.txt >j2 cut -f 3 mimAV.txt >j3 cut -f 4 mimAV.txt >j4 cut -f 5 mimAV.txt >j5 cat j1 |sed -e 's/\./\t/' >j1.2 cat j4 |sed -e 's/,/\t/' >j4-2 cut -f 1 j4-2 >j4.1 cut -f 2 j4-2 >j4.2 paste j1 j1.2 j3 j4 j4.1 j4.2 j5 j2 >omimAv.tab hgsql hg18 -e 'drop table omimAv' hgsql hg18 < ~/src/hg/lib/omimAv.sql hgsql hg18 -e 'load data local infile "omimAv.tab" into table omimAv ignore 1 lines' hgsql hg18 -e 'update omimAv set repl2 = rtrim(ltrim(repl2))' doOmimAv hg18 omimAvRepl.tab 2>j.err hgsql hg18 -e "drop table omimAvRepl" hgsql hg18 < ~/kent/src/hg/lib/omimAvRepl.sql hgsql hg18 -e 'load data local infile "omimAvRepl.tab" into table omimAvRepl' rm j1.2 j1 j2 j3 j4 j4-2 j4.1 j4.2 j5 hgsql hg18 -N -e 'select chrom, chromStart, chromEnd, avId from omimAvRepl r, snp130 s where s.name = dbSnpId order by avId' >omimAvSnp.tab hgLoadBed -allowStartEqualEnd hg18 omimAvSnp omimAvSnp.tab ############################################################## # build the omimLocation track cd /hive/data/genomes/hg18/bed/omim/05172011 mkdir location cd location doOmimLocation hg18 omimLocation.bed 2>j.err hgLoadBed hg18 omimLocation omimLocation.bed # Remove all gene entries in omimGene2 from omimLocation table hgsql hg18 -N -e \ 'delete from omimLocation where name in (select name from omimGene2) ' # Per OMIM request, delete all the gray entries in omimLocation table. mkdir cleanUpOmimLocation cd cleanUpOmimLocation hgsql hg18 -N -e \ 'select distinct name from omimLocation' |sort -u >j.all hgsql hg18 -N -e \ 'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=1' >j.1 hgsql hg18 -N -e \ 'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=2' >j.2 hgsql hg18 -N -e \ 'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=3' >j.3 hgsql hg18 -N -e \ 'select distinct name from omimLocation, omimPhenotype where name=omimId and phenotypeClass=4' >j.4 cat j.1 j.2 j.3 j.4 |sort -u >j.1234 diff j.all j.1234 |grep "<" |sed -e 's/doall cat << '_EOF_' > do1 hgsql hg18 -e "delete from omimLocation where name='${1}'" '_EOF_' # << emacs ./doall ############################################################################# # adding new decode data (WORKING - 2011-07-26 - Hiram) mkdir /hive/data/outside/decode cd /hive/data/outside/decode wget --timestamping "http://www.decode.com/addendum/Maps.zip" unzip Maps.zip # produces a Maps directory mkdir /hive/data/outside/decode/hg18 cd /hive/data/outside/decode/hg18 # extract the data from the 10Kb bin recombination maps, # constructing bedGraph files for F in female female_carrier female_noncarrier \ male male_carrier male_noncarrier \ sex-averaged sex-averaged_carrier sex-averaged_noncarrier do ls -og ../Maps/${F}.rmap grep -v stdrate ../Maps/${F}.rmap | awk ' { printf "%s\t%d\t%d\t%s\n", $1, $2-5000, $2+5000, $4 }' > ${F}.bedGraph bedGraphToBigWig ${F}.bedGraph /hive/data/genomes/hg18/chrom.sizes \ ${F}.bw done # load the bigWig files into SQL table name friendly tables: for C in female female_carrier female_noncarrier male male_carrier \ male_noncarrier sex-averaged sex-averaged_carrier sex-averaged_noncarrier do N=${C} case ${C} in female) N="Female" ;; female_carrier) N="FemaleCarrier" ;; female_noncarrier) N="FemaleNonCarrier" ;; male) N="Male" ;; male_carrier) N="MaleCarrier" ;; male_noncarrier) N="MaleNonCarrier" ;; sex-averaged) N="SexAveraged" ;; sex-averaged_carrier) N="SexAveragedCarrier" ;; sex-averaged_noncarrier) N="SexAveragedNonCarrier" ;; esac echo $C $N rm -f /gbdb/hg18/decode/${C}.bw /gbdb/hg18/decode/${N}.bw ln -s `pwd`/${C}.bw /gbdb/hg18/decode/${N}.bw hgsql -e "drop table decode${N};" hg18 hgBbiDbLink hg18 decode${N} /gbdb/hg18/decode/${N}.bw done # compute male - female difference awk '{printf "%s_%d_%d\t%s\n", $1, $2, $3, $4}' male.bedGraph \ | sort > ordered.male.txt awk '{printf "%s_%d_%d\t%s\n", $1, $2, $3, $4}' female.bedGraph \ | sort > ordered.female.txt join ordered.male.txt ordered.female.txt > maleFemale.txt awk '{printf "%s\t%.6f\n", $1, $2-$3}' maleFemale.txt \ | sed -e "s/_/\t/g" | sort -k1,1 -k2,2n > maleFemale.bedGraph # and hot spots awk '$4 > 9.99' female.bedGraph > hotSpotFemale.bed awk '$4 > 9.99' male.bedGraph > hotSpotMale.bed hgLoadBed hg18 decodeHotSpotFemale hotSpotFemale.bed hgLoadBed hg18 decodeHotSpotMale hotSpotMale.bed bedGraphToBigWig maleFemale.bedGraph /hive/data/genomes/hg18/chrom.sizes \ MaleFemaleDifference.bw ln -s `pwd`/MaleFemaleDifference.bw /gbdb/hg18/decode/ hgsql -e "drop table decodeMaleFemaleDifference;" hg18 hgBbiDbLink hg18 decodeMaleFemaleDifference /gbdb/hg18/decode/MaleFemaleDifference.bw ############################################################################# # HapMap recombination maps added to deCODE track (DONE - 2011-08-30 - Hiram) mkdir /hive/data/genomes/hg18/bed/hapmap/release24 cd /hive/data/genomes/hg18/bed/hapmap/release24 wget --timestamping \ https://mathgen.stats.ox.ac.uk/wtccc-software/recombination_rates/genetic_map_b36_CEU.tgz wget --timestamping \ https://mathgen.stats.ox.ac.uk/wtccc-software/recombination_rates/genetic_map_b36_YRI.tgz wget --timestamping \ https://mathgen.stats.ox.ac.uk/wtccc-software/recombination_rates/genetic_map_b36_combined.tgz mkdir CEU cd CEU tar xvzf ../genetic_map_b36_CEU.tgz cd .. mkdir YRI cd YRI tar xvzf ../genetic_map_b36_YRI.tgz cd .. mkdir combined cd combined tar xvzf ../genetic_map_b36_combined.tgz for F in CEU/genetic_map_*.txt do C=`basename $F | sed -e "s/genetic_map_chr//; s/_CEU_b36.txt//"` grep -v "^position" ${F} | awk -v chr="chr${C}" ' BEGIN {prev = 0} { printf "%s\t%d\t%d\t%s\n", chr, prev, $1, $2; prev = $1 } ' done | sort -k1,1 -k2,2n > hapMapRelease24CEURecombMap.bedGraph for F in YRI/genetic_map_*.txt do C=`basename $F | sed -e "s/genetic_map_chr//; s/_YRI_b36.txt//"` grep -v "^position" ${F} | awk -v chr="chr${C}" ' BEGIN {prev = 0} { printf "%s\t%d\t%d\t%s\n", chr, prev, $1, $2; prev = $1 } ' done | sort -k1,1 -k2,2n > hapMapRelease24YRIRecombMap.bedGraph for F in combined/genetic_map_*.txt do C=`basename $F | sed -e "s/genetic_map_chr//; s/_combined_b36.txt//"` grep -v "^position" ${F} | awk -v chr="chr${C}" ' BEGIN {prev = 0} { printf "%s\t%d\t%d\t%s\n", chr, prev, $1, $2; prev = $1 } ' done | sort -k1,1 -k2,2n > hapMapRelease24CombinedRecombMap.bedGraph for F in hapMapRelease24CEURecombMap hapMapRelease24CombinedRecombMap \ hapMapRelease24YRIRecombMap do bedGraphToBigWig -verbose=2 ${F}.bedGraph \ /hive/data/genomes/hg18/chrom.sizes ${F}.bw > ${F}.log 2>&1 done for T in hapMapRelease24CEURecombMap hapMapRelease24CombinedRecombMap \ hapMapRelease24YRIRecombMap do rm -f /gbdb/hg18/decode/${T}.bw ln -s `pwd`/${T}.bw /gbdb/hg18/decode/${T}.bw hgsql -e "drop table ${T};" hg18 hgBbiDbLink hg18 ${T} /gbdb/hg18/decode/${T}.bw done ############################################################################# # HI SEQ DEPTH (DONE 7/15/11 angie) mkdir /hive/data/genomes/hg18/bed/hiSeqDepth cd /hive/data/genomes/hg18/bed/hiSeqDepth foreach cov (001 005 01 05 1) wget --timestamp http://eqtl.uchicago.edu/Masking/seq.cov$cov.bed.gz gunzip -N seq.cov$cov.bed.gz end wc -l seq.cov* # 553 seq.cov001.bed # 1301 seq.cov005.bed # 2187 seq.cov01.bed # 18369 seq.cov05.bed # 34359 seq.cov1.bed foreach cov (001 005 01 05 1) echo seq.cov$cov.bed featureBits -countGaps hg18 seq.cov$cov.bed end #seq.cov001.bed #57409 bases of 3107677273 (0.002%) in intersection #seq.cov005.bed #183848 bases of 3107677273 (0.006%) in intersection #seq.cov01.bed #362423 bases of 3107677273 (0.012%) in intersection #seq.cov05.bed #3462959 bases of 3107677273 (0.111%) in intersection #seq.cov1.bed #Coordinate out of allowed range [0,135374737) for chr10 near line 6826 of seq.cov1.bed # I edited line 6826 of seq.cov1.bed to end with 135374737 not 135374744 featureBits -countGaps hg18 seq.cov$cov.bed #6466376 bases of 3107677273 (0.208%) in intersection # Sanity check: verify that the smaller ones are strict subsets of larger: featureBits hg18 -countGaps seq.cov001.bed \!seq.cov005.bed featureBits hg18 -countGaps seq.cov005.bed \!seq.cov01.bed featureBits hg18 -countGaps seq.cov01.bed \!seq.cov05.bed featureBits hg18 -countGaps seq.cov05.bed \!seq.cov1.bed # Yep, all got 0 bases: #0 bases of 3107677273 (0.000%) in intersection # Hmm, some overlap w/gap track: featureBits hg18 -countGaps seq.cov1.bed gap -bed=gapOverlaps.bed #477 bases of 3107677273 (0.000%) in intersection # Load tables: hgLoadBed hg18 hiSeqDepthTopPt1Pct seq.cov001.bed #Loaded 553 elements of size 3 hgLoadBed hg18 hiSeqDepthTopPt5Pct seq.cov005.bed #Loaded 1301 elements of size 3 hgLoadBed hg18 hiSeqDepthTop1Pct seq.cov01.bed #Loaded 2187 elements of size 3 hgLoadBed hg18 hiSeqDepthTop5Pct seq.cov05.bed #Loaded 18369 elements of size 3 hgLoadBed hg18 hiSeqDepthTop10Pct seq.cov1.bed #Loaded 34359 elements of size 3 # Compare with Terry Furey's blacklisted regions for ENCODE # http://encodewiki.ucsc.edu/EncodeDCC/index.php/Blacklist_of_problematic_genomic_regions # http://hgdownload-test.cse.ucsc.edu/goldenPath/hg18/encodeDCC/wgEncodeMapability/wgEncodeDukeRegionsExcluded.bed6.gz featureBits -countGaps hg18 \ /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed #10366850 bases of 3107677273 (0.334%) in intersection foreach cov (001 005 01 05 1) featureBits -countGaps hg18 seq.cov$cov.bed -enrichment \ /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed end # Watch the coverage of seq*bed by Terry's regions drop as $cov increases: #seq.cov001.bed 0.002%, /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed 0.334%, both 0.002%, cover 94.75%, enrich 284.02x #seq.cov005.bed 0.006%, /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed 0.334%, both 0.006%, cover 93.43%, enrich 280.06x #seq.cov01.bed 0.012%, /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed 0.334%, both 0.010%, cover 87.56%, enrich 262.49x #seq.cov05.bed 0.111%, /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed 0.334%, both 0.038%, cover 34.48%, enrich 103.35x #seq.cov1.bed 0.208%, /hive/data/genomes/hg18/bed/encodeBlacklist/hg18.wgEncodeDukeRegionsExcluded.bed 0.334%, both 0.050%, cover 24.02%, enrich 71.99x ############################################################################################ # CREATE TABLES AND .PNG PICTURE FILES OF evofoldV2 RNA STRUCTURES. (DONE, 7/26/2011, Fan) ssh hgwdev mkdir -p /hive/data/genomes/hg18/bed/evofoldV2/doEvoFoldV2 cd /hive/data/genomes/hg18/bed/evofoldV2/doEvoFoldV2 wget http://moma.ki.au.dk/~jsp/upload/evofoldV2.hg18.bed cat ~/kent/src/hg/lib/evofold.sql | \ sed -e "s/evofold/evofoldV2/g" > tmp.sql hgLoadBed –notItemRgb -sqlTable=tmp.sql hg18 evofoldV2 evofoldV2.hg18.bed # Creaet sub-directories to store .png files (total of about 47.5 K of them) sparated by chromosomes. mkdir -p evoFoldV2/chr1 mkdir -p evoFoldV2/chr10 mkdir -p evoFoldV2/chr11 mkdir -p evoFoldV2/chr12 mkdir -p evoFoldV2/chr13 mkdir -p evoFoldV2/chr14 mkdir -p evoFoldV2/chr15 mkdir -p evoFoldV2/chr16 mkdir -p evoFoldV2/chr17 mkdir -p evoFoldV2/chr18 mkdir -p evoFoldV2/chr19 mkdir -p evoFoldV2/chr2 mkdir -p evoFoldV2/chr20 mkdir -p evoFoldV2/chr21 mkdir -p evoFoldV2/chr22 mkdir -p evoFoldV2/chr3 mkdir -p evoFoldV2/chr4 mkdir -p evoFoldV2/chr5 mkdir -p evoFoldV2/chr6 mkdir -p evoFoldV2/chr7 mkdir -p evoFoldV2/chr8 mkdir -p evoFoldV2/chr9 mkdir -p evoFoldV2/chrM mkdir -p evoFoldV2/chrX mkdir -p evoFoldV2/chrY # get latest verion of the .jar file of VARNA wget --timestamping http://varna.lri.fr/bin/VARNAv3-7.jar # Create Java command line files echo 'doEvoFoldV2 hg18 do$1 $1' >do1Chrom chmod +x do1Chrom do1Chrom chr1 do1Chrom chr10 do1Chrom chr11 do1Chrom chr12 do1Chrom chr13 do1Chrom chr14 do1Chrom chr15 do1Chrom chr16 do1Chrom chr17 do1Chrom chr18 do1Chrom chr19 do1Chrom chr2 do1Chrom chr20 do1Chrom chr21 do1Chrom chr22 do1Chrom chr3 do1Chrom chr4 do1Chrom chr5 do1Chrom chr6 do1Chrom chr7 do1Chrom chr8 do1Chrom chr9 do1Chrom chrM do1Chrom chrX do1Chrom chrY chmod +x do* # run the dochrXX command files in small batches with '&' to exploit multiple CPU # wait an hour for each batch to finish so that we don't suck in too much computational resources. dochr1 & dochr2 & dochr3 & dochr4 & dochr5 & sleep 3600 dochr6 & dochr7 & dochr8 & dochr9 & dochr10 & sleep 3600 dochr11 & dochr12 & dochr13 & dochr14 & dochr15 & sleep 3600 dochr16 & dochr17 & dochr18 & dochr19 & dochr20 & sleep 3600 dochr21 & dochr22 & dochrX & dochrY & dochrM & # check the resulting .png files # create a simple script file, check1, with the following 3 lines: echo $1 hgsql hg18 -N -e "select count(*) from evofoldV2 where chrom='${1}'" ls evoFoldV2/$1/*.png|wc chmod +x check1 # create another script file, checkAll, with the following lines: check1 chr1 check1 chr10 check1 chr11 check1 chr12 check1 chr13 check1 chr14 check1 chr15 check1 chr16 check1 chr17 check1 chr18 check1 chr19 check1 chr2 check1 chr20 check1 chr21 check1 chr22 check1 chr3 check1 chr4 check1 chr5 check1 chr6 check1 chr7 check1 chr8 check1 chr9 check1 chrM check1 chrX check1 chrY chmod +x checkAll checkAll >j.check # examing the resuls in j.check to make sure things are OK. # create symbolic links mkdir -p /usr/local/apache/htdocs/evoFoldV2 ln -s /hive/data/genomes/gs.19/build36/bed/evofoldV2/doEvoFoldV2/evoFoldV2 /gbdb/hg18/evoFoldV2 ln -s /gbdb/hg18/evoFoldV2 /usr/local/apache/htdocs/evoFoldV2/hg18 ############################################################################ # GENEREVIEWS TRACK (DONE 2011-09-22 - Chin) # This track depends on some tasks completed for hg19, specifically: # # ~/kent/src/hg/lib/geneReviewsBB.sql # ~/kent/src/hg/lib/geneReviewsBed5.as # ~/kent/src/hg/lib/geneReviewsRefGene.sql # ~/kent/src/hg/makeDb/trackDb/human/geneReviews.html # ~/kent/src/utils/geneReviews/addGeneReviewToBed.pl # # and data fetched from NCBI: # /hive/data/outside/ncbi/geneReviews/current/disease_gene_GR.txt # /hive/data/genomes/hg19/bed/geneReviews/grRefGeneData.tab # /hive/data/genomes/hg19/bed/geneReviews/grRefGene.lst # # Refer to GENEREVIEWS TRACK section in hg19.txt for details mkdir /hive/data/genomes/hg18/bed/geneReviews cd /hive/data/genomes/hg18/bed/geneReviews cp /hive/data/outside/ncbi/geneReviews/current/disease_gene_GR.txt . cp /hive/data/genomes/hg19/bed/geneReviews/grRefGeneData.tab . # load RefSeg Gene to geneReview mapping list to hg18 hgLoadSqlTab -warn hg18 geneReviewsRefGene \ $HOME/kent/src/hg/lib/geneReviewsRefGene.sql grRefGeneData.tab # Scanning through 1 files cp /hive/data/genomes/hg19/bed/geneReviews/grRefGene.lst . # for each refGen in grRefGene.lst, create a non-overlapping bed row. cat grRefGene.lst | while read G do echo ${G} hgsql hg18 -N -e \ "SELECT e.chrom,e.txStart,e.txEnd,j.geneSymbol \ FROM knownGene e, kgXref j WHERE e.alignID = j.kgID AND \ j.geneSymbol ='${G}' ORDER BY e.chrom,e.txStart;" > temp.in bedRemoveOverlap temp.in temp.out cat temp.out >> geneReviews.tab done rm temp.* # load the collapsed bed4 file to hg18, hgLoadBed hg18 geneReviews geneReviews.tab # Use addGeneReviewToBed.pl will add the geneReviews detail in html format to # the bed 4 file chmod +x $HOME/kent/src/utils/geneReviews/addGeneReviewToBed.pl # Add geneReview item in html format format as field 5 $HOME/kent/src/utils/geneReviews//addGeneReviewToBed.pl hg18 > hg18.geneReviews.bed5 # Convert to bigBed format /cluster/bin/x86_64/bedToBigBed -bedFields=4 -tabs \ -as=$HOME/kent/src/hg/lib/geneReviewsBed5.as hg18.geneReviews.bed5 \ /hive/data/genomes/hg18/chrom.sizes hg18.geneReviews.bb # upload the bigBed file to genomewiki /cluster/bin/scripts/gwUploadFile hg18.geneReviews.bb hg18.geneReviews.bb # # loading file: hg18.geneReviews.bb # # into Image name: Hg18.geneReviews.bb # # login name: chinhli # # siteUrl: genomewiki.ucsc.edu # # traceBackLimit: 0 # # traceBackLimit: 0 past site.Images # Image info: {u'comment': u'gwUploadFile upload', u'sha1': u'6f5009a367a6b4fdaa2739541680253bd183af12', u'url': u'http://genomewiki.ucsc.edu/images/c/cd/Hg18.geneReviews.bb', u'timestamp': u'2011-09-22T23:28:39Z', u'metadata': None, u'height': 0, u'width': 0, u'user': u'Chinhli', u'descriptionurl': u'http://genomewiki.ucsc.edu/index.php/File:Hg18.geneReviews.bb', u'size': 170249} # Image File:Hg18.geneReviews.bb usage: hgsql hg18 -e "source $HOME/kent/src/hg/lib/geneReviewsBB.sql;" hgsql hg18 -e 'insert into geneReviewsBB values ("http://genomewiki.ucsc.edu/images/c/cd/Hg18.geneReviews.bb")' ############################################################################# 2012-06-25 markd discovered that: /hive/data/genomes/gs.19/build36/bed/blastz.tupBel1/axtChain/hg18.tupBel1.net.gz was not actually compressed mv hg18.tupBel1.net.gz hg18.tupBel1.net gzip hg18.tupBel1.net #############################################################################