# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on # NCBI build 35 (May 2004 freeze) # HOW TO BUILD AN ASSEMBLY FROM NCBI FILES # --------------------------------------- # Make gs.18 directory, gs.18/build35 directory, and gs.18/ffa directory. ssh eieio mkdir /cluster/store5/gs.18 mkdir /cluster/store5/gs.18/build35 mkdir /cluster/store5/gs.18/agp mkdir /cluster/store5/gs.18/ffa # Make a symbolic link from /cluster/store1 to this location # (I assume there is some use for this later ?) cd /cluster/store1 ln -s /cluster/store5/gs.18 ./gs.18 ln -s /cluster/store5/gs.18/build35 /cluster/data/hg17 # Make a symbolic link from your home directory to the build dir: # (Investigate what this is used for, may no longer be necessary) ln -s /cluster/store5/gs.18/build35 ~/oo # NCBI download site, fetch everything into this one directory: # with the machine and password in your $HOME/.netrc file, this # wget command will require no login. Your $HOME/.netrc file # is set to 'chmod 600 .netrc' to prevent anyone from finding # the data. (There were some early files that later moved # into an OLD subdirectory. They were broken.) mkdir /cluster/store5/gs.18/ncbi cd /cluster/store5/gs.18/ncbi wget --timestamping ftp://ftp.ncbi.nlm.nih.gov/build_35/* # FYI: agp file format documented at: # http://www.ncbi.nlm.nih.gov/Genbank/WGS.agpformat.html # fixup a couple of names for our own purposes here cd /cluster/store5/gs.18/agp ln -s ../ncbi/chr*.agp ../ncbi/chr*.fa.gz . sed -e "s#MT/NC_001807.4#NC_001807#" ../ncbi/chrMT.agp > chrM.agp sed -e "s/NG_002392.2/NG_002392/" ../ncbi/DR52.agp > chr6_hla_hap1.agp sed -e "s/NG_002433.1/NG_002433/" ../ncbi/DR53.agp > chr6_hla_hap2.agp zcat ../ncbi/DR52.fa.gz | \ sed -e "s/gi|29124352|ref|NG_002392.2/ref|NG_002392/" | \ gzip > chr6_hla_hap1.fa.gz zcat ../ncbi/DR53.fa.gz | \ sed -e "s/gi|28212470|ref|NG_002433.1/ref|NG_002433/" | \ gzip > chr6_hla_hap2.fa.gz zcat ../ncbi/chrMT.fa.gz | \ sed -e "s/gi|17981852|ref|NC_001807.4/ref|NC_001807/" | \ gzip > chrM.fa.gz # Put all the agp files together into one. cd /cluster/store5/gs.18/build35 # The chrM sequence now has its own agp, remove it from # ref_placed.agp sed -e "/^NC_001807/d" ../ncbi/ref_placed.agp > ref_placed.agp cat ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \ ../agp/chr6_hla_hap1.agp ../agp/chr6_hla_hap2.agp \ ../ncbi/PAR.agp > ncbi_build35.agp # and into ffa cd /cluster/store5/gs.18/ffa # There is a single bogus line at the end of ref_placed.fa.gz # declaring the NC_001807 MT sequence, this was later replaced by # chrMT.fa.gz, so remove that one line: zcat ../ncbi/ref_placed.fa.gz | sed -e "/^>ref|NC_001807/d" | \ gzip > ref_placed.fa.gz # (That's a 40 minute job) # sequence.inf is usually here, symlink it ln -s ../ncbi/sequence.inf # put all the fa.gz files together in one big fa.gz time zcat ref_placed.fa.gz ../agp/chrM.fa.gz ../ncbi/ref_unplaced.fa.gz \ ../agp/chr6_hla_hap?.fa.gz ../ncbi/PAR.fa.gz | gzip \ > ncbi_build35.fa.gz # real 37m42.208s # user 37m3.490s # sys 0m31.430s # Make a listing of all the fasta record headers, just FYI: cd /cluster/store5/gs.18/ffa zcat ffa/ncbi_build35.fa.gz | grep "^>" > ncbi.fa.headers # New to this build is the sequence: NC_001807 which is the # mitochondria sequence. This prefix NC_ is new to the process # and will have to be accounted for below. The other two special # prefixes are similar to what was seen before: # from DR52.agp NG_002392 # Homo sapiens major histocompatibility complex, class II, # DR52 haplotype (DR52) on chromosome 6 # and from DR53.agp NG_002433 # Homo sapiens major histocompatibility complex, class II, # DR53 haplotype (DR53) on chromosome 6 # Fixup seq_contig.md # # It has a bunch of stuff belonging to the Celera # genome assembly. Filter those out. I don't know what the # NT_07959[0-7] items are, but there are no definitions for them # in the agp files and no sequence in any fa.gz file. # Fixup the names for the NG_ items, and change chrom MT to be M cd /cluster/store5/gs.18/build35 egrep -v "Celera|NT_07959[0-7]" ../ncbi/seq_contig.md | \ sed -e "s/6|NG_002392/6_hla_hap1/" \ -e "s/6|NG_002433/6_hla_hap2/" \ -e "s/^9606\tMT|NC_001807/9606\tM/" \ > temp_contig.md # get the randoms sorted in proper order. The createNcbiLifts # does not work correctly if the randoms are not grouped together # by chromosome grep -v "|" temp_contig.md > seq_contig.md # This pulls out all the randoms and groups them within the # same chrom but leaving them in the same order as they orginally # were (warning this is BASH code ...) grep "|" temp_contig.md | awk -F"|" '{print $1}' | \ awk '{print $2}' | sort -n -u | while read CHR do grep "[^0-9]${CHR}|" temp_contig.md done >> seq_contig.md # Sanity check, checkYbr was updated to handle the NC_ identifier time zcat ../ffa/ncbi_build35.fa.gz | $HOME/bin/i386/checkYbr \ ncbi_build35.agp stdin seq_contig.md > check.seq_contig # real 2m34.143s # user 2m24.970s # sys 0m8.900s # result should be clean: cat check.seq_contig # Read 380 contigs from ncbi_build35.agp # Verifying sequence sizes in stdin # 0 problems detected # Convert fa files into UCSC style fa files and place in "contigs" # directory inside the gs.18/build35 directory # (a check that can be done here is make a list of the contigs # in this ./contigs directory before and compare it with the # list of distributed contigs created after they have been # disbursed.) # faNcbiToUcsc was fixed to handle the NC_ identifier cd /cluster/store5/gs.18/build35 # We've been through this often mv contigs contigs.0 time zcat ../ffa/ncbi_build35.fa.gz | $HOME/bin/i386/faNcbiToUcsc \ -split -ntLast stdin contigs # real 5m10.938s # user 2m20.070s # sys 0m51.020s # If you want to compare anything to previous work, check now, then: rm -fr contigs.0 # Determine the chromosome sizes from agps # Watch carefully how chrY gets constructed. I'm not sure # this chrom_sizes represents the whole length of chrY with # the PAR added. We will see about that. # Script updated to handle new chrom names: # my @chroms = (1 .. 22, 'X', 'Y', 'M', '6_hla_hap1', '6_hla_hap2'); cd /cluster/store5/gs.18/build35 /cluster/bin/scripts/getChromSizes ../agp # Create chrom.lst list for use in foreach() loops awk '{print $1}' chrom_sizes | sed -e "s/chr//" > chrom.lst # Create lift files (this will create chromosome directory structure) and # inserts file /cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md . # Create contig agp files (will create contig directory structure) /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build35.agp . # Create chromsome random agp files. /cluster/bin/scripts/createNcbiChrAgp -randomonly . # Copy the original chrN.agp files from the gs.18/agp directory # into each of the chromosome directories since they contain better # gap information. Delete the comments at top from these. cd /cluster/store5/gs.18/build35 foreach c ( `cat chrom.lst` ) sed -e "/^#.*/d" ../agp/chr${c}.agp > ./${c}/chr${c}.agp end # chrM needs a name fixup sed -e "s#NC_001807#chrM#" ../agp/chrM.agp > M/chrM.agp # Distribute contig .fa to appropriate directory (assumes all files # are in "contigs" directory). # Create inserts file from agp and lift files (new - added by Terry, 2004-07-12) /cluster/bin/scripts/createInserts /cluster/data/hg17 > /cluster/data/hg17/inserts # create global data link for everyone. No more home directory # links required. ln -s /cluster/store5/gs.18/build35 /cluster/data/hg17 cd /cluster/data/hg17 /cluster/bin/scripts/distNcbiCtgFa contigs . # Verify that everything was moved properly, the contigs directory # should be empty: ls contigs # Nothing there, then remove it rmdir contigs # Make a list of the contigs for use later rm contig.lst touch contig.lst foreach chrom ( `cat chrom.lst` ) foreach c ( $chrom/N{C,G,T}_?????? ) set contig = $c:t echo "${chrom}/${contig}/${contig}.fa" >> contig.lst end end # For later comparisons, this is how many contigs we have: wc -l contig.lst # 380 # Note 2004-06-30 - there are some clone numbers left in some of # the NCBI files that are incorrect. Due to version number # changes, more than one version is listed. Namely for accession # numbers: AC004491 AC004921 AC004983 AC005088 AC006014 AC099654 # The AGP files are correct, the sequence.inf file lists these # twice: AC004491.1 AC004491.2 # AC004921.1 AC004921.2 AC004983.2 AC004983.3 # AC005088.2 AC005088.3 AC006014.2 AC006014.3 # AC099654.4 AC099654.5 # FILES ARE NOW READY FOR REPEAT MASKING - start that process as # other steps here can proceed in parallel. # Previous practice used to copy everything over for jkStuff from a # previous build. Rather than do that, pick up whatever is needed # at the time it is needed and verify that it is going to do what # you expect. cd /cluster/data/hg17 mkdir jkStuff # Create the contig.gl files - XXX - NCBI doesn't deliver # contig_overlaps.agp - 2004-06-18 - this is beginning to come # together and there is now a contig_overlaps.agp file # This is properly done below with a combination of psLayout # alignments to create the contig_overlaps.agp file # /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md # Create chromosome gl files # jkStuff/liftGl.csh contig.gl # CREATING DATABASE (DONE - 2004-05-20 - Hiram) # RE-DONE for new NIBS - 2004-06-03 ssh hgwdev # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql df -h /var/lib/mysql # Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 303G 1.4T 19% /var/lib/mysql # Create the database. hgsql -e 'create database hg17' mysql # Copy over grp table (for track grouping) from another database: hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" hg17 # ENCODE groups # Added 2005-08016 kate echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg17 echo 'UPDATE grp SET priority=8 WHERE name="encode"'| hgsql hg17 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg17 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg17 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg17 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg17 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg17 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg17 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg17 # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS # (DONE - 2004-05-21 - Hiram) # RE-DONE with new NIBS - 2004-06-03 # Make nib/, unmasked until RepeatMasker and TRF steps are done. # Do this now so that the chromInfo table will exist and thus the # trackDb tables can be built in the next step. # These unmasked nibs will be replaced by the masked nibs after # repeat mask and trf are done. ssh eieio cd /cluster/data/hg17 # Make chr*.fa from contig .fa # Copied chrFa.sh from hg16/jkStuff, renamed it to chrFa.csh time ./jkStuff/chrFa.csh # real 13m24.710s # user 9m0.360s # sys 1m15.820s mkdir nib foreach c (`cat chrom.lst`) foreach f ($c/chr${c}{,_random}.fa) if (-e $f) then echo "nibbing $f" /cluster/bin/i386/faToNib $f nib/$f:t:r.nib endif end end # Make symbolic links from /gbdb/hg17/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/hg17/nib ln -s /cluster/data/hg17/nib/chr*.nib /gbdb/hg17/nib # Load /gbdb/hg17/nib paths into database and save size info. cd /cluster/data/hg17 hgsql hg17 < $HOME/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib hg17 /gbdb/hg17/nib */chr*.fa hgsql -N -e "select chrom,size from chromInfo order by chrom" hg17 \ > chrom.sizes # You can compare this chrom.sizes with the previously created # chrom_sizes. Should be no difference sort chrom_sizes > s0 sort chrom.sizes | grep -v random > s1 diff s0 s1 rm s0 s1 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2004-05-21 - Hiram) # dbDb orderKey updated 2004-06-08 - Hiram ssh hgwdev # reset dbDb orderKey - these have never been ordered properly # before, this will get them on the program. hgsql -e 'update dbDb set orderKey=11 where name = "hg16";' \ -h genome-testdb hgcentraltest hgsql -e 'update dbDb set orderKey=12 where name = "hg15";' \ -h genome-testdb hgcentraltest hgsql -e 'update dbDb set orderKey=13 where name = "hg13";' \ -h genome-testdb hgcentraltest # Enter hg17 into hgcentraltest.dbDb so test browser knows about it: hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("hg17", "May 2004", "/gbdb/hg17/nib", "Human", \ "chr4:56214201-56291736", 1, 10, "Human", "Homo sapiens", \ "/gbdb/hg17/html/description.html", 0, 0, "NCBI Build 35");' \ -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P . # Edit the makefile to add hg17 in all the right places and do make update make alpha cvs commit makefile # MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2004-05-21 - Hiram) # Re-DONE with new randoms - 2004-06-03 - Hiram) cd /cluster/data/hg17 mkdir -p jkStuff cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly. # Note: this ncbi.lift will not lift floating contigs to chr_random coords, # but it will show the strand orientation of the floating contigs # (grep for '|'). # mdToNcbiLift seq_contig.md jkStuff/ncbi.lft # XXXX - appears to be unused, not done - Hiram # REPEAT MASKING (DONE - 2004-05-24 - Hiram) # The randoms were rearranged after this was first done, # they are re-made below 2004-06-02) # Record the RM version here: # RepBase Update 8.12, RM database version 20040130 # as this changes over time and there is no record in the results # Split contigs, run RepeatMasker, lift results # This split takes about 8 minutes ssh eieio cd /cluster/data/hg17 foreach chrom ( `cat chrom.lst` ) foreach c ( $chrom/N{C,G,T}_?????? ) set contig = $c:t echo "splitting ${chrom}/${contig}/${contig}.fa" faSplit size ${chrom}/${contig}/$contig.fa 500000 \ ${chrom}/${contig}/${contig}_ \ -lift=${chrom}/${contig}/$contig.lft -maxN=500000 end end #- Make the run directory and job list: cd /cluster/data/hg17 mkdir -p jkStuff # According to RepeatMasker help file, no arguments are required to # specify species because its default is set for primate (human) # This run script saves the .tbl file to be sent to Arian. He uses # those for his analysis. Sometimes he needs the .cat and .align files for # checking problems. Krish needs the .align files, they are large. cat << '_EOF_' > jkStuff/RMHuman #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/hg17/$2 /bin/cp $2 /tmp/hg17/$2/ cd /tmp/hg17/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2 popd /bin/cp /tmp/hg17/$2/$2.out ./ if (-e /tmp/hg17/$2/$2.align) /bin/cp /tmp/hg17/$2/$2.align ./ if (-e /tmp/hg17/$2/$2.tbl) /bin/cp /tmp/hg17/$2/$2.tbl ./ # if (-e /tmp/hg17/$2/$2.cat) /bin/cp /tmp/hg17/$2/$2.cat ./ /bin/rm -fr /tmp/hg17/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/hg17/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/hg17 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMHuman ssh eieio cd /cluster/data/hg17 mkdir RMRun rm -f RMRun/RMJobs touch RMRun/RMJobs foreach d ( `cat chrom.lst` ) foreach c ( ${d}/N{C,G,T}_*/N{C,G,T}_*_*.fa ) set f = $c:t set cc = $c:h set contig = $cc:t echo /cluster/store5/gs.18/build35/jkStuff/RMHuman \ /cluster/store5/gs.18/build35/${d}/${contig} $f \ '{'check out line+ /cluster/store5/gs.18/build35/${d}/${contig}/$f.out'}' \ >> RMRun/RMJobs end end # We have 5971 jobs in RMJobs: wc RMRun/RMJobs # 5970 41790 1105804 RMRun/RMJobs #- Do the run ssh kk cd /cluster/data/hg17/RMRun para create RMJobs para try, para check, para check, para push, para check,... #- While that is running, you can run TRF (simpleRepeat) on the small # cluster. See SIMPLE REPEAT section below # Completed: 5970 of 5970 jobs # CPU time in finished jobs: 45189516s 753158.60m 12552.64h 523.03d 1.433 y # IO & Wait Time: 141333s 2355.55m 39.26h 1.64d 0.004 y # Average job time: 7593s 126.55m 2.11h 0.09d # Longest job: 10268s 171.13m 2.85h 0.12d # Submission to last job: 81484s 1358.07m 22.63h 0.94d # Lift up the split-contig .out's to contig-level .out's # # If a mistake is made in the following it would be possible to # destroy all the RM output. So, just to be paranoid, save all # the RM output in bluearc for the time being: ssh eieio cd /cluster/data/hg17 mkdir /cluster/bluearc/hg17/RMOutput foreach c ( `cat chrom.lst` ) foreach d ( ${c}/N{C,G,T}_* ) set T = /cluster/bluearc/hg17/RMOutput/${d} mkdir -p ${T} cd ${d} set contig = $d:t cp -p ${contig}_?{,?,??}.fa.out ${T} cd ../.. echo "${d} done" end end # Make sure we got them all: # (this doesn't work later since there are more *.fa.out files # after the lifting. More explicitly to find just these: # find . -name "N?_*_*.fa.out" -print | wc -l find . -name "*.fa.out" -print | wc -l # 5970 find /cluster/bluearc/hg17/RMOutput -type f | wc -l # 5970 # same count # OK, now you can try this operation, do it in a script like this # and save the output of the script for a record of what happened. cat << '_EOF_' > jkStuff/liftRM.csh #!/bin/csh -fe foreach c ( `cat chrom.lst` ) foreach d ( ${c}/N{C,G,T}_* ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out cd ../.. end end '_EOF_' chmod +x jkStuff/liftRM.csh mkdir scriptsOutput time jkStuff/liftRM.csh > scriptsOutput/liftRM.1 2>&1 # real 4m37.572s # user 1m19.130s # sys 0m32.950s # Check that they all were done: grep "fa.out" scriptsOutput/liftRM.1 | wc -l # 5959 # same count as above #- Lift up RepeatMask .out files to chromosome coordinates via # picked up jkStuff/liftOut2.sh from the hg16 build. Renamed to # liftOut2.csh, changed the line that does the chrom listing time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2 2>&1 # real 9m46.780s # user 1m18.900s # sys 7m33.990s #- By this point, the database should have been created (above): ssh hgwdev cd /cluster/data/hg17 time hgLoadOut hg17 ?/*.fa.out ??/*.fa.out 6_hla_hap?/*.fa.out > \ scriptsOutput/hgLoadOut 2>&1 # real 5m59.137s # user 1m47.550s # sys 0m15.410s # errors during this load: (there are always a couple of these) # Strange perc. field -6.1 line 243543 of 2/chr2.fa.out # Strange perc. field -5.6 line 243543 of 2/chr2.fa.out # Strange perc. field -6.1 line 243545 of 2/chr2.fa.out # Strange perc. field -5.6 line 243545 of 2/chr2.fa.out # Strange perc. field -0.2 line 30322 of 17/chr17.fa.out # Strange perc. field -0.2 line 30324 of 17/chr17.fa.out # Strange perc. field -0.2 line 30326 of 17/chr17.fa.out # Strange perc. field -0.2 line 30328 of 17/chr17.fa.out # Strange perc. field -18.6 line 77034 of 19/chr19.fa.out # Verify we have similar results to previous assembly: # featureBits hg17 rmsk # 1391378842 bases of 2867328468 (48.525%) in intersection # featureBits hg16 rmsk # 1388770568 bases of 2865248791 (48.469%) in intersection # Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF # following the SIMPLE REPEAT sections below # Re-Running REPEAT_MASKER on the new Randoms (DONE - 2004-06-02 - Hiram) ssh eieio cd /cluster/data/hg17 grep "|" seq_contig.md | awk '{print $2}' | sed -e "s#|#/#" > randoms.lst mkdir /cluster/data/hg17/RMRandoms foreach r ( `cat randoms.lst` ) set d = $r:h set contig = $r:t foreach c ( ${r}/N{C,G,T}_*_*.fa ) set f = $c:t echo /cluster/store5/gs.18/build35/jkStuff/RMHuman \ /cluster/store5/gs.18/build35/${d}/${contig} $f \ '{'check out line+ /cluster/store5/gs.18/build35/${d}/${contig}/$f.out'}' \ >> RMRandoms/RMJobs end end ssh kk cd /cluster/data/hg17/RMRandoms para create RMJobs para try, para check, para check, para push, para check,... # Completed: 94 of 94 jobs # CPU time in finished jobs: 221454s 3690.91m 61.52h 2.56d 0.007 y # IO & Wait Time: 866s 14.43m 0.24h 0.01d 0.000 y # Average job time: 2365s 39.42m 0.66h 0.03d # Longest job: 9062s 151.03m 2.52h 0.10d # Submission to last job: 9106s 151.77m 2.53h 0.11d # Continuing with the paranoia theme, let's backup all the RM output # ssh eieio cd /cluster/data/hg17 mkdir /cluster/bluearc/hg17/RMRandoms foreach c ( `cat chrom.lst` ) foreach d ( ${c}/N{C,G,T}_* ) set T = /cluster/bluearc/hg17/RMRandoms/${d} mkdir -p ${T} cd ${d} set contig = $d:t cp -p ${contig}_?{,?,??}.fa.out ${T} cd ../.. echo "${d} done" end end # Make sure we got them all: find . -name "N?_*_*.fa.out" -print | wc -l # 5959 find /cluster/bluearc/hg17/RMRandoms -type f | wc -l # 5959 # same count time jkStuff/liftRM.csh > scriptsOutput/liftRM2.1 2>&1 # real 4m46.302s # user 1m18.260s # sys 0m18.000s # Check that they all were done: grep "fa.out" scriptsOutput/liftRM2.1 | wc -l # 5959 # same count as above #- Lift up RepeatMask .out files to chromosome coordinates via # picked up jkStuff/liftOut2.sh from the hg16 build. Renamed to # liftOut2.csh, changed the line that does the chrom listing time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2.1 2>&1 # real 2m46.347s # user 1m18.650s # sys 0m15.990s #- By this point, the database should have been created (above): ssh hgwdev cd /cluster/data/hg17 time hgLoadOut hg17 ?/*.fa.out ??/*.fa.out 6_hla_hap?/*.fa.out > \ scriptsOutput/hgLoadOut 2>&1 # real 5m59.137s # user 1m47.550s # sys 0m15.410s # errors during this load: (there are always a couple of these) # Strange perc. field -6.1 line 243543 of 2/chr2.fa.out # Strange perc. field -5.6 line 243543 of 2/chr2.fa.out # Strange perc. field -6.1 line 243545 of 2/chr2.fa.out # Strange perc. field -5.6 line 243545 of 2/chr2.fa.out # Strange perc. field -0.2 line 30322 of 17/chr17.fa.out # Strange perc. field -0.2 line 30324 of 17/chr17.fa.out # Strange perc. field -0.2 line 30326 of 17/chr17.fa.out # Strange perc. field -0.2 line 30328 of 17/chr17.fa.out # Strange perc. field -18.6 line 77034 of 19/chr19.fa.out # Verify we have similar results to previous assembly: # featureBits hg17 rmsk # 1390952984 bases of 2866216770 (48.529%) in intersection # featureBits hg17 rmsk #with previous randoms: # 1391378842 bases of 2867328468 (48.525%) in intersection # featureBits hg16 rmsk # 1388770568 bases of 2865248791 (48.469%) in intersection # Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF # following the SIMPLE REPEAT sections below # SIMPLE REPEAT [TRF] TRACK (DONE - 2004-05-21 - Hiram) # Re-done with new randoms, 2004-06-02 - Hiram # Copy the contigs, first to the bluearc, then to /iscratch/i ssh eieio mkdir /cluster/bluearc/hg17 mkdir /cluster/bluearc/hg17/contigs cd /cluster/data/hg17 foreach ctg ( `cat contig.lst` ) set c = $ctg:t echo "$ctg > /cluster/bluearc/hg17/contigs/$c" cp -p $ctg /cluster/bluearc/hg17/contigs/$c end # Check how much is there: # du -hsc /cluster/bluearc/hg17/contigs # 2.8G /cluster/bluearc/hg17/contigs # Distribute contigs to /iscratch/i ssh kkr1u00 mkdir /iscratch/i/gs.18/build35/unmaskedContigs cd /iscratch/i/gs.18/build35/unmaskedContigs cp -p /cluster/bluearc/hg17/contigs/* . # Verify same amount made it there: # du -hsc /iscratch/i/gs.18/build35/unmaskedContigs # 2.8G /iscratch/i/gs.18/build35/unmaskedContigs # Then send them to the other 7 Iservers /cluster/bin/iSync # Go to the small cluster for this business: ssh kki mkdir -p /cluster/data/hg17/bed/simpleRepeat cd /cluster/data/hg17/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << this line makes emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1S /iscratch/i/gs.18/build35/unmaskedContigs/*.fa > genome.lst gensub2 genome.lst single gsub jobList para create jobList para try para check para push para check # Completed: 380 of 380 jobs # CPU time in finished jobs: 13230s 220.49m 3.67h 0.15d 0.000 y # IO & Wait Time: 2078s 34.64m 0.58h 0.02d 0.000 y # Average job time: 40s 0.67m 0.01h 0.00d # Longest job: 1590s 26.50m 0.44h 0.02d # Submission to last job: 2504s 41.73m 0.70h 0.03d liftUp simpleRepeat.bed /cluster/data/hg17/jkStuff/liftAll.lft \ warn trf/*.bed > lu.out 2>&1 # Load into the database: ssh hgwdev cd /cluster/data/hg17/bed/simpleRepeat /cluster/bin/i386/hgLoadBed hg17 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 629076 elements of size 16 # Compare with previous assembly featureBits hg17 simpleRepeat # 54952425 bases of 2866216770 (1.917%) in intersection # with previous randoms featureBits hg17 simpleRepeat # 54964044 bases of 3096628158 (1.775%) in intersection featureBits hg16 simpleRepeat # 54320136 bases of 2865248791 (1.896%) in intersection # GAPS weren't in hg17 yet at this point, after gaps added: # featureBits hg17 simpleRepeat # 54964044 bases of 2867328468 (1.917%) in intersection # featureBits -countGaps hg17 simpleRepeat # 54964044 bases of 3096628158 (1.775%) in intersection ########################################################################### # CREATE MICROSAT TRACK (done 2006-7-5 JK) ssh hgwdev cd /cluster/data/hg17/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed hg17 microsat microsat.bed # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-05-21 - Hiram) # re-done with new randoms - 2004-06-03 - Hiram # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh eieio cd /cluster/data/hg17/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # EXPERIMENT, at a filter of <= 12, we have coverage: # 20904399 bases of 2867328468 (0.729%) in intersection # at a filter of <= 9, we have coverage: # 19271270 bases of 2867328468 (0.672%) in intersection # Lift up filtered trf output to chrom coords as well: cd /cluster/data/hg17 mkdir bed/simpleRepeat/trfMaskChrom foreach c ( `cat chrom.lst` ) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2004-05-25) # -Hiram # re-done with new randoms - 2004-06-03 - Hiram # This used to be done right after RepeatMasking. Now, we mask with # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above, # and after Repeat Masker is complete. ssh eieio cd /cluster/data/hg17 # copied these scripts from hg16 - reset the lines that make # the chrom list to work on, reset the wild cards that find all the # contig .fa's # Make chr*.fa from contig .fa # Copied chrFa.sh from hg16/jkStuff, renamed it to chrFa.csh time ./jkStuff/chrFa.csh > scriptsOutput/chrFa.out 2>&1 & # real 13m18.512s # user 9m1.670s # sys 1m7.290s #- Soft-mask (lower-case) the contig and chr .fa's time ./jkStuff/makeFaMasked.csh > scriptsOutput/maFaMasked.out 2>&1 # real 29m31.623s # user 13m49.700s # sys 5m58.750s #- Make hard-masked .fa.masked files as well: time ./jkStuff/makeHardMasked.csh > scriptsOutput/maHardMasked.out 2>&1 #- Create the bothMasksNib/ directory time ./jkStuff/makeNib.csh > scriptsOutput/maNib.out 2>&1 # real 14m41.694s # user 6m28.000s # sys 1m42.500s # Make symbolic links from /gbdb/hg17/nib to the real nibs. ssh hgwdev mv nib nib.raw mv bothMasksNib nib rm /gbdb/hg17/nib/*.nib ln -s `pwd`/nib/* /gbdb/hg17/nib # Load /gbdb/hg17/nib paths into database and save size info. hgsql hg17 < ~/kent/src/hg/lib/chromInfo.sql cd /cluster/data/hg17 hgNibSeq -preMadeNib hg17 /gbdb/hg17/nib */chr*.fa # 3096628158 total bases # Should be the same size as before hgsql -N -e "select chrom,size from chromInfo order by chrom" hg17 \ > chrom.sizes.masked diff chrom.sizes chrom.sizes.masked # should be no output at all, thus: rm chrom.sizes.masked # Copy the masked contig fa to /scratch and /iscratch # And everything else we will need for blastz runs, etc ... # Best to do this sequence first to /cluster/bluearc/scratch, # which is going to be the source for the /scratch copy. # And then from there to the /iscratch # Make sure you are on the fileserver for the original source: ssh eieio mkdir -p /cluster/bluearc/scratch/hg/gs.18/build35 cd /cluster/bluearc/scratch/hg/gs.18/build35 # these copies take less than 2 minutes each mkdir bothMaskedNibs cp -p /cluster/data/hg17/nib/*.nib ./bothMaskedNibs mkdir maskedContigs foreach chrom ( `cat /cluster/data/hg17/chrom.lst` ) cp -p /cluster/data/hg17/${chrom}/N{C,G,T}_*/N{C,G,T}_??????.fa \ ./maskedContigs echo "done ${chrom}" end # make sure you have them all: ls maskedContigs | wc -l # 380 wc -l /cluster/data/hg17/contig.lst # 380 mkdir rmsk foreach chrom ( `cat /cluster/data/hg17/chrom.lst` ) cp -p /cluster/data/hg17/${chrom}/*.out ./rmsk echo "done ${chrom}" end # Now, go to the destination for /iscratch and copy from the # bluearc ssh kkr1u00 mkdir -p /iscratch/i/gs.18/build35 cd /iscratch/i/gs.18/build35 # This takes about 5 minutes rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/ . time /cluster/bin/iSync # real 7m27.649s # request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch # LOAD ctgPos table - Contig position track (DONE - 2004-06-08 - Hiram) # After fixing up hgCtgPos to accept the -chromLst argument, simply: cd /cluster/data/hg17 hgCtgPos -chromLst=chrom.lst hg17 . # GOLD AND GAP TRACKS (DONE - 2004-05-21 - Hiram) # RE-DONE with new randoms - 2004-06-03 - Hiram ssh hgwdev cd /cluster/data/hg17 hgGoldGapGl -noGl -chromLst=chrom.lst hg17 /cluster/data/hg17 . # Disappointing to see this create so many tables ... # _gap and _gold for each chrom # Create the contig.gl files - XXX - NCBI doesn't deliver # contig_overlaps.agp - 2004-06-18 - this is beginning to come # together and there is now a contig_overlaps.agp file cd /cluster/store5/gs.18/build35 # combine the various psLayout attempts on different sections of # clones ./combineContigOverlaps.sh # Turn contig_overlaps.agp into gl files ~hiram/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md # Create chromosome gl files (had to fix liftUp to do the NC_ properly) jkStuff/liftGl.csh contig.gl # # Need to remove these PAR clone names from chrY.gl rm -f /tmp/chrY.gl sed -e "/^AL954722.18/d; /^BX537334.4/d; /^BX000483.7/d; \ /^BX908402.3/d; /^BX649635.3/d; /^BX119919.5/d; \ /^AC079176.15/d; /^AC097314.27/d; /^AC006209.25/d; \ /^AJ271735.1/d; /^AJ271736.1/d" Y/chrY.gl > /tmp/chrY.gl rm -f Y/chrY.gl mv /tmp/chrY.gl Y/chrY.gl # After contig.gl files have been made from contig_overlaps.agp # The sed fixes the Celera clones that are marked phase W # Call that phase 3 instead, # Delete the Celera AACC clones, they are not in this assembly, # And fix the line of AC018743 to add it to the assembly, it was a # last minute addition by Terry that didn't get carried into the # NCBI sequence.inf file. And remove the older versions of five # clones that got left in by mistake at NCBI # AC004491.1=AC004491.2 AC004921.1=AC004921.2 AC004983.2=AC004983.3 # AC005088.2=AC005088.3 AC006014.2=AC006014.3 AC099654.4=AC099654.5 # And finally the grep selects only those things for_assembly cd /cluster/data/hg17 egrep "for_assembly|AC018743" /cluster/store5/gs.18/ncbi/sequence.inf | \ sed -e "s/\tW\t/\t3\t/; /^AACC010000.*/d; /^AC004491.1.*/d; \ /^AC004921.1.*/d; /^AC004983.2.*/d; /^AC005088.2.*/d; \ /^AC006014.2.*/d; /^AC099654.4.*/d; \ s/AC018743.27\t31791062\t466818\t1\tD\tUn\t-\tBCM\tRP11-289M22\tSIZE:2big/AC018743.27\t31791062\t466818\t1\t-\t(12)\t-\tBCM\tRP11-289M22\tfor_assembly/" \ > sequence.inf cd /cluster/data/hg17 hgGoldGapGl -chromLst=chrom.lst hg17 /cluster/store5/gs.18 build35 $HOME/bin/i386/hgClonePos -chromLst=chrom.lst hg17 \ /cluster/data/hg17 ./sequence.inf /cluster/store5/gs.18 -maxErr=3 \ -maxWarn=2000 2> clone.pos.errors # Extract all the PAR clones for chrX from clonePos, change the X # to Y, fixup the coordinates on the last three, and load this # data in on the clonePos table in addition to what is there # already. cat << '_EOF_' > chrY.par.clonePos BX640545.2 34821 3 chrY 0 34250 F AL954722.18 37771 3 chrY 84821 122592 F BX537334.4 36872 3 chrY 120592 157464 F BX000483.7 15918 3 chrY 155466 171384 F AL954664.17 39200 3 chrY 251384 290307 F BX000476.5 33340 3 chrY 282188 315528 F AL732314.18 218723 3 chrY 313528 532251 F BX004827.18 119555 3 chrY 479050 600112 F AL683871.15 175765 3 chrY 598112 773877 F AL672311.26 115998 3 chrY 771877 887875 F AL672277.20 131682 3 chrY 885875 1017557 F BX908402.3 36556 3 chrY 1067557 1104113 F BX649635.3 43709 3 chrY 1154113 1197822 F BX649553.5 90286 3 chrY 1347822 1438108 F BX296563.3 21008 3 chrY 1488108 1509117 F BX119906.16 35666 3 chrY 1507116 1542782 F AL683870.15 162377 3 chrY 1541782 1704175 F AL691415.17 45085 3 chrY 1702175 1747265 F AL683807.22 189825 3 chrY 1745260 1935086 F AL672040.10 117297 3 chrY 1933086 2050383 F BX004859.8 63432 3 chrY 2048380 2111815 F BX119919.5 55442 3 chrY 2261815 2317257 F AC079176.15 186278 3 chrY 2311674 2497952 F AC097314.27 80501 3 chrY 2495948 2576449 F AC006209.25 141759 3 chrY 2551122 2692881 F AJ271735.1 240000 3 chrY 57302979 57543030 F AJ271736.1 158661 3 chrY 57543030 57701691 F '_EOF_' hgsql -e 'load data local infile "chrY.par.clonePos" into table clonePos;' hg17 # We have the following errors # Processing /cluster/data/hg17/Y/chrY.gl # Clone BX640545 is on chromosomes chrX and chrY. Ignoring chrY # Clone AL954722 is on chromosomes chrX and chrY. Ignoring chrY # ... etc for all the PAR clones # ... And there are an unknown number of these: # AB000359 is in ./sequence.inf but not in ooDir/*/*.gl # AB000360 is in ./sequence.inf but not in ooDir/*/*.gl # gc5Base wiggle TRACK (DONE - 2004-05-22 - Hiram) # This previously was a script that ran through each nib # Recently transformed into a mini cluster run. # Re-DONE with the new randoms - 2004-06-04 ssh kki mkdir /cluster/data/hg17/bed/gc5Base cd /cluster/data/hg17/bed/gc5Base mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K cat << '_EOF_' > kkRun.sh #!/bin/sh NIB=$1 chr=${NIB/.nib/} chrom=${chr#chr} hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 hg17 \ /iscratch/i/gs.18/build35/bothMaskedNibs | \ grep -w GC | \ awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \ wigAsciiToBinary -dataSpan=5 -chrom=${chr} \ -wibFile=wigData5/gc5Base_${chrom} \ -name=${chrom} stdin 2> dataLimits5/${chr} '_EOF_' # << this line makes emacs coloring happy chmod +x kkRun.sh ls /iscratch/i/gs.18/build35/bothMaskedNibs > nibList cat << '_EOF_' > gsub #LOOP ./kkRun.sh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 nibList single gsub jobList para create jobList para try, check, ... etc # Completed: 46 of 46 jobs # CPU time in finished jobs: 5251s 87.51m 1.46h 0.06d 0.000 y # IO & Wait Time: 130s 2.17m 0.04h 0.00d 0.000 y # Average job time: 117s 1.95m 0.03h 0.00d # Longest job: 413s 6.88m 0.11h 0.00d # Submission to last job: 475s 7.92m 0.13h 0.01d # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/hg17/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/gc5Base hg17 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir /gbdb/hg17/wib/gc5Base ln -s `pwd`/wigData5/*.wib /gbdb/hg17/wib/gc5Base # And then the zoomed data view ssh kki cd /cluster/data/hg17/bed/gc5Base mkdir wigData5_1K dataLimits5_1K cat << '_EOF_' > kkRunZoom.sh #!/bin/sh NIB=$1 chr=${NIB/.nib/} chrom=${chr#chr} hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 hg17 \ /iscratch/i/gs.18/build35/bothMaskedNibs | \ grep -w GC | \ awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \ -chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \ -name=${chrom} stdin 2> dataLimits5_1K/${chr} '_EOF_' # << this line makes emacs coloring happy chmod +x kkRunZoom.sh cat << '_EOF_' > gsubZoom #LOOP ./kkRunZoom.sh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 nibList single gsubZoom jobListZoom para create jobListZoom para try ... check ... etc ... # Completed: 46 of 46 jobs # CPU time in finished jobs: 5216s 86.93m 1.45h 0.06d 0.000 y # IO & Wait Time: 34s 0.57m 0.01h 0.00d 0.000 y # Average job time: 114s 1.90m 0.03h 0.00d # Longest job: 415s 6.92m 0.12h 0.00d # Submission to last job: 469s 7.82m 0.13h 0.01d # Then load these .wig files into the same database as above ssh hgwdev hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/gc5Base \ -oldTable hg17 gc5Base wigData5_1K/*.wig # and symlink these .wib files into /gbdb mkdir -p /gbdb/hg17/wib/gc5Base ln -s `pwd`/wigData5_1K/*.wib /gbdb/hg17/wib/gc5Base # AUTO UPDATE GENBANK MRNA RUN (DONE - 2004-06-08 - Hiram) ssh eieio cd /cluster/data/genbank # This is a new organism, edit the etc/genbank.conf file and add: # hg17 hg17.genome = /scratch/hg/gs.18/build35/bothMaskedNibs/chr*.nib hg17.lift = /cluster/store5/gs.18/build35/jkStuff/liftAll.lft hg17.genbank.est.xeno.load = yes hg17.mgcTables.default = full hg17.mgcTables.mgc = all hg17.downloadDir = hg17 # Do the refseq's first, they are the quick ones ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial hg17 # logFile: var/build/logs/2004.05.25-13:41:07.hg17.initalign.log # checking that log, or watching the batch on kk, you can find # where the batch is running and after it is done get the time: cd /cluster/store6/genbank/work/initial.hg17/align para time > time cat time # Completed: 9500 of 9500 jobs # CPU time in finished jobs: 62241s 1037.35m 17.29h 0.72d 0.002 y # IO & Wait Time: 33719s 561.98m 9.37h 0.39d 0.001 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest job: 1062s 17.70m 0.29h 0.01d # Submission to last job: 1063s 17.72m 0.30h 0.01d # Load the results from the above ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17 # To get the genbank started, the above results need to be # moved out of the way. These things can be removed if there are # no problems to debug ssh eieio cd /cluster/data/genbank/work mv initial.hg17 initial.hg17.refseq.mrna cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial hg17 # logFile: var/build/logs/2004.06.04-10:47:21.hg17.initalign.log # One job was hung up, after killing it on its node, the batch # finished in a few minutes. # Completed: 35720 of 35720 jobs # CPU time in finished jobs: 5161424s 86023.74m 1433.73h 59.74d 0.164 y # IO & Wait Time: 144149s 2402.48m 40.04h 1.67d 0.005 y # Average job time: 149s 2.48m 0.04h 0.00d # Longest job: 18306s 305.10m 5.08h 0.21d # Submission to last job: 35061s 584.35m 9.74h 0.41d ssh hgwdev cd /cluster/data/genbank # some kind of error happened here, had to remove a lock file to # get this to proceed (this same thing happened again the second # time around) nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17 ssh eieio cd /cluster/data/genbank/work mv initial.hg17 initial.hg17.genbank.mrna cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial hg17 # Completed: 189240 of 189240 jobs # CPU time in finished jobs: 97172120s 1619535.33m 26992.26h 1124.68d 3.081 y # IO & Wait Time: 1507789s 25129.82m 418.83h 17.45d 0.048 y # Average job time: 521s 8.69m 0.14h 0.01d # Longest job: 33165s 552.75m 9.21h 0.38d # Submission to last job: 126988s 2116.47m 35.27h 1.47d ssh hgwdev cd /cluster/data/genbank time nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17 # real 440m42.750s # user 69m7.810s # sys 23m18.640s # This is ~7.5 hours # If the above is all OK, ask Mark to put this assembly on # the daily updates. # CPGISLANDS (DONE - 2004-05-25 - Hiram) # Re-DONE with new randoms - 2004-06-04 - Hiram ssh hgwdev mkdir -p /cluster/data/hg17/bed/cpgIsland cd /cluster/data/hg17/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe mv cpglh.exe /cluster/data/hg17/bed/cpgIsland/ # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. ssh eieio cd /cluster/data/hg17/bed/cpgIsland foreach f (../../*/chr*.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpglh.exe $f > $fout end # the warnings: # Bad char 0x52 = 'R' at line 2046, base 102229, sequence chr16_random # Bad char 0x4d = 'M' at line 1216113, base 60805573, sequence chr3 # Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3 # Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3 # real 21m47.823s # user 18m30.810s # sys 1m13.420s # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed ssh hgwdev cd /cluster/data/hg17/bed/cpgIsland hgLoadBed hg17 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 27801 elements of size 10 # Sorted # Saving bed.tab # Loading hg17 # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE - 2004-05-25 - Heather) ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans) \ VALUES("hg17", "blat12", "17778", "1"); \ INSERT INTo blatServers (db, host, port, isTrans) \ VALUES("hg17", "blat12", "17779", "0");' \ -h genome-testdb hgcentraltest # PREPARE CLUSTER FOR BLASTZ RUNS (DONE - 2004-05-26 - Hiram) # Re-DONE with new randoms - 2004-06-03 - Hiram ssh eieio mkdir /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec ln -s ../rmsk/*.out . # This takes 40 minutes run as a script, to hurry it up it has # been converted to a mini cluster run cat << '_EOF_' > runArian.sh #!/bin/sh for FN in *.out do echo /cluster/bluearc/RepeatMasker030619/DateRepsinRMoutput.pl \ ${FN} -query human -comp rat -comp mouse done '_EOF_' # << emacs chmod +x runArian.sh ssh kki cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec ./runArian.sh > jobList para create jobList para try, ... check ... push ... etc ... # Completed: 46 of 46 jobs # CPU time in finished jobs: 668s 11.14m 0.19h 0.01d 0.000 y # IO & Wait Time: 514s 8.56m 0.14h 0.01d 0.000 y # Average job time: 26s 0.43m 0.01h 0.00d # Longest job: 86s 1.43m 0.02h 0.00d # Submission to last job: 108s 1.80m 0.03h 0.00d # Now extract each one, 1 = Rat, 2 = Mouse ssh eieio cd /cluster/bluearc/scratch/hg/gs.18/build35 mkdir linSpecRep.notInRat linSpecRep.notInMouse foreach f (rmsk.spec/*.out_rat_mus) set base = $f:t:r:r echo "$f -> $base.out.spec" /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInRat/$base.out.spec /cluster/bin/scripts/extractLinSpecReps 2 $f > \ linSpecRep.notInMouse/$base.out.spec end # There is actually no difference at all between these two results. # copy to iscratch ssh kkr1u00 cd /iscratch/i/gs.18/build35 rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/ . /cluster/bin/iSync # request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch # COPY DATA TO GOLDEN PATH LOCATIONS (DONE - 2004-06-04 - Hiram) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/chromosomes cd /cluster/data/hg17 # Beware, this backgrounding of the gzips can be hard on hgwdev. # You could wait until after the copy then run one gzip to do them all foreach chrom ( `cat chrom.lst` ) cp -p ${chrom}/*.fa /usr/local/apache/htdocs/goldenPath/hg17/chromosomes gzip \ /usr/local/apache/htdocs/goldenPath/hg17/chromosomes/chr${chrom}*.fa & echo "done ${chrom}" end cd /usr/local/apache/htdocs/goldenPath/hg17/chromosomes gzip *.fa # FOSMID END PAIRS TRACK (2004-06-09 kate) # Corrected upper size limit to 50Kbp, reran pslPairs, # and reloaded (2004-07-15 kate) # Use latest fosmid ends data prepared by Terry Furey. # He says there is no on-going work on fosmid ends, so this # should suffice indefinitely ? Move/link this stuff into # central data area. ssh eieio cd /cluster/data/ncbi mkdir -p fosends/human ln -s /cluster/store1/fosends.3 fosends/human cd fosends/human/fosends.3 faSize fosEnds.fa # 579735181 bases (369769 N's 579365412 real) in 1087670 sequences # 580M bases in 1M sequences # create link in /gbdb/ncbi/fosends/human ? # use pre-split fosend files, and associated list for cluster run # Sequences are in /cluster/bluearc/hg/fosEnds cp /cluster/bluearc/booch/fosends/fosEnds.lst /cluster/bluearc/hg/fosEnds # run on rack9 since kilokluster is busy ssh kk9 cd /cluster/data/hg17 mkdir -p bed/fosends cd bed/fosends mkdir -p run cd run ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa \ > contigs.lst cp /cluster/bluearc/hg/fosEnds/fosEnds.lst fosEnds.lst # 380 contigs vs 97 fosEnd files -> 40K jobs # send output to kksilo, as it can better handle the NFS load mkdir -p /cluster/store7/kate/hg17/fosends/out ln -s /cluster/store7/kate/hg17/fosends/out ../out cat > gsub << 'EOF' #LOOP /cluster/bin/i386/blat $(path1) $(path2) -ooc=/scratch/hg/h/11.ooc {check out line+ /cluster/data/hg17/bed/fosends/out/$(root2)/$(root1).$(root2).psl} #ENDLOOP 'EOF' # << emacs gensub2 contigs.lst fosEnds.lst gsub jobList foreach f (`cat fosEnds.lst`) set d = $f:r:t echo $d mkdir -p /cluster/data/hg17/bed/fosends/out/$d end para create jobList # 36860 jobs para try para check para push # CPU time in finished jobs: 1655943s 27599.05m 459.98h 19.17d 0.053 y # IO & Wait Time: 101145s 1685.75m 28.10h 1.17d 0.003 y # Average job time: 48s 0.79m 0.01h 0.00d # Longest job: 1294s 21.57m 0.36h 0.01d # Submission to last job: 19269s 321.15m 5.35h 0.22d # sort, filter, and lift alignments ssh eieio cd /cluster/data/hg17/bed/fosends pslSort dirs raw.psl temp out/fosEnds* pslReps -nearTop=0.01 -minCover=0.70 -minAli=0.85 -noIntrons raw.psl \ fosEnds.psl /dev/null # Processed 84096767 alignments # cleanup rm -r temp rm raw.psl rm -fr out /cluster/store7/kate/hg17/fosends mkdir lifted liftUp lifted/fosEnds.lifted.psl \ /cluster/data/hg17/jkStuff/liftAll.lft warn fosEnds.psl pslSort dirs fosEnds.sorted.psl temp lifted rmdir temp wc -l *.sorted.psl # 1693693 fosEnds.sorted.psl set ncbiDir = /cluster/data/ncbi/fosends/human/fosends.3 ~/bin/i386/pslPairs -tInsert=5000 -minId=0.94 -noBin -min=30000 -max=50000 -slop -short -long -orphan -mismatch -verbose fosEnds.sorted.psl $ncbiDir/fosEnds.pairs all_fosends fosEnds # create header required by "rdb" tools # TODO: replace w/ awk & sort echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header cat header fosEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > fosEndPairs.bed cat header fosEnds.slop fosEnds.short fosEnds.long fosEnds.mismatch \ fosEnds.orphan \ | row score ge 300 | sorttbl chr start | headchg -del > fosEndPairsBad.bed extractPslLoad -noBin fosEnds.sorted.psl fosEndPairs.bed \ fosEndPairsBad.bed | \ sorttbl tname tstart | headchg -del > fosEnds.load.psl # load into database ssh hgwdev cd /cluster/data/hg17/bed/fosends hgLoadBed hg17 fosEndPairs fosEndPairs.bed \ -sqlTable=/cluster/home/kate/kent/src/hg/lib/fosEndPairs.sql # Loaded 384558 elements # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed hg17 fosEndPairsBad fosEndPairsBad.bed \ -sqlTable=/cluster/home/kate/kent/src/hg/lib/fosEndPairsBad.sql # Loaded 30830 elements #hgLoadPsl hg17 -nobin -table=all_fosends fosEnds.load.psl # NOTE: truncates file to 0 if -nobin is used hgLoadPsl hg17 -table=all_fosends fosEnds.load.psl # load of all_fosends did not go as planned: 1526991 record(s), 0 row(s) skipped, 156 warning(s) loading psl.tab # load sequences mkdir -p /gbdb/hg17/fosends ln -s /cluster/data/ncbi/fosends/human/fosends.3/fosEnds.fa \ /gbdb/hg17/fosends/fosEnds.fa hgLoadSeq hg17 /gbdb/hg17/fosends/fosEnds.fa # 1087670 sequences # NOTE: extFile ID is 832625 (shouldn't be so large ??) # may want to reset this. # BAC END PAIRS TRACK (DONE - 2004-06-09 kate) # Re-ran pslPairs with updated pairs file (2004-10-04 booch) # Use latest BAC ends data from NCBI # Checked ftp.ncbi.nih.gov/genomes/BACENDS/homo_sapiens, # and files were unchanged from Terry's last download # (to /cluster/store1/bacends.4) # Link this stuff into central data area. ssh eieio cd /cluster/data/ncbi mkdir -p bacends/human ln -s /cluster/store1/bacends.4 bacends/human cd bacends/human/bacends.4 faSize BACends.fa # 400230494 bases (2743171 N's 397487323 real) in 832614 sequences # 400M bases in 800K sequences # use pre-split bacends files, and associated list for cluster run ssh kk cd /cluster/data/hg17 mkdir -p bed/bacends cd bed/bacends mkdir run cd run ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > contigs.lst ls -1S /cluster/bluearc/hg/bacEnds/hs/*.fa > bacends.lst # 380 contigs vs 98 bacends files -> 40K jobs # send output to kksilo, as it can better handle the NFS load # (these are quick jobs) mkdir -p /cluster/store7/kate/hg17/bacends/out ln -s /cluster/store7/kate/hg17/bacends/out ../out cat > gsub << 'EOF' #LOOP /cluster/bin/i386/blat $(path1) $(path2) -ooc=/scratch/hg/h/11.ooc {check out line+ /cluster/data/hg17/bed/bacends/out/$(root2)/$(root1).$(root2).psl} #ENDLOOP 'EOF' # << emacs gensub2 contigs.lst bacends.lst gsub jobList foreach f (`cat bacends.lst`) set d = $f:r:t echo $d mkdir -p /cluster/data/hg17/bed/bacends/out/$d end para create jobList # 37240 jobs written to batch para try para check para push # CPU time in finished jobs: 1573932s 26232.19m 437.20h 18.22d 0.050 y # IO & Wait Time: 122751s 2045.86m 34.10h 1.42d 0.004 y # Average job time: 46s 0.76m 0.01h 0.00d # Longest job: 3312s 55.20m 0.92h 0.04d # Submission to last job: 7148s 119.13m 1.99h 0.08d cd ../out/BACends000 pslCheck *.psl #Error: invalid PSL: AZ519021:1-575 NT_004559:1306426-1608347 - NT_004559.BACends000.psl:1101 #AZ519021 query block 3 start 283 < previous block end 575 # NOTE: inquired with JK regarding these results # lift alignments ssh eieio cd /cluster/data/hg17/bed/bacends pslSort dirs raw.psl temp out/BACends* # takes hours ? # 37240 files in 98 dirs # Got 37240 files 193 files per mid file pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ raw.psl bacEnds.psl /dev/null # Processed 52291246 alignments mkdir lifted liftUp lifted/bacEnds.lifted.psl \ /cluster/data/hg17/jkStuff/liftAll.lft warn bacEnds.psl pslSort dirs bacEnds.sorted.psl temp lifted # cleanup rmdir temp rm -fr out /cluster/store7/kate/hg17/bacends wc -l *.sorted.psl # 2497227 bacEnds.sorted.psl set ncbiDir = /cluster/data/ncbi/bacends/human/bacends.4 ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose bacEnds.sorted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds # create header required by "rdb" tools # TODO: replace w/ awk & sort echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \ | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \ bacEndPairsBad.bed | \ sorttbl tname tstart | headchg -del > bacEnds.load.psl # load into database ssh hgwdev cd /cluster/data/hg17/bed/bacends hgLoadBed hg17 bacEndPairs bacEndPairs.bed \ -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql # Loaded 201380 # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed hg17 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 81773 #hgLoadPsl hg17 -nobin -table=all_bacends bacEnds.load.psl # NOTE: truncates file to 0 if -nobin is used hgLoadPsl hg17 -table=all_bacends bacEnds.load.psl #load of all_bacends did not go as planned: 441072 record(s), 0 row(s) skipped, 30 warning(s) loading psl.tab # Reloaded table, 2004-07-21, and got more rows: # load of all_bacends did not go as planned: 1698790 record(s), # 0 row(s) skipped, 63 warning(s) loading psl.tab # load BAC end sequences mkdir -p /gbdb/hg17/bacends ln -s /cluster/data/ncbi/bacends/human/bacends.4/BACends.fa \ /gbdb/hg17/bacends/BACends.fa hgLoadSeq hg17 /gbdb/hg17/bacends/BACends.fa # 158588 sequences # Re-ran pslPairs with updated pairs file to take advantage of new # feature of # allowing comma separated lists of end accessions for each # end for a clone # First, create new pairs file (bacEndPairs.txt, bacEndSingles.txt) mkdir /cluster/data/ncbi/bacends/human/bacends.5 cd /cluster/data/ncbi/bacends/human/bacends.5 cp ../bacends.4/cl_ac_gi_len . /cluster/bin/scripts/convertEndPairInfo cl_ac_gi_len # Next, re-create the bed file mkdir /cluster/data/hg17/bed/bacends.update cd /cluster/data/hg17/bed/bacends.update ln -s /cluster/data/hg17/bed/bacends/bacEnds.sorted.psl ./bacEnds.sorted.psl set ncbiDir = /cluster/data/ncbi/bacends/human/bacends.5 ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose bacEnds.sorted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds # create header required by "rdb" tools # TODO: replace w/ awk & sort echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairs.bed cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \ | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed # wc *.bed # 204884 2253724 20612402 bacEndPairs.bed # 79401 873411 6527559 bacEndPairsBad.bed # previous # wc ../bacends/*.bed # 201380 2215180 20280578 ../bacends/bacEndPairs.bed # 81773 899503 6712402 ../bacends/bacEndPairsBad.bed extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \ bacEndPairsBad.bed | \ sorttbl tname tstart | headchg -del > bacEnds.load.psl # load into database ssh hgwdev cd /cluster/data/hg17/bed/bacends.update hgLoadBed hg17 bacEndPairs bacEndPairs.bed \ -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql # Loaded 204884 # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed hg17 bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 79401 #hgLoadPsl hg17 -nobin -table=all_bacends bacEnds.load.psl # NOTE: truncates file to 0 if -nobin is used hgLoadPsl hg17 -table=all_bacends bacEnds.load.psl # load of all_bacends did not go as planned: 1729146 record(s), 0 row(s) skipped, 70 warning(s) loading psl.tab # PLACE ASSEMBLY CLONES - misc instructions, only somewhat relevant # See PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE below ###### A second attempt at clone alignment### # Split the clones into 3K pieces into about 1000 fa files # Example: zcat Z99916.1.fa.gz Z99774.1.fa.gz Z99756.7.fa.gz | faSplit size stdin 3000 /tmp/name.fa -lift=/tmp/name.lft -oneFile # Trying this idea in unPlacedBatch ssh kk0 mkdir /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs > nibList ls -1S /cluster/data/hg17/bed/contig_overlaps/blatClones > cloneList cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fastMap -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in exists /scratch/hg/gs.18/build35/bothMaskedNibs/$(path1)} {check in exists+ /cluster/data/hg17/bed/contig_overlaps/blatClones/$(path2)} {check out line+ psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy mkdir psl cat nibList | sed -e "s/.nib//" | while read D do mkdir psl/$D done gensub2 nibList cloneList gsub jobList para create jobList # PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE (DONE - 2004-07-12 - Hiram) ssh eieio mkdir /cluster/data/hg17/bed/contig_overlaps cd /cluster/data/hg17/bed/contig_overlaps # find all the clones that were used in the assembly sed -e "/^#.*/d" /cluster/data/hg17/ncbi_build35.agp | \ awk '{if (!match($5,"N")) {print $6}}' | \ sort -u > placed_in_assembly.list wc -l placed_in_assembly.list # 26872 placed_in_assembly.list # These may be available from the phases files at: # ftp://ftp.ncbi.nih.gov/genbank/genomes/H_sapiens # Which are easily fetched with wget. However I took a look # at those and could not find all the clones in them. There may # be a versioning problem because these phases files are often # updated. # Fetch them from Genbank with the following three PERL scripts: # [hiram@hgwdev /cluster/data/hg17/bed/contig_overlaps] ls -og *.pl # -rwxrwxr-x 1 3047 May 24 18:43 bioPerlFetch.pl # -rwxrwxr-x 1 2370 Jun 4 15:21 fetchGenbank.pl # -rwxrwxr-x 1 700 May 24 21:47 foldEm.pl # Which takes about 4 days ... # Example, cat << '_EOF_' > terrys.list AC011841.7 AC018692.9 AC018743.27 AC037482.14 AL163540.11 '_EOF_' # << this line makes emacs coloring happy # only works on hgwdev ssh hgwdev cd /cluster/data/hg17/bed/contig_overlaps mkdir fasta time ./fetchGenbank.pl terrys.list > fetchResult.out 2>&1 # There is a bit of behind the scenes hocus pocus going on here. # This is a tedious task of comparing various lists with each # other and making sure everything matches. Manual fixups are # done for the newly named 6_hla_hap* items, copies of the PAR # business were duplicated so that X and Y both have the same set # of clones for that. The end result should be a directory hierarchy # here with a directory for each chrom, each random, the 6_hla_hap? # items and each directory contains the clones that belong to that # chromosome. The leftovers are the unplaced clones which end up # in the directory called: unPlaced. The instructions here are # merely a guideline of possibilities. Care should be taken to # make sure all listings are correct and everything gets in the # right place. ssh eieio # And then make a list of all clones considered for assembly: sed -e "/^#.*/d" /cluster/store5/gs.18/ncbi/sequence.inf | \ grep for_assembly | awk '{print $1}' | sort -u > sequence.list wc -l sequence.list # 46733 sequence.list # Verify overlaps are correct: comm -12 placed_in_assembly.list sequence.list > inBoth comm -23 placed_in_assembly.list sequence.list > inAssemblyNotSequence comm -13 placed_in_assembly.list sequence.list > inSequenceNotAssembly wc in* # 1 1 12 inAssemblyNotSequence # 26871 26871 301709 inBoth # 19862 19862 219050 inSequenceNotAssembly # 46734 46734 520771 total # This stray one is from Terry's five additions in the final fixup # phase with Greg: cat inAssemblyNotSequence # AC018743.27 # Terry added: AC011841.7 AC018692.9 AC018743.27 AC037482.14 AL163540.11 # # Generate a listing that relates clones to their contigs sed -e "/^#.*/d" /cluster/store5/gs.18/build35/ncbi_build35.agp | \ ./contigAcc.pl > disburseEm.list # # Using that list, sort the downloaded clones into their # respective chrom directories: ./disburse.sh # Check the number of sequences obtained: find ./? ./?? ./*_random ./6_hla* -type f | wc -l # 26872 # So, why is this number one more than the inBoth list ? # Because, the official NCBI sequence.inf file is missing one of # the clones that Terry added: AC018743.27 # And it shows up in our check list above as inAssemblyNotSequence # It isn't exactly missing, it just isn't marked "for_assembly" # OK, with everything in place, we are ready to try and find # all these items in the assembly. To run a Kluster job on one of # the chroms, matching the items that are supposed to be included # in that chrom. We need to get things set up on the Iservers, # psLayout is heavy into disk I/O and it brings everything down if # allowed to work on any NFS filesystems for input. # It appears that psLayout wants an ooc file of tile size 10 # I tried making one for the whole assembly but it seemed to # include too much for some contigs and it caused a lot of # alignments to be missed. Thus, create an ooc file for each # contig ssh eieio mkdir /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10 cd /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10 ls ../maskedContigs | sed -e "s/.fa//" | while read CONTIG do blat -repMatch=256 -makeOoc=${CONTIG}.10.ooc -tileSize=10 \ ../maskedContigs/${CONTIG}.fa \ ../maskedContigs/${CONTIG}.fa /dev/null echo "done: ${CONTIG}" done # Copy that result to the Iservers: ssh kkr1u00 mkdir /iscratch/i/gs.18/build35/contigOoc10 cd /iscratch/i/gs.18/build35/contigOoc10 rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10/ . # And, copy the clone sequences: mkdir /iscratch/i/gs.18/build35/clones cd /cluster/store5/gs.18/build35/bed/contig_overlaps for D in ? ?? *_random 6_hla_hap? do rsync -arlv `pwd`/${D} /iscratch/i/gs.18/build35/clones done /cluster/bin/iSync ssh kk cd /cluster/data/hg17/bed/contig_overlaps mkdir psl cat << '_EOF_' > runPsLayout.sh #!/bin/sh # kkiPsLayout.sh # where is the chrom this contig is on # is one of the .fa.gz files in # /cluster/data/hg17/bed/contig_overlaps/*/.fa.gz # without the .fa.gz extension # This stuff has been mirrored to: # /iscratch/i/gs.18/clones/*/.fa.gz # is one of the contigs found in: # /cluster/store5/gs.18/build35///.fa # CHROM=$1 CLONE=$2 CONTIG=$3 TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa FAZ=/iscratch/i/gs.18/build35/clones/${CHROM}/${CLONE}.fa.gz OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc mkdir -p psl/${CONTIG} if [ ! -s ${FAZ} ]; then echo "Can not find: ${FAZ}" exit 255 fi if [ ! -s ${TARGET} ]; then echo "Can not find: ${TARGET}" exit 255 fi if [ ! -s ${OOC} ]; then echo "Can not find: ${OOC}" exit 255 fi zcat ${FAZ} > /tmp/${CLONE}.fa $HOME/bin/i386/psLayout ${TARGET} \ /tmp/${CLONE}.fa genomic ${OOC} psl/${CONTIG}/${CLONE}.psl RET=$? rm -f /tmp/${CLONE}.fa exit ${RET} '_EOF_' # << this line makes emacs coloring happy chmod +x runPsLayout.sh # make up a listing of chrom, clone, contig from: grep -v "^#" disburseEm.list | sed -e "s/.fa.gz//" > chr.clone.contig.list wc -l chr.clone.contig.list # 26872 chr.clone.contig.list awk '{ printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s.psl}\n", $1, $2, $3, $3, $2 }' chr.clone.contig.list > jobList # << this line makes emacs coloring happy # To do a quick test, run just chr22: grep -v "^22" chr.clone.contig.list | awk '{ printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s.psl}\n", $1, $2, $3, $3, $2 }' > jobList para create jobList para try ... check ... etc ... # One run on chr22 took: # Completed: 561 of 561 jobs # CPU time in finished jobs: 927068s 15451.14m 257.52h 10.73d 0.029 y # IO & Wait Time: 6295s 104.91m 1.75h 0.07d 0.000 y # Average job time: 1664s 27.73m 0.46h 0.02d # Longest job: 69745s 1162.42m 19.37h 0.81d # Submission to last job: 69780s 1163.00m 19.38h 0.81d # put the results together, filter, lift and load: cd /cluster/data/hg17/bed/contig_overlaps/psl pslSort dirs raw.psl tmp N* pslReps -singleHit raw.psl repsSingle.psl /dev/null liftUp chr22.psl /cluster/data/hg17/jkStuff/liftAll.lft \ warn repsSingle.psl hgLoadPsl -table=cloneTest hg17 chr22.psl # There are a number of clones listed in the sequence.inf file # as status W with names beginning AACC AADB AADC AADD # These are the Whole shotgun assemblies for the Celera genome. # A few of them were used in the assembly of the NCBI genome, namely: ./11/AADB01066164.1.fa.gz ./11/AADC01095577.1.fa.gz ./11/AADD01116830.1.fa.gz ./11/AADD01118406.1.fa.gz ./11/AADD01116787.1.fa.gz ./11/AADD01112371.1.fa.gz ./11/AADD01116788.1.fa.gz ./11/AADD01115518.1.fa.gz ./11/AADD01118410.1.fa.gz ./11/AADD01117999.1.fa.gz ./21/AADD01172789.1.fa.gz ./21/AADD01172788.1.fa.gz ./21/AADD01209098.1.fa.gz ./21/AADD01172902.1.fa.gz # And these have been distributed properly in their corresponding # chromosome. The rest of them, 26, all with names starting AACC are in # the directory here: celeraOnly # To run the unPlaced alignments. # Prepare scratch and iscratch ssh eieio mkdir /cluster/bluearc/scratch/hg/gs.18/build35/clones/unPlaced rsync -arlv /cluster/data/hg17/bed/contig_overlaps/unPlaced/ \ /cluster/bluearc/scratch/hg/gs.18/build35/clones/unPlaced # request scratch sync to cluster admins ssh kkr1u00 mkdir /iscratch/i/gs.18/build35/clones/unPlaced rsync -arlv /cluster/data/hg17/bed/contig_overlaps/unPlaced/ \ /iscratch/i/gs.18/build35/clones/unPlaced /cluster/bin/iSync ssh hgwdev cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch # There are too many to try them all, obtain guildelines from hg16 # of clone to contig mapping: hgsql -N -e "select name,chrom from clonePos;" hg16 > hg16.clone.chrom hgsql -N -e "select contig,chrom from ctgPos;" hg16 > hg16.contig.chrom ssh kk mkdir /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch ls ../unPlaced | sed -e "s/.fa.gz//" > unPlaced.clone.list wc -l unPlaced.clone.list # 19836 unPlaced.clone.list ls -1S /scratch/hg/gs.18/build35/maskedContigs > contig.list wc -l contig.list # 380 contig.list cat << '_EOF_' > runPsLayout.sh #!/bin/sh # kkiPsLayout.sh # is one of the .fa.gz files in # /scratch/hg/gs.18/build35/clones/unPlaced # without the .fa.gz extension # is one of the contigs found in: # /iscratch/i/gs.18/build35/maskedContigs # CLONE=$1 CONTIG=$2 TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa FAZ=/scratch/hg/gs.18/build35/clones/unPlaced/${CLONE}.fa.gz OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc mkdir -p psl/${CONTIG} if [ ! -s ${FAZ} ]; then echo "Can not find: ${FAZ}" exit 255 fi if [ ! -s ${TARGET} ]; then echo "Can not find: ${TARGET}" exit 255 fi if [ ! -s ${OOC} ]; then echo "Can not find: ${OOC}" exit 255 fi zcat ${FAZ} > /tmp/${CLONE}.fa $HOME/bin/i386/psLayout ${TARGET} \ /tmp/${CLONE}.fa genomic ${OOC} psl/${CONTIG}/${CLONE}.psl RET=$? rm -f /tmp/${CLONE}.fa exit ${RET} '_EOF_' # << this line makes emacs coloring happy chmod +x runPsLayout.sh cat << '_EOF_' > gsub #LOOP ./runPsLayout.sh $(path1) $(path2) {check out line+ psl/$(path2)/$(path1).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 unPlaced.clone.list contig.list gsub jobList # XXXX - some time later ... 2004-07-12 # Bringing this sequence to a close. Difficulties encountered: # Placed clones that did not survive the psLayout filter: # AC006040.3 AC006328.5 AC007039.6 AC007241.3 AC007965.3 # AC009947.2 AC010682.2 AC012005.4 AC016707.2 AC016728.4 # AC016752.2 AC017005.7 AC025226.4 AC025246.6 AC055713.29 # AC068541.7 AC068601.8 AC068704.4 AC073649.3 AC073962.5 # AC091175.11 AC095381.1 AC104597.3 AC130223.2 AC130814.3 # AC133883.6 AC139103.3 AF003627.3 AF135405.3 AL021878.2 # AL137064.6 AL356803.2 AL390801.4 AL591480.8 AL901608.1 # AP005814.2 BX322790.2 Z84489.1 Z84814.1 # And placed clones that broken into two pieces during their # psLayout alignment: # AC006982.3 AC007742.4 AC023342.3 AC024183.4 AC025735.4 # AC095380.1 AL646104.4 BX293536.4 # For the above clones, their assignments in ref_placed.agp were # used instead of trying to adjust the psLayout process. # The PAR clones are a problem. They were placed properly, but # during their load with hgClonePos there was a warning issued # about their dual existance. hgClonePos said they were only # going to be placed on chrX and not on chrY. However in the # browser when chrY is viewed it issues errors about these not # having proper coordinates in the clonePos table. These were # removed from the coverage track to eliminate that error. # AL954722.18 BX537334.4 BX000483.7 BX908402.3 BX649635.3 BX119919.5 # AC079176.15 AC097314.27 AC006209.25 AJ271735.1 AJ271736.1 # # And finally, after many different types of alignment attempts, # there remain 1489 un-placed clones that could not be located. # While trying to figure out which contigs many clones belonged # to, the following cluster run script was used to take a survey # using blat: #!/bin/sh # runBlat.sh # is one of the .fa.gz files in # /scratch/hg/gs.18/build35/clones/ # without the .fa.gz extension # is one of the contigs found in: # /iscratch/i/gs.18/build35/maskedContigs # # ./runBlat.sh unPlaced/AB000876.1.fa.gz NT_005612.fa {check out line+ # psl/NT_005612.fa/unPlaced/AB000876.1.fa.gz.psl} # HERE=`pwd` CLONE=$1 CLONEDIR=`dirname ${CLONE}` CLONENAME=`basename ${CLONE}` CLONESRC=/iscratch/i/gs.18/build35/clones/${CLONE}.fa.gz CONTIG=$2 CONTIGBASE=${CONTIG/.fa/} TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG} if [ ! -s ${CLONESRC} ]; then echo "Can not find: ${CLONESRC}" 1>/dev/stderr exit 255 fi if [ ! -s ${TARGET} ]; then echo "Can not find: ${TARGET}" 1>/dev/stderr exit 255 fi mkdir -p /tmp/${CLONEDIR}/${CLONENAME} zcat ${CLONESRC} > /tmp/${CLONEDIR}/${CLONENAME}/${CLONENAME}.fa cd /tmp/${CLONEDIR} /cluster/data/hg17/bed/contig_overlaps/FfaSplit/faToFfa ${CLONENAME} ECOUNT=`cat error.convert | wc -l` if [ "${ECOUNT}" -ne 0 ]; then echo "Error during faToFfa, error.convert not empty" 1>/dev/stderr exit 255 fi rm -f error.convert B=${CLONENAME/\.*/} cd /tmp/${CLONEDIR}/${CLONENAME} faSplit byname ${CLONENAME}.fa . RET=0 export RET for F in ${CLONENAME}_*.fa do FA=${F/_*.fa/} A=${FA/.[0-9]*/} P=${F/.fa/} N=${P##*_} rm -f t.fa mv ${F} t.fa cat t.fa | faSplit -oneFile size stdin 3000 ${A}_${N} rm -f t.fa blat ${TARGET} ${A}_${N}.fa -ooc=/scratch/hg/h/11.ooc ${A}_${N}.psl \ -t=dna -q=dna -fastMap -noHead RET=$? if [ "$RET" -ne 0 ]; then echo "Error during blat ${TARGET} ${A}_${N}.fa" 1>/dev/stderr break fi done rm -f ${CLONENAME}.fa rm -f ${B}_*.fa cd ${HERE} mkdir -p psl/${CONTIGBASE} sed -e "s/${A}/${CLONENAME}/" /tmp/${CLONEDIR}/${CLONENAME}/*.psl > \ psl/${CONTIGBASE}/${CLONENAME}.psl rm -f /tmp/${CLONEDIR}/${CLONENAME}/*.psl rmdir --ignore-fail-on-non-empty /tmp/${CLONEDIR}/${CLONENAME} rmdir --ignore-fail-on-non-empty /tmp/${CLONEDIR} exit ${RET} # The alignment with psLayout were done with the following cluster # run script: #!/bin/sh # kkiPsLayout.sh # is one of the .fa.gz files in # /scratch/hg/gs.18/build35/clones/unPlaced # without the .fa.gz extension # is one of the contigs found in: # /iscratch/i/gs.18/build35/maskedContigs # # ./runPsLayout.sh unPlaced/AP001966.2 NT_016354 {check out exists # psl/NT_016354/AP001966.2.psl} # HERE=`pwd` CLONE=$1 CONTIG=$2 CLONEDIR=`dirname ${CLONE}` CLONENAME=`basename ${CLONE}` RESULT=psl/${CONTIG}/${CLONENAME}.psl CLONESRC=/iscratch/i/gs.18/build35/clones/${CLONE}.fa.gz TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc if [ ! -s ${CLONESRC} ]; then echo "Can not find: ${CLONESRC}" 1>/dev/stderr exit 255 fi if [ ! -s ${TARGET} ]; then echo "Can not find: ${TARGET}" 1>/dev/stderr exit 255 fi if [ ! -s ${OOC} ]; then echo "Can not find: ${OOC}" 1>/dev/stderr exit 255 fi mkdir -p /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME} zcat ${CLONESRC} > /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa cd /tmp/${CONTIG} /cluster/data/hg17/bed/contig_overlaps/FfaSplit/faToFfa ${CLONEDIR} cd ${HERE} mkdir -p psl/${CONTIG} $HOME/bin/i386/psLayout ${TARGET} /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa genomic ${OOC} ${RESULT} RET=$? rm -f /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa /tmp/${CONTIG}/error.convert rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME} rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}/${CLONEDIR}/ rmdir --ignore-fail-on-non-empty /tmp/${CONTIG} exit ${RET} # BUILD KNOWN GENES TABLES (DONE 6/8/04 Fan) Build sp040515 and proteins040515 DBs first. hgsql hg17 -e "create database kgHg17" cd /cluster/store6/kgDB/bed mkdir kgHg17 cd /cluster/store6/kgDB/bed/kgHg17 ~/src/hg/protein/KGprocess.sh kgHg17 hg17 040515 The script was run successfully with the last message: Tue Jun 8 15:36:52 PDT 2004 DONE After initial inspection of tables in kgHg17, do the following from mySql prompt: alter table kgHg17.cgapAlias rename as hg17.cgapAlias; alter table kgHg17.cgapBiocDesc rename as hg17.cgapBiocDesc; alter table kgHg17.cgapBiocPathway rename as hg17.cgapBiocPathway; alter table kgHg17.dupSpMrna rename as hg17.dupSpMrna; alter table kgHg17.keggMapDesc rename as hg17.keggMapDesc; alter table kgHg17.keggPathway rename as hg17.keggPathway; alter table kgHg17.kgAlias rename as hg17.kgAlias; alter table kgHg17.kgProtAlias rename as hg17.kgProtAlias; alter table kgHg17.kgXref rename as hg17.kgXref; alter table kgHg17.knownGene rename as hg17.knownGene; alter table kgHg17.knownGeneLink rename as hg17.knownGeneLink; alter table kgHg17.knownGeneMrna rename as hg17.knownGeneMrna; alter table kgHg17.knownGenePep rename as hg17.knownGenePep; alter table kgHg17.mrnaRefseq rename as hg17.mrnaRefseq; alter table kgHg17.spMrna rename as hg17.spMrna; hg17.knownGene has 43,401 entries and hg16.knownGene has 43,232 entries. and running featireBits shows: featureBits hg17 knownGene 63983072 bases of 2866216770 (2.232%) in intersection featureBits hg16 knownGene 63781799 bases of 2865248791 (2.226%) in intersection Connect to genome-testdb and use hgcentraltest DB. Add a new entry in gdbPdb table: insert into gdbPdb values('hg17', 'proteins040515'); # CREATE LINEAGE-SPECIFIC REPEATS FOR BLASTZ WITH ZEBRAFISH # (DONE, 2004-06-08, hartera) # Treat all repeats as lineage-specific mkdir /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish foreach f (/iscratch/i/gs.18/build35/rmsk/chr*.fa.out) cp -p $f /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/$f:t:r:r.out.spec end iSync # PREP FOR LIFTOVER CHAINS TO THIS ASSEMBLY (2004-06-10 kate) # split into 3K chunks ssh eieio set liftDir = /cluster/data/hg17/bed/liftOver/liftSplit mkdir -p $liftDir cd $liftDir cat > split.csh << 'EOF' set splitDir = /iscratch/i/hg17/liftOver/split mkdir -p $splitDir set liftDir = /cluster/data/hg17/bed/liftOver/liftSplit foreach n (`ls /cluster/data/hg17/nib`) set c = $n:r echo $c faSplit -lift=$liftDir/$c.lft size \ /cluster/data/hg17/$d/$c.fa -oneFile 3000 $splitDir/$c end 'EOF' # << for emacs csh split.csh >&! split.log & tail -100f split.log ssh kkr1u00 iSync # STS MARKERS (DONE 2004-07-21 kate) # MANUAL UPDATE OF D21S168 and D21S167 (DONE, 2005-02-11, hartera) # FILTERED OUT noOoc ALIGNMENTS WITH tBaseInsert >=1000 # (DONE, 2005-02-17, hartera) AND RELOADED stsMap, stsInfo2 and all_sts_seq # DATABASE TABLES AFTER ADDING FILTERED ALIGNMENTS TO all_sts_seq AND # REMOVING DATA FROM stsMap and stsInfo2 FOR THE MARKERS REMOVED FROM THE # FILTERED SET (DONE, 2005-02-18, hartera) # UPDATE PSL ALIGNMENTS FOR D21S167 and D21S168 AND RELOAD INTO all_sts_seq # (DONE, 2005-02-23, hartera) # UPDATED stsAlias TABLE REMOVING OF FILTERED ALIGNMENTS (2005-02-24, hartera) # Terry's sts.9 dir is in /cluster/store5/sts.2004-07.old # remove this after verifying the newer version # update from NCBI (booch) ssh eieio # use store5 for space mkdir -p /cluster/store5/sts.2004-07 ln -s /cluster/store5/sts.2004-07 /cluster/data/ncbi ln -s /cluster/data/ncbi/sts.2004-07 sts.9 cd /cluster/data/ncbi/sts.2004-07 wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz gunzip sts.gz mv sts dbSTS.fa # incremental update from previous build # NOTE: could mysql dump this, unless hand-updated (like hg16) # First - copy from Terry's dir ssh eieio ln -s /cluster/store1/sts.8 /cluster/data/ncbi cd /cluster/data/ncbi/sts.9 # this time, snag from Terry's dir cd /cluster/data/ncbi/sts.9 cp -p ~booch/tracks/update/all.STS.fa.prev . cp -p ~booch/tracks/update/stsInfo2.bed stsInfo2.bed.prev # Convert dbSTS.fa file to easier reading format, and get accessions /cluster/bin/scripts/convertGbFaFile dbSTS.fa > dbSTS.convert.fa grep ">" dbSTS.convert.fa | cut -f 2 -d ">" > dbSTS.acc # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers, # all.STS.fa, stsAlias.bed files updateStsInfo -verbose=1 -gb=dbSTS.acc stsInfo2.bed.prev all.STS.fa.prev \ dbSTS.sts dbSTS.aliases dbSTS.convert.fa new # 129991 SWXD2599 99622 (0) not in dbSTS anymore # 166473 D3S3812 154523 (0) not in dbSTS anymore # 185776 RH83562 209614 (0) not in dbSTS anymore mv new.info stsInfo2.bed mv new.primers all.primers mv new.alias stsAlias.bed mv new.fa all.STS.fa # get list of all STS id's in the fasta file sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n > all.STS.id wc -l all.STS.id # 92674 total sequences /cluster/bin/scripts/convertPrimerToFA all.primers > all.primers.fa # Copy stsInfo2.bed and stsAlias.bed to data directory becuase # these will be loaded into the database later mkdir -p /cluster/data/hg17/bed/sts cp stsInfo2.bed /cluster/data/hg17/bed/sts/ cp stsAlias.bed /cluster/data/hg17/bed/sts/ # Create sts sequence alignments mkdir -p /cluster/bluearc/sts.9/sts.split faSplit sequence all.STS.fa 50 /cluster/bluearc/sts.9/sts.split/sts cp /cluster/data/ncbi/sts.9/all.STS.fa /cluster/bluearc/sts.9 # create small ooc file to use with alignments (if not existing) # NOTE: these were just used for experimenting; weren't used in # final runs ssh kolossus cd /cluster/data/hg17/bed/sts ls /cluster/bluearc/hg17/bothMaskedNibs/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null \ -tileSize=11 -makeOoc=/cluster/bluearc/hg/h/11.4096.ooc -repMatch=4096 blat nib.lst /dev/null /dev/null \ -tileSize=11 -makeOoc=/cluster/bluearc/hg/h/11.16384.ooc -repMatch=16384 ssh kk cd /cluster/data/hg17/bed/sts mkdir run cd run ls -1S /scratch/hg/hg17/maskedContigs/*.fa > contigs.lst ls -1S /cluster/bluearc/sts.9/sts.split/sts*.fa > sts.lst mkdir -p /cluster/bluearc/hg17/sts/sts/out foreach f (`cat sts.lst`) set d = $f:t:r mkdir /cluster/bluearc/hg17/sts/sts/out/$d end # create alignments cat > template << 'EOF' #LOOP /cluster/bin/i386/blat $(path1) $(path2) -ooc=/cluster/bluearc/hg/h/11.ooc -stepSize=5 {check out line+ /cluster/bluearc/hg17/sts/sts/out/$(root2)/$(root1).$(root2).psl} #ENDLOOP 'EOF' # << for emacs gensub2 contigs.lst sts.lst template jobList para create jobList # 17860 jobs para try para check para push # CPU time in finished jobs: 216985s 3616.41m 60.27h 2.51d 0.007 y # IO & Wait Time: 48790s 813.17m 13.55h 0.56d 0.002 y # Average job time: 15s 0.25m 0.00h 0.00d # Longest job: 267s 4.45m 0.07h 0.00d # Submission to last job: 2228s 37.13m 0.62h 0.03d # Compile sts sequence results ssh kolossus cd /cluster/bluearc/hg17/sts/sts pslSort dirs raw.psl temp out/* rm -rf temp pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons raw.psl \ stsMarkers.psl /dev/null # Processed 7121016 alignments #cp stsMarkers.psl /cluster/data/hg17/bed/sts/run # Lift them and get them ready to combine with primer alignments #cd /cluster/data/hg17/bed/sts/run #liftUp -nohead /cluster/data/hg17/bed/sts/run/stsMarkers.lifted.psl \ liftUp -nohead stsMarkers.lifted.psl \ /cluster/data/hg17/jkStuff/liftAll.lft warn stsMarkers.psl # missing some utilities for kolossus, so switch to fileserver # NOTE: probably no longer true -- try on kolossus next time ssh kksilo cd /cluster/bluearc/hg17/sts/sts /cluster/bin/scripts/extractPslInfo stsMarkers.lifted.psl # creates .initial /cluster/bin/scripts/findAccession -agp stsMarkers.lifted.psl.initial \ /cluster/data/hg17 # "Could not open /cluster/data/hg17/Y/chrY_random.agp" etc. # Looks like it trys all _randoms (even one's that don't # exist/aren't needed # creates .acc #rm stsMarkers.lifted.psl.initial sort -k 4n stsMarkers.lifted.psl.initial.acc > stsMarkers.final #rm stsMarkers.lifted.psl.initial.acc #cp stsMarkers.final stsMarkers.lifted.psl.initial /cluster/data/hg17/bed/sts # determine found markers (4th field in file) cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found wc -l stsMarkers.found # 89532 stsMarkers.found # out of 92674 total sequences # extract sequences for markers not yet found, and # blat w/o ooc to try to place more comm -1 -3 stsMarkers.found /cluster/data/ncbi/sts.9/all.STS.id \ > stsMarkers.notFound wc -l stsMarkers.notFound # 3142 stsMarkers.notFound faSomeRecords /cluster/data/ncbi/sts.9/all.STS.fa stsMarkers.notFound \ notFound.STS.fa mkdir /cluster/bluearc/sts.9/sts.splitNotFound faSplit sequence notFound.STS.fa 20 \ /cluster/bluearc/sts.9/sts.splitNotFound/sts # blat with 11.ooc misses alignments, so reblat w/o the # sequences that aren't found # NOTE: filtering produces yield of only 149 markers placed (out of 3142). # not enough to justify this step next time ssh kk cd /cluster/data/hg17/bed/sts mkdir run.noOoc cd run.noOoc ls -1S /scratch/hg/hg17/maskedContigs/*.fa > contigs.lst ls -1S /cluster/bluearc/sts.9/sts.splitNotFound/sts*.fa > sts.lst mkdir -p /cluster/bluearc/hg17/sts/sts/out.noOoc foreach f (`cat sts.lst`) set d = $f:t:r mkdir /cluster/bluearc/hg17/sts/sts/out.noOoc/$d end cat > template << 'EOF' #LOOP /cluster/bin/i386/blat $(path1) $(path2) -stepSize=5 {check out line+ /cluster/bluearc/hg17/sts/sts/out.noOoc/$(root2)/$(root1).$(root2).psl} #ENDLOOP 'EOF' # << for emacs gensub2 contigs.lst sts.lst template jobList para create jobList # 7220 jobs written to batch para try para check # process this set of alignments ssh kolossus cd /cluster/bluearc/hg17/sts/sts pslSort dirs raw.noOoc.psl temp out.noOoc/* rm -rf temp pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons \ raw.noOoc.psl stsMarkers.noOoc.psl /dev/null # Processed 4254094 alignments #cp stsMarkers.psl /cluster/data/hg17/bed/sts/run # Lift them and get them ready to combine with primer alignments liftUp -nohead stsMarkers.noOoc.lifted.psl \ /cluster/data/hg17/jkStuff/liftAll.lft warn stsMarkers.noOoc.psl /cluster/bin/scripts/extractPslInfo stsMarkers.noOoc.lifted.psl # creates .initial /cluster/bin/scripts/findAccession -agp \ stsMarkers.noOoc.lifted.psl.initial /cluster/data/hg17 # "Could not open /cluster/data/hg17/Y/chrY_random.agp" etc. # Looks like it trys all _randoms (even one's that don't # exist/aren't needed # creates .acc #rm stsMarkers.lifted.psl.initial mv stsMarkers.final stsMarkers.ooc.final sort -k 4n stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra sort -k 4n stsMarkers.lifted.psl.initial.acc \ stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.final # determine found markers (4th field in file) cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found wc -l stsMarkers.found # 89681 stsMarkers.found cut -f 4 stsMarkers.extra | sort -n -u > stsMarkers.extra.found wc -l stsMarkers.extra.found # 149 out of 3142 attempted # out of 92674 total sequences cp stsMarkers.final stsMarkers.lifted.psl stsMarkers.*lifted.psl.initial* stsMarkers.found \ /cluster/data/hg17/bed/sts # Alignments from noOoc set were not added to all_sts_seq but info for the markers # is in stsMap and stsInfo2. Some of the alignments are bad so filter by removing # all alignments from noOoc psl file where tBaseInsert >=1000. Add the remaining # alignments to the set of final alignments for stsMarkers. The information for the # removed markers from the filtered set was also removed from stsMap and stsInfo2. # (DONE, 2005-02-17, hartera) ssh eieio cd /cluster/data/hg17/bed/sts/fix cp /cluster/bluearc/hg17/sts/sts/stsMarkers.noOoc.lifted.psl . awk '{if ($8 < 1000) print;}' stsMarkers.noOoc.lifted.psl > stsMarkers.noOoc.lifted.filt1000.psl wc -l *.filt*.psl # 254 5334 26384 stsMarkers.noOoc.lifted.filt1000.psl sort -k 4n /cluster/bluearc/hg17/sts/sts/stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra awk '{print $4;}' stsMarkers.extra | sort -n | uniq > extra.ids # in psl file, the ids are the 10th field awk '{print $10;}' stsMarkers.noOoc.lifted.psl | sort -n | uniq \ > noOoc.ids diff extra.ids noOoc.ids # there is no difference as expected # get list of IDs from filtered file, filter < 1000 awk '{print $10;}' stsMarkers.noOoc.lifted.filt1000.psl \ | sort -n | uniq > filt1000.ids foreach i (`cat filt1000.ids`) awk 'BEGIN {OFS="\t"} \ {if ($4 == "'$i'") print $1, $2, $3, $4, $5, $6, $7}' \ stsMarkers.extra >> stsMarkers.extra.filt1000 end cp ../stsMarkers.final stsMarkers.final # cat stsMarkers.extra.filt1000 >> stsMarkers.final2 # need to filter stsMarkers.final not just cat this on the end # get list of alignments with tBaseInsert >= 1000 and remove these cd /cluster/data/hg17/bed/sts/fix awk '{if ($8 >= 1000) print;}' stsMarkers.noOoc.lifted.psl > stsMarkers.noOoc.lifted.filtToRemove.psl wc -l *.filt*.psl # 254 stsMarkers.noOoc.lifted.filt1000.psl # 249 stsMarkers.noOoc.lifted.filt500.psl # 448 stsMarkers.noOoc.lifted.filtToRemove.psl # get list of IDs that need to be removed awk '{print $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl | sort -n \ | uniq > noOoc.IdsToRemove.txt # get chrom and co-ordinates for IDs to be removed awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \ stsMarkers.noOoc.lifted.filtToRemove.psl | sort | uniq \ > sts.noOoc.filtToRemove.coords # checked that the stsMarkers.final contain the noOoc alignments # wrote perl script to remove lines with these IDs from stsMarkers.final cat << '_EOF_' > removeIds.pl #!/usr/bin/perl -w use strict; my $ids = $ARGV[0]; my $file = $ARGV[1]; # list of IDs with chrom and coords to remove open(IDS, $ids) || die "Can not open $ids: $!\n"; # file for removal of IDs open(FILE, $file) || die "Can not open $file: $!\n"; open(OUT, ">removed.txt") || die "Can not create removed.txt: $!\n"; my %idsHash; while () { chomp; my @a = split(/\t/); my $chr = $a[0]; my $st = $a[1]; my $end = $a[2]; my $id = $a[3]; my $key = $id."_".$chr . "_" . $st . "_" . $end; $idsHash{$key}->{chrom} = $chr; $idsHash{$key}->{start} = $st; $idsHash{$key}->{end} = $end; } close IDS; while () { chomp; my $l = $_; my $found = "FALSE"; my @f = split(/\t/, $l); foreach my $k (keys(%idsHash)) { # if the id is contained in the key if ($k =~ /^$f[3]/) { my $c = $idsHash{$k}->{chrom}; my $s = $idsHash{$k}->{start}; my $e = $idsHash{$k}->{end}; if ($f[0] eq $c && $f[1] == $s && $f[2] == $e) { print OUT "$c\t$s\t$e\t$f[3]\n"; $found = "TRUE"; } } } if ($found eq "FALSE") { print "$l\n"; } } '_EOF_' chmod +x removeIds.pl perl removeIds.pl sts.noOoc.filtToRemove.coords stsMarkers.final \ > stsMarkers.final.new wc -l stsMarkers.final* # 92338 stsMarkers.final # 91890 stsMarkers.final.new # There are 448 ids and sets of co-ordinates in list of Ids to remove # check that stsMarkers.final.new contains all the alignments that # are in filtered set: stsMarkers.noOoc.lifted.filt1000.psl awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \ stsMarkers.noOoc.lifted.filt1000.psl | sort | uniq \ > sts.noOoc.filt1000.coords awk 'BEGIN {OFS = "\t"} {print $1,$2,$3,$4}' \ stsMarkers.final.new | sort | uniq \ > sts.finalnew.coords diff sts.finalnew.coords sts.noOoc.filt1000.coords > finalnewvsfilt1000 grep '>' finalnewvsfilt1000 # there is nothing in sts.noOoc.filt1000.coords not found in the # sts.finalnew.coords file therefore this contains all the alignments # from the filtered noOoc file. cp ../primers/primers.final . awk '{print $4}' primers.final | sort | uniq > primers.ids awk '{print $4}' stsMarkers.final.new | sort | uniq > stsfinal.new.ids # primers ssh eieio cd /cluster/data/ncbi/sts.9 # strip out N's and wobbles (KS) from primers, as isPcr # can't currently handle them # strip out primers < 10 as isPcr can't handle them awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \ all.primers > all.primers.ispcr mkdir -p /cluster/bluearc/sts.9/primers cd /cluster/bluearc/sts.9/primers split -l 2000 /cluster/data/ncbi/sts.9/all.primers.ispcr primers_ ssh kk cd /cluster/data/hg17/bed/sts mkdir primers cd primers mkdir run cd run ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > contigs.lst ls -1S /cluster/bluearc/sts.9/primers/primers_* > primers.lst mkdir -p /cluster/bluearc/hg17/sts/primers/out cat > template << 'EOF' #LOOP /cluster/home/kate/bin/i386/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/scratch/hg/h/10.ooc -stepSize=5 $(path1) $(path2) {check out line /cluster/bluearc/hg17/sts/primers/out/$(root1)_$(root2).psl} #ENDLOOP 'EOF' # << for emacs gensub2 contigs.lst primers.lst template jobList para create jobList # 26980 jobs para try para check para push #Completed: 26953 of 26980 jobs #Crashed: 27 jobs #CPU time in finished jobs: 1130353s 18839.22m 313.99h 13.08d 0.036 y #IO & Wait Time: 86067s 1434.44m 23.91h 1.00d 0.003 y #Average job time: 45s 0.75m 0.01h 0.00d #Longest job: 1255s 20.92m 0.35h 0.01d #Submission to last job: 2762s 46.03m 0.77h 0.03d # 27 jobs seg faulted due to -minPerfect=2. # Looks like a bug in isPcr -- till it's fixed, # we'll rerun with -minPerfect=5 (Terry determined they # all complete with this (he used 3, 4, or 5, tuned individually # for each job, but just using 5 should be adequate and # less labor-intensive). # NOTE: isPcr bug is fixed -- this shouldn't be necessary for # next run para crashed | grep isPcr | sed 's/minPerfect=2/minPerfect=5/' \ > jobList.minPerfect5 para create jobList.minPerfect5 # 28 jobs # repeat with increasing minPerfect, till all complete succesfully # Filter output file quickly based on simple parameters ssh kolossus cd /cluster/bluearc/hg17/sts/primers/ mkdir -p filter pslQuickFilter -minMatch=26 -maxMismatch=5 -maxTinsert=5000 -verbose out/ filter/ # Note: there will be many messages saying files are empty - this is OK pslSort dirs primers.psl.unlifted temp filter # filter primer alignments and create not found primer file for ePCR run (booch) pslFilterPrimers /cluster/bluearc/hg17/sts/primers/primers.psl.unlifted \ /cluster/data/ncbi/sts.9/all.primers primers.filter.unlifted.psl # creates $3.notfound.primers wc -l primers.filter.unlifted.psl.notfound.primers # 21919 primers.filter.unlifted.psl.notfound.primers # use Greg Schuler's ePCR to attempt alignment of primers missed # by isPcr mkdir -p /cluster/data/hg17/bed/sts/primers/run.epcr mkdir -p /cluster/bluearc/hg17/sts/primers/epcr cd /cluster/bluearc/hg17/sts/primers/epcr split -l 2500 /cluster/data/hg17/bed/sts/primers/primers.filter.unlifted.psl.notfound.primers primers_ cd /cluster/data/hg17/bed/sts/primers/run.epcr ls -1S /cluster/bluearc/hg17/sts/primers/epcr/primers_* > primers.lst # create contig.lst based on split in build dir # NOTE: should probably replace this with something more standard # and faster. Also, this appears to cause load spikes on fileservers. # Should get contigs from bluearc, iservers, or cluster local disk # At least it's over pretty quick! ssh eieio cd /cluster/data/hg17/bed/sts/primers/run.epcr /cluster/bin/scripts/splitContigList -ncbi /cluster/data/hg17 1 # next time... ls -1S /cluster/bluearc/hg17/contigs/* > contig.lst (?) mkdir -p /cluster/bluearc/hg17/sts/primers/epcr/out ssh kk cd /cluster/data/hg17/bed/sts/primers/run.epcr cat > template << 'EOF' #LOOP /cluster/bin/scripts/runEpcr $(path1) $(path2) {check out line /cluster/bluearc/hg17/sts/primers/epcr/out/$(root1).$(root2).epcr} #ENDLOOP 'EOF' # << for emacs gensub2 primers.lst contig.lst template jobList para create jobList # 3420 jobs para try para check para push # CPU time in finished jobs: 78897s 1314.95m 21.92h 0.91d 0.003 y # IO & Wait Time: 254582s 4243.03m 70.72h 2.95d 0.008 y # Average job time: 98s 1.63m 0.03h 0.00d # Longest job: 647s 10.78m 0.18h 0.01d # Submission to last job: 1112s 18.53m 0.31h 0.01d # merge output ssh eieio cd /cluster/bluearc/hg17/sts/primers/epcr cat out/*.epcr > all.epcr wc -l all.epcr # 3573 # use all.epcr file to re-filter alignemnts and determine which # ePCR records to keep cp all.epcr /cluster/data/hg17/bed/sts/primers cd /cluster/data/hg17/bed/sts/primers pslFilterPrimers -epcr=all.epcr -verbose=1 \ /cluster/bluearc/hg17/sts/primers/primers.psl.unlifted \ /cluster/data/ncbi/sts.9/all.primers primers.unlifted.epcr.psl # convert to PSL and combine with other psl file (this takes a couple hours) /cluster/bin/scripts/epcrToHgPsl epcr.not.found \ /cluster/data/ncbi/sts.9/all.primers /cluster/data/hg17 cat primers.unlifted.epcr.psl epcr.not.found.psl \ | sort -k 10n > primers.final.unlifted.psl # Fix the query gap lengths so that they match the all.primers.fa # file lengths /cluster/bin/scripts/fixPrimersQueryGaps \ /cluster/data/ncbi/sts.9/all.primers primers.final.unlifted.psl \ > primers.final.unlifted.fix.psl # lift results from contigs to chrom coordinates, and create final file liftUp -nohead /cluster/data/hg17/bed/sts/primers/primers.psl \ /cluster/data/hg17/jkStuff/liftAll.lft warn \ primers.final.unlifted.fix.psl # Extract relevant info, make alignments unique, and create final file to be merged # with full sequence alignments /cluster/bin/scripts/extractPslInfo primers.psl /cluster/bin/scripts/findAccession -agp primers.psl.initial \ /cluster/data/hg17 #rm primers.psl.initial /cluster/bin/scripts/getStsId /cluster/data/ncbi/sts.9/stsInfo2.bed \ primers.psl.initial.acc \ | sort -k 4n > primers.final #rm primers.psl.initial.acc wc -l primers.final # 314713 primers.final # Merge primer and sequence files to create final bed file # Merge (combineSeqPrimerPos) takes about an hour to run ssh kolossus cd /cluster/data/hg17/bed/sts /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final primers/primers.final # creates *_pos.rdb /cluster/bin/scripts/createSTSbed /cluster/data/ncbi/sts.9/stsInfo2.bed \ stsMarkers_pos.rdb > stsMap.bed # Set up sequence files ssh hgwdev mkdir -p /gbdb/hg17/sts.9/ ln -s /cluster/data/ncbi/sts.9/all.STS.fa /gbdb/hg17/sts.9/all.STS.fa ln -s /cluster/data/ncbi/sts.9/all.primers.fa \ /gbdb/hg17/sts.9/all.primers.fa # Load all files cd /cluster/data/hg17/bed/sts hgLoadSeq hg17 /gbdb/hg17/sts.9/all.STS.fa /gbdb/hg17/sts.9/all.primers.fa hgsql hg17 < ~kent/src/hg/lib/stsInfo2.sql hgsql hg17 < ~kent/src/hg/lib/stsAlias.sql cp /cluster/data/ncbi/sts.9/{stsInfo2.bed,stsAlias.bed} . hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2' hgsql hg17 -e 'load data local infile "stsAlias.bed" into table stsAlias' hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \ hg17 stsMap stsMap.bed hgLoadPsl -nobin -table=all_sts_primer hg17 primers/primers.psl hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl # update of information for D21S167 and D21S168 (2005-02-11, hartera) # currently X52289 associated with D21S168 # and X53367 associated with D21S167 - these need to be switched as they # are causing incorrect positioning # On Terry's advice, # first manually update the accession field stsInfo2.bed so that the # corrected version is carried through to the next version cd /cluster/data/hg17/bed/sts # manually change accessions in this file so now X52289 is associated # with D21S167 and X53367 is now associated with D21S168 # manually update the chromStart and chromEnd fields for these # records in stsMap.bed # this change was not carried through after filtering to change stsMap.bed # again and reload this table (DONE, 2005-02-18, hartera) chr21 39867340 39867513 D21S167 1000 7888 AF064860 # becomes chr21 37117635 37117858 D21S167 1000 7888 AF064860 chr21 37117635 37117858 D21S168 1000 103256 AP000699 # becomes chr21 39867340 39867513 D21S168 1000 103256 AP000699 # then reload the stsMap.bed and stsInfo2.bed files # copy this updated bed file back to ncbi directory cp stsInfo2.bed /cluster/data/ncbi/sts.9/ # delete previous data before reloading tables hgsql hg17 -e 'delete from stsInfo2' hgsql hg17 -e 'drop table stsMap' hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2' hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \ hg17 stsMap stsMap.bed # (2005-02-19, hartera) # also need to update the psl alignment file and reload into all_sts_seq # for D21S168, the id is 103256, this is qName in the psl file # for D21S167, the id is 7888 cd /cluster/data/hg17/bed/sts # manually update the stsMarkers.lifted.psl file with the new # co-ordinates as above. # (2005-02-23) Correct alignments. # need to swap the names for the alignments not just the start and end # coords as before as now the rest of the alignment data fields in the # table are incorrect. Change the start and end co-ordinates and just swap # the names for D21S167 and D21S168 in the psl file then reload the table. # sort on the ID field (qName) sort -k 10n stsMarkers.lifted.psl > sts.lifted.sort mv sts.lifted.sort stsMarkers.lifted.psl hgsql hg17 -e 'drop table all_sts_seq' hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl # Add new information after filtering the noOoc files # (DONE, 2005-02-17, hartera) # latest psl file: stsMarker.lifted.new.psl is in fix dir # Merge primer and sequence files to create final bed file ssh kolossus cd /cluster/data/hg17/bed/sts/fix nice /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final.new \ ../primers/primers.final # creates *_pos.rdb /cluster/bin/scripts/createSTSbed /cluster/data/ncbi/sts.9/stsInfo2.bed \ stsMarkers_pos.rdb > stsMap.bed awk '{print $6;}' stsMap.bed | sort -n | uniq > stsMap.ids diff stsMap.ids filt1000.ids # There is only 1 id that does not make it into this set (109375) # There are 38 of the IDs to remove that do not appear in stsMap.ids # there are 65 therefore that appear in stsMap.bed: noOoctoremoveinStsMap foreach i (`cat noOoctoremoveinStsMap`) awk 'BEGIN {OFS = "\t"} {if ($10 == "'$i'" && $8 >= 1000) \ print $14, $16, $17, $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl \ >> stsMap.noOoc.toRemove.coords end sort stsMap.noOoc.toRemove.coords > stsMap.noOoc.toRemove.coords.sort wc -l stsMap.noOoc.toRemove.coords.sort # 122 # get the equivalent co-ordinates from stsMap.bed foreach i (`cat noOoctoremoveinStsMap`) awk 'BEGIN {OFS = "\t"} {if ($6 == "'$i'") print $1,$2,$3,$6;}' \ stsMap.bed >> stsMap.toRemove.coords end sort stsMap.toRemove.coords > stsMap.toRemove.coords.sort wc -l stsMap.toRemove.coords.sort # 68 diff stsMap.noOoc.toRemove.coords stsMap.toRemove.coords.sort # They are different co-ordinates in each set although the same ID # is represented. # none of the noOoc alignments are in stsMarkers.lifted.psl so add cp ../stsMarkers.lifted.psl stsMarkers.lifted.psl awk '{print $10}' stsMarkers.lifted.psl | sort -n | uniq > sts.liftedpsl.ids # none of the noOoc alignments are in stsMarkers.lifted.psl so add # the filtered version cp stsMarkers.lifted.psl stsMarkers.lifted.new.psl cat stsMarkers.noOoc.lifted.filt1000.psl >> stsMarkers.lifted.new.psl wc -l stsMarkers.lifted.new.psl # 91890 awk '{print $1;}' ../stsInfo2.bed | sort -n | uniq > stsInfo2.ids # diff with filt1000.ids and noOoc.IdsToRemove.txt # all of these are in stsInfo2.bed # need to remove info for the filtered out set but only for the 38 that # were removed from stsMap.bed - noOocnotinstsMap perl removeById.pl noOocnotinstsMap stsInfo2.bed > stsInfo2.new.bed cat << '_EOF_' > removeById.pl #!/usr/bin/perl -w use strict; my $ids = $ARGV[0]; my $file = $ARGV[1]; # list of to remove open(IDS, $ids) || die "Can not open $ids: $!\n"; # file of stsMarkers.final open(FILE, $file) || die "Can not open $file: $!\n"; open(OUT, ">removedIds.txt") || die "Can not create removedIds.txt: $!\n"; my %idsHash; while () { chomp; my @a = split(/\t/); my $id = $a[0]; $idsHash{$id} = 1; } close IDS; while () { my $l = $_; my $found = "FALSE"; my @f = split(/\t/, $l); foreach my $k (keys(%idsHash)) { # if the id is contained in the key if ($k eq $f[0]) { $found = "TRUE"; print OUT "$f[0]\n"; } } if ($found eq "FALSE") { print $l; } } '_EOF_' # << emacs chmod +x removeById.pl # this removed data for all 38 of these Ids from stsInfo2.bed # need to reload database tables (2005-02-18, hartera) ssh hgwdev cd /cluster/data/hg17/bed/sts/fix hgsql hg17 -e 'drop table stsMap' hgsql hg17 -e 'drop table all_sts_seq' hgsql hg17 -e 'drop table stsInfo2' mv stsInfo2.new.bed stsInfo2.bed cp stsInfo2.bed /cluster/data/ncbi/sts.9/stsInfo2.bed mv stsMap.new.bed stsMap.bed mv stsMarkers.lifted.new.psl stsMarkers.lifted.psl hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \ hg17 stsMap stsMap.bed hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl hgsql hg17 < ~kent/src/hg/lib/stsInfo2.sql hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2' cd .. mkdir old mv stsMap.bed stsInfo2.bed stsMarkers.lifted.psl ./old mv ./fix/stsMap.bed ./fix/stsInfo2.bed ./fix/stsMarkers.lifted.psl . # Update of stsAlias table (DONE, 2005-02-24, hartera) # stsAlias filtered IDs removed # should have same IDs as in stsInfo2 ssh eieio cd /cluster/data/hg17/bed/sts/fix awk '{print $2;}' ../stsAlias.bed | sort -n | uniq > alias.ids # 145985 alias.ids awk '{print $6;}' ../stsMap.bed | sort -n | uniq > stsMap.new.ids.sort awk '{print $1;}' ../stsInfo2.bed | sort -n | uniq > stsInfo.new.ids.sort # 16678 ids in stsInfo2 that are not in stsMap # 16717 ids in stsAlias that are not in stsMap # 38 ids in stsAlias that are not in stsInfo2 cat stsMap.new.ids.sort stsInfo.new.ids.sort | sort -n | uniq \ > stsMapandInfo.ids.sort diff stsMapandInfo.ids.sort alias.ids | grep '>' > idstoremoveAlias # there are 38 of these IDs to remove perl -pi.bak -e 's/> //' idstoremoveAlias cp ../stsAlias.bed . foreach i (`cat idstoremoveAlias`) awk '{if ($2 != "'$i'") print;}' stsAlias.bed > stsAlias.tmp mv stsAlias.tmp stsAlias.bed end # check that ids are removed from file and that they are the correct ones # all looks good cd /cluster/data/hg17/bed/sts # save old stsAlias file and copy new one to sts dir and to ncbi sts dir mv stsAlias.bed ./old cp ./fix/stsAlias.bed . cp stsAlias.bed /cluster/data/ncbi/sts.9/stsAlias.bed ssh hgwdev # remove old table data and reload hgsql hg17 -e 'delete from stsAlias' hgsql hg17 -e 'load data local infile "stsAlias.bed" into table stsAlias' # PRUNE stsMap RECORDS (DONE 3/3/06) hgsql hg17 -e 'delete from stsMap where chromEnd-chromStart > 5000' # RECOMBINATION RATES (2004-07-13 Terry) # (2004-07-21 kate) # The STS MArkers track must be completed prior to creating this track ssh eieio cd /cluster/data/hg17/bed mv recombRate recombRate.terry mkdir -p recombRate cd recombRate # Copy other necessary files here (in future, can take from previous version) # NOTE: these are stable, and could be saved in a permanent spot cp /projects/hg2/booch/psl/info/decode_all . cp /projects/hg2/booch/psl/info/marshfield_all . cp /projects/hg2/booch/psl/info/genethon_all . # Determine maximum concordant set of markers for each of the maps /cluster/bin/scripts/assignGPsts -full -maxcon \ /cluster/data/ncbi/sts.9/stsAlias.bed \ /cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \ decode_all > decode.marker.rdb /cluster/bin/scripts/assignGPsts -full -maxcon \ /cluster/data/ncbi/sts.9/stsAlias.bed \ /cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \ marshfield_all > marshfield.marker.rdb /cluster/bin/scripts/assignGPsts -full -maxcon \ /cluster/data/ncbi/sts.9/stsAlias.bed \ /cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \ genethon_all > genethon.marker.rdb # Determine the rates for each of the maps /cluster/bin/scripts/markers_to_recomb_rate.terry.pl decode.marker.rdb \ /cluster/data/hg17/chrom.sizes 1000000 1000000 \ > decode_1mb_slide_1mb /cluster/bin/scripts/markers_to_recomb_rate.terry.pl genethon.marker.rdb \ /cluster/data/hg17/chrom.sizes 1000000 1000000 \ > genethon_1mb_slide_1mb # Marker number 2 at position 120005974 on chr9 is out of genetic distance order. DISCARDING /cluster/bin/scripts/markers_to_recomb_rate.terry.pl marshfield.marker.rdb \ /cluster/data/hg17/chrom.sizes 1000000 1000000 \ > marshfield_1mb_slide_1mb # Marker number 1 at position 124276104 on chr9 is out of genetic distance order. DISCARDING # Convert files to proper format /cluster/bin/scripts/convertRecombRate decode_1mb_slide_1mb \ /cluster/data/hg17/inserts \ /cluster/data/hg17 1000 > decode_1mb_slide_1mb_conv /cluster/bin/scripts/convertRecombRate marshfield_1mb_slide_1mb \ /cluster/data/hg17/inserts \ /cluster/data/hg17 1000 > marshfield_1mb_slide_1mb_conv /cluster/bin/scripts/convertRecombRate genethon_1mb_slide_1mb \ /cluster/data/hg17/inserts \ /cluster/data/hg17 1000 > genethon_1mb_slide_1mb_conv # Create bed file and load /cluster/bin/scripts/createRRbed decode_1mb_slide_1mb_conv \ marshfield_1mb_slide_1mb_conv genethon_1mb_slide_1mb_conv \ > recombRate.bed hgLoadBed -noBin -tab \ -sqlTable=/cluster/home/kent/src/hg/lib/recombRate.sql \ hg17 recombRate recombRate.bed # FISH CLONES (DONE 2004-07-22 Kate) # Reloaded 2004-09-36 after Terry Furey reworked fishClones.c # to improve scoring # The STS Marker, Coverage, and BAC End Pairs tracks must be completed prior to # creating this track ssh eieio mkdir -p /cluster/data/ncbi/fishClones/fishClones.2004-07/ cd /cluster/data/ncbi/fishClones/fishClones.2004-07/ # Download information from NCBI # point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg # change "Show details on sequence-tag" to "yes" # change "Download or Display" to "Download table for UNIX" # press Submit - save as /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt chmod 664 /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt # Get current clone/accession information wget http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out # Create initial Fish Clones bed file mkdir -p /cluster/data/hg17/bed/fishClones cd /cluster/data/hg17/bed/fishClones # Copy previous sts info from fhcrc (take from previous build in future) cp ~booch/tracks/fish/fhcrc.sts . fishClones -verbose=1 -fhcrc=fhcrc.sts -noBin hg17 \ /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt \ /cluster/data/ncbi/fishClones/fishClones.2004-07/clac.out \ /cluster/data/ncbi/bacends/human/bacends.4/cl_acc_gi_len \ /cluster/data/hg17/bed/bacends/lifted/bacEnds.lifted.psl \ fishClones_initial # Get sequences for accessions not in genome ssh eieio mkdir -p /cluster/bluearc/hg17/fishClones/ cd /cluster/bluearc/hg17/fishClones/ # goto http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide # select file "/cluster/data/hg17/bed/fishClones/fishClones_initial.acc" # change output to FASTA format # download results to "/cluster/bluearc/hg17/fishClones/notFound.fa" # Align these using blat cp ~booch/tracks/gs.17/build34/fish/convert.pl . cp ~booch/tracks/gs.17/build34/fish/blatAll.pl . # edited to use ooc file on bluearc, so can run on kolossus convert.pl < notFound.fa > notFound.convert.fa mkdir out blatAll.pl /cluster/data/hg17 notFound.convert.fa out # creates raw.psl, not.found.psl # Make final fishClones file with this new clone placement info cd /cluster/data/hg17/bed/fishClones fishClones -verbose=1 -fhcrc=fhcrc.sts -noBin \ -psl=/cluster/bluearc/hg17/fishClones/not.found.psl hg17 \ /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt \ /cluster/data/ncbi/fishClones/fishClones.2004-07/clac.out \ /cluster/data/ncbi/bacends/human/bacends.4/cl_acc_gi_len \ /cluster/data/hg17/bed/bacends/lifted/bacEnds.lifted.psl fishClones # Load the track ssh hgwdev cd /cluster/data/hg17/bed/fishClones hgLoadBed -noBin -tab \ -sqlTable=/cluster/home/kent/src/hg/lib/fishClones.sql \ hg17 fishClones fishClones.bed # Loaded 10601 elements of size 16 # fixed bad table entry (2004-08-12 kate) # NOTE: this won't be necessary in the future, as the fishClones program # will now accomodate more bad input data. hgsql hg17 -e "update fishClones set bandEnds='1q43,Yp' where name='RP11-188A4' and placeCount=2" # CHROMOSOME BANDS TRACK (2004-07-13 Terry) # This must wait until the Fish Clones tracks is done mkdir -p /cluster/data/hg17/bed/cytoband cd /cluster/data/hg17/bed/cytoband # Copy in some necessary files (usually from previous version) cp /projects/hg2/booch/psl/cytobands/pctSetBands.txt . cp /projects/hg2/booch/psl/cytobands/ISCN800.txt . # Create some preliminary information files /cluster/bin/scripts/createSetBands pctSetBands.txt \ /cluster/data/hg17/inserts /cluster/data/hg17 100 > setBands.txt /cluster/bin/scripts/makeBands ISCN800.txt /cluster/data/hg17 > cytobands.pct.bed /cluster/bin/scripts/makeBandRanges cytobands.pct.bed > cytobands.pct.ranges # Reformat fishClones file /cluster/bin/scripts/createBanderMarkers \ /cluster/data/hg17/bed/fishClones/fishClones.bed > fishClones.txt # Create bed file /cluster/bin/scripts/runBander fishClones.txt \ ISCN800.txt setBands.txt /cluster/data/hg17 # Should be 862 bands wc cytobands.bed # 862 4310 30748 cytobands.bed # Load track hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql \ hg17 cytoBand cytobands.bed # Load ideogram table hgLoadBed -noBin -tab -sqlTable=/cluster/home/booch/src/hg/lib/cytoBandIdeo.sql \ hg17 cytoBandIdeo cytobands.bed # CHROMOSOME BANDS TRACK REDO (2004-07-22 Kate) # Just to make sure we know the proper steps. # The tables were not reloaded, as Terry has already # sent the data to NCBI # This must wait until the Fish Clones tracks is done ssh kolossus mkdir -p /cluster/data/hg17/bed/cytoband.kate cd /cluster/data/hg17/bed/cytoband.kate # Copy in some necessary files (usually from previous version) cp /projects/hg2/booch/psl/cytobands/pctSetBands.txt . cp /projects/hg2/booch/psl/cytobands/ISCN800.txt . # Create some preliminary information files /cluster/bin/scripts/createSetBands pctSetBands.txt \ /cluster/data/hg17/inserts /cluster/data/hg17 100 > setBands.txt /cluster/bin/scripts/makeBands ISCN800.txt \ /cluster/data/hg17 > cytobands.pct.bed /cluster/bin/scripts/makeBandRanges cytobands.pct.bed \ > cytobands.pct.ranges # Reformat fishClones file /cluster/bin/scripts/createBanderMarkers \ /cluster/data/hg17/bed/fishClones/fishClones.bed > fishClones.txt # Create bed file ssh eieio cd /cluster/data/hg17/bed/cytoband.kate /cluster/bin/scripts/runBander fishClones.txt \ ISCN800.txt setBands.txt /cluster/data/hg17 # NOTE: fails on kolossus (C++ compiler different ??) # Should be 862 bands wc -l cytobands.bed # 862 cytobands.bed # NOTE - don't load tracks, as Terry has already sent his # versions to NCBI # Load track #hgLoadBed -noBin -tab \ # -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql \ # hg17 cytoBand cytobands.bed # Load ideogram table #hgLoadBed -noBin -tab \ # -sqlTable=/cluster/home/booch/src/hg/lib/cytoBandIdeo.sql \ # hg17 cytoBandIdeo cytobands.bed # LOAD AFFYRATIO (DONE - 2004-07-14 - Hiram) # Copied from Hg16 doc # Set up cluster job to align consenesus/exemplars to hg17 ssh eieio mkdir /cluster/bluearc/hg17/affyGnf cp -p /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa /cluster/bluearc/hg17/affyGnf ssh kkr1u00 mkdir -p /iscratch/i/affyGnf cp -p /cluster/bluearc/hg17/affyGnf/* /iscratch/i/affyGnf /cluster/bin/iSync ssh kki mkdir /cluster/data/hg17/bed/affyGnf.2004-06-09 cd /cluster/data/hg17/bed/affyGnf.2004-06-09 ls -1 /iscratch/i/affyGnf/* > affy.lst ls -1 /iscratch/i/gs.18/build35/maskedContigs/* > allctg.lst cat << '_EOF_' > template.sub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/gs.18/build35/hg17.11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 allctg.lst affy.lst template.sub jobList mkdir psl para create jobList # Completed: 380 of 380 jobs # CPU time in finished jobs: 2922s 48.70m 0.81h 0.03d 0.000 y # IO & Wait Time: 1146s 19.10m 0.32h 0.01d 0.000 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest job: 80s 1.33m 0.02h 0.00d # Submission to last job: 333s 5.55m 0.09h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU95.psl ssh eieio cd /cluster/data/hg17/bed/affyGnf.2004-06-09 pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least 95% identity in aligned # region. # minAli = 0.97 too high. low minCover as a lot of n's in these # sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 \ raw.psl contig.psl /dev/null liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl # Eliminate the long names sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl # Merge with spot data and load into database. added -chip flag to # affyPslAndAtlasToBed to allow correct parsing ssh hgwdev cd /cluster/data/hg17/bed/affyGnf.2004-06-09 /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 \ affyU95shortQname.psl \ /projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt \ affyRatio.bed affyRatio.exr > affyPslAndAtlasToBed.log 2>&1 hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg17 \ affyRatio affyRatio.bed # Loaded 12740 elements of size 15 mkdir affyU95 hgLoadPsl hg17 -table=affyU95 affyU95shortQname.psl # sequences loaded 2004-08-06 hgLoadSeq -abbr=U95Av2: hg17 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa # Advisory lock created # Creating .tab file # Adding /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa # 12386 sequences # Updating seq table # Advisory lock has been released # All done # Load AFFYUCLANORM, extended version of affyUcla track. Hopefully # final freeze of data set. (DONE - 2004-07-14 - Hiram) ssh kk mkdir /cluster/data/hg17/bed/affyUclaNorm cd /cluster/data/hg17/bed/affyUclaNorm cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa . ls -1 /scratch/hg/gs.18/build35/maskedContigs/* > contig.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy mkdir psl ls HG-U133AB_all.fa > affy.lst gensub2 contig.lst affy.lst gsub jobList para create jobList para try para check para push ... etc # Completed: 380 of 380 jobs # CPU time in finished jobs: 20070s 334.51m 5.58h 0.23d 0.001 y # IO & Wait Time: 162784s 2713.06m 45.22h 1.88d 0.005 y # Average job time: 481s 8.02m 0.13h 0.01d # Longest job: 735s 12.25m 0.20h 0.01d # Submission to last job: 771s 12.85m 0.21h 0.01d ssh eieio cd /cluster/data/hg17/bed/affyUclaNorm pslSort dirs hg17.affyU133AB_all.psl tmp psl wc hg17.affyU133AB_all.psl # 61022 1281401 12934919 hg17.affyU133AB_all.psl liftUp hg17.affyU133AB_all.lifted.psl \ /cluster/data/hg17/jkStuff/liftAll.lft warn hg17.affyU133AB_all.psl pslReps -minCover=0.5 -sizeMatters -minAli=0.97 \ -nearTop=0.005 hg17.affyU133AB_all.lifted.psl \ hg17.affyU133AB_all.lifted.pslReps.psl out.psr # Processed 61017 alignments affyUclaMergePslData -pslFile=hg17.affyU133AB_all.lifted.pslReps.psl \ -affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt \ -bedOut=hg17.affyUcla.bed \ -expRecordOut=hg17.affyUcla.expRecords \ -expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt ~/kent/src/hg/affyGnf/addUclaAnnotations.pl hg17.affyUcla.expRecords \ /projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg17.affyUcla.annotations.expRecords # Load the databases ssh hgwdev cd /cluster/data/hg17/bed/affyUclaNorm sed -e 's/affyRatio/affyUclaNorm/' ~/kent/src/hg/lib/affyRatio.sql \ > affyUclaNorm.sql hgLoadBed hg17 affyUclaNorm hg17.affyUcla.bed -sqlTable=affyUclaNorm.sql # MAKE AFFY U133 - made after above affyUclaNorm (DONE - 2004-07-15 - Hiram) # Someday the names can be fixed. ssh hgwdev mkdir /cluster/data/hg17/bed/affyU133 cd /cluster/data/hg17/bed/affyU133 ln -s ../affyUclaNorm/hg17.affyU133AB_all.lifted.pslReps.psl affyU133.psl hgLoadPsl hg17 affyU133.psl # hgsql -e "select count(*) from affyU133;" hg17 # row count in hg16: 45693, in hg17: 44620 hgLoadSeq hg17 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa # 44792 sequences # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN & FUGU (DONE 2004-06-10 kate) # In an email 2/13/04 to Angie, Arian said we could treat all # human repeats as # lineage-specific for human-chicken blastz. # and Angie did the same for fugu. # Lacking input from Arian, and using blastzSelf as a model, # I'm also using all human repeats for the human/chimp blastz. # Scripts expect *.out.spec filenames. ssh kkr1u00 cd /cluster/data/hg17 mkdir /iscratch/i/hg17/linSpecRep.chicken foreach f (/iscratch/i/hg17/rmsk/chr*.fa.out) cp -p $f /iscratch/i/hg17/linSpecRep.chicken/$f:t:r:r.out.spec end ln -s /iscratch/i/hg17/linSpecRep.chicken \ /iscratch/i/hg17/linSpecRep.fugu ln -s /iscratch/i/hg17/linSpecRep.chicken \ /iscratch/i/hg17/linSpecRep.chimp iSync # BLASTZ FUGU (FR1) (DONE 2004-06-24 kate) ssh kk mkdir -p /cluster/data/hg17/bed/blastz.fr1.2004-06-10 ln -s /cluster/data/hg17/bed/blastz.fr1.2004-06-10 \ /cluster/data/hg17/bed/blastz.fr1 cd /cluster/data/hg17/bed/blastz.fr1 # Set L=6000 (more relaxed than chicken) and abridge repeats. # Treat all repeats as lineage-specific (reuse linSpecRep.Chicken). cat << '_EOF_' > DEF # human vs. fugu export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from human-chicken. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.fugu SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Fugu SEQ2_DIR=/iscratch/i/fr1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/fr1/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastz.fr1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy bash # if a csh/tcsh user source DEF mkdir $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j # GOT HERE sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList # 11935 jobs para try para check para push # Completed: 11935 of 11935 jobs # CPU time in finished jobs: 4673316s 77888.60m 1298.14h 54.09d 0.148 y # IO & Wait Time: 329249s 5487.48m 91.46h 3.81d 0.010 y # Average job time: 419s 6.99m 0.12h 0.00d # Longest job: 714s 11.90m 0.20h 0.01d # Submission to last job: 5575s 92.92m 1.55h 0.06d # second cluster run: lift raw alignments -> lav dir ssh kki cd /cluster/data/hg17/bed/blastz.fr1 bash # if a csh/tcsh user source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList # 341 jobs para try para check para push # CPU time in finished jobs: 315s 5.26m 0.09h 0.00d 0.000 y # IO & Wait Time: 4451s 74.18m 1.24h 0.05d 0.000 y # Average job time: 14s 0.23m 0.00h 0.00d # Longest job: 107s 1.78m 0.03h 0.00d # Submission to last job: 368s 6.13m 0.10h 0.00d # third run: lav -> axt ssh kki cd /cluster/data/hg17/bed/blastz.fr1 mkdir axtChrom pslChrom run.2 cd run.2 cat << 'EOF' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin \ /iscratch/i/hg17/bothMaskedNibs /iscratch/i/fr1/nib stdout \ | axtSort stdin ../../axtChrom/$chr.axt axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl 'EOF' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList # 41 jobs para try para check para push # CHAIN FUGU BLASTZ (2004-06-11 kate) # Run axtChain on little cluster ssh kki cd /cluster/data/hg17/bed/blastz.fr1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/hg17/bed/blastz.fr1/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from chicken run. cat << '_EOF_' > temp.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line makes emacs coloring happy sed 's/ */\t/g' temp.gap > ../../fuguHumanTuned.gap rm -f temp.gap cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../fuguHumanTuned.gap \ -minScore=5000 $1 \ /iscratch/i/hg17/bothMaskedNibs \ /iscratch/i/fr1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList # 46 jobs para try para check para push # 1 crashed job -- chr6_hla_hap1.chain is empty # CPU time in finished jobs: 610s 10.16m 0.17h 0.01d 0.000 y # IO & Wait Time: 1644s 27.40m 0.46h 0.02d 0.000 y # Average job time: 50s 0.83m 0.01h 0.00d # Longest job: 233s 3.88m 0.06h 0.00d # Submission to last job: 339s 5.65m 0.09h 0.00d # now on the cluster server, sort chains ssh eieio cd /cluster/data/hg17/bed/blastz.fr1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # Load chains into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.fr1/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain hg17 ${c}_chainFr1 $i end featureBits hg16 chainFr1Link # 50709290 bases of 2865248791 (1.770%) in intersection # ANCIENT REPEAT TABLE (2004-06-11 kate) # The netClass operations requires an "ancientRepeat" table in one # of the databases. # This is a hand curated table obtained from Arian. ssh hgwdev mkdir -p /cluster/data/hg17/bed/ancientRepeat cd /cluster/data/hg17/bed/ancientRepeat # mysqldump needs write permission to this directory chmod 777 . hgsqldump --all --tab=. hg15 ancientRepeat chmod 775 . hgsql hg17 < ancientRepeat.sql echo "LOAD DATA LOCAL INFILE 'ancientRepeat.txt' into table ancientRepeat"\ | hgsql hg17 # NET FUGU BLASTZ (2004-06-11 kate) ssh eieio cd /cluster/data/hg17/bed/blastz.fr1/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/hg17/bed/blastz.fr1/axtChain netClass noClass.net hg17 fr1 human.net # Make a 'syntenic' subset: ssh eieio cd /cluster/data/hg17/bed/blastz.fr1/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.fr1/axtChain netFilter -minGap=10 human.net | hgLoadNet hg17 netFr1 stdin #netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyFr1 stdin # EXTRACT AXT'S AND MAF'S FROM THE NET (kate) # NOTE: Redo 2005-08-16 to fix overlap problem (use 8/05 netToAxt) # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) ssh kkstore2 cd /cluster/data/hg17/bed/blastz.fr1/axtChain netSplit human.net humanNet mkdir -p ../axtNet ../mafNet cat > makeMaf.csh << 'EOF' foreach f (humanNet/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/fr1/nib stdout | axtSort stdin ../axtNet/$c.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/fr1/chrom.sizes \ ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=fr1. end 'EOF' csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log mkdir -p /cluster/bluearc/hg17/mafNet cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/fr1 # FUGU FR1 DOWNLOADS (DONE 2004-09-17 kate) # REDO axtNet downloads for fix, above (2005-09-12 kate) # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) ssh kksilo cd /cluster/data/hg17/bed/blastz.fr1/axtChain ln -s all.chain fugu.chain mkdir gz gzip -c fugu.chain > gz/fugu.chain.gz gzip -c human.net > gz/fugu.net.gz cd ../axtNet nice gzip *.axt ssh kkstore02 cd /cluster/data/hg17/bed/blastz.fr1/axtNet gzip *.axt md5sum *.gz > md5sum.txt ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17 mkdir -p vsFr1 cd vsFr1 # Copy and edit README cp /cluster/data/hg17/bed/blastz.fr1/axtChain/gz/*.gz . md5sum *.gz > md5sum.txt mv axtNet axtNet.old ln -s /cluster/data/hg17/bed/blastz.fr1/axtNet . # PRODUCE FUGU BLAT ALIGNMENT (DONE - 2004-07-07 - Hiram) # Use masked scaffolds from fr1 assembly (same sequence as # previous BlatFugu, however it's repeat and TRF-masked). ssh kk mkdir /cluster/data/hg17/bed/blatFr1 cd /cluster/data/hg17/bed/blatFr1 mkdir psl # next time, use N?_?????? (to pick up NG_ contigs) foreach f ( `cat /cluster/data/hg17/contig.lst` ) set c=$f:t:r echo $c mkdir psl/$c end # create cluster job mkdir run cd run ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > human.lst cat << 'EOF' > gsub #LOOP /cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg17/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl} #ENDLOOP 'EOF' # << keep emacs happy gensub2 human.lst fugu.lst gsub jobList para create spec # 219640 jobs para try para check para push -maxQueue=300000 -maxPush=220000 para check # Completed: 219640 of 219640 jobs # CPU time in finished jobs: 5206945s 86782.41m 1446.37h 60.27d 0.165 y # IO & Wait Time: 797791s 13296.52m 221.61h 9.23d 0.025 y # Average job time: 27s 0.46m 0.01h 0.00d # Longest job: 951s 15.85m 0.26h 0.01d # Submission to last job: 7553s 125.88m 2.10h 0.09d # cd psl # count files with aligments # find . -not -size 427c | wc -l # 44558 # count files with no aligments # find . -size 427c | wc -l # 175463 # When cluster run is done, sort alignments # into chrom directory ssh eieio cd /cluster/data/hg17/bed/blatFr1 pslCat -dir psl/N?_?????? | \ liftUp -type=.psl stdout \ /cluster/data/hg17/jkStuff/liftAll.lft warn stdin | \ pslSortAcc nohead chrom temp stdin # 65 minutes ? # Processed 216595 lines into 1 temp files # Rename to correspond with tables and load into database: ssh hgwdev cd /cluster/data/hg17/bed/blatFr1/chrom foreach i (chr*.psl) set r = $i:r echo mv $i ${r}_blatFr1.psl mv $i ${r}_blatFr1.psl end # lift fugu scaffolds to Fugu browser chrUn, # so you can link to other browser. And don't need to load sequence cd /cluster/data/hg17/bed/blatFr1 liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl hgLoadPsl -table=blatFr1 hg17 all.psl # load of blatFr1 did not go as planned: 216595 record(s), # 0 row(s) skipped, 3 warning(s) loading psl.tab # featureBits hg17 blatFr1 refGene:CDS # 13563544 bases of 2866216770 (0.473%) in intersection # featureBits hg16 blatFr1 refGene:CDS # 13547219 bases of 2865248791 (0.473%) in intersection # featureBits hg15 blatFugu refGene:CDS # 12427544 bases of 2866466359 (0.434%) in intersection # BLASTZ RAT RN3 (DONE - 2004-06-14 - Hiram) ssh kk mkdir -p /cluster/data/hg17/bed/blastz.rn3.2004-06-11 cd /cluster/data/hg17/bed ln -s blastz.rn3.2004-06-11 blastz.rn3 cd blastz.rn3 cat << '_EOF_' > DEF # rat vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Human SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Rat SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/store5/gs.18/build35/bed/blastz.rn3 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # prepare first cluster run ssh kk cd /cluster/data/hg17/bed/blastz.rn3 source DEF # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run0.sh # it is a generic script and works for any assembly /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... Completed: 41943 of 41943 jobs CPU time in finished jobs: 15330421s 255507.02m 4258.45h 177.44d 0.486 y IO & Wait Time: 673809s 11230.15m 187.17h 7.80d 0.021 y Average job time: 382s 6.36m 0.11h 0.00d Longest job: 4651s 77.52m 1.29h 0.05d Submission to last job: 169197s 2819.95m 47.00h 1.96d # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/hg17/bed/blastz.rn3 # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh # fixup machine check, should be kki, not kk /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 341 of 341 jobs # CPU time in finished jobs: 1894s 31.56m 0.53h 0.02d 0.000 y # IO & Wait Time: 6271s 104.52m 1.74h 0.07d 0.000 y # Average job time: 24s 0.40m 0.01h 0.00d # Longest job: 131s 2.18m 0.04h 0.00d # Submission to last job: 590s 9.83m 0.16h 0.01d # Third cluster run to convert lav's to axt's cd /cluster/data/hg17/bed/blastz.rn3 # The copy of this in mm4 was broken, fixed here /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 46 of 46 jobs # CPU time in finished jobs: 426s 7.09m 0.12h 0.00d 0.000 y # IO & Wait Time: 7283s 121.39m 2.02h 0.08d 0.000 y # Average job time: 168s 2.79m 0.05h 0.00d # Longest job: 642s 10.70m 0.18h 0.01d # Submission to last job: 642s 10.70m 0.18h 0.01d # translate sorted axt files into psl ssh eieio cd /cluster/data/hg17/bed/blastz.rn3 mkdir pslChrom set tbl = "blastzRn3" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 30 minutes # Load database tables ssh hgwdev cd /cluster/data/hg17/bed/blastz.rn3/pslChrom for I in *.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I} echo "done: ${I}" done # this is a 55 minute job # Check results # featureBits hg16 blastzRn3 # 1013603401 bases of 2865248791 (35.376%) in intersection # featureBits hg17 blastzRn3 # 1013003285 bases of 2866216770 (35.343%) in intersection # CHAIN RN3 BLASTZ (DONE - 2004-06-14 - Hiram) # re-worked with no 'axtFilter -notQ_random' on the axtChain step - 2004-06-23 # used to be: axtFilter -notQ_random $1 | axtChain stdin # The axtChain is best run on the small kluster, or the kk9 kluster ssh kki mkdir -p /cluster/data/hg17/bed/blastz.rn3/axtChain/run1 cd /cluster/data/hg17/bed/blastz.rn3/axtChain/run1 mkdir out chain ls -1S /cluster/data/hg17/bed/blastz.rn3/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/rn3/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain # 46 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # Completed: 46 of 46 jobs # CPU time in finished jobs: 4645s 77.41m 1.29h 0.05d 0.000 y # IO & Wait Time: 6840s 114.00m 1.90h 0.08d 0.000 y # Average job time: 250s 4.16m 0.07h 0.00d # Longest job: 1539s 25.65m 0.43h 0.02d # Submission to last job: 3761s 62.68m 1.04h 0.04d # now on the file server, sort chains ssh eieio cd /cluster/data/hg17/bed/blastz.rn3/axtChain time chainMergeSort run1/chain/*.chain > all.chain # real 36m42.170s # user 4m55.970s # sys 1m49.840s time chainSplit chain all.chain # real 13m54.860s # user 4m50.370s # sys 1m3.260s # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/hg17/bed/blastz.rn3/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg17 ${c}_chainRn3 $i echo done $c end # featureBits hg17 chainRn3 # 2827052992 bases of 2866216770 (98.634%) in intersection # (with filter:) 2826192649 bases of 2866216770 (98.604%) in intersection # featureBits hg16 chainRn3 # 2830563493 bases of 2865248791 (98.789%) in intersection # NET RN3 (DONE - 2004-06-15 - Hiram) # Re-done due to Chain being re-done 2004-06-23 # NOTE: Redo net axt's and net maf's to fix overlaps, # (using 8/05 netToAxt). (2005-08-16 kate) ssh eieio cd /cluster/data/hg17/bed/blastz.rn3/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \ /cluster/data/rn3/chrom.sizes ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \ /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 2510467072, utime 19307 s/100, stime 3181 ssh hgwdev cd /cluster/data/hg17/bed/blastz.rn3/axtChain time netClass hNoClass.net hg17 rn3 rat.net \ -tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInRat \ -qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman # real 34m29.829s # user 11m30.440s # sys 1m52.730s # If things look good do ssh eieio cd /cluster/data/hg17/bed/blastz.rn3/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with time netFilter -syn rat.net > ratSyn.net # real 16m25.640s # user 7m41.330s # sys 1m1.150s # Load the nets into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.rn3/axtChain netFilter -minGap=10 rat.net | hgLoadNet hg17 netRn3 stdin netFilter -minGap=10 ratSyn.net | hgLoadNet hg17 syntenyNetRn3 stdin # real 37m0.199s # user 15m13.770s # sys 1m41.540s # check results # featureBits hg17 netRn3 # 2817656275 bases of 2866216770 (98.306%) in intersection # (with axtFilter) 2816623107 bases of 2866216770 (98.270%) in intersection # featureBits hg16 netRn3 # 2820958389 bases of 2865248791 (98.454%) in intersection # featureBits hg17 syntenyNetRn3 # 2781748096 bases of 2866216770 (97.053%) in intersection # (with axtFilter) 2780883450 bases of 2866216770 (97.023%) in intersection # featureBits hg16 syntenyNetRn3 # 2784011730 bases of 2865248791 (97.165%) in intersection # Add entries for net and chain to rat/hg17 trackDb # make net ssh eieio cd /cluster/data/hg17/bed/blastz.rn3/axtChain mkdir ratNet time netSplit rat.net ratNet # real 12m1.478s # user 8m35.050s # sys 1m7.230s # extract axts from net mkdir ../axtNet ../mafNet cat << 'EOF' > makeMaf.csh foreach n (ratNet/chr*.net) set c=$n:t:r echo $c netToAxt ratNet/$c.net chain/$c.chain \ /cluster/data/hg17/nib /cluster/data/rn3/nib stdout | \ axtSort stdin ../axtNet/$c.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/rn3/chrom.sizes \ ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=rn3. end 'EOF' csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log mkdir -p /cluster/bluearc/hg17/mafNet cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/rn3 ssh hgwdev mkdir -p /cluster/data/hg17/bed/blastz.rn3/axtBest cd /cluster/data/hg17/bed/blastz.rn3/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area ssh hgwdev cd /cluster/data/hg17/bed/blastz.rn3/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet gzip *.axt # add README.txt file to dir (use previous assembly's copy as template) # Convert those axt files to psl ssh eieio cd /cluster/data/hg17/bed/blastz.rn3 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestRn3.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestRn3.psl echo "Done: ${c}_blastzBestRn3.psl" end # Load tables ssh hgwdev cd /cluster/data/hg17/bed/blastz.rn3/pslBest for I in chr*BestRn3.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I} echo "done ${I}" done # check results # featureBits hg17 blastzBestRn3 # 975533772 bases of 2866216770 (34.036%) in intersection # (with axtFilter) 970005525 bases of 2866216770 (33.843%) in intersection # featureBits hg16 blastzBestRn3 # 976121391 bases of 2865248791 (34.068%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/hg17/axtBest/Rn3 cd /gbdb/hg17/axtBest/Rn3 ln -s /cluster/data/hg17/bed/blastz.rn3/axtNet/chr*.axt . cd /cluster/data/hg17/bed/blastz.rn3/axtNet rm -f axtInfoInserts.sql foreach f (/gbdb/hg17/axtBest/Rn3/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql # table axtInfo may already exist, ignore create error. hgsql hg17 < axtInfoInserts.sql # MAKING RAT SYNTENY (DONE - 2004-06-30 - Hiram) # Re-Done after above done without the axtFilter ssh hgwdev mkdir /cluster/data/hg17/bed/syntenyRn3 cd /cluster/data/hg17/bed/syntenyRn3 # Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3 cp -p /cluster/data/hg16/bed/syntenyMm3/*.pl . cp -p /cluster/data/hg16/bed/syntenyMm3/*.sh . ./syntenicBest.pl -db=hg17 -table=blastzBestRn3 ./smooth.pl ./joinsmallgaps.pl ./fillgap.pl -db=hg17 -table=blastzBestRn3 ./synteny2bed.pl # The five commands above # real 196m2.565s # user 0m21.170s # sys 0m4.690s # Used to load this in syntenyRn3, but that type is misleading to # the table browser and fails the checkTableCoords check. # Better to use this ensRatMusHom type: sed -e 's/ensPhusionBlast/ensRn3MusHom/g' \ $HOME/kent/src/hg/lib/ensPhusionBlast.sql \ > ensRn3MusHom.sql hgLoadBed hg17 ensRn3MusHom ucsc100k.bed -sqlTable=ensRn3MusHom.sql # featureBits hg17 ensRn3MusHom # 2592164486 bases of 2866216770 (90.439%) in intersection # featureBits hg16 syntenyRn3 # 2595919851 bases of 2865248791 (90.600%) in intersection # MAKING RAT AXTTIGHT FROM AXTBEST (DONE - 2004-06-15 - Hiram) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd /cluster/data/hg17/bed/blastz.rn3/axtNet mkdir -p ../axtTight foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn3.psl echo "Done: $i" end # Load tables into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.rn3/pslTight for I in chr*TightRn3.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I} echo "done ${I}" done # Compare results with previous assembly # featureBits hg17 blastzTightRn3 # 153936720 bases of 2866216770 (5.371%) in intersection # featureBits hg16 blastzTightRn3 # 153151903 bases of 2865248791 (5.345%) in intersection # copy axt's to download area ssh hgwdev cd /cluster/data/hg17/bed/blastz.rn3/axtTight mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight gzip *.axt # add README.txt file to dir (use previous assembly's copy as template) # REDO downloads with fixed axtNet's (2005=09-13 kate) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3 mv axtNet axtNet.old nice cp -rp /cluster/data/hg17/bed/blastz.rn3/axtNet . cd axtNet nice gzip *.axt md5sum *.axt.gz > md5sum.txt # BLASTZ RN3 CLEAN UP (DONE - 2004-07-02 - Hiram) ssh eieio cd /cluster/data/hg17/bed/blastz.rn3 nice rm -rf raw & nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/hNoClass.net & nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net & # BLASTZ CHICKEN (GALGAL2) (DONE - 2004-06-14 - Fan) ssh kk mkdir /cluster/data/hg17/bed/blastz.galGal2.2004-06-14 cd /cluster/data/hg17/bed ln -s /cluster/data/hg17/bed/blastz.galGal2.2004-06-14 blastz.galGal2 cd blastz.galGal2 # Set L=10000 (higher threshold on blastz's outer loop) and abridge # repeats. cat << '_EOF_' > DEF # human vs. chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.chicken SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken SEQ2_DIR=/iscratch/i/galGal2/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/store5/gs.18/build35/bed/blastz.galGal2 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # prepare first cluster run ssh kk cd /cluster/data/hg17/bed/blastz.galGal2 bash # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run0.sh # it is a generic script and works for any assembly /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... Completed: 41943 of 41943 jobs CPU time in finished jobs: 15330421s 255507.02m 4258.45h 177.44d 0.486 y IO & Wait Time: 673809s 11230.15m 187.17h 7.80d 0.021 y Average job time: 382s 6.36m 0.11h 0.00d Longest job: 4651s 77.52m 1.29h 0.05d Submission to last job: 169197s 2819.95m 47.00h 1.96d # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/hg17/bed/blastz.galGal2 bash # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh # fixup machine check, should be kki, not kk /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 341 of 341 jobs # CPU time in finished jobs: 1894s 31.56m 0.53h 0.02d 0.000 y # IO & Wait Time: 6271s 104.52m 1.74h 0.07d 0.000 y # Average job time: 24s 0.40m 0.01h 0.00d # Longest job: 131s 2.18m 0.04h 0.00d # Submission to last job: 590s 9.83m 0.16h 0.01d # Third cluster run to convert lav's to axt's cd /cluster/data/hg17/bed/blastz.galGal2 # The copy of this in mm4 was broken, fixed here /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 46 of 46 jobs # CPU time in finished jobs: 426s 7.09m 0.12h 0.00d 0.000 y # IO & Wait Time: 7283s 121.39m 2.02h 0.08d 0.000 y # Average job time: 168s 2.79m 0.05h 0.00d # Longest job: 642s 10.70m 0.18h 0.01d # Submission to last job: 642s 10.70m 0.18h 0.01d # translate sorted axt files into psl ssh eieio cd /cluster/data/hg17/bed/blastz.galGal2 mkdir pslChrom set tbl = "blastzGalGal2" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 30 minutes # Load database tables ssh hgwdev cd /cluster/data/hg17/bed/blastz.galGal2/pslChrom bash for I in *.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I} echo "done: ${I}" done # GNF ATLAS 2 (DONE - 2004-07-14 - Hiram # Align probes from GNF1H chip. ssh kk cd /cluster/data/hg17/bed mkdir -p geneAtlas2/run/psl cd geneAtlas2/run # This bluearc/geneAtlas2 directory already exists # mkdir -p /cluster/bluearc/geneAtlas2 # cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2 ls -1 /scratch/hg/gs.18/build35/maskedContigs > genome.lst ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst cat << '_EOF_' > gsub #LOOP blat -fine -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.18/build35/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.lst mrna.lst gsub jobList para create jobList para try para check para push para time # Completed: 380 of 380 jobs # CPU time in finished jobs: 10599s 176.65m 2.94h 0.12d 0.000 y # IO & Wait Time: 3893s 64.88m 1.08h 0.05d 0.000 y # Average job time: 38s 0.64m 0.01h 0.00d # Longest job: 649s 10.82m 0.18h 0.01d # Submission to last job: 663s 11.05m 0.18h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \ contig.psl /dev/null # Processed 80818 alignments liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl rm -r contig.psl raw.psl psl # Load probes and alignments from GNF1H into database. ssh hgwdev cd /cluster/data/hg17/bed/geneAtlas2 # Already symlinked # ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa \ # /gbdb/hgFixed/affyProbes hgLoadPsl hg17 affyGnf1h.psl hgLoadSeq hg17 /gbdb/hgFixed/affyProbes/gnf1h.fa grep -v U133B ../affyUclaNorm/hg17.affyU133AB_all.lifted.pslReps.psl \ | sed -e "s/exemplar://; s/consensus://; s/U133A://" \ | sed -e "s/;//" > affyU133A.psl hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \ affyU133A.psl /cluster/data/hg17/bed/geneAtlas2/affyGnf1h.psl # Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio # Mapped 32857, multiply-mapped 1462, missed 49, unmapped 11839 hgLoadBed hg17 gnfAtlas2 gnfAtlas2.bed # Loaded 34319 elements of size 15 # LOAD SNPS ( Daryl Thomas; November 7, 2004 ; snpExceptions added January 8, 2005 ; updated to build 124 on January 13, 2005; added affy snps March 5, 2004 ) set db = hg17 set org = human set build = 124 set dir = /cluster/bluearc/snp/$db/build$build # ssh to some quiet machine with fast access to the bluearc # it takes ~4.5 hours to download the data # (build 124 directly to /cluster/bluearc/... from eieio) # Check to make sure the chrMT file is included mkdir -p $dir $dir/ds_ch.xml $dir/det $dir/str $dir/loc $dir/seq cd $dir ln -s /cluster/data/$db/jkStuff/liftAll.lft . screen ftp ftp.ncbi.nih.gov cd snp/$org/XML prompt mget ds_ch*.xml.gz exit # screen exit # machine # TODO: check chromStart for each locType cp -f {$HOME}/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts chmod 775 /cluster/bin/scripts/parseDbSnpXML #ssh kk touch jobList foreach file ( /cluster/bluearc/snp/$db/build$build/ds_ch*.xml.gz ) set out = $file:t:r echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/$db/build$build $out.contig >> jobList end # I removed ds_chMulti.xml.gz and ds_chNotOn.xml.gz from the job list # para create jobList; para push; para check ... #Completed: 25 of 25 jobs #CPU time in finished jobs: 30120s 502.01m 8.37h 0.35d 0.001 y #IO & Wait Time: 2533s 42.21m 0.70h 0.03d 0.000 y #Average job time: 1306s 21.77m 0.36h 0.02d #Longest job: 2611s 43.52m 0.73h 0.03d #Submission to last job: 2611s 43.52m 0.73h 0.03d exit # kk mv -r $dir /cluster/data/$db/bed/snp/build$build set dir = /cluster/data/$db/bed/snp/build$build cd $dir # concatenate the details files to make it easier to lift (and load) time zcat det/ds_ch*.xml.contig.det.gz > $db.build$build.contig.bed # 33.380u 24.470s 1:54.79 50.3% 0+0k 0+0io 86pf+0w (hgwdev) time gzip $db.build$build.contig.bed # 251.160u 16.770s 12:40.77 35.2% 0+0k 0+0io 83pf+0w (hgwdev/bluearc - should have done it on eieio/store5) # some of the NT contigs are not in the liftSpec - this is expected as snps that map to # alternate assemblies (Celera) are in the original files, but we disregard their mappings. time liftUp $db.build$build.bed liftAll.lft warn $db.build$build.contig.bed.gz # 232.260u 30.050s 5:09.04 84.8% 0+0k 0+0io 379pf+0w (hgwdev/store5) time gzip hg17.build124.bed # 141.980u 8.180s 2:34.43 97.2% 0+0k 0+0io 83pf+0w # hgLoadBed is the important step - check to make sure there are no warnings time hgLoadBed $db snp $db.build$build.bed.gz -sqlTable=${HOME}/kent/src/hg/lib/snp.sql # Loaded 9131054 elements of size 16 # 225.040u 37.030s 35:20.45 12.3% 0+0k 0+0io 308pf+0w # basic snp table is now loaded, but exception column needs to be updated # ~ 3 hours wall clock time from here to end # run queries from snpException.query against snp table mkdir -p /usr/local/apache/htdocs/qa/test-results/snpException/build$build cd /usr/local/apache/htdocs/qa/test-results/snpException/build$build time snpException hg17 0 ${db}snpException > ${db}snpException.log chmod o+rx . chmod o+r * # 10.610u 19.200s 53:59.98 0.9% 0+0k 0+0io 264pf+0w # check alignment of flanking sequences time snpValid $db /cluster/data/$db/bed/snp/build$build/seq > ${db}snpValid.log # 5205.860u 216.570s 1:55:10.27 78.4% 0+0k 0+0io 72408pf+0w (hgwdev) ### NOTE: the pseudoautosomal snps are reported in the chrX files ### only, which causes problems for snpValid when checking the ### chrY snp mappings. I got around this by confirming that all ### of the 'missing flank' errors (#23) were in pseudoautosomal ### regions and ignoring them. I manually truncated the ### hg17snpException.23.bed file before continuing with the next ### step. This could/should be fixed in the next iteration. # create list of statements to update the snp table and run them time tail +3 ${db}snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt # ~10 seconds time updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql # 7.250u 0.390s 0:07.87 97.0% 0+0k 0+0io 337pf+0w time hgsql hg17 < updateExceptionList.sql # 8.420u 10.370s 11:58.44 2.6% 0+0k 0+0io 413pf+0w build123 (this is mostly a mysql process) # 6.550u 9.370s 14:34.17 1.8% 0+0k 0+0io 413pf+0w build124 # > wc -l build12*/updateExceptionList.sql # 387166 build123/updateExceptionList.sql # 383759 build124/updateExceptionList.sql # Add Affy SNPs from new submission #!/bin/csh -fe # rm -f log ; date ; ./loadAffySnps.csh > & log ; date ; cat log set db = hg17 cd /cluster/data/$db/bed/snp/affy/latest touch affy.txt affy.bed Affy.bed bed.tab rm -f affy*.txt affy*.bed Affy.bed* bed.tab # datafile was provided by Valmeekam, Venu [Venu_Valmeekam@affymetrix.com] tar xfz affyhg17maps_withstrand_alleles.tgz wc -l affy*txt awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10K\t0\n", $1,$2,$3,$4,$6,$7);}' < affy10K.txt > affy10K.bed awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10Kv2\t0\n", $1,$2,$3,$4,$6,$7);}' < affy10Kv2.txt > affy10Kv2.bed awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_HindIII\t0\n",$1,$2,$3,$4,$6,$7);}' < affy50K_HindIII.txt > affy50K_HindIII.bed awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_XbaI\t0\n", $1,$2,$3,$4,$6,$7);}' < affy50K_XbaI.txt > affy50K_XbaI.bed # this is a temporary kluge to fix some bad input data. cat affy*.bed | sed 's/_par//' > Affy.bed # the source enum for 'dbSnp' is 2; all of the affy* values are higher. hgsql $db -e "delete from snp where source > 2 " hgLoadBed $db snp Affy.bed -oldTable -tab rm -f affy*.txt affy*.bed bed.tab gzip Affy.bed #mysql> select source, count(*) from hg17.snp group by source; #+-----------------+----------+ #| source | count(*) | #+-----------------+----------+ #| dbSnp | 9131054 | #| Affy10K | 11344 | #| Affy10Kv2 | 10032 | #| Affy50K_HindIII | 56859 | #| Affy50K_XbaI | 58494 | #+-----------------+----------+ #March 7, 2005: fix pseudoautosomal snps: #SNP_A-1606360 #SNP_A-1606329 #SNP_A-1666553 #SNP_A-1715750 #SNP_A-1726331 #SNP_A-1685712 #SNP_A-1735899 #SNP_A-1726272 #SNP_A-1660936 #SNP_A-1662285 #SNP_A-1680848 #SNP_A-1671440 #SNP_A-1719355 #SNP_A-1716499 #SNP_A-1643847 #SNP_A-1646007 #SNP_A-1715285 #SNP_A-1657714 #SNP_A-1725038 #SNP_A-1713938 #SNP_A-1708565 #SNP_A-1510243 #SNP_A-1510197 #SNP_A-1606356 delete from snp where chrom = 'chrY' and name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356'); update snp set chrom = 'chrX' where name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356'); insert into snp select bin, 'chrY' as chrom, chromStart, chromEnd, name, score, strand, observed, molType, class, valid, avHet, avHetSE, func, locType, source, exception from snp where name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356'); select chrom, count(*) from snp where name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356') group by chrom;; ## LS-SNP links [load data only] (Daryl Thomas; November 3, 2005) # Data from Rachel Karchin in the Andrej Sali lab at UCSF # /cluster/data/hg17/bed/lssnp hgsql hg17 < ${HOME}/kent/src/hg/lib/lsSnpFunction.sql hgsql hg17 < ${HOME}/kent/src/hg/lib/lsSnpStructure.sql mysql> load data local infile "snp-human3-function-predictions.txt" into table lsSnpFunction; Query OK, 24337 rows affected (1.27 sec) mysql> load data local infile "snp-human3-structure-predictions.txt" into table lsSnpStructure; Query OK, 34764 rows affected (2.36 sec) # Tajima's D (DONE -- 2005-09-20 -- Daryl) # Data from Chris Carlson in Debbie Nickerson's lab # Chris Carlson [csc47uwashingtonedu] # get data from ftp site, unpack in $dir: # tar tvfz *gz | more # -rw-r--r-- chris/admin 34405061 2005-06-03 13:22:15 AD.SNP.track # -rw-r--r-- chris/admin 29869512 2005-06-03 13:22:30 ED.SNP.track # -rw-r--r-- chris/admin 27154049 2005-06-03 13:22:41 XD.SNP.track # -rw-r--r-- chris/admin 10948753 2005-06-02 21:12:27 AD.tajd.track # -rw-r--r-- chris/admin 10928630 2005-06-02 21:12:39 ED.tajd.track # -rw-r--r-- chris/admin 10926122 2005-06-02 21:12:51 XD.tajd.track set db=hg17 set dir=/cluster/data/$db/bed/tajdpoly/latest cd $dir tar xvfz TajDtracks.tar.gz mac2unix < AD.SNP.track | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpAd.bed mac2unix < ED.SNP.track | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpEd.bed mac2unix < XD.SNP.track | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpXd.bed mac2unix < AD.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdAd.bedGraph mac2unix < ED.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdEd.bedGraph mac2unix < XD.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdXd.bedGraph set chain = /cluster/data/hg17/bed/bedOver/hg17ToHg16.over.chain foreach pop (Ad Ed Xd) liftOver hg17.tajdSnp$pop.bed $chain hg16.tajdSnp$pop.bed hg17ToHg16.tajdSnp$pop.unmapped liftOver hg17.tajd$pop.bedGraph $chain hg16.tajd$pop.bedGraph hg17ToHg16.tajd$pop.unmapped foreach db (hg16 hg17) hgLoadBed -bedGraph=4 $db tajd$pop $db.tajd$pop.bedGraph hgLoadBed $db tajdSnp$pop $db.tajdSnp$pop.bed end end set where1 = "where t.bin=g.bin and t.chrom=g.chrom and (t.chromStart between g.chromStart and g.chromEnd or t.chromEnd between g.chromStart and g.chromEnd)" set where2 = "t, chromInfo c where t.chromStart < 0 or (t.chrom=c.chrom and t.chromEnd > c.size)" set list = "as pop, t.chrom, t.chromStart from" foreach db (hg16 hg17) rm -f $db.delete.sql touch $db.delete.sql foreach p (Ad Ed Xd SnpAd SnpEd SnpXd) foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) echo "select 'tajd$p' $list tajd${p} t,chr${c}_gap g $where1" | \ hgsql $db | \ grep -v pop | \ awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n",$1,$2,$3}' \ >> $db.delete.sql end echo "select 'tajd$p' $list tajd${p} $where2" | \ hgsql $db | \ grep -v pop | \ awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n",$1,$2,$3}'\ >> $db.delete.sql end hgsql $db < $db.delete.sql end # GENE SORTER (AKA: FAMILY BROWSER) (DONE - 2004-06-16 - Hiram) # Added knownToU133Plus2 track (2004-10-14) # to be done after knownGene tables are complete from known gene # process. # # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/hg17/bed/geneSorter.2004-06-15 ln -s /cluster/data/hg17/bed/geneSorter.2004-06-15 \ /cluster/data/hg17/bed/geneSorter cd /cluster/data/hg17/bed/geneSorter hgClusterGenes hg17 knownGene knownIsoforms knownCanonical # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/hg17/bed/geneSorter/blastp cd /cluster/data/hg17/bed/geneSorter/blastp pepPredToFa hg17 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /scratch/blast/formatdb -i known.faa -t known -n known # This command is in /projects/compbio/bin/$MACH/formatdb # Copy over database to bluearc rm -fr /cluster/bluearc/hg17/blastp mkdir -p /cluster/bluearc/hg17/blastp cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \ /cluster/bluearc/hg17/blastp # Had to pick up a new blastall binary (2004-06-15) # Our old one would no longer run on our systems that have # updated Linux versions mkdir /cluster/bluearc/blast229 cd /cluster/bluearc/blast229 wget --timestamping \ ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/blast-2.2.9-ia32-linux.tar.gz wget --timestamping \ ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/ChangeLog.txt wget --timestamping \ ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/ReleaseNotes.txt tar xvzf blast-2.2.9-ia32-linux.tar.gz # Split up fasta file into bite sized chunks for cluster cd /cluster/data/hg17/bed/geneSorter/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/self cd /cluster/data/hg17/bed/geneSorter/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \ -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod +x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~15 minutes if the cluster is free. Completed: 7749 of 7749 jobs CPU time in finished jobs: 182148s 3035.81m 50.60h 2.11d 0.006 y IO & Wait Time: 22954s 382.56m 6.38h 0.27d 0.001 y Average job time: 26s 0.44m 0.01h 0.00d Longest job: 372s 6.20m 0.10h 0.00d Submission to last job: 871s 14.52m 0.24h 0.01d # Load into database. This takes about 30 minutes ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out time hgLoadBlastTab hg17 knownBlastTab *.tab # Scanning through 7749 files # Loading database with 11799667 rows # Hg16 was: 11376875 rows # real 30m10.761s # user 5m25.490s # sys 1m0.630s cd /cluster/data/hg17/bed/geneSorter # Create table that maps between known genes and RefSeq hgMapToGene hg17 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # hgsql -e "select count(*) from knownToRefSeq;" hg17 # row count changed from 36078 in Hg16 to 36082 # Create table that maps between known genes and LocusLink hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \ > refToLl.txt hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt # hgsql -e "select count(*) from knownToLocusLink;" hg17 # row count went from 36078 in Hg16 to 36082 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam # hgsql -e "select count(*) from knownToPfam;" hg17 # row count dropped from 30467 in Hg16 to 29725 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # hgsql -e "select count(*) from knownToGnfAtlas2;" hg17 # row count droppted from 35817 in Hg16 to 35739 # Create expression distance table - takes about an hour hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 # Got 35739 unique elements in hgFixed.gnfHumanAtlas2MedianRatio # hgsql -e "select count(*) from gnfAtlas2Distance;" hg17 # row count went from 35,817,000 in Hg16 to 35,739,000 # real 108m1.671s # user 89m30.680s # sys 3m6.800s # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133 # hgsql -e "select count(*) from knownToU133;" hg17 # row count went from 37,634 in Hg16 to 36,795 # Create expression distance table. This will take about 2.5 hours cd /tmp cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight . time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \ -weights=affyUcla.weight -lookup=knownToU133 # 211 genes, 42 weights, 26.500000 total wieght # Got 36795 unique elements in affyUclaNorm # real 154m1.058s # user 134m45.000s # sys 3m1.990s # Create table that maps between known genes and # the GNF data. cd /tmp hgMapToGene hg17 affyU95 knownGene knownToU95 # row count went from 18780 in Hg16 to 18796 # hgFixed.gnfHumanU95Exps argument is unused, no need to exist hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \ hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 # row count went from 17711000 in Hg16 to 17710000 # real 21m37.703s # user 13m35.110s # sys 0m28.470s # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes only 10 minutes.) hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnf1h # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio # Got 9756 unique elements in hgFixed.gnfHumanAtlas2MedianRatio # create table mapping knownGenes to affyU133Plus2 table (2004-10-14, hartera) cd /cluster/data/hg17/bed/geneSorter hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2 # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. # I update this GO database very carefully, checking that all # structures in it remain the same from release to release and # backing up the current go DB in a backup database. In this case # the backup is go040107 - when it was loaded for Mm4, and the new # go database is based on data from Dec 17th 2003 and Feb 2004 according # to the time stamp on the fetched data. This build was done in # /cluster/store1/geneOntology/20040217 cd /cluster/data/hg17/bed/geneSorter XXX - DO NOT YET HAVE ensGene table - must wait on Ensembl to release that XXX - have not created the knownToEnsembl table yet - 2004-07-15 - Hiram # Create knownToEnsembl column hgMapToGene hg17 ensGene knownGene knownToEnsembl # table row count went from previous version: 36068 to 38251 # Make knownToCdsSnp table (DONE Nov 11, 2004, Heather) ssh hgwdev nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds # row count 165728 # unique 34013 # approx. 5 minutes running time # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/ce1/blastp should have data # The blast jobs below can be run on the kk or kk9 clusters # Create the ceBlastTab ssh kk9 mkdir /cluster/data/hg17/bed/geneSorter/blastp/ce1 cd /cluster/data/hg17/bed/geneSorter/blastp/ce1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/ce1/blastp/wormPep \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Only takes 10 minutes on an idle cluster # Completed: 7749 of 7749 jobs # CPU time in finished jobs: 32023s 533.72m 8.90h 0.37d 0.001 y # IO & Wait Time: 20643s 344.05m 5.73h 0.24d 0.001 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest job: 110s 1.83m 0.03h 0.00d # Submission to last job: 1911s 31.85m 0.53h 0.02d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/ce1/run/out hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab # row count went from 27620 to 27616 # Make mouse ortholog column using blastp on mouse known genes. # First make mouse protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeMm5.doc for procedure # the directory: /cluster/bluearc/scratch/mus/mm5/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm5 cd /cluster/data/hg17/bed/geneSorter/blastp/mm5 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7749 of 7749 jobs # CPU time in finished jobs: 139041s 2317.34m 38.62h 1.61d 0.004 y # IO & Wait Time: 21227s 353.79m 5.90h 0.25d 0.001 y # Average job time: 21s 0.34m 0.01h 0.00d # Longest job: 260s 4.33m 0.07h 0.00d # Submission to last job: 1137s 18.95m 0.32h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/mm5/run/out hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab # Scanning through 7748 files # row count went from 36471 to 36638 # Make rat ortholog column using blastp on rat known genes. # First make rat protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeRn3.doc for procedure. # Files were put in this directory: /cluster/bluearc/rn3/blastp/ # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/rn3/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... #Completed: 7749 of 7749 jobs #CPU time in finished jobs: 31035s 517.25m 8.62h 0.36d 0.001 y #IO & Wait Time: 38472s 641.20m 10.69h 0.45d 0.001 y #Average job time: 9s 0.15m 0.00h 0.00d #Longest job: 75s 1.25m 0.02h 0.00d #Submission to last job: 169s 2.82m 0.05h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab # Scanning through 7749 files #Loading database with 25574 rows # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dr1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/dr1 cd /cluster/data/hg17/bed/geneSorter/blastp/dr1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/dr1/blastp/ensembl \ -i $1 -o $2 -e 0.005 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7749 of 7749 jobs # CPU time in finished jobs: 100217s 1670.28m 27.84h 1.16d 0.003 y # IO & Wait Time: 23697s 394.95m 6.58h 0.27d 0.001 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest job: 233s 3.88m 0.06h 0.00d # Submission to last job: 1667s 27.78m 0.46h 0.02d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/dr1/run/out hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab # row count went from 32971 to 33023 # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/sc1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/sc1/blastp/sgd \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7749 of 7749 jobs # CPU time in finished jobs: 20738s 345.64m 5.76h 0.24d 0.001 y # IO & Wait Time: 22018s 366.96m 6.12h 0.25d 0.001 y # Average job time: 6s 0.09m 0.00h 0.00d # Longest job: 39s 0.65m 0.01h 0.00d # Submission to last job: 572s 9.53m 0.16h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab # row count went from 18286 to 18265 # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7749 of 7749 jobs # CPU time in finished jobs: 82022s 1367.03m 22.78h 0.95d 0.003 y # IO & Wait Time: 21982s 366.37m 6.11h 0.25d 0.001 y # Average job time: 13s 0.22m 0.00h 0.00d # Longest job: 174s 2.90m 0.05h 0.00d # Submission to last job: 1439s 23.98m 0.40h 0.02d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab # row count went from 29322 to 29341 #### Blat knownGene proteins to determine exons (braney 2004-06-20 DONE) ssh hgwdev cd /cluster/data/hg17/bed mkdir blat.hg17KG.2004-06-20 rm blat.hg17KG ln -s blat.hg17KG.2004-06-20 blat.hg17KG cd blat.hg17KG pepPredToFa hg17 knownGenePep known.fa hgPepPred hg17 generic blastKGPep00 known.fa grep ">" known.fa | sed "s/>//" > kgName.lst kgName hg17 kgName.lst blastKGRef00 hgsql hg17 < ~/kent/src/lib/hg/blastRef.sql echo "rename table blastRef to blastKGRef00" | hgsql hg17 echo "load data local infile 'blastKGRef00' into table blastKGRef00" | hgsql hg17 ssh kk cd /cluster/data/hg17/bed/blat.hg17KG cat << '_EOF_' > blatSome #!/bin/csh -fe /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3 '_EOF_' # << keep emacs happy chmod +x blatSome ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst mkdir kgfa cd kgfa faSplit sequence ../known.fa 3000 kg cd .. ls -1S kgfa/*.fa > kg.lst cat << '_EOF_' > blatGsub #LOOP blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy gensub2 human.lst kg.lst blatGsub blatSpec mkdir psl cd psl foreach i (`cat ../human.lst`) mkdir `basename $i .nib` end cd .. para create blatSpec para push # Completed: 133676 of 133676 jobs # CPU time in finished jobs: 29661130s 494352.16m 8239.20h 343.30d 0.941 y # IO & Wait Time: 2181179s 36352.99m 605.88h 25.25d 0.069 y # Average job time: 238s 3.97m 0.07h 0.00d # Longest job: 105972s 1766.20m 29.44h 1.23d ssh eieio cd /cluster/data/hg17/bed/blat.hg17KG pslSort dirs raw.psl /tmp psl/* pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null pslUniq cooked.psl hg17KG.psl pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft # BLASTZ MM4 (DONE - 2004-06-22 - Hiram) ssh kk mkdir -p /cluster/data/hg17/bed/blastz.mm4.2004-06-21 cd /cluster/data/hg17/bed ln -s blastz.mm4.2004-06-21 blastz.mm4 cd blastz.mm4 cat << '_EOF_' > DEF # human vs. mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Human SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Mouse SEQ2_DIR=/scratch/mus/mm4/softNib # RMSK not currently used SEQ2_RMSK=/scratch/mus/mm4/rmsk # FLAG not currently used SEQ2_FLAG=-rodent SEQ2_SMSK=/scratch/mus/mm4/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastz.mm4 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # prepare first cluster run ssh kk cd /cluster/data/hg17/bed/blastz.mm4 /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # Completed: 43648 of 43648 jobs # CPU time in finished jobs: 16448001s 274133.36m 4568.89h 190.37d 0.522 y # IO & Wait Time: 751666s 12527.76m 208.80h 8.70d 0.024 y # Average job time: 394s 6.57m 0.11h 0.00d # Longest job: 8323s 138.72m 2.31h 0.10d # Submission to last job: 44244s 737.40m 12.29h 0.51d # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/hg17/bed/blastz.mm4 /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 341 of 341 jobs # CPU time in finished jobs: 3925s 65.42m 1.09h 0.05d 0.000 y # IO & Wait Time: 6208s 103.46m 1.72h 0.07d 0.000 y # Average job time: 30s 0.50m 0.01h 0.00d # Longest job: 289s 4.82m 0.08h 0.00d # Submission to last job: 2800s 46.67m 0.78h 0.03d # Third cluster run to convert lav's to axt's # Does not work on kki since /scratch on the iservers is not the # same as /scratch on the other clusters. ssh kk cd /cluster/data/hg17/bed/blastz.mm4 /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 45 of 46 jobs # Crashed: 1 jobs # CPU time in finished jobs: 2389s 39.82m 0.66h 0.03d 0.000 y # IO & Wait Time: 13374s 222.90m 3.71h 0.15d 0.000 y # Average job time: 350s 5.84m 0.10h 0.00d # Longest job: 1426s 23.77m 0.40h 0.02d # Submission to last job: 1440s 24.00m 0.40h 0.02d # chr19 failing due to out of memory. Run this job individually # on kolossus, adjusting the location of the nib directories: ssh kolossus cd /cluster/data/hg17/bed/blastz.mm4 sed -e "s/i386/x86_64/g" /cluster/bin/scripts/blastz-chromlav2axt > \ x86_64-chromlav2axt chmod +x x86_64-chromlav2axt time ./x86_64-chromlav2axt \ /cluster/data/hg17/bed/blastz.mm4/lav/chr19 \ /cluster/data/hg17/bed/blastz.mm4/axtChrom/chr19.axt \ /cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs \ /cluster/bluearc/scratch/mus/mm4/softNib # real 24m28.955s # user 6m40.990s # sys 1m16.500s # translate sorted axt files into psl ssh eieio cd /cluster/data/hg17/bed/blastz.mm4 mkdir -p pslChrom set tbl = "blastzMm4" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # This takes more than an hour. You can shorten this by changing # that command to a simple echo, put the results into a file, # split the file into four parts and run the four files as shell # scripts on eieio to have four processes running at the same # time. Load on eieio gets up to about 20 which is reasonable. # Load database tables ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm4/pslChrom bash for F in chr*_blastzMm4.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${F} echo "${F} done" done # this is a 55 minute job # exit bash if you are tcsh # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of # memory. But if you reset your ~/.hg.conf to use the read-only # user and contact the hgwdev host, then use the x86_64 featureBits # featureBits hg16 blastzMm4 # 1056761609 bases of 2865248791 (36.882%) in intersection # featureBits hg17 blastzMm4 # 1056201417 bases of 2866216770 (36.850%) in intersection # CHAIN MM4 BLASTZ (DONE - 2004-06-29 - Hiram) # redone with the 'axtFilter -notQ_random' removed - 2004-06-23 # The axtChain is best run on the small kluster, or the kk9 kluster ssh kk9 mkdir -p /cluster/data/hg17/bed/blastz.mm4/axtChain/run1 cd /cluster/data/hg17/bed/blastz.mm4/axtChain/run1 mkdir out chain ls -1S /cluster/data/hg17/bed/blastz.mm4/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # May need -minScore=5000 for all chroms if chr19 won't finish on kolossus cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/mm4/softNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain # 46 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # Completed: 45 of 46 jobs # CPU time in finished jobs: 6575s 109.58m 1.83h 0.08d 0.000 y # IO & Wait Time: 9274s 154.57m 2.58h 0.11d 0.000 y # Average job time: 352s 5.87m 0.10h 0.00d # Longest job: 3121s 52.02m 0.87h 0.04d # Submission to last job: 3121s 52.02m 0.87h 0.04d # one job wouldn't finish due to memory usage # run the chr19 job on kolossus, takes an hour, gets up to 4 Gb # memory usage # now on the file server, sort chains ssh eieio cd /cluster/data/hg17/bed/blastz.mm4/axtChain time chainMergeSort run1/chain/*.chain > all.chain # real 17m17.639s # user 9m54.240s # sys 1m31.210s # (1.9 Gb result file !) time chainSplit chain all.chain # real 27m32.278s # user 9m46.970s # sys 2m45.960s # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm4/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg17 ${c}_chainMm4 $i echo done $c end # featureBits hg17 chainMm4 # 2829135227 bases of 2866216770 (98.706%) in intersection # featureBits hg16 chainMm4 # 2828363353 bases of 2865248791 (98.713%) in intersection # NET MM4 (DONE - 2004-06-29 - Hiram) ssh eieio cd /cluster/data/hg17/bed/blastz.mm4/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \ /cluster/data/mm4/chrom.sizes ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \ /cluster/data/mm4/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 2504171520, utime 19373 s/100, stime 5906 ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm4/axtChain time netClass hNoClass.net hg17 mm4 mouse.net \ -tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse \ -qNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman # real 19m33.421s # user 10m37.130s # sys 1m45.630s # If things look good do ssh eieio cd /cluster/data/hg17/bed/blastz.mm4/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with time netFilter -syn mouse.net > mouseSyn.net # real 13m24.885s # user 7m37.100s # sys 1m5.760s # Load the nets into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm4/axtChain netFilter -minGap=10 mouse.net | hgLoadNet hg17 netMm4 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet hg17 syntenyNetMm4 stdin # real 44m20.735s # user 15m58.620s # sys 1m58.720s # check results # featureBits hg17 netMm4 # 2824272033 bases of 2866216770 (98.537%) in intersection # featureBits hg16 netMm4 # 2823565051 bases of 2865248791 (98.545%) in intersection # featureBits hg17 syntenyNetMm4 # 2785830955 bases of 2866216770 (97.195%) in intersection # featureBits hg16 syntenyNetMm4 # 2786960572 bases of 2865248791 (97.268%) in intersection # Add entries for net and chain to mouse/hg17 trackDb # make net ssh eieio cd /cluster/data/hg17/bed/blastz.mm4/axtChain mkdir mouseNet time netSplit mouse.net mouseNet # real 12m1.478s # user 8m35.050s # sys 1m7.230s # extract axt's from net, and convert to maf's (DONE - Kate - 2004-06-24) ssh eieio cd /cluster/data/hg17/bed/blastz.mm4/axtChain mkdir ../axtNet ../mafNet cat > makeMaf.csh << '_EOF_' foreach f (mouseNet/chr*.net) set c = $f:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt mouseNet/$c.net chain/$c.chain \ /cluster/data/hg17/nib /cluster/data/mm4/nib stdout | \ axtSort stdin ../axtNet/$c.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/mm4/chrom.sizes \ ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=mm4. echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf" end '_EOF_' # << for emacs csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log ssh hgwdev mkdir -p /cluster/data/hg17/bed/blastz.mm4/axtBest cd /cluster/data/hg17/bed/blastz.mm4/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm4/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet gzip *.axt # add README.txt file to dir (use previous assembly's copy as template) # Convert those axt files to psl ssh eieio cd /cluster/data/hg17/bed/blastz.mm4 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestMm4.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestMm4.psl echo "Done: ${c}_blastzBestMm4.psl" end # Load tables ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm4/pslBest for I in chr*BestMm4.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I} echo "done ${I}" done # check results # featureBits hg17 blastzBestMm4 # 1017319919 bases of 2866216770 (35.493%) in intersection # featureBits hg16 blastzBestMm4 # 996722004 bases of 2865248791 (34.787%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/hg17/axtBest/Mm4 cd /gbdb/hg17/axtBest/Mm4 ln -s /cluster/data/hg17/bed/blastz.mm4/axtNet/chr*.axt . cd /cluster/data/hg17/bed/blastz.mm4/axtNet rm -f axtInfoInserts.sql foreach f (/gbdb/hg17/axtBest/Mm4/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('mm4','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql # table axtInfo may already exist, ignore create error. hgsql hg17 < axtInfoInserts.sql # MAKING MOUSE SYNTENY (DONE - 2004-06-29 - Hiram) ssh hgwdev mkdir /cluster/data/hg17/bed/syntenyMm4 cd /cluster/data/hg17/bed/syntenyMm4 # Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3 cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl . ./syntenicBest.pl -db=hg17 -table=blastzBestMm4 ./smooth.pl ./joinsmallgaps.pl ./fillgap.pl -db=hg17 -table=blastzBestMm4 ./synteny2bed.pl # The five commands above # real 220m16.227s # user 0m22.940s # sys 0m3.960s # Used to load this in syntenyMm4, but that type is misleading to # the table browser and fails the checkTableCoords check. # Better to use this ensRatMusHom type: # Need a new name here for the Mm4 to not conflict with Rn3 sed -e 's/ensPhusionBlast/ensRatMm4Hom/g' \ $HOME/kent/src/hg/lib/ensPhusionBlast.sql \ > ensRatMm4Hom.sql hgLoadBed hg17 ensRatMm4Hom ucsc100k.bed -sqlTable=ensRatMm4Hom.sql # featureBits hg17 ensRatMm4Hom # 2549307611 bases of 2866216770 (88.943%) in intersection # featureBits hg16 syntenyMm4 # 2560252977 bases of 2865248791 (89.355%) in intersection # MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-06-29 - Hiram) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd /cluster/data/hg17/bed/blastz.mm4/axtNet mkdir -p ../axtTight foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm4.psl echo "Done: $i" end # Load tables into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm4/pslTight for I in chr*TightMm4.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I} echo "done ${I}" done # Compare results with previous assembly: # featureBits hg17 blastzTightMm4 # 166569246 bases of 2866216770 (5.811%) in intersection # featureBits hg16 blastzTightMm4 # 162641577 bases of 2865248791 (5.676%) in intersection # copy axt's to download area ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm4/axtTight mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight gzip *.axt # add README.txt file to dir (use previous assembly's copy as template) # BLASTZ MM4 CLEAN UP (DONE - 2004-07-02 - Hiram) ssh eieio cd /cluster/data/hg17/bed/blastz.mm4 nice rm -rf raw & nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/hNoClass.net & nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net & # BLASTZ CHIMP panTro1 (DONE 2004-06-22 kate) # NOTE: Ran with abridge repeats=0, although SMSK was set # Looked better than running with abridge=1, which had very # chopped-up alignments ssh kk cd /cluster/data/hg17/bed mkdir -p blastz.panTro1.2004-06-22 rm -f blastz.panTro1 cd blastz.panTro1.2004-06-22 cat << 'EOF' > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # Specific settings for chimp BLASTZ_Y=3400 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/cluster/data/penn/human_chimp.q # TARGET: Human SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.chimp SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chimp SEQ2_DIR=/scratch/chimp/panTro1/nib # not currently used SEQ2_RMSK=/iscratch/i/chimp/panTro1/linSpecRep.human # not currently used SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastz.panTro1.2004-06-22 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len 'EOF' # << this line keeps emacs coloring happy # first cluster run: raw blastz alignments ssh kk bash # if a csh/tcsh user cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22 source DEF mkdir $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList # 160270 jobs written to batc para try, check, push, check, .... #CPU time in finished jobs: 2399227s 39987.11m 666.45h 27.77d 0.076 y #IO & Wait Time: 503100s 8385.00m 139.75h 5.82d 0.016 y #Average job time: 18s 0.30m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 2073s 34.55m 0.58h 0.02d #Submission to last job: 10843s 180.72m 3.01h 0.13d # second cluster run: lift raw alignments -> lav dir ssh kki bash # if a csh/tcsh user cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22 source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList # 341 jobs para try, check, push, etc ... # CPU time in finished jobs: 3458s 57.63m 0.96h 0.04d 0.000 y # IO & Wait Time: 57996s 966.60m 16.11h 0.67d 0.002 y # Average job time: 180s 3.00m 0.05h 0.00d # Longest job: 483s 8.05m 0.13h 0.01d # Submission to last job: 1498s 24.97m 0.42h 0.02d # third run: lav -> axt -> psl ssh kki cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22 mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | /cluster/bin/x86_64/lavToAxt stdin \ /iscratch/i/hg17/bothMaskedNibs /iscratch/i/chimp/panTro1/nib stdout \ | /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt /cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList for d in ../lav/chr*; do echo "do.csh $d" >> jobList done para create jobList # 46 jobs para try, check, push, check #Completed: 42 of 42 jobs #Average job time: 38s 0.64m 0.01h 0.00d #Longest job: 147s 2.45m 0.04h 0.00d #Submission to last job: 147s 2.45m 0.04h 0.00d # Load database tables (takes an hour or so) ssh hgwdev cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/pslChrom cat > load.csh << 'EOF' foreach f (chr*.psl) set table = $f:r_blastzPanTro1 echo "loading ${table}" /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 -table=$f:r_${table} $f end 'EOF' # << for emacs csh load.csh >&! load.log & tail -100f load.log # CHAIN CHIMP BLASTZ (6/23/04 kate) # Run axtChain on little cluster # first copy input to bluearc, as eieo bogs down if even mini-cluster # gets input from it !? ssh eieio cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22 cp -rp axtChrom /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom ssh kki cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -fe set c = $1:r:t axtChain $1 -scoreScheme=/cluster/data/blastz/human_chimp.q \ /iscratch/i/hg17/bothMaskedNibs \ /iscratch/i/chimp/panTro1/nib /tmp/$c.chain.$$ > /tmp/$c.out.$$ set ret = $status mv -f /tmp/$c.chain.$$ $2 mv -f /tmp/$c.out.$$ $3 exit $status '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # TODO rm -fr /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom echo "remove after 7/1/04" > /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom/README # now on the cluster server, sort chains ssh eieio cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # TODO rm run1/chain/*.chain echo "remove after 7/1/04" > run1/chain/README # Load chains into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain hg17 ${c}_chainPanTro1 $i end # TODO featureBits hg16 chainPanTro1Link #2627280557 bases of 2865248791 (91.695%) in intersection featureBits hg17 chainPanTro1Link # 2633869032 bases of 2866216770 (91.894%) in intersection # NET CHIMP (DONE 2004-6-24 kate) # Redone to make chimp.net on 2004-10-11 kate (other files have # new times, but are the same as 6-24 versions) ssh kolossus cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain #chainPreNet all.chain ../S1.len ../S2.len stdout \ #| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ #| netSyntenic stdin noClass.net time chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=10 ../S1.len ../S2.len human.net chimp.net # 42.860u 2.080s 2:11.11 34.2% netSyntenic human.net noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain netClass noClass.net hg17 panTro1 human.net rm noClass.net # Make a 'syntenic' subset: ssh eieio cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain # TODO #rm noClass.net # Make a 'syntenic' subset of these with # NOTE: we used -chimpSyn filtering for the reciprocal best nets # on hg16 -- perhaps should use for nets here as well netFilter -chimpSyn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain netFilter -minGap=10 human.net | hgLoadNet hg17 netPanTro1 stdin netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyPanTro1 stdin # Add entries for chainPanTro1, netPanTro1 to # human/hg17 trackDb # save chimp net to downloads area ssh eieio cd /cluster/data/hg17/blastz.panTro1/axtChain nice gzip chimp.net cp chimp.net.gz /usr/local/apache/htdocs/goldenPath/panTro1/vsHg17 cd /usr/local/apache/htdocs/goldenPath/panTro1/vsHg17 md5sum *.gz > md5sum.txt # RECIPROCAL BEST CHAINS FOR ENSEMBL GENE BUILD (DONE 2004-10-11 kate) # Starting with the chimp-reference net, which contains the best human # alignments to chimp, extract the subset of chains in the net. # (these are the "best" chains of human alignments to chimp). # Net these chains and use the resulting human-reference net (the # "reciprocal best" net). Extract the chains from this net to # obtain "reciprocal best" chains of chimp alignments to human. ssh kolossus cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain mkdir rBest grep chain all.chain | wc -l # extract "best" chains from the chimp-reference net time chainSwap all.chain stdout | \ netChainSubset chimp.net stdin stdout | \ chainSort stdin rBest/chimp.best.chain grep chain rBest/chimp.best.chain | wc -l # 64396 # for comparison later, extract "best" chains from human-reference net netChainSubset human.net all.chain stdout | \ chainSort stdin rBest/human.best.chain cd rBest # net the best chains from the chimp net and pull the human-ref net # (Daryl accidentally deleted human.rbest.net and rebuilt it with the # same command on 8/14/2005, resulting in a file of the same size) time chainPreNet chimp.best.chain ../../S2.len ../../S1.len stdout | \ chainNet stdin -minSpace=10 ../../S2.len ../../S1.len \ /dev/null human.rbest.net # extract "reciprocal best" chains from the "best" human-reference net netChainSubset human.rbest.net ../all.chain stdout | \ chainSort stdin human.rbest.chain # take a look ssh hgwdev cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain cd rBest mkdir rBestChain chainSplit rBestChain human.rbest.chain hgLoadChain hg17 chr7_rBestChainPanTro1 rBestChain/chr7.chain # Loading 1639 chains into hg17.chr7_rBestChainPanTro1 mkdir bestChain chainSplit bestChain human.best.chain hgLoadChain hg17 chr7_bestChainPanTro1 bestChain/chr7.chain # Loading 6516 chains into hg17.chr7_bestChainPanTro1 # compare hgsql hg16 -s -e "select count(*) from chr7_rBestChainPanTro1" # 2416 # spot-checked by comparing chr7 best and rbest: # 1. for a a chain appearing in rBest, click thru to human browser, # then via chimp net back to human browser at same region # 2. for a chain in "best", but not rBest, do the same, verify # that it produces a different region in the human browser # post pre-Q/A file for ensembl download cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain/rBest gzip human.rbest.chain cp human.rbest.chain.gz \ /usr/local/apache/htdocs/kate/ensembl/hg17-panTro1.rbest.chain.gz cd /usr/local/apache/htdocs/kate/ensembl md5sum *.gz > md5sum.txt mv hg17-panTro1.rbest.chain.gz /usr/local/apache/htdocs/hg17/vsPanTro1/hg17.panTro1.rbest.chain.gz # save as reciprocal best liftover chain (2005-02-22 kate) gunzip -c human.rbest.chain.gz > \ /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.rbest.chain # cleanup (TODO -- after QA) ssh hgwdev hgsql hg17 -e "drop table chr7_rBestChainPanTro1" hgsql hg17 -e "drop table chr7_bestChainPanTro1" cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain mv rBest/human.rbest.chain.gz .. rm -fr rBest # RECIPROCAL BEST AXT'S FROM RECIPROCAL BEST CHAIN (2005-08-16 kate) # (requested by Daryl) cd /cluster/data/hg17/bed/blastz.panTro1 mkdir -p axtRBestNet cat > makeRbestAxt.csh << 'EOF' foreach f (axtChain/rBest/rBestChain/*.chain) set c = $f:t:t:r echo $c chainToAxt $f /cluster/data/hg17/nib /cluster/data/panTro1/nib stdout \ | axtSort stdin axtRBestNet/$c.axt end 'EOF' # << for emacs csh makeRbestAxt.csh >&! makeRbestAxt.log & # GENERATE CHIMP MAF FOR MULTIZ FROM NET (DONE 2004-06-24 kate) # Redo to fix overlap problem using 8/05 netToAxt (2005-08-16 kate) # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) # There was apparently a bad chr5 nib for a while... ssh kkstore02 cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain netSplit human.net net mkdir axtNet mafNet cat > makeMaf.csh << 'EOF' foreach f (axtChain/net/*.net) set c = $f:t:r netToAxt $f axtChain/chain/$c.chain /cluster/data/hg17/nib \ /cluster/data/panTro1/nib stdout | axtSort stdin axtNet/$c.axt axtToMaf axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/panTro1/chrom.sizes \ mafNet/$c.maf -tPrefix=hg17. -qPrefix=panTro1. end 'EOF' # << for emacs csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log mkdir -p /cluster/bluearc/hg17/mafNet cp -rp mafNet /cluster/bluearc/hg17/mafNet/panTro1 # MAKE PANTRO1 DOWNLOADABLES (DONE 2004-09-14 kate) # Redo panTro1.net.gz (it was truncated) 2004-10-07 kate # Redo axtNets with non-overlapped versions (2005-08-29 kate) # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) ssh eieio cd /cluster/data/hg17/bed/blastz.panTro1 # gzip chains and nets mkdir gz cd gz nice gzip -c ../axtChain/all.chain > panTro1.chain.gz nice gzip -c ../axtChain/human.net > panTro1.net.gz wc -l *.gz cd ../axtNet time nice gzip *.axt # 46 mins. ssh hgwdev # copy chains and nets to downloads area cd /usr/local/apache/htdocs/goldenPath/hg17 mkdir -p vsPanTro1 cd vsPanTro1 mv /cluster/data/hg17/bed/blastz.panTro1/gz/*.gz . md5sum *.gz > md5sum.txt # copy in README and edit rmdir /cluster/data/hg17/bed/blastz.panTro1/gz mkdir -p axtNet cd axtNet cp /cluster/data/hg17/bed/blastz.panTro1/axtNet/*.axt.gz . md5sum *.gz > md5sum.txt # RESCORE CHICKEN BLASTZ (DONE 6/23/04 angie) # Webb noticed low scores when using non-default BLASTZ_Q scoring matrix # and repeats abridged -- # PSU's restore_rpts program rescored alignments with default matrix # instead of BLASTZ_Q matrix. Rescore them here so the chainer sees # the higher scores: ssh kolossus cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14 mkdir axtChrom.rescore foreach f (axtChrom/chr*.axt) axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \ $f axtChrom.rescore/$f:t end mv axtChrom axtChrom.preRescore mv axtChrom.rescore axtChrom # CHAIN CHICKEN BLASTZ (DONE 6/23/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \ -minScore=5000 $1 \ /iscratch/i/hg17/bothMaskedNibs \ /iscratch/i/galGal2/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # axtChrom/chr18_random.axt is empty, so the {out line +} check failed: #Completed: 45 of 46 jobs #Crashed: 1 jobs #Average job time: 46s 0.76m 0.01h 0.00d #Longest job: 273s 4.55m 0.08h 0.00d #Submission to last job: 519s 8.65m 0.14h 0.01d # now on the cluster server, sort chains ssh eieio cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # Load chains into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain hg17 ${c}_chainGalGal2 $i end # NET CHICKEN BLASTZ (DONE 6/23/04 angie) ssh eieio cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain netClass noClass.net hg17 galGal2 human.net # Make a 'syntenic' subset: ssh eieio cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain netFilter -minGap=10 human.net | hgLoadNet hg17 netGalGal2 stdin netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyGalGal2 stdin # Add entries for chainGalGal2, netGalGal2, syntenyGalGal2 to # human/hg17 trackDb # XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk) # see makeXenTro1.doc and search for zb.hg17 # The results of this are also symlinked under hg17/bed # GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 6/23/04 angie) # Redo net axt's and maf's to fix overlap problem (use 8/05 netToAxt) # (2005-08-16 kate) ssh kkstore02 cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain netSplit human.net net cd .. mkdir axtNet mafNet cat > makeMaf.csh << 'EOF' foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg17/nib \ /cluster/data/galGal2/nib stdout \ | axtSort stdin axtNet/$chr.axt axtToMaf axtNet/$chr.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/galGal2/chrom.sizes \ mafNet/$chr.maf -tPrefix=hg17. -qPrefix=galGal2. end 'EOF' # << for emacs csh makeMaf.csh >&! makeMaf.log & mkdir -p /cluster/bluearc/hg17/mafNet cp -rp mafNet /cluster/bluearc/hg17/mafNet/galGal2 # MAKE VSGALGAL2 DOWNLOADABLES (REDONE 9/13/04 angie) # REDO axtNet's to fix overlaps (2005-09-12 kate) ssh eieio cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain gzip -c all.chain > /cluster/data/hg17/zip/chicken.chain.gz gzip -c human.net > /cluster/data/hg17/zip/chicken.net.gz mkdir /cluster/data/hg17/zip/axtNet foreach f (axtNet/chr*axt) gzip -c $f > /cluster/data/hg17/zip/$f.gz end # Doh! above for loop didn't work because all axt's have been removed # from this dir! :| Just this once, regenerate compressed axtNet on # the fly: ssh kolossus cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain/net foreach f (*.net) set chr = $f:t:r echo $chr netToAxt $f ../chain/$chr.chain /cluster/data/hg17/nib \ /cluster/data/galGal2/nib stdout \ | axtSort stdin stdout \ | gzip -c > /cluster/data/hg17/zip/axtNet/$chr.axt.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2 cd /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2 mv /cluster/data/hg17/zip/chicken*.gz . mv /cluster/data/hg17/zip/axtNet . md5sum *.gz */*.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # REDO axtNet downloads to fix overlaps (2005-09-13 kate) ssh kkstore02 cd /cluster/data/hg17/bed/blastz.galGal2/axtNet nice gzip *.axt md5sum *.axt.gz > md5sum.txt ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2 mv axtNet axtNet.old ln -s /cluster/data/hg17/bed/blastz.galGal2/axtNet . # 8-WAY MULTIZ MULTIPLE ALIGNMENT WITH MM5 (DONE 2004-07-13 kate) # Redo, below to fix overlapping alignments (2005-08-16 kate) ssh eieio set multizDir = multiz.2004-07-13 set workingDir = /cluster/bluearc/hg17/$multizDir ln -s $workingDir /cluster/bluearc/hg17/multiz8way mkdir -p $workingDir mkdir -p /cluster/data/hg17/bed/$multizDir cd /cluster/data/hg17/bed/$multizDir # wrapper script for multiz # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) # NOTE: next time, modify script so it only needs one arg -- saves the # multiple dirname in a file for use by the next run cat << 'EOF' > doMultiz.csh #!/bin/csh -fe mkdir -p $3:h /cluster/bin/penn/multiz $1 $2 - > $3 'EOF' # << for emacs cat << 'EOF' > gsub #LOOP ../doMultiz.csh {check in line /cluster/bluearc/hg17/multiz.2004-07-13/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/hg17/multiz.2004-07-13/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/hg17/multiz.2004-07-13/$(root1)$(dir1)/$(root2).maf} #ENDLOOP 'EOF' # << for emacs chmod +x doMultiz.csh ssh eieio set workingDir = /cluster/bluearc/hg17/multiz.2004-07-13 # copy mafs to bluearc -- chimp mkdir $workingDir/panTro1 cp /cluster/data/hg17/bed/blastz.panTro1/mafNet/*.maf \ $workingDir/panTro1 ls $workingDir/panTro1/*.maf > chrom.lst # mouse mkdir $workingDir/mm5 cp /cluster/data/hg17/bed/blastz.mm5/mafNet/chr*.maf $workingDir/mm5 # rat mkdir $workingDir/rn3 cp /cluster/data/hg17/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3 # dog mkdir $workingDir/canFam1 foreach f (/cluster/data/hg17/bed/blastz.canFam1.2004-07-08/mafNet/chr*.maf) set c = $f:r:r:t echo $c cp $f $workingDir/canFam1/$c.maf end # chicken mkdir $workingDir/galGal2 foreach f (/cluster/data/hg17/bed/blastz.galGal2/mafNet/chr*.maf) set c = $f:r:r:t cp $f $workingDir/galGal2/$c.maf end # fugu mkdir $workingDir/fr1 cp /cluster/data/hg17/bed/blastz.fr1/mafNet/chr*.maf $workingDir/fr1 # zebrafish mkdir $workingDir/danRer1 cp /cluster/data/hg17/bed/blastz.danRer1.swap/mafNet/chr*.maf \ $workingDir/danRer1 # first multiz - add in mm5 mouse to human/chimp # ssh kki set multizDir = multiz.2004-07-13 set workingDir = /cluster/bluearc/hg17/$multizDir cd /cluster/data/hg17/bed/$multizDir mkdir run.mm5 cd run.mm5 echo "mm5/panTro1" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList para create jobList # 46 jobs para try, check, push, check # CPU time in finished jobs: 6620s 110.33m 1.84h 0.08d 0.000 y # IO & Wait Time: 3685s 61.42m 1.02h 0.04d 0.000 y # Average job time: 224s 3.73m 0.06h 0.00d # Longest job: 819s 13.65m 0.23h 0.01d # Submission to last job: 1474s 24.57m 0.41h 0.02d cd .. # rat mkdir run.rn3 cd run.rn3 echo "rn3/panTro1mm5" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList para create jobList # 46 jobs para try, check, push, check cd .. # dog mkdir run.canFam1 cd run.canFam1 echo "canFam1/panTro1mm5rn3" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList para create jobList # 46 jobs para try, check, push, check cd ../ # chicken mkdir run.galGal2 cd run.galGal2 echo "galGal2/panTro1mm5rn3canFam1" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList # no alignment file for chr18_random -- create one so we can create jobList touch $workingDir/galGal2/chr18_random.maf para create jobList # 46 jobs para try, check, push, check # 1 crashed job for empty file chr18_random cd .. # fugu mkdir run.fr1 cd run.fr1 echo "fr1/panTro1mm5rn3canFam1galGal2" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList # create empty alignment file for missing one (no alignments) touch /cluster/bluearc/hg17/multiz.2004-07-13/fr1/chr6_hla_hap1.maf para create jobList # 46 jobs para try, check, push, check # 1 crashed job for empty file chr6_hla_hap1 cd .. # zebrafish mkdir run.danRer1 cd run.danRer1 echo "danRer1/panTro1mm5rn3canFam1galGal2fr1" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList para create jobList # 46 jobs para try, check, push, check cd .. # copy 8-way mafs to build directory ssh eieio set multizDir = multiz.2004-07-13 set workingDir = /cluster/bluearc/hg17/$multizDir ln -s $workingDir/panTro1mm5rn3canFam1galGal2fr1danRer1 $workingDir/maf cd /cluster/data/hg17/bed/multiz.2004-07-13 mkdir maf cp $workingDir/maf/*.maf maf # copy to download area (2004-07-27 angie) # moved gzipped files to mafDownload dir and recreated symlinks # (2006-04-23 kate) cd /usr/local/apache/htdocs/goldenPath/hg17 mkdir -p mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 # gzipped & copied maf files from /cluster/data/hg17/bed/multiz8way/maf # dumped table and gzipped for download (user request to after file # removed when the track was replaced by 18way). cd /cluster/data/hg17/bed/multiz8way/mafDownloads hgsqldump --all -c --tab=. hg17 multiz8way ssh kkstore02 \ 'gzip /cluster/data/hg17/bed/multiz8way/mafDownloads/multiz8way.{sql,txt}' ln -s /cluster/data/hg17/bed/multiz8way/mafDownloads/multiz8way.{sql,txt}.gz \ /usr/local/apache/htdocs/goldenPath/hg17/multiz8way # load summary table (2005-09-27) cd /cluster/data/hg17/bed/multiz.2004-07-13/maf time cat chr*.maf | hgLoadMafSummary hg17 multiz8waySummary stdin # 30 minutes ? # NOTE: this didn't improve track display time at 5MB, so # I'm leaving out of trackDb (sticking with pairwise maf's) for now # It may be that this helps performance only with larger numbers # of species. # Create upstream files for download (2004-09-13 kate) ssh hgwdev cd /cluster/data/hg17/bed/multiz8way echo hg17 panTro1 mm5 rn3 canFam1 galGal2 fr1 danRer1 > org.txt # mafFrags takes a while foreach i (1000 2000 5000) echo "making upstream$i.maf" featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad mafFrags hg17 multiz8way up.bed upstream$i.maf -orgs=org.txt rm up.bed end ssh eieio cd /cluster/data/hg17/bed/multiz8way nice gzip upstream{1000,2000,5000}.maf # 6 mins. ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17 ln -s mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 multiz8way mv /cluster/data/hg17/bed/multiz8way/upstream*.maf.gz multiz8way # PHYLO-HMM (PHASTCONS) CONSERVATION FOR 8-WAY WITH MM5 (DONE 2004-07-20 kate) # (this was partially redone by acs using the new phastCons, 08-28; # I've tried to merge the two sets of docs into one cohesive # description) # More revisions, acs, 09-13 ssh eieio set path = ($path /cluster/bin/phast) cd /cluster/data/hg17/bed/multiz.2004-07-13 mkdir cons cd cons #break up the genome-wide MAFs into pieces mkdir /cluster/bluearc/hg17/chrom cd /cluster/data/hg17 foreach f (`cat chrom.lst`) echo $f cp $f/*.fa /cluster/bluearc/hg17/chrom end ssh kki cd /cluster/data/hg17/bed/multiz.2004-07-13/cons mkdir run.split cd run.split set WINDOWS = /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/sh PHAST=/cluster/bin/phast FA_SRC=/cluster/bluearc/hg17/chrom WINDOWS=/cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS maf=$1 c=`basename $maf .maf` echo $c mkdir -p /scratch/msa_split ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg17,panTro1,mm5,rn3,canFam1,galGal2,fr1,danRer1 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000 [ $? -eq 0 ] || exit 1 echo "Copying..." cd /scratch/msa_split for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done [ $? -eq 0 ] || exit 1 rm -f /scratch/msa_split/$c.*.ss echo "Done copying" echo "Done" >> ${WINDOWS}/$c.done 'EOF' # << for emacs chmod +x doSplit.sh rm -f jobList foreach file (/cluster/bluearc/hg17/multiz.2004-07-13/maf/*.maf) set c = $file:t:r echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList end para create jobList # 46 jobs para try para check para push # 2 crashed jobs -- due to no alignments in input maf # chr18_random, chr6_hla_hap1 cd .. # now generate conservation scores and predicted elements ssh hgwdev cd /cluster/data/hg17/bed/multiz.2004-07-13/cons mkdir run.elements # despite the name, I've put the elements and the new conservation # scores here # first produce a rough starting model; in this case, we can just # use the model previously estimated (see the entry below on PHYLOFIT/PHASTCONS) cp /cluster/bluearc/hg17/multiz.2004-07-13/panTro1mm5rn3canFam1/hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod starting-tree.mod # In other cases, it would be sufficient to choose an arbitrary # input file from the WINDOWS directory (choose one with plenty of # data, i.e., large NTUPLES) and run phyloFit on it with the # correct tree topology, e.g., # phyloFit -i SS datafile.ss --tree \ # "(((((hg17,panTro1),(mm5,rn3)),canFam1),galGal2),(fr1,danRer1))" \ # --out-root starting-tree # Get genome-wide average GC content (for all species together, # not just the reference genome). If you have a globally # estimated tree model, as above, you can get this from the # BACKGROUND line in the .mod file. E.g., # ALPHABET: A C G T # ... # BACKGROUND: 0.294633 0.205082 0.205189 0.295097 # This implies a GC content of 0.205 + 0.205 = 0.410 # If you do *not* have a global tree model and you do not know # your GC content, you can get it directly from the MAFs with # a command like: # msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1,galGal2,danRer1,fr1 \ # -i MAF --summary-only /cluster/data/hg17/bed/multiz.2004-07-13/maf/chr*.maf\ # > maf_summary.txt # This will take a little while (30-60 min). Run on eieio. # now set up cluster job to estimate model parameters. Parameters # will be estimated separately for each alignment fragment then # will be combined across fragments cat << 'EOF' > doEstimate.sh #!/bin/sh zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.410 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 12 --target-coverage 0.17 --quiet --log $2 --estimate-trees $3 EOF # Be sure to substitute in the right G+C content. Also, notice the # target coverage of 0.17. We actually want 5% coverage here but # the final (posterior) coverage is only indirectly related to the # expected (prior) coverage. One thing to consider is that we # only have about 40% alignment coverage (excluding chimp, which # doesn't help us much in identifying conserved regions). As far # as phastCons is concerned, we want to aim for about 0.05 / 0.4 = # 0.125 coverage. In this case, though, --target-coverage # 0.125 resulted in only about 4.1% coverage. I had to iterate # a couple of times (using only chromosome 1) to find a value that # got me close to the target of 5% chmod u+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES rm -f jobs.lst # watch out: bash assumed below in a few places for f in /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS/*.ss.gz ; do \ root=`basename $f .ss.gz` ;\ echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\ done # run cluster job ssh kk, cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements, para create, ... # takes about an hour # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ls TREES/*.cons.mod > cons.txt phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt ls TREES/*.noncons.mod > noncons.txt phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt # look over the files cons_summary.txt and noncons_summary.txt. # The means and medians should be roughly equal and the stdevs # should be reasonably small compared to the means, particularly # for rate matrix parameters (at bottom) and for branches to the # leaves of the tree. The stdevs may be fairly high for branches # near the root of the tree; that's okay. Some min values may be # 0 for some parameters. That's okay, but watch out for very large # values in the max column, which might skew the mean. If you see # any signs of bad outliers, you may have to track down the # responsible .mod files and throw them out. I've never had to do # this; the estimates generally seem pretty well behaved. # NOTE: Actually, a random sample of several hundred to a thousand # alignment fragments (say, a number equal to the number of # available cluster nodes) should be more than adequate for # parameter estimation. If pressed for time, use this strategy. # Now we are ready to set up the cluster job for computing the # conservation scores and predicted elements. It's all downhill # from here. cat << 'EOF' > doPhastCons.sh #!/bin/sh mkdir -p /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/ELEMENTS pref=`basename $1 .ss.gz` chr=`echo $pref | awk -F\. '{print $1}'` tmpfile=/scratch/phastCons.$$ zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 12 --target-coverage 0.17 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/hg17/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile gzip -c $tmpfile > /cluster/bluearc/hg17/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile EOF chmod u+x doPhastCons.sh rm -fr /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/ELEMENTS rm -f jobs2.lst for f in /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done # run cluster job ssh kk, cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements, para create, ... logout # takes about 20 minutes # combine predictions and transform scores to be in 0-1000 interval # do in a way that avoids limits on numbers of args find /cluster/bluearc/hg17/phastCons/ELEMENTS -name "*.bed" > files rm -f splitfiles* all.raw.bed split files splitfiles for s in splitfiles* ; do awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed ; done /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed rm files splitfiles* hgLoadBed hg17 phastConsElements all.bed hgLoadBed -chrom=chr1 hg17 phastConsElements all.bed # check coverage featureBits hg17 phastConsElements #137850739 bases of 2866216770 (4.810%) in intersection # This should be close enough. If necessary, you can rerun the # steps above with a different target coverage. When hitting the # target is important, you may want to perform several iterations # using a representative subset of the entire dataset (human chr1 # seems to work pretty well) # set up wiggle mkdir -p /cluster/bluearc/hg17/phastCons/wib cat << 'EOF' > doWigAsciiToBinary.sh #!/bin/sh chr=$1 zcat `ls /cluster/bluearc/hg17/phastCons/POSTPROBS/$chr.*.pp.gz | sort -t\. -k2,2n` | wigAsciiToBinary -chrom=$chr -wibFile=/cluster/bluearc/hg17/phastCons/wib/${chr}_phastCons stdin EOF chmod u+x doWigAsciiToBinary.sh rm -f jobs3.lst for chr in `ls /cluster/bluearc/hg17/phastCons/POSTPROBS | awk -F\. '{print $1}' | sort -u` ; do echo doWigAsciiToBinary.sh $chr >> jobs3.lst ; done # run a little wigAsciiToBinary cluster job ssh kk, etc. # copy wibs and wigs from bluearc rsync -av /cluster/bluearc/hg17/phastCons/wib . # load track hgLoadWiggle hg17 phastCons -pathPrefix=/gbdb/hg17/phastCons/wib \ wib/chr*_phastCons.wig mkdir -p /gbdb/hg17/phastCons/wib rm -f /gbdb/hg17/phastCons/wib/chr*phastCons.wib ln -s /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/wib/*.wib /gbdb/hg17/phastCons/wib chmod 775 . wib /gbdb/hg17/phastCons /gbdb/hg17/phastCons/wib chmod 664 wib/*.wib # move postprobs over and clean up bluearc rsync -av /cluster/bluearc/hg17/phastCons/POSTPROBS . # (people sometimes want the raw scores) rm -r /cluster/bluearc/hg17/phastCons/ELEMENTS /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/wib # set up full alignment/conservation track ("multiz8way") # load multiz maf tables ssh hgwdev cd /cluster/data/hg17/bed/multiz.2004-07-13 set mafDir = /gbdb/hg17/multiz8way/maf set table = multiz8way mkdir -p $mafDir/$table ln -s `pwd`/maf/*.maf $mafDir/$table cd maf hgLoadMaf hg17 -warn multiz8way -pathPrefix=$mafDir/$table/maf # someone dropped from hgwdev # reload (2007-03-19 kate) nice hgLoadMaf hg17 -warn multiz8way -pathPrefix=/gbdb/hg17/multiz8wayFixed cat /gbdb/hg17/multiz8wayFixed/*.maf | \ nice hgLoadMafSummary hg17 -minSize=30000 -mergeGap=1500 -maxSize=200000 \ multiz8waySummary stdin # load blastz maf tables # TODO: change mafWiggle to use db names instead of species names # in speciesOrder # link files into /gbdb table dir ln -s /cluster/data/hg17/bed/blastz.panTro1/mafNet $mafDir/chimp_netBlastz ln -s /cluster/data/hg17/bed/blastz.mm5/mafNet $mafDir/mouse_netBlastz ln -s /cluster/data/hg17/bed/blastz.rn3/mafNet $mafDir/rat_netBlastz ln -s /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/mafNet $mafDir/dog_netBlastz ln -s /cluster/data/hg17/bed/blastz.galGal2/mafNet $mafDir/chicken_netBlastz ln -s /cluster/data/hg17/bed/blastz.fr1/mafNet $mafDir/fugu_netBlastz ln -s /cluster/data/hg17/bed/blastz.danRer1.swap/mafNet $mafDir/zebrafish_netBlastz # remove empty file, disliked by hgLoadMaf # NOTE: these shouldn't be empty -- next time, make sure previous # alignments are copied over to output maf (multiz won't if there's # an empty input file). rm chicken/chr18_random.maf rm fugu/chr6_hla_hap1.maf # load tables foreach s (chimp mouse rat dog chicken fugu zebrafish) set table = ${s}_netBlastz echo "$s $mafDir/$table" ~kate/bin/i386/hgLoadMaf hg17 -warn ${s}_netBlastz -pathPrefix=$mafDir/$table end # trackDb entry: # track multiz8way # shortLabel Conservation # longLabel Chimp/Mouse/Rat/Dog/Chicken/Fugu/Zebrafish Multiz Alignments & Conservation # group compGeno # priority 149 # visibility pack #color 0, 10, 100 # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons # yLineOnOff Off # autoScale Off # pairwise netBlastz # speciesOrder chimp mouse rat dog chicken fugu zebrafish # PHASTCONS SCORES DOWNLOADABLES (REDONE 6/15/05 angie) # Initially done 10/11/04, but using scores from run.cons -- which # had been replaced by scores in run.elements, where I did not think # to look for scores. :( ! ssh eieio mkdir /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 cd /cluster/data/hg17/bed/multiz8way/cons/run.elements/POSTPROBS foreach chr (`awk '{print $1;}' /cluster/data/hg17/chrom.sizes`) echo $chr nice zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` \ | nice gzip -c \ > /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1/$chr.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg17/phastCons # Doh! /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 is 11G now -- # too much to dump on hgwdev's / which is at 94%. So don't do this: #mv /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 . # make symbolic links instead: mkdir /usr/local/apache/htdocs/goldenPath/hg17/phastCons/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 cd /usr/local/apache/htdocs/goldenPath/hg17/phastCons/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 ln -s /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1/* . md5sum *.gz > md5sum.txt # make a README.txt. # PHYLOFIT AND TREE-DOCTOR FOR 8-WAY: ESTIMATE PHYLOGENETIC TREE (acs) # (This was originally done for phastCons but is not necessary with # the new version. However, it may be useful for other purposes, so # I'm leaving it in as a separate entry.) # first estimate a model for the mammals ssh eieio cd /cluster/bluearc/hg17/multiz.2004-07-13/panTro1mm5rn3canFam1 # collect sufficient stats (takes maybe an hour) for file in *.maf ; do echo $file ; msa_view -i MAF $file -o SS --order hg17,panTro1,rn3,mm5,canFam1 > `basename $file .maf`.ss ; done ls *.ss | grep -v chr6_hla_hap2 > files msa_view '*files' --aggregate hg17,panTro1,rn3,mm5,canFam1 -i SS -o SS > all.ss # BTW, this can now be done in one step using something like: # msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1 -i MAF -o SS *.maf > all.ss # (modify to exclude certain files if necessary) # estimate model, with rate variation (takes about a minute) phyloFit all.ss --nrates 10 --tree "(((hg17,panTro1),(rn3,mm5)),canFam1)" --alpha 4.4 --EM --log log -i SS --out-root hprmc-rev-dg # (Actually, --nrates 4 should be more than adequate for most purposes) cat hprmc-rev-dg.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #NRATECATS: 10 #ALPHA: 4.658942 #TRAINING_LNL: -6889216721.159384 #BACKGROUND: 0.294633 0.205082 0.205189 0.295097 #RATE_MAT: # -0.865237 0.159990 0.554805 0.150442 # 0.229851 -1.194646 0.168269 0.796526 # 0.796651 0.168182 -1.194919 0.230086 # 0.150205 0.553556 0.159985 -0.863747 #TREE: (((1:0.006523,2:0.007997):0.103779,(3:0.104867,4:0.078911):0.265676):0.112364,5:0.112364); # now extrapolate to fish and chicken using tree_doctor and the CFTR 25 tree # (replace numbers with names in hprmc-rev-dg.mod; this won't be necessary in the future) tree_doctor --rename "1->hg17;2->panTro1;3->rn3;4->mm5;5->canFam1" hprmc-rev-dg.mod > hprmc-rev-dg.names.mod # (obtain 8-way subtree from cftr25_hybrid.nh; also map names as necessary to match above) tree_doctor /cluster/data/nisc/targets/cftr/phyloHMMcons25/cftr25_hybrid.nh --prune-all-but hg16,chimp,mm3,rn3,dog,chicken,fr1,zfish --rename "hg16->hg17;mm3->mm5;chimp->panTro1;dog->canFam1;chicken->galGal2;zfish->danRer1" > cftr8way.nh # now merge (see tree_doctor help page for explanation) tree_doctor hprmc-rev-dg.names.mod --merge cftr8way.nh > hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod cat hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #NRATECATS: 10 #ALPHA: 4.658942 #BACKGROUND: 0.294633 0.205082 0.205189 0.295097 #RATE_MAT: # -0.865237 0.159990 0.554805 0.150442 # 0.229851 -1.194646 0.168269 0.796526 # 0.796651 0.168182 -1.194919 0.230086 # 0.150205 0.553556 0.159985 -0.863747 #TREE: (((((hg17:0.006523,panTro1:0.007997):0.103779,(rn3:0.104867,mm5:0.078911):0.265676):0.019461,canFam1:0.205267):0.377150,galGal2:0.511134):0.536627,(danRer1:0.905323,fr1:0.922995):0.536627); # CONSERVED NON-CODING (CNS) TRACK (acs 08/29/04) # (depends on phastConsElements) cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements featureBits hg17 -bed=possibleCoding.bed -or twinscan:exon xenoMrna mrna intronEst # (add SGP, exoniphy, possib. others if available) # now filter out all phastCons elements that overlap overlapSelect -nonOverlapping possibleCoding.bed all.bed cns.bed hgLoadBed hg17 cns cns.bed # track cns # shortLabel CNS # longLabel Conserved Non-Coding (Cons Elements Minus Predicted Coding) # priority 109.11 # group compGeno # visibility hide # type bed 5 . # PRODUCING GENSCAN PREDICTIONS (DONE - 2004-07-08 - Hiram) # Needed to download a new binary for this run. Our Linux systems # XXX - I thought a new binary was needed. Turned out it was already # here in our hg3rdParty CVS project. All of this discussed here can # be simply fetched from cvs: cvs co hg3rdParty/genscanlinux # have been updated since the last time, the old binary would not # run. Go to: http://genes.mit.edu/GENSCAN.html # and then to: http://genes.mit.edu/license.html # Fill in the license agreement and you can then pick up the # README and the Linux version: genscanlinux.tar.uue.tgz # To uudecode that file, go to one of the Solaris home machines # and use the uudecode command: # uudecode genscanlinux.tar.uue.tgz # That produces the file: genscanlinux.tar # Which contains the files: # drwxr-xr-x chris/burgelab 0 2003-02-17 11:48:44 ./ # -rw-r--r-- chris/burgelab 219056 2000-09-07 12:39:26 ./Arabidopsis.smat # -rw-r--r-- chris/burgelab 6622 2000-09-07 12:39:26 ./HUMRASH # -rw-r--r-- chris/burgelab 849 2000-09-07 12:39:26 ./HUMRASH.sample # -rw-r--r-- chris/burgelab 219050 2000-09-07 12:39:26 ./HumanIso.smat # -rw-r--r-- chris/burgelab 155735 2000-09-07 12:39:26 ./Maize.smat # -rw-r--r-- chris/burgelab 24465 2000-09-07 12:39:26 ./README # -rw-r--r-- chris/burgelab 6344 2000-09-07 12:39:27 ./HUMRASH.ps # -rwxr-xr-x chris/burgelab 126365 2003-02-17 11:48:44 ./genscan # # I placed these currently in: /cluster/home/hiram/GENSCAN/ # I'll check with Angie where it should properly live ... # XXX - it already lives in 'cvs co hg3rdParty/genscanlinux' # These instructions should simple check it out right here in # bed/genscan and make the gsub command refer to these copies. ssh hgwdev mkdir /cluster/data/hg17/bed/genscan cd /cluster/data/hg17/bed/genscan cvs co hg3rdParty/genscanlinux ssh eieio cd /cluster/data/hg17/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the contigs # *that do not have pure Ns* (due to heterochromatin, unsequencable # stuff) which would cause genscan to run forever. rm -f genome.list bash for f in `cat /cluster/data/hg17/contig.lst` do egrep '[ACGT]' /cluster/data/hg17/$f.masked > /dev/null if [ $? = 0 ]; then echo /cluster/data/hg17/$f.masked >> genome.list fi done # exit your bash shell if you are [t]csh ... # This egrep matched all the contigs in hg17. I guess none of # them are complete Ns* at this point. # Log into kki (not kk !). kki is the driver node for the small # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the # big cluster, due to limitation of memory and swap space on each # processing node). ssh kki cd /cluster/data/hg17/bed/genscan # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 379 of 380 jobs # Crashed: 1 jobs # CPU time in finished jobs: 79998s 1333.30m 22.22h 0.93d 0.003 y # IO & Wait Time: 2989s 49.82m 0.83h 0.03d 0.000 y # Average job time: 219s 3.65m 0.06h 0.00d # Longest job: 2999s 49.98m 0.83h 0.03d # Submission to last job: 8324s 138.73m 2.31h 0.10d # Running the single failed job on kolossus with a smaller window: /cluster/bin/x86_64/gsBig /cluster/data/hg17/5/NT_006576/NT_006576.fa.masked \ gtf/NT_006576.fa.gtf -trans=pep/NT_006576.fa.pep \ -subopt=subopt/NT_006576.fa.bed -exe=hg3rdParty/genscanlinux/genscan \ -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000 # If there were out-of-memory problems (run "para problems"), then # re-run those jobs by hand but change the -window arg from 2400000 # something lower. In build33, this was 22/NT_011519 # In build34 there were NO failures ! # Convert these to chromosome level files as so: ssh eieio cd /cluster/data/hg17/bed/genscan $HOME/bin/i386/liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N*.gtf $HOME/bin/i386/liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft \ warn subopt/N*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/hg17/bed/genscan ldHgGene hg17 genscan genscan.gtf # 35 minute job # Read 42807 transcripts in 325994 lines in 1 files # 42807 groups 46 seqs 1 sources 1 feature types hgPepPred hg17 generic genscanPep genscan.pep # Processing genscan.pep hgLoadBed hg17 genscanSubopt genscanSubopt.bed # Reading genscanSubopt.bed # Loaded 517157 elements of size 6 # Sorted # Creating table definition for # Saving bed.tab # Loading hg17 # featureBits hg17 genscan # 55323340 bases of 2866216770 (1.930%) in intersection # featureBits hg16 genscan # 55333689 bases of 2865248791 (1.931%) in intersection # featureBits hg17 genscanSubopt # 55986178 bases of 2866216770 (1.953%) in intersection # featureBits hg16 genscanSubopt # 56082952 bases of 2865248791 (1.957%) in intersection # Should be zero intersection with rmsk # featureBits -chrom=chr1 hg17 genscan rmsk # 794 bases of 222827847 (0.000%) in intersection # EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 8/1/05 angie) # Originally done 7/1/04 for canFam1 -- redone 8/1/05 for canFam2. ssh kolossus cd /san/sanvol1/scratch/hg17/rmsk # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. # Even though we already have the human-mouse linSpecReps, # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl # additions. So add mouse, then ignore it. # Dog in extra column 1, Mouse in extra column 2 foreach outfl ( *.out ) echo "$outfl" /cluster/bluearc/RepeatMasker/DateRepeats \ ${outfl} -query human -comp dog -comp mouse end # Now extract dog (extra column 1), ignore mouse. cd .. mkdir linSpecRep.notInDog foreach f (rmsk/*.out_canis-familiaris_mus-musculus) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInDog/$base.out.spec end # Clean up. rm rmsk/*.out_canis* rsync -av /san/sanvol1/scratch/hg17/linSpecRep.notInDog \ /cluster/bluearc/scratch/hg/gs.18/build35/ # Ask cluster-admin for an rsync. # BLASTZ DOG (CANFAM1) (DONE 7/8/04 angie) ssh kk # space is awful tight on store4 -- use store7. mkdir -p /cluster/store7/hg17/bed/blastz.canFam1.2004-07-08 ln -s /cluster/store7/hg17/bed/blastz.canFam1.2004-07-08 \ /cluster/data/hg17/bed/ cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08 # Use default (Human-Mouse) settings for starters. cat << '_EOF_' > DEF # human vs. dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Default BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog SEQ2_DIR=/scratch/hg/canFam1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastz.canFam1.2004-07-08 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # first cluster run: raw blastz alignments ssh kk bash # if a csh/tcsh user cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08 source DEF mkdir $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, check, push, check, .... # Moving the human chr19 jobs up to the top of the jobList probably # would have shaved 4 hours off the total time! It was almost done # after 6 hours, except for a few chr19 stragglers. #Completed: 93775 of 93775 jobs #Average job time: 202s 3.37m 0.06h 0.00d #Longest job: 17806s 296.77m 4.95h 0.21d #Submission to last job: 35523s 592.05m 9.87h 0.41d # second cluster run: lift raw alignments -> lav dir ssh kki bash # if a csh/tcsh user cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08 source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList para try, check, push, etc ... #Completed: 341 of 341 jobs #Average job time: 36s 0.61m 0.01h 0.00d #Longest job: 302s 5.03m 0.08h 0.00d #Submission to last job: 1143s 19.05m 0.32h 0.01d # third run: lav -> axt # (if non-default BLASTZ_Q is used in the future, put axtRescore in # the pipe after lavToAxt) ssh kki cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08 mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | $HOME/bin/x86_64/lavToAxt stdin \ /iscratch/i/gs.18/build35/bothMaskedNibs /iscratch/i/canFam1/nib stdout \ | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check #Completed: 46 of 46 jobs #Average job time: 300s 5.00m 0.08h 0.00d #Longest job: 1669s 27.82m 0.46h 0.02d #Submission to last job: 1689s 28.15m 0.47h 0.02d # CHAIN DOG BLASTZ (DONE 7/9/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/canFam1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 46 of 46 jobs #Average job time: 266s 4.43m 0.07h 0.00d #Longest job: 3578s 59.63m 0.99h 0.04d #Submission to last job: 3578s 59.63m 0.99h 0.04d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Lots of chaff with scores in the 3000's. Many very-high-scoring # chains. So filter the chain down somewhat... mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered # Load chains into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg17 ${c}_chainCanFam1 $i end # Coverage is significantly higher than mouse: featureBits hg17 -chrom=chr1 chainCanFam1Link #123999291 bases of 222827847 (55.648%) in intersection # before filtering: 124750124 bases of 222827847 (55.985%) in intersection featureBits hg17 -chrom=chr1 chainMm5Link #83773012 bases of 222827847 (37.595%) in intersection # NET DOG BLASTZ (DONE 7/9/04 angie) ssh kolossus cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain netClass noClass.net hg17 canFam1 dog.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn dog.net > dogSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain netFilter -minGap=10 dog.net | hgLoadNet hg17 netCanFam1 stdin netFilter -minGap=10 dogSyn.net | hgLoadNet hg17 syntenyNetCanFam1 stdin # Add entries for chainCanFam1, netCanFam1 to human/hg17 trackDb # MAKE VSCANFAM1 DOWNLOADABLES (DONE 9/17/04 kate) ssh kksilo cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain ln -s all.chain dog.chain mkdir gz cd gz gzip -c ../dog.chain > dog.chain.gz gzip -c ../dog.net > dog.net.gz gzip ../dogSyn.net > dogSyn.net.gz # Angie's notes... # Mike Zody asked for raw blastz in chain format, so figure out some # way to translate axt or psl to chain and put it out there. # Actually, it's probably just hg16-canFam1 that he wants for now -- ? # Ask when we get to this point. cd ../axtNet time gzip *.axt ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17 mkdir -p vsCanFam1 cd vsCanFam1 mv /cluster/data/hg17/bed/blastz.canFam1/axtChain/gz/*.gz . md5sum *.gz > md5sum.txt mkdir -p axtNet cd axtNet cp /cluster/data/hg17/bed/blastz.canFam1/axtNet/*.axt.gz . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # REDO downloads of axtNet's to fix overlaps (2005-09-13 kate) # Finally, replace bad chr5 files (2006-01-05 kate) ssh kkstore02 cd /cluster/data/hg17/bed/blastz.canFam1/axtNet nice gzip *.axt md5sum *.axt.gz > md5sum.txt ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/vsCanFam1 mv axtNet axtNet.old ln -s /cluster/data/hg17/bed/blastz.canFam1/axtNet . # GENERATE CANFAM1 MAF FOR MULTIZ FROM NET (DONE 7/9/04 angie) # Redo net axt's and maf's to fix overlaps (use 8/5 netToAxt) # (2005-08-16 kate) # and replace bad chr5 files (2006-01-05 kate) ssh kkstore02 cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain netSplit dog.net net cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08 mkdir axtNet mafNet cat > makeMaf.csh << 'EOF' foreach f (axtChain/net/*) set chr = $f:t:r echo $chr netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg17/nib \ /cluster/data/canFam1/nib stdout \ | axtSort stdin axtNet/$chr.axt axtToMaf axtNet/$chr.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/canFam1/chrom.sizes \ mafNet/$chr.maf -tPrefix=hg17. -qPrefix=canFam1. end 'EOF' csh makeMaf.csh >&! makeMaf.log & mkdir -p /cluster/bluearc/hg17/mafNet cp -rp mafNet /cluster/bluearc/hg17/mafNet/canFam1 # BLASTZ MM5 (DONE - 2004-06-22 - Hiram) ssh kk mkdir -p /cluster/data/hg17/bed/blastz.mm5.2004-07-01 cd /cluster/data/hg17/bed ln -s blastz.mm5.2004-07-01 blastz.mm5 cd blastz.mm5 cat << '_EOF_' > DEF # human vs. mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Human SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat # notInRat OK as it is identical to notInMouse SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Mouse SEQ2_DIR=/scratch/mus/mm5/softNib # RMSK not currently used SEQ2_RMSK=/scratch/mus/mm5/rmsk # FLAG not currently used SEQ2_FLAG=-rodent SEQ2_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastz.mm5 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # prepare first cluster run ssh kk cd /cluster/data/hg17/bed/blastz.mm5 /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # Completed: 44330 of 44330 jobs # CPU time in finished jobs: 16250628s 270843.80m 4514.06h 188.09d 0.515 y # IO & Wait Time: 387936s 6465.60m 107.76h 4.49d 0.012 y # Average job time: 375s 6.26m 0.10h 0.00d # Longest job: 4417s 73.62m 1.23h 0.05d # Submission to last job: 43754s 729.23m 12.15h 0.51d # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/hg17/bed/blastz.mm5 /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 341 of 341 jobs # CPU time in finished jobs: 2189s 36.48m 0.61h 0.03d 0.000 y # IO & Wait Time: 7714s 128.57m 2.14h 0.09d 0.000 y # Average job time: 29s 0.48m 0.01h 0.00d # Longest job: 165s 2.75m 0.05h 0.00d # Submission to last job: 830s 13.83m 0.23h 0.01d # Third cluster run to convert lav's to axt's # Does not work on kki since /scratch on the iservers is not the # same as /scratch on the other clusters. ssh kk cd /cluster/data/hg17/bed/blastz.mm5 /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 45 of 46 jobs # Crashed: 1 jobs # CPU time in finished jobs: 1638s 27.30m 0.46h 0.02d 0.000 y # IO & Wait Time: 12068s 201.13m 3.35h 0.14d 0.000 y # Average job time: 305s 5.08m 0.08h 0.00d # Longest job: 1124s 18.73m 0.31h 0.01d # Submission to last job: 2519s 41.98m 0.70h 0.03d # chr19 takes too long, the axtSort becomes too large and the poor # node ends up swapping forever. When you are down to that last # job running, stop it and go to kolossus. # Adjusting the location of the nib directories, and fixing the # MACHTYPE on the commands in the blastz script: ssh kolossus cd /cluster/data/hg17/bed/blastz.mm5 sed -e "s/i386/x86_64/g" /cluster/bin/scripts/blastz-chromlav2axt > \ x86_64-chromlav2axt chmod +x x86_64-chromlav2axt time ./x86_64-chromlav2axt \ /cluster/data/hg17/bed/blastz.mm5/lav/chr19 \ /cluster/data/hg17/bed/blastz.mm5/axtChrom/chr19.axt \ /cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs \ /cluster/bluearc/scratch/mus/mm5/softNib # real 7m41.719s # user 2m2.850s # sys 0m23.070s # translate sorted axt files into psl ssh eieio cd /cluster/data/hg17/bed/blastz.mm5 mkdir -p pslChrom set tbl = "blastzMm5" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # This takes more than an hour. You can shorten this by changing # that command to a simple echo, put the results into a file, # split the file into four parts and run the four files as shell # scripts on eieio to have four processes running at the same # time. Load on eieio gets up to about 20 which is reasonable. # Load database tables ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm5/pslChrom bash # for tcsh users for F in chr*_blastzMm5.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${F} echo "${F} done" done # this is a 40 minute job # exit bash if you are tcsh # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of # memory. But if you reset your ~/.hg.conf to use the read-only # user and contact the hgwdev host, then use the x86_64 featureBits # featureBits hg16 blastzMm5 # 1056761609 bases of 2865248791 (36.882%) in intersection # featureBits hg17 blastzMm5 # 1052077141 bases of 2866216770 (36.706%) in intersection # featureBits hg17 blastzMm4 # 1056201417 bases of 2866216770 (36.850%) in intersection # CHAIN MM5 BLASTZ (DONE - 2004-07-02 - Hiram) # The axtChain is best run on the small kluster, or the kk9 kluster ssh kki mkdir -p /cluster/data/hg17/bed/blastz.mm5/axtChain/run1 cd /cluster/data/hg17/bed/blastz.mm5/axtChain/run1 mkdir out chain ls -1S /cluster/data/hg17/bed/blastz.mm5/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # May need -minScore=5000 for all chroms if chr19 won't finish on kolossus cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/mus/mm5/softNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain # 46 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # Completed: 46 of 46 jobs # CPU time in finished jobs: 4856s 80.94m 1.35h 0.06d 0.000 y # IO & Wait Time: 20083s 334.71m 5.58h 0.23d 0.001 y # Average job time: 542s 9.04m 0.15h 0.01d # Longest job: 2929s 48.82m 0.81h 0.03d # Submission to last job: 2929s 48.82m 0.81h 0.03d # now on the file server, sort chains ssh eieio cd /cluster/data/hg17/bed/blastz.mm5/axtChain time chainMergeSort run1/chain/*.chain > all.chain # real 8m42.853s # user 5m59.100s # sys 0m40.320s time chainSplit chain all.chain # real 10m52.224s # user 5m52.360s # sys 0m34.870s # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm5/axtChain/chain bash # for tcsh users for i in *.chain do c=${i/.chain/} hgLoadChain hg17 ${c}_chainMm5 $i echo done $c done # exit bash if you are tcsh # This is a 50 minute job # featureBits hg17 chainMm5 # 2834490112 bases of 2866216770 (98.893%) in intersection # featureBits hg17 chainMm4 # 2829135227 bases of 2866216770 (98.706%) in intersection # featureBits hg16 chainMm4 # 2828363353 bases of 2865248791 (98.713%) in intersection # NET MM5 (DONE - 2004-07-02 - Hiram) ssh eieio cd /cluster/data/hg17/bed/blastz.mm5/axtChain mkdir preNet cd chain bash # for tcsh users for i in *.chain do echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \ /cluster/data/mm5/chrom.sizes ../preNet/$i done # exit bash if you are tcsh # 15 minute job cd .. mkdir n1 cd preNet bash # for tcsh users for i in *.chain do n=${i/.chain/}.net echo primary netting $i $n /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \ /cluster/data/mm5/chrom.sizes ../n1/$n /dev/null done # exit bash if you are tcsh # 9 minute job cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 2546110464, utime 16327 s/100, stime 3546 ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm5/axtChain time netClass hNoClass.net hg17 mm5 mouse.net \ -tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse \ -qNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInHuman # real 16m38.098s # user 11m38.490s # sys 1m48.470s # If things look good do ssh eieio cd /cluster/data/hg17/bed/blastz.mm5/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with time netFilter -syn mouse.net > mouseSyn.net # real 12m3.701s # user 8m44.180s # sys 1m1.610s # Load the nets into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm5/axtChain netFilter -minGap=10 mouse.net | hgLoadNet hg17 netMm5 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet hg17 syntenyNetMm5 stdin # check results # featureBits hg17 netMm5 # 2830625630 bases of 2866216770 (98.758%) in intersection # featureBits hg17 netMm4 # 2824272033 bases of 2866216770 (98.537%) in intersection # featureBits hg16 netMm5 # 2823565051 bases of 2865248791 (98.545%) in intersection # featureBits hg17 syntenyNetMm5 # 2799194300 bases of 2866216770 (97.662%) in intersection # featureBits hg17 syntenyNetMm4 # 2785830955 bases of 2866216770 (97.195%) in intersection # featureBits hg16 syntenyNetMm5 # 2786960572 bases of 2865248791 (97.268%) in intersection # Add entries for net and chain to mouse/hg17 trackDb # make net ssh eieio cd /cluster/data/hg17/bed/blastz.mm5/axtChain mkdir mouseNet time netSplit mouse.net mouseNet # real 11m45.243s # user 8m48.490s # sys 1m13.490s # extract axt's from net, and convert to maf's # NOTE: Redo the net axt's and maf's using 8/05 netToAxt # in order to remove overlaps (2005-08-16 kate) ssh kkstore02 cd /cluster/data/hg17/bed/blastz.mm5/axtChain mkdir ../axtNet ../mafNet cat > makeMaf.csh << '_EOF_' #!/bin/csh -ef foreach f (mouseNet/chr*.net) set c = $f:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt mouseNet/$c.net chain/$c.chain \ /cluster/data/hg17/nib /cluster/data/mm5/nib stdout | \ axtSort stdin ../axtNet/$c.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/mm5/chrom.sizes \ ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=mm5. echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf" end '_EOF_' # << for emacs csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log mkdir -p /cluster/bluearc/hg17/mafNet cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/mm5 ssh hgwdev mkdir /cluster/data/hg17/bed/blastz.mm5/axtBest cd /cluster/data/hg17/bed/blastz.mm5/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm5/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet gzip *.axt # add README.txt file to dir (use previous assembly's copy as template) # 32 minute gzip # Convert those axt files to psl ssh eieio cd /cluster/data/hg17/bed/blastz.mm5 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo -n "processing $c.axt -> ${c}_blastzBestMm5.psl ..." /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestMm5.psl echo "Done" end # Load tables ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm5/pslBest for I in chr*BestMm5.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I} echo "done ${I}" done # check results # featureBits hg17 blastzBestMm5 # 1013348528 bases of 2866216770 (35.355%) in intersection # featureBits hg17 blastzBestMm4 # 1017319919 bases of 2866216770 (35.493%) in intersection # featureBits hg16 blastzBestMm5 # 996722004 bases of 2865248791 (34.787%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/hg17/axtBest/Mm5 cd /gbdb/hg17/axtBest/Mm5 ln -s /cluster/data/hg17/bed/blastz.mm5/axtNet/chr*.axt . cd /cluster/data/hg17/bed/blastz.mm5/axtNet rm -f axtInfoInserts.sql foreach f (/gbdb/hg17/axtBest/Mm5/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('mm5','Blastz Best in Genome','$chr','$f');" \ >>! axtInfoInserts.sql end hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql # table axtInfo may already exist, ignore create error. hgsql hg17 < axtInfoInserts.sql # REDO: replace downloadable axtNet's to remove overlaps (2005-09-12 kate) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5 mv axtNet axtNet.old mkdir axtNet cd axtNet cp /cluster/data/hg17/bed/blastz.mm5/axtNet/*.axt . nice gzip *.axt md5sum *.axt.gz > md5sum.txt # HG17 TO MM5 LIFTOVER CHAIN (DONE 1/6/05 Andy) ssh kolossus cd /cluster/data/hg17/bed/blastz.mm5/axtChain mkdir over for file in chain/*.chain; do chrom=`basename $file .chain` netChainSubset mouseNet/$chrom.net chain/$chrom.chain over/$chrom.over cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain done rm -rf over/ ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg17/liftOver cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain . gzip hg17ToMm5.chain ln -s /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain /gbdb/hg17/liftOver/hg17ToMm5.over.chain hgAddLiftOverChain -multiple hg17 mm5 # HG17 TO CANFAM1 LIFTOVER CHAIN (DONE 1/7/05 Andy) ssh kolossus cd /cluster/data/hg17/bed/blastz.canFam1/axtChain mkdir over for file in chain/*.chain; do chrom=`basename $file .chain` netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain done rm -rf over/ ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain . gzip hg17ToCanFam1.chain ln -s /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain /gbdb/hg17/liftOver/hg17ToCanFam1.over.chain hgAddLiftOverChain -multiple hg17 canFam1 # HG17 TO PANTRO1 LIFTOVER CHAIN (DONE 1/20/05 Andy) ssh kolossus cd /cluster/data/hg17/bed/blastz.panTro1/axtChain mkdir over for file in chain/*.chain; do chrom=`basename $file .chain` netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain done rm -rf over/ ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain . gzip hg17ToPanTro1.chain ln -s /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain /gbdb/hg17/liftOver/hg17ToPanTro1.over.chain hgAddLiftOverChain -multiple hg17 panTro1 # HG17 TO RN3 LIFTOVER CHAIN (DONE 3/1/05 Andy) #ssh kolossus #cd /cluster/data/hg17/bed/blastz.rn3/axtChain #mkdir over #for file in chain/*.chain; do # chrom=`basename $file .chain` # netChainSubset ratNet/$chrom.net chain/$chrom.chain over/$chrom.over # cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToRn3.chain #done #rm -rf over/ # Oh fancy that, there's already a hg17ToRn3.over.chain in the /cluster/data/hg17/bed/liftOver # directory generated by Angie. ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain . gzip hg17ToRn3.over.chain ln -s /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain /gbdb/hg17/liftOver/hg17ToRn3.over.chain hgAddLiftOverChain -multiple hg17 rn3 # HG17 TO GALGAL2 LIFTOVER CHAIN (DONE 3/1/05 Andy) # OK there's already a /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain file generated # by Angie. ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain . gzip hg17ToGalGal2.over.chain ln -s /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain /gbdb/hg17/liftOver/hg17ToGalGal2.over.chain hgAddLiftOverChain -multiple hg17 galGal2 # HG17 TO MONDOM1 LIFTOVER CHAIN (DONE 3/1/05 Andy) ssh kksilo cd /cluster/data/monDom1/bed/zb.hg17/axtChain netSplit human.net.gz net ssh kolossus cd /cluster/data/monDom1/bed/zb.hg17/axtChain mkdir over for file in chain/*.chain.gz; do chrom=`basename $file .chain.gz` netChainSubset net/$chrom.net chain/$chrom.chain.gz over/$chrom.over cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain done rm -rf over/ net/ ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain . gzip hg17ToMonDom1.chain ln -s /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain /gbdb/hg17/liftOver/hg17ToMonDom1.over.chain hgAddLiftOverChain -multiple hg17 monDom1 # HG17 TO DANRER2 LIFTOVER CHAIN (DONE 3/2/05 Andy) ssh kolossus cd /cluster/data/hg17/bed/blastz.danRer2/axtChain chainSplit chain all.chain.gz netSplit zfishdanRer2.net.gz net mkdir over # FAILED STEPS: #for file in chain/*.chain; do # chrom=`basename $file .chain` # netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over # cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain #done # Error: #read 28019 of 28019 chains in chain/chr1.chain #Processing chr1 #netChainSubset: netChainSubset.c:55: writeChainPart: Assertion `subChain != ((void *)0)' failed. # OK instead of using the ones in the chain/ subdir, I'm using the ones in # the chainAR/ subdir. These chain files had an additional step in the process of making # them: Rachel used the chainAntiRepeat program. for file in chain/*.chain; do chrom=`basename $file .chain` if [ $chrom = "chr1" ]; then netChainSubset net/$chrom.net chainAR/$chrom.chain over/$chrom.over else netChainSubset net/$chrom.net chainAR/$chrom.chain.gz over/$chrom.over fi cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain done rm -rf over/ ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain . gzip hg17ToDanRer2.chain ln -s /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain /gbdb/hg17/liftOver/hg17ToDanRer2.over.chain hgAddLiftOverChain -multiple hg17 danRer2 # HG17 TO TETNIG1 LIFTOVER CHAIN (DONE 3/1/05 Andy) ssh kolossus cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain mkdir over for file in chain/*.chain; do chrom=`basename $file .chain` netChainSubset tetraodonNet/$chrom.net chain/$chrom.chain over/$chrom.over cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain done ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain . gzip hg17ToTetNig1.chain ln -s /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain /gbdb/hg17/liftOver/hg17ToTetNig1.over.chain hgAddLiftOverChain -multiple hg17 tetNig1 # HG17 TO BOSTAU1 LIFTOVER CHAIN (DONE Mar. 18, 2004, Heather) ssh kolossus cd /cluster/data/bosTau1/bed/zb.hg17/axtChain mkdir over for file in chain/*.chain; do chrom=`basename $file .chain` netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain done ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain . gzip hg17ToBosTau1.chain ln -s /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain /gbdb/hg17/liftOver/hg17ToBosTau1.over.chain hgAddLiftOverChain -multiple hg17 bosTau1 # HG17 TO XENTRO1 LIFTOVER CHAIN (DONE 7/5/05 Andy) ssh kolossus cd /cluster/data/xenTro1/bed/zb.hg17/axtChain mkdir chain net over chainSplit chain all.chain netSplit human.net net for file in chain/*.chain; do chrom=`basename $file .chain` netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain done rm -rf over/ chain/ net/ ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver cp /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain . gzip hg17ToXenTro1.chain ln -s /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain /gbdb/hg17/liftOver/hg17ToXenTro1.over.chain hgAddLiftOverChain -multiple hg17 xenTro1 # ADD CHAIN AND NET TO VSMM5 AND VSRN3 DOWNLOAD AREAS (DONE 8/5/04 angie) ssh hgwdev cp -p /cluster/data/hg17/bed/blastz.mm5/axtChain/all.chain.gz \ /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/mouse.chain.gz cp -p /cluster/data/hg17/bed/blastz.mm5/axtChain/mouse.net.gz \ /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/mouse.net.gz cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5 md5sum *.gz */*.gz > md5sum.txt # Update the README.txt cp -p /cluster/data/hg17/bed/blastz.rn3/axtChain/all.chain.gz \ /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/rat.chain.gz cp -p /cluster/data/hg17/bed/blastz.rn3/axtChain/rat.net.gz \ /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/rat.net.gz cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3 md5sum *.gz */*.gz > md5sum.txt # Update the README.txt # ADD CHAIN AND NET TO VSHG17 DOWNLOAD AREAS (DONE Sept. 8th, 2004, heather) ssh hgwdev cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/all.chain.gz \ /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.chain.gz cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/human.net.gz \ /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.net.gz cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3 md5sum *.gz */*.gz > md5sum.txt # Update the README.txt # SWAP BLASTZ ZEBRAFISH-HUMAN (danRer1-hg17) to HUMAN-ZEBRAFISH (hg17-danRer1) # USE RESCORED ALIGNMENTS (see makeDanRer1.doc) # (DONE, 2004-06-22, hartera) # CONVERT AXTs TO PSL AND LOAD INTO DATABASE (DONE, 2004-07-08, hartera) ssh kolossus mkdir /cluster/data/hg17/bed/blastz.danRer1.swap cd /cluster/data/hg17/bed/blastz.danRer1.swap # use rescored axtChrom from blastzHg17 on danRer1 set aliDir = /cluster/data/danRer1/bed/blastz.hg17 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 19G /cluster/data/danRer1/bed/blastz.hg17/axtChrom # 19G unsorted rm -r unsorted # translate sorted axt files into psl ssh kolossus cd /cluster/data/hg17/bed/blastz.danRer1.swap mkdir -p pslChrom set tbl = "blastzDanRer1" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/hg17/bed/blastz.danRer1.swap/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl hg17 $f echo "$f Done" end # CHAIN ZEBRAFISH (danRer1) BLASTZ (DONE, 2004-06-23, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/hg17/bed/blastz.danRer1.swap mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/hg17/bed/blastz.danRer1.swap/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from hg16 vs chicken run. cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter $1 \ | axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap \ -minScore=5000 stdin \ /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/danRer1/nib $2 > $3 '_EOF_' chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 45 of 46 jobs # Crashed: 1 jobs # CPU time in finished jobs: 3559s 59.32m 0.99h 0.04d 0.000 y # IO & Wait Time: 934s 15.56m 0.26h 0.01d 0.000 y # Average job time: 100s 1.66m 0.03h 0.00d # Longest job: 502s 8.37m 0.14h 0.01d # Submission to last job: 2969s 49.48m 0.82h 0.03d # chr19.axt crashed - out of memory so try again on kolossus ssh kolossus cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/run1 # need to use nibs on bluearc as iscratch not accessible to kolossus cat << '_EOF_' > doChain2 #!/bin/csh axtFilter $1 \ | axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap \ -minScore=5000 stdin \ /cluster/bluearc/hg17/bothMaskedNibs \ /cluster/bluearc/danRer1/nib $2 >& $3 '_EOF_' chmod +x doChain2 doChain2 \ /cluster/data/hg17/bed/blastz.danRer1.swap/axtChrom/chr19.axt \ chain/chr19.chain out/chr19.out # now on the cluster server, sort chains ssh kksilo cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg17 ${c}_chainDanRer1 $i echo done $c end # tried minScore = 1000 and minScore = 10000 for axtChain # minScore = 5000 was best for reducing low scoring chains but not reducing # overlap with refGene CDS too much # NET ZEBRAFISH (danRer1) BLASTZ (DONE, 2004-06-24, hartera) # REMAKE NET WITHOUT ANCIENT REPEATS (DONE, 2004-07-07, hartera) ssh kksilo cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 149086208, utime 868 s/100, stime 173 # Add classification info using db tables: cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian # this is only for human rodent comparisons so use -noAr option mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInHuman mkdir -p /cluster/bluearc/hg17/linSpecRep.notInZebrafish cp /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/* \ /cluster/bluearc/hg17/linSpecRep.notInZebrafish cp /iscratch/i/danRer1/linSpecRep.notInHuman/* \ /cluster/bluearc/danRer1/linSpecRep.notInHuman ssh hgwdev cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain # add -noAr option # mkdir old # mv zebrafish.net ./old/zebrafish.net.old time netClass noClass.net hg17 danRer1 zebrafish.net \ -tNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish \ -qNewR=/cluster/bluearc/danRer1/linSpecRep.notInHuman -noAr # 83.410u 43.650s 3:09.94 66.8% 0+0k 0+0io 198pf+0w ssh hgwdev cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain netFilter -minGap=10 zebrafish.net | hgLoadNet hg17 netDanRer1 stdin # EXTRACT AXT'S AND MAF'S FROM ZEBRAFISH (danRer1) NET # (DONE, 2004-06-24, hartera) used net where hg17 ancient Repeat table used # sorted axts and remade mafs as multiz needs axts to be sorted # (DONE, 2004-06-25, kate) # Redone to fix overlaps using 8/05 axtToNet (2005-08-16 kate) # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) ssh eieio # create axts cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain netSplit zebrafish.net zebrafishNet mkdir -p ../axtNet ../mafNet cat > makeMaf.csh << 'EOF' foreach f (zebrafishNet/chr*.net) set c = $f:t:r echo $c netToAxt zebrafishNet/$c.net chain/$c.chain \ /cluster/data/hg17/nib /cluster/data/danRer1/nib stdout | \ axtSort stdin ../axtNet/$c.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/danRer1/chrom.sizes \ ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=danRer1. end 'EOF' csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log mkdir -p /cluster/bluearc/hg17/mafNet cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/danRer1 # BLASTZ ZEBRAFISH (danRer1) CLEAN UP (DONE, 2004-07-19, hartera) # FURTHER CLEANUP (hartera, 2006-09-01, hartera) ssh eieio cd /cluster/data/hg17/bed/blastz.danRer1.swap nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/hNoClass.net & nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net & # further cleanup (2006-09-01, hartera) ssh kkstore02 cd /cluster/data/hg17/bed/blastz.danRer1.swap rm -r axtNet.old axtNet.unsorted mafNet cd axtChain rm hist* # remove chains and nets directories. These can be reconstructed with # all.chain.gz and zebrafish.net.gz rm -r old chain zebrafishNet preNet rm noClass.net.gz cd .. rm pslChrom/psl.tab.gz # ZEBRAFISH DANRER1 DOWNLOADS (WORKING 2004-09-17 kate) ssh eieio cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtNet gzip *.axt ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17 mkdir -p vsDanRer1 cd vsDanRer1 cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/all.chain.gz zebrafish.chain.gz cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/zebrafish.net.gz . md5sum *.gz > md5sum.txt mkdir -p axtNet cd axtNet cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtNet/*.axt.gz . md5sum *.gz > md5sum.txt # Copy and edit README.txt # MAKING MOUSE SYNTENY (DONE - 2004-07-03 - Hiram) ssh hgwdev mkdir /cluster/data/hg17/bed/syntenyMm5 cd /cluster/data/hg17/bed/syntenyMm5 # Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3 cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl . ./syntenicBest.pl -db=hg17 -table=blastzBestMm5 ./smooth.pl ./joinsmallgaps.pl ./fillgap.pl -db=hg17 -table=blastzBestMm5 ./synteny2bed.pl # The five commands above # real 209m28.161s # user 0m21.040s # sys 0m4.100s # Used to load this in syntenyMm5, but that type is misleading to # the table browser and fails the checkTableCoords check. # Better to use this ensRatMusHom type: # Need a new name here for the Mm5 to not conflict with Rn3 sed -e 's/ensPhusionBlast/ensRatMm5Hom/g' \ $HOME/kent/src/hg/lib/ensPhusionBlast.sql \ > ensRatMm5Hom.sql hgLoadBed hg17 ensRatMm5Hom ucsc100k.bed -sqlTable=ensRatMm5Hom.sql # featureBits hg17 ensRatMm5Hom # 2649530748 bases of 2866216770 (92.440%) in intersection # featureBits hg17 ensRatMm4Hom # 2549307611 bases of 2866216770 (88.943%) in intersection # featureBits hg16 syntenyMm5 # 2560252977 bases of 2865248791 (89.355%) in intersection # MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-07-02 - Hiram) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd /cluster/data/hg17/bed/blastz.mm5/axtNet mkdir -p ../axtTight bash # for tcsh users for I in *.axt do echo $I subsetAxt $I ../axtTight/$I \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 done # exit bash if you are tcsh # An 8 minute job # translate to psl cd ../axtTight mkdir ../pslTight bash # for tcsh users for I in *.axt do C=${I/.axt/} axtToPsl $I ../S1.len ../S2.len ../pslTight/${C}_blastzTightMm5.psl echo "Done: $I -> ${C}_blastzTightMm5.psl" done # exit bash if you are tcsh # Load tables into database ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm5/pslTight for I in chr*TightMm5.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I} echo "done ${I}" done # Compare results with previous assembly: # featureBits hg17 blastzTightMm5 # 165862935 bases of 2866216770 (5.787%) in intersection # featureBits hg17 blastzTightMm4 # 166569246 bases of 2866216770 (5.811%) in intersection # featureBits hg16 blastzTightMm5 # 162641577 bases of 2865248791 (5.676%) in intersection # copy axt's to download area ssh hgwdev cd /cluster/data/hg17/bed/blastz.mm5/axtTight mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight gzip *.axt # add README.txt file to dir (use previous assembly's copy as template) # 4 minute gzip # BLASTZ MM5 CLEAN UP (DONE 2004-07-02 - Hiram) ssh eieio cd /cluster/data/hg17/bed/blastz.mm5 nice rm -rf raw & nice rm -fr axtChain/n1 axtChain/hNoClass.net & nice rm axtChain/run1/chain/* & nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net & ############################################################################## # MAKING BLASTZ SELF (DONE - 2004-07-14 - Hiram) # The procedure for lineage spec business with self is to simply # use the actual repeat masker output for this human assembly as # the lineage specific repeats for itself. Thus, merely make # symlinks to the repeat masker out files and name them as expected # for blastz. In this case they are called notInHuman but they # really mean InHuman. Yes, it is confusing, but that's just the # nature of the game in this case. ssh eieio mkdir /cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman cd /cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman foreach f (../rmsk/*.fa.out) set base = $f:t:r:r echo $base.out.spec ln -s $f $base.out.spec end # Same thing done on iscratch # Not worried about pushing this scratch yet, it will get done # sometime later. Using the actual /cluster/bluearc/scratch/ # location below. ssh kk mkdir /cluster/data/hg17/bed/blastzSelf.2004-07-01 cd /cluster/data/hg17/bed ln -s blastzSelf.2004-07-01 blastzSelf cd blastzSelf cat << '_EOF_' > DEF # human vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Human SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Human SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/data/hg17/bed/blastzSelf DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # prepare first cluster run ssh kk cd /cluster/data/hg17/bed/blastzSelf /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # you need a -maxPush=200000 on this one, it is more than 100000 # jobs the default push limit. Also be aware of maxQueue limits # on the KK, may need something more than the default of 200000 if # the KK is busy. XXX - running 2004-07-01 11:26 ############################################################################## # LIFTOVER (DROP) CHAINS TO HG16 (IN PROGRESS 2005-01-03 kate) # swap hg16->hg17 chains # LIFTOVER (DROP) CHAINS TO HG16 (IN PROGRESS 2004-07-07 kate) # run alignment # NOTE: split hg16 to /iscratch/i is doc'ed in makeHg16.doc ssh kk cd /cluster/data/hg17 makeLoChain-align hg17 /scratch/hg/gs.18/build35/bothMaskedNibs \ hg16 /iscratch/i/gs.17/build34/liftOver/split # Created parasol job in bed/blat.hg16.2004-07-07/run # 1150 jobs cd bed/blat.hg16.2004-07-07/run para try para check para push # GOT HERE # lift results (use bash) cd /cluster/data/hg17/bed/blat.hg16 for file in /cluster/data/hg16/nib/*.nib; do chrom=`basename $file .nib` liftUp -pslQ psl/$chrom.psl /cluster/bluearc/hg/gs.17/build34/liftOver/lift/$chrom.lft warn raw/chr*_${chrom}.psl done # There were some errors from not finding .lft files for the chr_random ones. ssh kk9 cd ../liftOver ln -s blat.hg16 blat.hg16.2005-01-22 makeLoChain-chain hg17 /cluster/data/hg17/nib hg16 /cluster/data/hg16/nib 2>chain.error.log >chain.log ssh eieio makeLoChain-net hg17 hg16 ssh hgwdev makeLoChain-load hg17 hg16 # DROPUNDER CHAIN TO HG15 (DONE 2005-07-21 Andy) # Split things up ssh eieio cd /cluster/bluearc mkdir -p hg15/liftOver/split cd hg15/liftOver/split/ mkdir ../lift for c in `cut -f1 /cluster/data/hg15/chrom.sizes`; do echo $c num=${c%_random} num=${num#chr} faSplit -lift=../lift/${c}.lft size /cluster/data/hg15/${num}/${c}.fa -oneFile 3000 ${c} done # Move files to santest ssh hgwdev cd /santest/scratch mkdir hg15 cd hg15/ cp -r /cluster/bluearc/hg15/liftOver . # run alignment ssh kk cd /cluster/data/hg17 makeLoChain-align hg17 /scratch/hg/gs.18/build35/bothMaskedNibs \ hg15 /santest/scratch/hg15/liftOver/split # Created parasol job in bed/blat.hg16.2004-07-07/run # 2024 jobs written to batch # *** IGNORE the batch created by the script. ln -s bed/blat.hg15.2005-07-21 bed/blat.hg15 cd bed/blat.hg15/ mv run run.kk mkdir run.kk9 run.kki cd run.kk/ sed 's/\.fa\./\./g' spec > tmp; mv tmp spec grep Un_random spec > ../run.kki/spec grep -v Un_random spec > newspec mv newspec spec egrep "chr(1|19|X)(\.|_)" spec | grep -v random > ../run.kk9/spec grep -Fv -f ../run.kk9/spec spec > newspec mv newspec spec wc -l spec ../run.kk9/spec ../run.kki/spec # 1831 spec # 147 ../run.kk9/spec # 46 ../run.kki/spec # 2024 total # Checks out # Run the thing on all 3 clusters. para create spec para push #Completed: 1831 of 1831 jobs #CPU time in finished jobs: 8556066s 142601.10m 2376.69h 99.03d 0.271 y #IO & Wait Time: 60428s 1007.13m 16.79h 0.70d 0.002 y #Average job time: 4706s 78.43m 1.31h 0.05d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 46724s 778.73m 12.98h 0.54d #Submission to last job: 46725s 778.75m 12.98h 0.54d ssh kk9 cd /cluster/data/hg17/bed/blat.hg15/run.kk9 para create spec para push #Completed: 147 of 147 jobs #CPU time in finished jobs: 1698424s 28307.07m 471.78h 19.66d 0.054 y #IO & Wait Time: 874s 14.56m 0.24h 0.01d 0.000 y #Average job time: 11560s 192.66m 3.21h 0.13d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 31413s 523.55m 8.73h 0.36d #Submission to last job: 31413s 523.55m 8.73h 0.36d ssh kki cd /cluster/data/hg17/bed/blat.hg15/run.kki para create spec para push # OK I don't have para time stuff for this one, but it was the shortest # by far. # lift results cd /cluster/data/hg17/bed/blat.hg15/lift.run for chrom in `cut -f1 /cluster/data/hg15/chrom.sizes`; do liftUp -pslQ /cluster/bluearc/hg15/liftOver/psl/${chrom}.psl /cluster/bluearc/hg15/liftOver/lift/${chrom}.lft warn raw/chr*_${chrom}.psl done # Chain # There's been some problems with store5. ssh kk9 cd /cluster/store12/store5/gs.18/build35/bed/blat.hg15.2005-07-21 mkdir chainRun mkdir -p /panasas/store/hg15/chainRaw ln -s /panasas/store/hg15/chainRaw chainRaw cd chainRun/ ls -1S ../psl/*.psl > in.lst cat > chain.sh << "_EOF_" #!/bin/bash tmp=/scratch/`basename $4` axtChain -psl $1 $2 $3 $tmp cp $tmp $4 rm $tmp _EOF_ chmod +x chain.sh cat > gsub << "_EOF_" #LOOP ./chain.sh $(path1) /scratch/hg/gs.18/build35/bothMaskedNibs /scratch/hg/gs.16/build33/chromTrfMixedNib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP _EOF_ # << cd chainRun/ gensub2 in.lst single gsub spec para create spec para push #Completed: 44 of 44 jobs #CPU time in finished jobs: 7448s 124.13m 2.07h 0.09d 0.000 y #IO & Wait Time: 9591s 159.85m 2.66h 0.11d 0.000 y #Average job time: 387s 6.45m 0.11h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 1906s 31.77m 0.53h 0.02d #Submission to last job: 1906s 31.77m 0.53h 0.02d ssh kolossus cd /panasas/store/hg15/chainRaw chainMergeSort *.chain | chainSplit /scratch/andy/chain stdin cd /scratch/andy mkdir net over cd chain/ for chain in *; do c=${chain%.chain} echo $c chainNet $chain /cluster/store12/store5/gs.18/build35/chrom.sizes \ /cluster/store12/store5/gs.16/build33/chrom.sizes \ ../net/${c}.net /dev/null netChainSubset ../net/${c}.net $chain ../over/${c}.over done cd ../over/ cat * >> ../hg17ToHg15.over.chain cd ../ cp -r hg17* over/ /cluster/store12/store5/gs.18/build35/bed/blat.hg15.2005-07-21/ cd ../ rm -rf andy/ rm -rf /panasas/store/hg15 cd /cluster/bluearc/hg15/liftOver/psl for psl in *; do gzip $psl done cd ../ # Completed: 116281 of 116281 jobs # CPU time in finished jobs: 21807388s 363456.46m 6057.61h 252.40d 0.692 y # IO & Wait Time: 2319383s 38656.39m 644.27h 26.84d 0.074 y # Average job time: 207s 3.46m 0.06h 0.00d # Longest job: 22063s 367.72m 6.13h 0.26d # Submission to last job: 83402s 1390.03m 23.17h 0.97d # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kki cd /cluster/data/hg17/bed/blastzSelf /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 341 of 341 jobs # CPU time in finished jobs: 6344s 105.73m 1.76h 0.07d 0.000 y # IO & Wait Time: 5413s 90.22m 1.50h 0.06d 0.000 y # Average job time: 34s 0.57m 0.01h 0.00d # Longest job: 505s 8.42m 0.14h 0.01d # Submission to last job: 4521s 75.35m 1.26h 0.05d # Third cluster run to convert lav's to axt's # These self alignments do not work well as the usual third cluster job. # Instead, a specialized job here that includes a DropSelf # operation, and in individual lav pieces to avoid out of memory # problems during axtSort ssh kki cd /cluster/data/hg17/bed/blastzSelf mkdir axtChrom run.2 cd run.2 cat << '_EOF_' > runLavToAxt.sh #!/bin/sh BASE=/cluster/data/hg17/bed/blastzSelf SEQ1_DIR=/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs SEQ2_DIR=/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs CHR=$1 OUT=axtChrom/$CHR.axt cd ${BASE}/lav/${CHR} for D in *.lav do smallout=$D.axt lavToAxt $D $SEQ1_DIR $SEQ2_DIR stdout \ | axtDropSelf stdin stdout \ | axtSort stdin $smallout done cat `ls -1 *.lav.axt | sort -g` > $BASE/$OUT '_EOF_' # << keep emacs coloring happy chmod +x runLavToAxt.sh cat << '_EOF_' > gsub #LOOP ./runLavToAxt.sh $(path1) {check out line ../axtChrom/$(path1).axt} #ENDLOOP '_EOF_' # << keep emacs coloring happy ls ../lav > chrList gensub2 chrList single gsub jobList para create jobList para try para push # This is a tough load on eieio. Managable, but the load should # be monitored to make sure it isn't severe. I saw about 100 to 150 # the chr19 job will not finish, even in parts it takes up too # much memory and the node it runs on ends up swapping endlessly. # Need to go to kolossus to do chr19 para stop para recover jobList chr19JobList ssh kolossus cd /cluster/data/hg17/bed/blastzSelf/run.2 time ./runLavToAxt.sh chr19 # real 43m14.797s # user 12m56.670s # sys 3m13.590s # translate sorted axt files into psl ssh eieio cd /cluster/data/hg17/bed/blastzSelf mkdir pslChrom set tbl = "blastzSelf" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 70 minutes # Load database tables ssh hgwdev cd /cluster/data/hg17/bed/blastzSelf/pslChrom bash # if a csh/tcsh user for I in *.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I} echo "done: ${I}" done # exit bash if you are tcsh # This is an 80 minute job # Check results # featureBits hg17 blastzSelf # 252256266 bases of 2866216770 (8.801%) in intersection # real 40m49.573s # user 21m14.200s # sys 2m10.420s # featureBits hg16 blastzSelf # 254410837 bases of 2865248791 (8.879%) in intersection # CHAIN SELF BLASTZ (DONE - 2004-07-07 - Hiram) # The axtChain is best run on the small kluster, or the kk9 kluster ssh kki mkdir -p /cluster/data/hg17/bed/blastzSelf/axtChain/run1 cd /cluster/data/hg17/bed/blastzSelf/axtChain/run1 mkdir out chain ls -1S /cluster/data/hg17/bed/blastzSelf/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # May need -minScore=5000 for all chroms if chr19 won't finish on kolossus cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain # 46 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # Completed: 45 of 46 jobs # Crashed: 1 jobs # CPU time in finished jobs: 8519s 141.98m 2.37h 0.10d 0.000 y # IO & Wait Time: 4795s 79.92m 1.33h 0.06d 0.000 y # Average job time: 296s 4.93m 0.08h 0.00d # Longest job: 2407s 40.12m 0.67h 0.03d # Submission to last job: 3540s 59.00m 0.98h 0.04d # chr19 did fail, on kolossus, try: ssh kolossus cd /cluster/data/hg17/bed/blastzSelf/axtChain/run1 time axtChain /cluster/data/hg17/bed/blastzSelf/axtChrom/chr19.axt \ /cluster/data/hg17/nib \ /cluster/data/hg17/nib \ chain/chr19.chain > out/chr19.out # 80 minute job, 1.5 Gb result: # -rw-rw-r-- 1 1588795432 Jul 7 21:54 chr19.chain # now on the file server, sort chains ssh eieio cd /cluster/data/hg17/bed/blastzSelf/axtChain time chainMergeSort run1/chain/*.chain > all.chain # real 27m38.935s # user 23m18.540s # sys 2m39.300s # A 5 Gb file: # -rw-rw-r-- 1 5267202936 Jul 7 22:23 all.chain time chainSplit chain all.chain # real 29m27.062s # user 22m48.250s # sys 1m57.910s # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain bash # for tcsh users for I in *.chain do c=${I/.chain/} $HOME/bin/i386/hgLoadChain -normScore hg17 ${c}_chainSelf $I echo done $c done # exit bash if you are tcsh # This is almost 3 hours to load ssh kolossus cd /cluster/data/hg17/bed/blastzSelf.2004-07-01 time HGDB_CONF=~/.hg.conf.read-only featureBits \ -noRandom -noHap hg17 chainSelfLink > fb.chainSelfLink 2>&1 & # real 56m34.802s # 240976607 bases of 2851352871 (8.451%) in intersection # featureBits hg17 chainSelf # 682833453 bases of 2866216770 (23.824%) in intersection # featureBits hg16 chainSelf # 626345319 bases of 2865248791 (21.860%) in intersection # DELIVER these chain files to hgdownload (2005-01-27 - Hiram) ssh eieio cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain gzip chr*.chain ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsSelf cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain cp -p *.chain.gz /usr/local/apache/htdocs/goldenPath/hg17/vsSelf # fixup README file, request push # NET SELF (DONE - 2004-07-13 - Hiram) ssh eieio cd /cluster/data/hg17/bed/blastzSelf/axtChain mkdir preNet cd chain bash # for tcsh users for I in *.chain do echo preNetting $I /cluster/bin/i386/chainPreNet $I /cluster/data/hg17/chrom.sizes \ /cluster/data/hg17/chrom.sizes ../preNet/$I done # 23 minutes cd .. mkdir n1 cd preNet for I in *.chain do N=${I/.chain/}.net echo primary netting $I /cluster/bin/i386/chainNet $I -minSpace=10 \ /cluster/data/hg17/chrom.sizes /cluster/data/hg17/chrom.sizes \ ../n1/$N /dev/null done # exit bash if you are tcsh # 5 minute job cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 206442496, utime 3009 s/100, stime 252 # memory usage 2510467072, utime 19307 s/100, stime 3181 ssh hgwdev cd /cluster/data/hg17/bed/blastzSelf/axtChain time netClass hNoClass.net hg17 hg17 human.net \ -tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman \ -qNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman # real 9m32.951s # user 2m42.840s # sys 1m23.460s # If things look good do ssh eieio cd /cluster/data/hg17/bed/blastzSelf/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with time netFilter -syn human.net > humanSyn.net # real 0m29.851s # user 0m27.200s # sys 0m2.120s # Load the nets into database ssh hgwdev cd /cluster/data/hg17/bed/blastzSelf/axtChain netFilter -minGap=10 human.net | hgLoadNet hg17 netSelf stdin netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 syntenyNetSelf stdin # check results # featureBits hg17 netSelf # 620827374 bases of 2866216770 (21.660%) in intersection # featureBits hg16 netSelf # 563788850 bases of 2865248791 (19.677%) in intersection # featureBits hg15 selfNet # 749177799 bases of 2866466359 (26.136%) in intersection # featureBits hg17 syntenyNetSelf # 404535376 bases of 2866216770 (14.114%) in intersection # featureBits hg16 syntenyNetSelf # 340871322 bases of 2865248791 (11.897%) in intersection # Add entries for net and chain to human/hg17 trackDb # make net ssh eieio cd /cluster/data/hg17/bed/blastzSelf/axtChain mkdir humanNet time netSplit human.net humanNet # real 0m52.106s # user 0m43.350s # sys 0m5.170s # extract axts from net - this should be combined with the sort and # maf conversion below mkdir ../axtNet foreach n (humanNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt humanNet/$c.net chain/$c.chain \ /cluster/data/hg17/nib \ /cluster/data/hg17/nib stdout > ../axtNet/$c.axt echo "Complete: $c.net -> axtNet/$c.axt" end # sort axt's and convert to maf format mkdir ../mafNet foreach f (../axtNet/chr*.axt) set c=$f:t:r echo $c.axt mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt rm ../axtNet/$c.unsorted.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/hg17/chrom.sizes \ ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=hg17. end # a 3 minute job XXXX - ! ! ! WE DO NOT NEED the Best and Tight tracks for Self ! ! ! ssh hgwdev mkdir -p /cluster/data/hg17/bed/blastzSelf/axtBest cd /cluster/data/hg17/bed/blastzSelf/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area - XXX Do we need this for Self ? ssh hgwdev cd /cluster/data/hg17/bed/blastzSelf/axtNet mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsSelf cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsSelf cd /usr/local/apache/htdocs/goldenPath/hg17/vsSelf nice gzip *.axt nice md5sum *.gz > md5sum.txt # add README.txt file to dir (use previous assembly's copy as template) # Convert those axt files to psl ssh eieio cd /cluster/data/hg17/bed/blastzSelf mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestSelf.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestSelf.psl echo "Done: ${c}_blastzBestSelf.psl" end # Load tables ssh hgwdev cd /cluster/data/hg17/bed/blastzSelf/pslBest bash # if a csh/tcsh user for I in chr*BestSelf.psl do /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I} echo "done ${I}" done # exit bash if you are tcsh # check results # featureBits hg17 blastzBestSelf # 233978156 bases of 2866216770 (8.163%) in intersection # featureBits hg16 blastzBestSelf # 225819219 bases of 2865248791 (7.881%) in intersection # MAKING HUMAN AXTTIGHT FROM AXTBEST (NOT TO BE DONE - 2004-07-13 - Hiram) # XXXX - ! ! ! DO NOT NEED axtBest for Self alignments # Been done anyway, Robert and Gill like to see it. # BLASTZ SELF CLEAN UP (DONE - 2004-07-15 - Hiram) ssh eieio cd /cluster/data/hg17/bed/blastzSelf nice rm -rf raw & nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/hNoClass.net & nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net & # CREATING BIG ZIPS (DONE - 2004-07-23 - Hiram) ssh eieio cd /cluster/data/hg17/jkStuff time ./zipAll.sh > zipAll.out 2>&1 ssh hgwdev # This stuff has to work in a different way because this stuff # updates on a daily basis. cd /usr/local/apache/htdocs/goldenPath/hg17/bigZips featureBits hg17 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa rm upstream1000.fa featureBits hg17 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa rm upstream2000.fa featureBits hg17 refGene:upstream:5000 -fa=upstream5000.fa zip upstream5000.zip upstream5000.fa rm upstream5000.fa # ENCODE REGIONS (DONE 2004-07-28 kate) ssh eieio cd /cluster/data/hg17/bed mkdir encodeRegions cd encodeRegions liftOver /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \ /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \ encodeRegions.bed encodeRegions.unmapped wc -l encodeRegions.* # 44 encodeRegions.bed # 0 encodeRegions.unmapped ssh hgwdev cd /cluster/data/hg17/bed/encodeRegions hgLoadBed hg17 encodeRegions encodeRegions.bed -noBin # H-INVITATIONAL GENE ANNOTATION DATABASE (WORKING 2004-07-28 kate) # http://www.jbirc.aist.go.jp/hinv/top.html # Create knownGene table to reference HINV gene ID's # for link on knownGenes details page # Also, create an HINV gene track # download CDNA file release 1.5 -- got release # from downloads page). ssh kksilo mkdir -p /cluster/data/hinv cd /cluster/data/hinv wget http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz gunzip FCDNA.gz mv FCDNA FCDNA.1.5 # set up assembly work area ssh eieio cd /cluster/data/hg17 mkdir -p bed/hinv cd bed/hinv # extract H-INV ID's and Genbank accessions of mRNAs awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/FCDNA.1.5 \ > accessions.txt awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/FCDNA.1.5 \ > ids.txt paste accessions.txt ids.txt > queries.txt wc -l ids.txt # 41118 ids.txt # create PSL file from alignments for these mRNA's, extracted from the # table of all aligned mRNA's ssh hgwdev cd /cluster/data/hg17/bed/hinv hgsql hg17 -s -e "SELECT * FROM all_mrna" | cut -f 2- > all_mrna.tab ssh eieio cd /cluster/data/hg17/bed/hinv pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl # using pslReps to generate the PSL file header ~kate/bin/i386/pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl # NOTE: generated with pslSelect.c v1.3 (1.4 is broken -- test is # setup in hg/pslSelect/tests & I requested Robert take a look) # load track of mrna alignments hgwdev cd /cluster/data/hg17/bed/hinv hgLoadPsl hg17 -table=HInvGeneMrna hinv_mrna.psl hgsql hg17 -s -e \ "select distinct(qName) from HInvGeneMrna order by qName" > hg17.mrna hgsql hg16 -s -e \ "select distinct(qName) from HInvGeneMrna order by qName" > hg16.mrna wc -l hg*.mrna # 40998 hg16.mrna # 41023 hg17.mrna comm -1 -3 *.mrna > hg17.aligned wc -l hg17.aligned # 29 (transcripts newly aligned in hg17) comm -2 -3 *.mrna > hg16.aligned wc -l hg16.aligned # 4 (transcripts no longer aligned in hg17) comm -2 -3 ids.txt hg17.mrna > hg17.notaligned wc -l hg17.notaligned # 95 (transcripts not aligned in hg17 -- checking on why...) # also make a table with various useful items for each transcript ssh hgwdev hgsql hg17 < ~/kent/src/hg/lib/HInv.sql cd /cluster/data/hg17/bed/hinv /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/FCDNA.1.5 > HInv.tab echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg17 hgsql hg16 -s -e "select count(*) from HInv" # 41118 hgsql hg17 -s -e "select count(*) from HInv" # 41118 # create table for knownGenes detail page ssh hgwdev cd /cluster/data/hg17/bed/hinv hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv # GENEID GENE PREDICTIONS (DONE 7/30/04 angie) ssh hgwdev mkdir /cluster/data/hg17/bed/geneid cd /cluster/data/hg17/bed/geneid foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/geneid_v1.2/$chr.gtf wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/geneid_v1.2/$chr.prot end # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf cp /dev/null geneid.fa foreach f (chr*.prot) perl -wpe 's/^(>chr\S+)/$1.1/' $f >> geneid.fa end ldHgGene -gtf -genePredExt hg17 geneid *.gtf hgPepPred hg17 generic geneidPep geneid.fa # MITOPRED DATA FOR HGGENE (DONE 7/30/04 angie) ssh hgwdev mkdir /cluster/data/hg17/bed/mitopred cd /cluster/data/hg17/bed/mitopred wget http://mitopred.sdsc.edu/data/hum_30.out perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' hum_30.out > mitopred.tab cat > mitopred.sql << '_EOF_' # Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/ CREATE TABLE mitopred ( name varchar(10) not null, # SwissProt ID confidence varchar(8) not null, # Confidence level #Indices PRIMARY KEY(name(6)) ); '_EOF_' # << this line makes emacs coloring happy hgsql hg17 < mitopred.sql hgsql hg17 -e 'load data local infile "mitopred.tab" into table mitopred' # NUCLEAR PROTEIN DATABASE (IN PROGRESS 7/30/04 angie) ssh eieio mkdir /cluster/data/hg17/bed/npd cd /cluster/data/hg17/bed/npd wget ftp://ftp.hgu.mrc.ac.uk/pub/npd/database.zip unzip database.zip # OK, it's one big .mdb (Microsoft Access DB) file. # Googling... can buy a converter for $40... free trial .exe... # CREATING REFFULL - DBTSS MRNA (DONE - 2004-08-02 - Hiram) ssh to eieio mkdir /cluster/data/hg17/bed/refFull cd /cluster/data/hg17/bed/refFull wget --timestamping "ftp://ftp.hgc.jp/pub/hgc/db/dbtss/ref-full.fa.gz" . wget --timestamping "ftp://ftp.hgc.jp/pub/hgc/db/dbtss/readme" . # See also: http://dbtss.hgc.jp/index.html # gunzip it and split the ref-rull.fa file into about 200 pieces # (faSplit won't do this job if it is: # zcat ref-full.fa.gz | faSplit sequence stdin 50 splitRefFull gunzip ref-full.fa.gz faSplit sequence ref-full.fa 50 splitRefFull gzip ref-full.fa # copy to Iservers ssh kkr1u00 cd /cluster/data/hg17/bed/refFull mkdir /iscratch/i/gs.18/build35/refFull cp -p split*.fa /iscratch/i/gs.18/build35/refFull /cluster/bin/iSync # no longer need these split files here rm -f split*.fa # run alignments on kluster ssh kk cd /cluster/data/hg17/bed/refFull ls -1S /scratch/hg/gs.18/build35/maskedContigs > genome.lst ls -1S /iscratch/i/gs.18/build35/refFull > refFull.lst # Use BLAT to generate refFull alignments as so: cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in exists /scratch/hg/gs.18/build35/maskedContigs/$(path1)} {check in exists+ /iscratch/i/gs.18/build35/refFull/$(path2)} {check out line+ psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs coloring happy bash # if a csh/tcsh user mkdir psl cat genome.lst | sed -e "s/.fa//" | while read C do mkdir psl/${C} done # exit bash if you are tcsh gensub2 genome.lst refFull.lst gsub jobList para create jobList # 18240 jobs written to batch para try para check para push ... etc ... # Completed: 18240 of 18240 jobs # CPU time in finished jobs: 37011s 616.85m 10.28h 0.43d 0.001 y # IO & Wait Time: 62630s 1043.84m 17.40h 0.72d 0.002 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest job: 51s 0.85m 0.01h 0.00d # Submission to last job: 850s 14.17m 0.24h 0.01d # Process refFull alignments into near best in genome. ssh eieio cd /cluster/data/hg17/bed/refFull pslSort dirs raw.psl tmp psl/* pslReps -minCover=0.2 -sizeMatters -minAli=0.965 \ -nearTop=0.001 raw.psl contig.psl /dev/null liftUp -nohead all_refFull.psl ../../jkStuff/liftAll.lft warn contig.psl pslSortAcc nohead chrom tmp all_refFull.psl pslCat -dir chrom > refFullAli.psl # Load refFull alignments into database ssh hgwdev cd /cluster/data/hg17/bed/refFull hgLoadPsl hg17 -tNameIx refFullAli.psl # VAR_MULTIZ HG17/MM5/RN3/GALGAL2/FR1 (acs 2004-08-12) # This is a new, experimental version of multiz written by Minmei at # PSU and sent by e-mail from Webb. This version allows for a # progressive alignment strategy (i.e., alignment construction in a # post-order traversal of the tree) using only pairwise alignments of # each sequence with the reference sequence and without any need for # "staging". Here's a little blurb about it from the header of # var_multiz.v3.c. # var_multiz.v3.c # # Variant to multiz program. It aligns two files of # alignment blocks where top row is always the reference, # assuming blocks are increasing ordered based on the # start position on the refernece seqence. Single-coverage # on reference is required at this stage. # # Four arguments are required: char* arg1, char* arg2, # int arg3, int arg4. arg1 and arg2 are two files need # to be aligned together. The alignment of reference in # two files are either fixed or not, determined from # argurments arg3 and arg4. arg3 and arg4 are either 1 # or 0, but cannot be 1 at the same time. 1 means # reference is fixed. v1 and v2 cannot be both 1. # ... mkdir /cluster/data/hg17/bed/var_multiz.2004-08-12 # unpack source and compile cp /cluster/home/acs/var_multiz.tar.gz /cluster/data/hg17/bed/var_multiz.2004-08-12 cd /cluster/data/hg17/bed/var_multiz.2004-08-12 tar xfz var_multiz.tar.gz cd var_multiz_source make NOTE (8/14): this version of the source is already out of date! # Source is now checked in under hg3rdParty and updated binaries # are being kept under /cluster/bin/penn/var_multiz # script for creating the 5-way alignments for a given chromosome # (acs, 8/20/04) below revised after e-mail exchange with Minmei cat << '_EOF_' > doVarMultiz.csh #!/bin/csh -fe set chr = $1 # may include _random or _hla_hap[12] set REF = hg17.$chr set RAT = /cluster/bluearc/hg17/multiz8way/rn3/$chr.maf set MOUSE = /cluster/bluearc/hg17/multiz8way/mm5/$chr.maf set CHICKEN = /cluster/bluearc/hg17/multiz8way/galGal2/$chr.maf set FISH = /cluster/bluearc/hg17/multiz8way/fr1/$chr.maf set DEST = /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/$chr.maf set VMZ = /cluster/bin/penn/var_multiz set PROJECT = /cluster/bin/penn/var_multiz.2004.08.12/maf_project mkdir -p $DEST:h if ( -s $RAT && -s $MOUSE ) then echo "Aligning $RAT $MOUSE..." $VMZ $RAT $MOUSE 0 0 > /scratch/$chr.tmp1.maf echo "Projecting on $REF..." $PROJECT /scratch/$chr.tmp1.maf $REF > /scratch/$chr.hrm.maf else if ( -s $RAT ) then cp $RAT /scratch/$chr.hrm.maf else if ( -s $MOUSE ) then cp $MOUSE /scratch/$chr.hrm.maf endif if ( -s $CHICKEN && -s /scratch/$chr.hrm.maf ) then echo "Adding $CHICKEN..." $VMZ /scratch/$chr.hrm.maf $CHICKEN 1 0 > /scratch/$chr.tmp2.maf echo "Projecting on $REF..." $PROJECT /scratch/$chr.tmp2.maf $REF > /scratch/$chr.hrmc.maf else if ( -s $CHICKEN ) then cp $CHICKEN /scratch/$chr.hrmc.maf else if ( -s /scratch/$chr.hrm.maf ) then cp /scratch/$chr.hrm.maf /scratch/$chr.hrmc.maf endif if ( -s $FISH && -s /scratch/$chr.hrmc.maf ) then echo "Adding $FISH..." $VMZ /scratch/$chr.hrmc.maf $FISH 1 0 > /scratch/$chr.tmp3.maf echo "Projecting on $REF..." $PROJECT /scratch/$chr.tmp3.maf $REF > $DEST else if ( -s $FISH ) then cp $FISH $DEST else if ( -s /scratch/$chr.hrmc.maf ) then cp /scratch/$chr.hrmc.maf $DEST endif echo "Done." rm /scratch/$chr.tmp[123].maf /scratch/$chr.hrm.maf /scratch/$chr.hrmc.maf '_EOF_' # << keep emacs coloring happy chmod 755 doVarMultiz.csh for file in `find /cluster/bluearc/hg17/multiz8way/rn3 /cluster/bluearc/hg17/multiz8way/mm5 /cluster/bluearc/hg17/multiz8way/galGal2 /cluster/bluearc/hg17/multiz8way/fr1 -name "chr*.maf"` ; do echo `basename $file .maf` ; done | sort -u > chrlist rm -f jobs.lst for chr in `cat chrlist` ; do echo "doVarMultiz.csh $chr" >> jobs.lst ; done # run cluster job ssh kk ; cd /cluster/data/hg17/bed/var_multiz.2004-08-12; para create jobs.lst ; para try ; para push # (etc.) Completed: 46 of 46 jobs CPU time in finished jobs: 71302s 1188.36m 19.81h 0.83d 0.002 y IO & Wait Time: 1162s 19.37m 0.32h 0.01d 0.000 y Average job time: 1575s 26.26m 0.44h 0.02d Longest job: 6353s 105.88m 1.76h 0.07d Submission to last job: 6362s 106.03m 1.77h 0.07d # for now just create an ordinary maf track (conservation later) rm -rf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1 mkdir -p /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1 ln -s /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/*.maf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1 /cluster/bin/i386/hgLoadMaf hg17 -warn varMultizMm5Rn3GalGal2Fr1 -pathPrefix=/gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1 chmod 775 /gbdb/hg17/varMultiz /gbdb/hg17/varMultiz/maf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1 /cluster/data/hg17/bed/var_multiz.2004-08-12 /cluster/data/hg17/bed/var_multiz.2004-08-12/maf chmod 664 /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/*.maf # trackDb entry # track varMultizMm5Rn3GalGal2Fr1 # shortLabel varMultiz5Way # longLabel Human/Mouse/Rat/Chicken/Fugu Var-Multiz # group compGeno # priority 190 # visibility hide # type maf # elephant human blastz alignment by Robert Aug 11 2004 mkdir /cluster/bluearc/elephant cd /cluster/bluearc/elephant #get reads and qual scores from trace repository for i in `cat trace.lst`; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/loxodonta_africana/fasta.loxodonta_africana.$i.gz ; done for i in `cat trace.lst`; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/loxodonta_africana/qual.loxodonta_africana.$i.gz ; done for i in `cat trace.lst` ; do zcat fasta.loxodonta_africana.$i.gz > loxodonta_africana.$i.fa ; done #trim reads for i in `cat trace.lst` ; do nice gunzip -c qual.loxodonta_africana.$i.gz > qual.loxodonta_africana.$i; faTrimRead loxodonta_africana.$i.fa qual.loxodonta_africana.$i tmp.$i.fa lift.$i.lft; mv -f tmp.$i.fa loxodonta_africana.$i.fa ; rm -f qual.loxodonta_africana.$i ; done for i in `cat trace.lst`; do faSize -detailed=on loxodonta_africana.$i.fa > mac.$i.len ; done cat mac.0*.len > S2.len for i in `cat trace.lst`; do sed -e s/S2.len/mac.$i.len/ < DEF > DEF.$i ; done #split fa reads into 10mb chunks for blastz run and distribute to i-servers. ssh kkr1u00 for i in `cat trace.lst`; do nice faSplit about loxodonta_africana.$i.fa 10000000 /iscratch/i/elephant/${i}.mac. ; done cd /iscratch/i/elephant find split -name \*.fa > /cluster/bluearc/elephant/mac.lst cd /cluster/bluearc/elephant hgsql hg17 -N < chromLen.sql > S1.len cd /iscratch/i/elephant iSync #setup cluster run to blastz reads to human genome ssh kk cd /cluster/bluearc/elephant BlastZ_run0.sh cd run.0 para create jobList para push #94798 jobs in batch #149 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 94797 of 94798 jobs #Crashed: 1 jobs #CPU time in finished jobs: 14183153s 236385.89m 3939.76h 164.16d 0.450 y #IO & Wait Time: 310938s 5182.30m 86.37h 3.60d 0.010 y #Average job time: 153s 2.55m 0.04h 0.00d #Longest job: 1770s 29.50m 0.49h 0.02d #Submission to last job: 52186s 869.77m 14.50h 0.60d ssh kkr9 BlastZ_run1.sh cd run.1 para create jobList para push #341 jobs in batch #151 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 341 of 341 jobs #CPU time in finished jobs: 142914s 2381.91m 39.70h 1.65d 0.005 y #IO & Wait Time: 14078s 234.63m 3.91h 0.16d 0.000 y #Average job time: 460s 7.67m 0.13h 0.01d #Longest job: 782s 13.03m 0.22h 0.01d #Submission to last job: 1954s 32.57m 0.54h 0.02d #generate lst and fa files for each chromosome for faster lavToAxt cd /cluster/bluearc/elephant echo "select chrom from chromInfo;" > chrom.sql hgsql hg17 -B -N < chrom.sql > chrom.lst for i in `cat chrom.lst` ; do grep -h '>' lav/$i/* | awk '{print $1}' | sed -e 's/"//g' | sed -e 's/>//g' > mac.$i.lst ; echo $i ; done ssh kki cd /cluster/bluearc/elephant mkdir -p splitChrom /bin/rm splitChrom/* gensub2 trace.lst chrom.lst gsub.split spec.split para create spec.split para push #322 jobs in batch #Checking finished jobs #Completed: 322 of 322 jobs #CPU time in finished jobs: 819s 13.65m 0.23h 0.01d 0.000 y #IO & Wait Time: 3278s 54.63m 0.91h 0.04d 0.000 y #Average job time: 13s 0.21m 0.00h 0.00d #Longest job: 51s 0.85m 0.01h 0.00d #Submission to last job: 462s 7.70m 0.13h 0.01d cd /cluster/bluearc/elephant/splitChrom for i in `cat /cluster/bluearc/elephant/chrom.lst` ; do cat mac.*.$i.fa > mac.$i.fa ; echo $i ; done #lav to axt run ssh kk cd /cluster/bluearc/elephant mkdir -p run.2 #NOTE: chr19 must be run on kolossus with 64bit executables #change SEQ1_DIR form /scratch to /iscratch for mini cluster . DEF echo "#LOOP" > run.2/gsub echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR}' /cluster/bluearc/elephant/splitChrom/mac.$(root1).fa' >> run.2/gsub echo "#ENDLOOP" >> run.2/gsub cd run.2 gensub2 ../chrom.lst single gsub jobList para create jobList para push #chrM has no data and crashed #46 jobs in batch #Checking finished jobs #Completed: 45 of 46 jobs #Crashed: 1 jobs #CPU time in finished jobs: 249970s 4166.17m 69.44h 2.89d 0.008 y #IO & Wait Time: 5407s 90.11m 1.50h 0.06d 0.000 y #Average job time: 5675s 94.58m 1.58h 0.07d #Longest job: 27065s 451.08m 7.52h 0.31d #Submission to last job: 47744s 795.73m 13.26h 0.55d #split reads by prefix so axtBest will fit in memory mkdir axtByQ cat mac*.lst | awk -F\| '{print substr($3,1,3)}' | sort -nu >prefix.lst for i in `cat prefix.lst` ; do cat axtChrom/*.axt | axtFilter -qStartsWith=gnl\|ti\|$i stdin | axtSwap stdin S1.len S2.len stdout | axtSort stdin axtByQ/q$i.axt ; done mkdir axtByQBest #lots of memory needed for reciprocal best ssh kolossus cd /cluster/bluearc/elephant/axtByQ for i in `ls *.axt` ; do axtBest -quiet i all stdout | axtSwap stdin ../S2.len ../S1.len ../axtByQBest/$i ; echo $i done ; done cd ../cluster/bluearc/elephant/axtByQBest cat q*.axt | axtSplitByTarget stdin . mkdir axtRecipBest for i in `cat chrom.lst` ; do axtSort axtByQBest/$i.axt stdout | axtBest stdin $i axtRecipBest/$i.axt ; echo $i ;done for i in `cat chrom.lst` ; do axtToMaf axtRecipBest/$i.axt S1.len S2.len maf/$i.maf -tPrefix=hg17. -qPrefix=rm1. -scoreZero ; done for i in `cat chrom.lst` ; do mafFilter -minScore=1000 maf/$i.maf > mafFilter/$i.maf ; done # record coverage cd mafFilter for i in `cat ../chrom.lst` ; do nice mafCoverage hg17 $i.maf -count=2 > $i.cov ; echo done $i ; done # CHIMP DELS FROM HG16 (DONE 2004-08-17 kate) # NOTE: this track just for development -- it should be regenerated from the latest # alignments instead of lifted. ssh hgwdev cd /cluster/data/hg17/bed mkdir -p chimpDels cd chimpDels hgsql -s hg16 -e "SELECT * FROM chimpDels" | cut -f 2- > chimpDels.hg16.bed liftOver chimpDels.hg16.bed \ /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \ chimpDels.bed chimpDels.unmapped wc -l chimpDels.bed chimpDels.unmapped # 27662 chimpDels.bed # 132 chimpDels.unmapped # 27794 total hgLoadBed hg17 chimpDels chimpDels.bed -noBin ### CREATE chimpFixedDiff -- panTro1 (Daryl, August 18, 2005) # Convert chimp quality scores from uncompressed to compressed # chromosome format. This took 22 minutes on crow. ## previously done for hg16 # cd /cluster/data/panTro1 # cat */chr*.qa | qaToQac stdin chrom.qac # Make single base pair high quality differences into a bed file # and load into database cd /cluster/data/hg17/bed mkdir chimpFixedDiff cd chimpFixedDiff sed 's/simpleNucDiff/chimpFixedDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > chimpFixedDiffs.sql # chimpHiQualDiffs was changed to allow different # quality parameters as command line options set axtDir = /cluster/data/hg17/bed/blastz.panTro1/axtRBestNet # This crashed twice at the same place, but ran successfully when # each chromosome was run separately. ## time chimpFixedDiffs /$axtDir /cluster/data/panTro1/chrom.qac chimpFixedDiffs.bed >& chimpFixedDiffs.log mkdir chroms; cd chroms ls -1 $axtDir | grep chr | grep axt | sed 's/.axt//' | xargs mkdir # rmdir chr*random touch cfd.log foreach f (chr*) echo -n $f " " ln -s /$axtDir/$f.axt $f/$f.axt time nice chimpFixedDiffs $f /cluster/data/panTro1/chrom.qac $f.chimpFixedDiffs.bed>>& cfd.log end rm ../chimpFixedDiffs.bed cat chr*bed > ../chimpFixedDiffs.bed ## The load (sort) ran out of memory on hgwdev, so I sorted the ## file first on kolossus (3 minutes) and then loaded it on hgwdev ssh kolossus hgLoadBed -strict -sqlTable=chimpFixedDiffs.sql -noLoad hg16 chimpFixedDiff chimpFixedDiffs.bed exit ## hgwdev (37 minutes) hgLoadBed -hasBin -noSort -sqlTable=chimpFixedDiffs.sql hg16 chimpFixedDiff bed.tab TODO: need to filter out polymorphic sites (SNPs) # Load firstEF track (DONE 2004-08-18 braney) ssh hgwdev mkdir -p /cluster/data/hg17/bed/firstEF cd /cluster/data/hg17/bed/firstEF wget "http://bioinformatics.med.ohio-state.edu/downloads/firstEFMay04.bed.gz" cat << '_EOF_' > sedScript s/chr23/chrX/g s/chr24/chrY/g /^>/d /^$/d /^No/d '_EOF_' zcat firstEFMay04.bed.gz | sed -f sedScript | awk "{OFS=\"\t\"} {\$3 +=1; print \$0}" > firstEF.bed hgLoadBed hg17 firstEF firstEF.bed rm firstEF.tab gzip *.bed #done firstEF # GENE BOUNDS (RNACLUSTER) (DONE 08-18-2004 Chuck) # Create rnaCluster table (depends on {est,mrna}OrientInfo) cd ~sugnet/store1/altSplice/hg17/ mkdir rnaCluster cd rnaCluster/ mkdir chrom # Create a list of accessions that come from RAGE libraries and need to be excluded. ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg17 rage.libs foreach f (/cluster/data/hg17/nib/chr*.nib) set c = $f:t:r set out = chrom/$c.bed # Exclude accesions in the RAGE file echo clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c end hgLoadBed hg17 rnaCluster chrom/*.bed mkdir /cluster/data/hg17/bed/rnaCluster cp -r chrom /cluster/data/hg17/bed/rnaCluster # miRNA track (CORRECTION 2004-12-09 - Hiram) # Received the following correction from Michel Weber: # Could you please replace the two lines: # chr6 72169974 72170045 hsa-mir-30a 480 - 72169977 72169999 # chr6 72170017 72170040 hsa-mir-30a-5p 480 - 72170017 72170040 # by: # chr6 72169974 72170045 hsa-mir-30a 480 - 72169977 72169999 # chr6 72169974 72170045 hsa-mir-30a 480 - 72170017 72170040 # (The first line remains identical, only the second is changed. The # repetition of the hsa-mir-30a entry means that both strands of its # hairpin structure are matured into microRNAs, named hsa-miR-30a-3p and # hsa-miR-30a-5p in Rfam database). ssh hgwdev cd /cluster/data/hg17/bed/miRNA mv miRNA_hg17_1.bed miRNA_hg17_1.bed.0 cp miRNA_hg17_1.bed.0 miRNA_hg17_1.bed # edit miRNA_hg17_1.bed to change the single line. Then: mv hg17.bed hg17.bed.0 egrep -v "^track |^browser " miRNA_hg17_1.bed | \ sed -e "s/miR/mir/g; s/ sa-mir/ hsa-mir/g; s/ /\t/g;" > hg17.bed # Check that the edit is in place properly: diff hg17.bed.0 hg17.bed # and load it hgLoadBed hg17 miRNA hg17.bed # Loaded 221 elements of size 8 # featureBits remains the same: featureBits hg17 miRNA # 18052 bases of 2866216770 (0.001%) in intersection # miRNA track (DONE 2004-09-03 - Hiram)(CORRECTED, see above 2004-12-09) # The source data for this was received via email from Sam # Griffiths-Jones to Donna 16 August 2004. In other email Michel # Weber asked to add one more data line to that file. # data from: Sam Griffiths-Jones # and Michel.Weber@ibcg.biotoul.fr # notify them if this assembly updates to renew this track cd /cluster/data/hg17/bed mkdir miRNA cd miRNA # one name was missing the h in hsa-mir and one was miR instead of # mir egrep -v "^track |^browser " miRNA_hg17_1.bed | \ sed -e "s/miR/mir/g; s/ sa-mir/ hsa-mir/g; s/ /\t/g;" > hg17.bed hgLoadBed hg17 miRNA hg17.bed # compare with previous results, should be relatively similar # featureBits hg16 miRNA # 16923 bases of 2865248791 (0.001%) in intersection # featureBits hg17 miRNA # 18052 bases of 2866216770 (0.001%) in intersection # entry is already in trackDb/trackDb.ra ## blastz mRNA track for internal use - Robert 8/12/04 mkdir /cluster/bluearc/hg17/mrnaBlastz cd /cluster/bluearc/hg17/mrnaBlastz /cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank genbank mrna mrna.fa -db=hg -native mkdir -p split faTrimPolyA mrna.fa trim.fa faSplit about trim.fa 1000000 split/mrna cp -ip trim.fa /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz faSize trim.fa -detailed=on > S2.len hgsql hg16 < chromInfo.sql > S1.len BlastZ_run0.sh cd run.0 para push para time #113894 jobs in batch #207911 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 113894 of 113894 jobs #CPU time in finished jobs: 14423845s 240397.41m 4006.62h 166.94d 0.457 y #IO & Wait Time: 334352s 5572.54m 92.88h 3.87d 0.011 y #Average job time: 130s 2.16m 0.04h 0.00d #Longest job: 38301s 638.35m 10.64h 0.44d #Submission to last job: 59841s 997.35m 16.62h 0.69d mkdir run.1 ~angie/hummus/do.out2lav DEF > run.1/j cd run.1 para create j para push para time #341 jobs in batch #208550 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 341 of 341 jobs #CPU time in finished jobs: 28990s 483.17m 8.05h 0.34d 0.001 y #IO & Wait Time: 43139s 718.98m 11.98h 0.50d 0.001 y #Average job time: 212s 3.53m 0.06h 0.00d #Longest job: 2015s 33.58m 0.56h 0.02d #Submission to last job: 2187s 36.45m 0.61h 0.03d #!/bin/tcsh set base="/cluster/bluearc/hg17/mrnaBlastz" cd $base mkdir -p pslRaw foreach c (lav/*) pushd $c set chr=$c:t set out=$base/pslRaw/$chr.psl echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToPsl stdin stdout \ | sed -e 's@scratch/hg/gs.18/build35/bothMaskedNibs//@@' | sed -e 's/\.nib:[0-9]*-[0-9]*//' > $out popd end mkdir run.2 for i in `awk '{print $1}' S1.len` ; do echo doSortFilter.sh ../pslRaw/$i.psl ../pslFilter/$i.psl >> run.2/spec.dup ; done cd run.2 para create spec.dup para push #46 jobs in batch #3 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 46 of 46 jobs #CPU time in finished jobs: 4409s 73.48m 1.22h 0.05d 0.000 y #IO & Wait Time: 1082s 18.04m 0.30h 0.01d 0.000 y #Average job time: 119s 1.99m 0.03h 0.00d #Longest job: 3842s 64.03m 1.07h 0.04d #Submission to last job: 3842s 64.03m 1.07h 0.04d cd .. for i in `awk '{print $1}' S1.len` ; do echo axtChain -linearGap=linearGap.txt -psl pslFilter/$i.psl /scratch/hg/gs.18/build35/bothMaskedNibs/ -faQ /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz/trim.fa chain/$i.chain >> spec.chain ; done para create spec.chain para push cd run.3 para create spec.filter para push cd .. ls /cluster/data/hg17/nib/*.nib > S1.lst #Skip chainPreNet it is not good for mrna #mkdir -p preNet # #cd chainFilter #foreach i ( *.chain) #chainPreNet $i ../S1.len ../S2.len ../preNet/$i #end mkdir run.4 cd run.4 for i in `awk '{print $1}' ../S1.len`; do echo "chainToPsl ../chainFilter/$i.chain ../S1.len ../S2.len ../S1.lst /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz/trim.fa ../psl/$i.psl" >> spec.chain2psl.new ; done pslCat psl/*psl > mrnaBlastz.psl hgLoadPsl hg17 mrnaBlastz.psl cp trim.fa /cluster/data/hg17/bed/mrnaBlastz/hg17Mrna.fa ln /cluster/data/hg17/bed/mrnaBlastz/hg17Mrna.fa /gbdb/hg17/mrnaBlastz/ -s hgLoadSeq -prefix=bz hg17 /gbdb/hg17/mrnaBlastz/hg17Mrna.fa ## end of blastz Mrna track #### BUILD RETROGENE TRACK (done Robert 8/26/2004) #### REBUILD RETROGENE TRACK (done Robert 12/24/2004 - but no notes - kuhn) # diffs before push to beta: # 1640 hg17.pseudoGeneLink.devOnly # 9639 hg17.pseudoGeneLink.betaOnly # 15091 hg17.pseudoGeneLink.common # RETROGENE TRACK data update - Robert - 2005-04-08 (added by Jen 2006-01-31) - pushQ entry did not include psuedoMrna table. Old table is still present on RR. New data has since been lost on dev. User impact: ~1000 sequence missing links in browser - new all.joiner rule needed to link psuedoMrna to pseudoGeneLink table - current all.joiner rule between knownGene to pseudoGeneLink gives errors. the data types appear to be mismatched. pseudoGeneLink.kgName is a gene symbol, not the same identifier as in knownGene.name - data is to be regenerated soon and errors corrected at that time mkdir /cluster/data/hg17/bed/pseudo cd /cluster/data/hg17/bed/pseudo ls /cluster/data/hg17/nib/*.nib > S1.lst hgsql hg17 -N -B < allMrna.sql > allMrna.lst cp /cluster/data/genbank/data/aligned/genbank.142.0/hg17/full/mrna.native.psl.gz . gunzip mrna.native.psl.gz awk '{OFS="\t";print $1,$2,$3,$4,$5,$6,$7,$8,$9,substr($10,1,index($10,".")-1),$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23}' mrna.native.rawPsl > mrnaBlat.psl hgsql hg17 -N -B < refGene.sql > refGene.tab hgsql hg17 -B -N < mgcGene.sql > mgcGene.tab cat ../../*/*.fa.out | awk '$5~/chr*/{OFS="\t";print $5,$6,$7}' >rmsk.bed cd /cluster/bluearc/hg17/mrnaBlastz/ zcat /cluster/data/hg17/bed/blastz.mm5/axtChain/mouseSyn.net.gz | netToBed stdin mouseSyn.bed hgsql hg17 < mrna.sql | grep -v matches | awk '{OFS="\t"; print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22}' > all_mrna.psl ssh eieio pslCat -nohead -check all_mrna.psl /cluster/bluearc/hg17/mrnaBlastz/psl/*.psl |awk '{print $0, $1*3-$2}' | sort -k 10,10 -k 22nr -T /tmp | awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' > blatBlastzHg17.psl ssh hgwdev cp blatBlastzHg17.psl /scratch/ tawk '$12 > 1 && $12 < 9999999{x=$11;$11=$12;$12=x;print $0}' /cluster/data/kgDB/bed/hg17/kgBestMrna/sortedKnownGene.tab > sortedKnownGene.tab #copy files to iServers for cluster run ssh kkr1u00 /cluster/home/baertsch/bin/i386/pslSplit nohead -chunkSize=121 /iscratch/i/gs.18/build35/pseudo blatBlastzHg17.psl cd /cluster/data/hg17/bed/pseudo cp refGene.tab /iscratch/i/gs.18/build35/pseudo cp /cluster/data/hg17/bed/simpleRepeat/simpleRepeat.bed /iscratch/i/gs.18/build35/pseudo cp mrnaHg17.fa /iscratch/i/gs.18/build35/pseudo cp sortedKnownGene.tab /iscratch/i/gs.18/build35/pseudo cp rmsk.bed /iscratch/i/gs.18/build35/pseudo cp all_mrna.psl /iscratch/i/gs.18/build35/pseudo cp mouseSyn.bed /iscratch/i/gs.18/build35/pseudo for i in `ls tmp*` ; do echo "doBuildkk.sh ${i%%.psl}" ; done | sed -e 's/tmp//g' > ~/hg17/pseudo/spec.kk cd /iscratch/i/hg/gs.18/build35/pseudo iSync para create spec.kk para push #post process # run from eieio BLUE=/cluster/bluearc/hg17/pseudo echo catting output cat $BLUE/pseudoGeneLink[0-9]*.bed | sort -k1,1 -k2,3n >pseudoGeneLinkSort.bed ; /bin/rm $BLUE/pseudoGeneLink[0-9]*.bed cat $BLUE/pseudo[0-9]*.psl > pseudo.psl ; /bin/rm $BLUE/pseudo[0-9]*.psl & echo Filtering pseudoGeneLinkSort.bed tawk '$5 > 10 && $15 > 10000 && $35 > 650 {OFS="\t";print $0}' pseudoGeneLinkSort.bed > pseudoGeneLinkSortFilter.bed echo Removing Overlaps doSplit cd /cluster/bluearc/hg17/pseudo/run.o spec.overlap cd ~/hg17/pseudo cat /cluster/bluearc/hg17/pseudo/chr*pseudoNoOverlap.bed > pseudoGeneLinkNoOverlap.bed echo Making psl awk '{printf("%s\t%s\t%s\n", $5,$2,$3)}' pseudoGeneLinkNoOverlap.bed > pseudoGeneLinkSelect.tab ## 350 is the sacrad magic number and will probably change tawk '$6>=350{print $0}' pseudoGeneLinkNoOverlap.bed > pseudoGeneLinkNoOverlapFilter.bed pslSelect -qtStart=pseudoGeneLinkSelect.tab pseudo.psl pseudoMrna.psl echo Loading Bed hgLoadBed hg17 pseudoGeneLink pseudoGeneLinkNoOverlapFilter.bed -hasBin -sqlTable=/cluster/home/baertsch/kent/src/hg/lib/pseudoGeneLink.sql echo Loading Psl hgLoadPsl hg17 pseudoMrna.psl ## end of retroGene track # 3-WAY MULTIZ MULTIPLE ALIGNMENT (MM5, RN3) (DONE 2004-08-27 kate) # HMR Maf's needed for regulatory potential track ssh eieio set multizDir = multiz.2004-08-27 cd /cluster/data/hg17/bed/$multizDir set workingDir = /cluster/bluearc/hg17/$multizDir ln -s $workingDir /cluster/bluearc/hg17/multiz3way ln -s $multizDir multiz3way mkdir -p $workingDir mkdir -p /cluster/data/hg17/bed/$multizDir # wrapper script for multiz # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) # NOTE: next time, modify script so it only needs one arg -- saves the # multiple dirname in a file for use by the next run cat << 'EOF' > doMultiz.csh #!/bin/csh -fe mkdir -p $3:h /cluster/bin/penn/multiz $1 $2 - > $3 'EOF' # << for emacs cat << 'EOF' > gsub #LOOP ../doMultiz.csh {check in line /cluster/bluearc/hg17/multiz.2004-08-27/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/hg17/multiz.2004-08-27/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/hg17/multiz.2004-08-27/$(root1)$(dir1)/$(root2).maf} #ENDLOOP 'EOF' # << for emacs chmod +x doMultiz.csh ssh eieio set workingDir = /cluster/bluearc/hg17/multiz.2004-08-27 # copy mafs to bluearc -- chimp mkdir $workingDir/mm5 cp /cluster/data/hg17/bed/blastz.mm5/mafNet/*.maf \ $workingDir/mm5 ls $workingDir/mm5/*.maf > chrom.lst # rat mkdir $workingDir/rn3 cp /cluster/data/hg17/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3 # multiz - add in rn3 rat to human/mouse # ssh kki set multizDir = multiz.2004-08-27 set workingDir = /cluster/bluearc/hg17/$multizDir cd /cluster/data/hg17/bed/$multizDir mkdir run.rn3 cd run.rn3 echo "rn3/mm5" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList para create jobList # 47 jobs para try, check, push, check # copy 3-way mafs to build directory ssh eieio set multizDir = multiz.2004-08-27 set workingDir = /cluster/bluearc/hg17/$multizDir ln -s $workingDir/mm5rn3 $workingDir/maf cd /cluster/data/hg17/bed/multiz.2004-08-27 mkdir maf cp $workingDir/maf/*.maf maf # BLASTZ TETRAODON (tetNig1) (DONE, 2004-08-26, hartera) # blastz requires lineage-specific repeats # Treat all repeats as lineage-specific. ssh kkr1u00 mkdir /iscratch/i/gs.18/build35/linSpecRep.notInTetraodon foreach f (/iscratch/i/gs.18/build35/rmsk/chr*.fa.out) cp -p $f /iscratch/i/gs.18/build35/linSpecRep.notInTetraodon/$f:t:r:r.out.spec end mkdir /iscratch/i/tetNig1/linSpecRep.notInHuman foreach f (/iscratch/i/tetNig1/rmsk/chr*.fa.out) cp -p $f /iscratch/i/tetNig1/linSpecRep.notInHuman/$f:t:r:r.out.spec end iSync ssh kk mkdir -p /cluster/data/hg17/bed/blastz.tetNig1.2004-08-20 ln -s /cluster/data/hg17/bed/blastz.tetNig1.2004-08-20 \ /cluster/data/hg17/bed/blastz.tetNig1 cd /cluster/data/hg17/bed/blastz.tetNig1 # abridge repeats. # Treat all repeats as lineage-specific. cat << '_EOF_' > DEF # human vs. Tetraodon export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin # use same parameters as for danRer1-fr1 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.notInTetraodon SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Tetraodon SEQ2_DIR=/iscratch/i/tetNig1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/tetNig1/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastz.tetNig1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.hg17-tetNig1.2004-08-20 # make sure BlastZ_run0.sh, BlastZ_run1.sh and BlastZ_run2.sh scripts # are in /cluster/data/hg17/jkStuff # edit BlastZ_run0.sh so directory for blastz is /cluster/bin/penn bash # if a csh/tcsh user . ./DEF /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 # check batch looks ok then para try, check, push, check, .... # para time # Completed: 19437 of 19437 jobs # CPU time in finished jobs: 3225816s 53763.60m 896.06h 37.34d 0.102 y # IO & Wait Time: 174096s 2901.60m 48.36h 2.01d 0.006 y # Average job time: 175s 2.92m 0.05h 0.00d # Longest job: 709s 11.82m 0.20h 0.01d # Submission to last job: 5324s 88.73m 1.48h 0.06d # second cluster run: lift raw alignments -> lav dir ssh kki cd /cluster/data/hg17/bed/blastz.tetNig1 bash # if a csh/tcsh user . ./DEF /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, check etc. # para time # Completed: 341 of 341 jobs # CPU time in finished jobs: 280s 4.66m 0.08h 0.00d 0.000 y # IO & Wait Time: 2183s 36.39m 0.61h 0.03d 0.000 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest job: 41s 0.68m 0.01h 0.00d # Submission to last job: 469s 7.82m 0.13h 0.01d # third run: lav -> axt ssh kki cd /cluster/data/hg17/bed/blastz.tetNig1 mkdir axtChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/tetNig1/nib stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 45 of 46 jobs # Crashed: 1 jobs # CPU time in finished jobs: 52s 0.87m 0.01h 0.00d 0.000 y # IO & Wait Time: 256s 4.27m 0.07h 0.00d 0.000 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest job: 36s 0.60m 0.01h 0.00d # Submission to last job: 275s 4.58m 0.08h 0.00d # one job crashed because chr6_hla_hap1.axt is empty. Checked by running this # again and then looked at the lav file which has no alignments in it. # translate sorted axt files into psl ssh kolossus cd /cluster/data/hg17/bed/blastz.tetNig1 mkdir -p pslChrom set tbl = "blastzTetNig1" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/hg17/bed/blastz.tetNig1/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 $f echo "$f Done" end # original blastzTetNig1: # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=6000 # BLASTZ_K=2200 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # BLASTZ_ABRIDGE_REPEATS=1 # featureBits -chrom=chr1 hg17 blastzTetNig1 # 6378680 bases of 222827847 (2.863%) in intersection # featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1 -enrichment # refGene:cds 1.246%, blastzTetNig1 2.863%, both 0.856%, cover 68.70%, # enrich 24.00x # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer1 -enrichment # refGene:cds 1.246%, blastzDanRer1 3.934%, both 0.831%, cover 66.72%, # enrich 16.96x # comparable to zebrafish so good # try same parameters with L=8000 # featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1L8k -enrichment # refGene:cds 1.246%, blastzTetNig1L8k 2.095%, both 0.753%, cover 60.47%, # enrich 28.87x # load chr1 with blastz using just H=2000 and default parameters # featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1Default -enrichment # refGene:cds 1.246%, blastzTetNig1Default 1.630%, both 0.808%, cover 64.87%, # enrich 39.80x # rows in chr1_blastzTetNig1 tables # blastzTetNig1 95156 # blastzTetNig1L8k 58015 # blastzTetNig1Default 71342 # The default values also used for danRer1 vs fugu give good coverage and # higher enrichment than blastzTetNig1 with less alignments so this will be # used for the blastz track - now called blastzTetNig1. # CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera) # Make chains with rescored blastz # Run axtChain on little cluster ssh kki cd /cluster/data/hg17/bed/blastz.tetNig1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Make our own linear gap file with reduced gap penalties, # in hopes of getting longer chains - works well for species at # chicken-human distance or greater cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -linearGap=../../chickenHumanTuned.gap $1 \ /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/tetNig1/nib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 45 of 46 jobs # Crashed: 1 jobs # CPU time in finished jobs: 553s 9.22m 0.15h 0.01d 0.000 y # IO & Wait Time: 102s 1.69m 0.03h 0.00d 0.000 y # Average job time: 15s 0.24m 0.00h 0.00d # Longest job: 56s 0.93m 0.02h 0.00d # Submission to last job: 985s 16.42m 0.27h 0.01d # one job crashed since chr6_hla_hap1.axt is empty - no alignments # now on the cluster server, sort chains ssh kksilo cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # only chr19 has a very large number of chains with score < 5000 # load chr 1 into table ssh hgwdev cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain hgLoadChain hg17 chr1_chainTetnig1 chr1.chain # featureBits -chrom=chr1 hg17 refGene:cds chainTetnig1Link -enrichment # refGene:cds 1.246%, chainTetnig1Link 1.582%, both 0.805%, cover 64.59%, # enrich 40.83x # try filtering with minScore of 5000 ssh kksilo cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain rm -r chain chainSplit chain all.chain ssh hgwdev cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain hgLoadChain hg17 chr1_chainTetNig1Filt5k chr1.chain # featureBits -chrom=chr1 hg17 refGene:cds chainTetNig1Filt5kLink -enrichment # refGene:cds 1.246%, chainTetNig1Filt5kLink 1.487%, both 0.789%, cover 63.33%, # enrich 42.58x # this cleans it up a lot with little reduction in coverage. # check in browser - filtered version looks good. # add all chains for minScore=5000 filtered chains # remove test chain tables for chr1 ssh hgwdev hgsql -e "drop table chr1_chainTetnig1;" hg17 hgsql -e "drop table chr1_chainTetnig1Link;" hg17 hgsql -e "drop table chr1_chainTetNig1Filt5k;" hg17 hgsql -e "drop table chr1_chainTetNig1Filt5kLink;" hg17 cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg17 ${c}_chainTetNig1 $i echo done $c end # NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera) ssh kksilo cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 55373824, utime 415 s/100, stime 45 # Add classification info using db tables: cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option mkdir -p /cluster/bluearc/hg17/linSpecRep.notInTetraodon mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInHuman cp /iscratch/i/hg17/linSpecRep.notInTetraodon/* \ /cluster/bluearc/hg17/linSpecRep.notInTetraodon cp /iscratch/i/tetNig1/linSpecRep.notInHuman/* \ /cluster/bluearc/tetNig1/linSpecRep.notInHuman ssh hgwdev cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain time netClass noClass.net hg17 tetNig1 tetNig1.net \ -tNewR=/cluster/bluearc/hg17/linSpecRep.notInTetraodon \ -qNewR=/cluster/bluearc/tetNig1/linSpecRep.notInHuman -noAr # 54.100u 31.890s 2:20.01 61.4% 0+0k 0+0io 197pf+0w netFilter -minGap=10 tetNig1.net | hgLoadNet hg17 netTetNig1 stdin # featureBits hg17 refGene:cds netTetNig1 -enrichment # refGene:cds 0.978%, netTetNig1 25.095%, both 0.778%, cover 79.53%, # enrich 3.17x # TWINSCAN 1.3 GENE PREDICTIONS (Done, 2004-Aug-26, heather) cd /cluster/data/hg17/bed mkdir twinscan tarFile=hg17_TS13_pseudomasked.tar.gz wget http://genes.cs.wustl.edu/predictions/human/NCBI35/hg17_TS13_pseudomasked.tar.gz wget http://genes.cs.wustl.edu/predictions/human/NCBI35/md5sum.txt # check file transferred correctly grep gz md5sum.txt > gz.sum md5sum $tarFile | diff - gz.sum # extract tar xvfz $tarFile unset tarFile # check that files unzipped and untarred correctly # expect no differences cd chr_gtf grep gtf ../md5sum.txt > md5sum.txt cd ../chr_ptx grep ptx ../md5sum.txt > md5sum.txt cd ../chr_tx grep tx ../md5sum.txt > md5sum.txt cd .. md5sum chr_gtf/* > gtf.sum diff gtf.sum chr_gtf/md5sum.txt md5sum chr_ptx/* > ptx.sum diff ptx.sum chr_ptx/md5sum.txt md5sum chr_tx/* > tx.sum diff tx.sum chr_tx/md5sum.txt # pare down protein FASTA header to id and add missing .a: foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) echo chr$c perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa end ldHgGene hg17 twinscan chr_gtf/chr*.gtf -gtf -genePredExt hgPepPred hg17 generic twinscanPep chr_ptx/chr*-fixed.fa # MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-09-08, hartera) # Replace with gzipped versions (DONE 2004-09-14 kate) ssh kksilo # zip chains and nets cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain cp all.chain tetNig1.chain zip -j /cluster/data/hg17/zip/tetNig1.chain.zip tetNig1.chain rm tetNig1.chain zip -j /cluster/data/hg17/zip/tetNig1.net.zip tetNig1.net ssh hgwdev # copy chains and nets to downloads area set gp = /usr/local/apache/htdocs/goldenPath/hg17 mkdir -p $gp/vsTetNig1 cd $gp/vsTetNig1 mv /cluster/data/hg17/zip/tetNig1*.zip . md5sum *.zip > md5sum.txt # move axt files to downloads area and zip cd /cluster/data/hg17/bed/blastz.tetNig1/axtChrom mkdir -p $gp/vsTetNig1/axtChrom cp -p *.axt $gp/vsTetNig1/axtChrom cd $gp/vsTetNig1/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # BLASTZ TETRAODON (tetNig1) CLEANUP (DONE, 2004-09-10, hartera) ssh kksilo cd /cluster/data/hg17/bed/blastz.tetNig1 nice rm -rf raw & nice rm -rf lav & nice rm axtChain/run1/chain/* & nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} & # regulatory potential 2X track (WORKING - 2004-09-14 - Hiram) ssh eieio mkdir /cluster/store3/gs.18/build35/bed/regPotential2X mkdir /cluster/store3/gs.18/build35/bed/regPotential3X cd /cluster/data/hg17/bed ln -s /cluster/store3/gs.18/build35/bed/regPotential2X . ln -s /cluster/store3/gs.18/build35/bed/regPotential3X . cd regPotential2X wget --timestamping 'http://www.bx.psu.edu/~james/stuff/rp_kit.tgz' . tar xvzf rp_kit.tgz # fixup the hmr_rp_score.sh and hm_rp_score.sh to set # RP=. to read: RP=/cluster/data/hg17/bed/regPotential2X/rp_kit # And fix the usage of SHIFT and WINDOW, the following diff shows # the changes: # 5c5 # < RP_DIR=/cluster/data/hg17/bed/regPotential2X/rp_kit # --- # > RP_DIR=. # 8,9c8,9 # < MAPPING=rp_kit/hm_5a_mapping.txt # < MATRIX=rp_kit/hm_5a+3_scoreMatrix.dat # --- # > MAPPING=hm_5a_mapping.txt # > MATRIX=hm_5a+3_scoreMatrix.dat # 12c12 # < SHIFT=1 # --- # > SHIFT=5 # 24,25c24,25 # < --shiftAmount $SHIFT \ # < --windowSize $WINDOW \ # --- # > --shiftAmount 5 \ # > --windowSize 100 \ mkdir maf for A in `(cd /cluster/data/hg17/bed/blastz.mm5/axtNet; ls chr*.axt)` do C=${A/.axt} echo "/cluster/data/hg17/bed/blastz.mm5/axtNet/${A} -> maf/${C}.maf.gz" axtToMaf /cluster/data/hg17/bed/blastz.mm5/axtNet/${A} \ /cluster/data/hg17/chrom.sizes /cluster/data/mm5/chrom.sizes \ stdout | gzip > maf/${C}.maf.gz done # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) # a valid java runtime is only on hgwdev. This is a java procedure ssh hgwdev cd /cluster/data/hg17/bed/regPotential2X mkdir rp_scores # WARNING - the following loop takes almost 12 hours ! for M in maf/chr*.maf.gz do C=${M/.maf.gz} C=${C#maf/} echo "$M -> rp_scores/$C.score.gz" (zcat ${M} | ./rp_kit/hm_rp_score.sh /dev/stdin /dev/stderr 2>&1 >/dev/null) | sort -n | \ gzip > rp_scores/${C}.score.gz done # real 709m55.805s # user 754m51.030s # sys 20m11.000s # Back to the file server to create the wiggle data ssh eieio cd /cluster/data/hg17/bed/regPotential2X mkdir wigData dataLimits for S in rp_scores/chr*.score.gz do C=${S/.score.gz} C=${C#rp_scores/} echo "$S -> wigData/$C.wig" zcat $S | sort -n | \ wigAsciiToBinary -chrom=$C -dataSpan=1 \ -wibFile=wigData/$C stdin 2> dataLimits/$C.limits done # real 313m0.567s # user 285m37.319s # sys 23m8.301s # Loading the table on hgwdev ssh hgwdev cd /cluster/data/hg17/bed/regPotential2X/wigData mkdir /gbdb/hg17/wib/regPotential2X ln -s `pwd`/*.wib /gbdb/hg17/wib/regPotential2X time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential2X \ hg17 regPotential2X chr*.wig # real 2m29.668s # user 0m33.380s # sys 0m8.200s # regulatory potential 3X track (WORKING - 2004-09-14 - Hiram) # Expects groundwork done above in the 2X track # a valid java runtime is only on hgwdev. This is a java procedure ssh hgwdev cd /cluster/data/hg17/bed/regPotential3X ln -s ../regPotential2X/rp_kit/hmr_rp_score.sh . mkdir rp_scores # WARNING - the following loop takes almost 12 hours ! for M in maf/chr*.maf.gz do C=${M/.maf.gz} C=${C#maf/} echo "$M -> rp_scores/$C.score.gz" (zcat ${M} | ./rp_kit/hm_rp_score.sh /dev/stdin /dev/stderr 2>&1 >/dev/null) | sort -n | \ gzip > rp_scores/${C}.score.gz done # real 613m8.230s # user 623m7.110s # sys 20m24.550s # Back to the file server to create the wiggle data ssh eieio cd /cluster/data/hg17/bed/regPotential3X mkdir wigData dataLimits for S in rp_scores/chr*.score.gz do C=${S/.score.gz} C=${C#rp_scores/} echo "$S -> wigData/$C.wig" zcat $S | sort -n | \ wigAsciiToBinary -chrom=$C -dataSpan=1 \ -wibFile=wigData/$C stdin 2> dataLimits/$C.limits done # Loading the table on hgwdev ssh hgwdev cd /cluster/data/hg17/bed/regPotential3X/wigData mkdir /gbdb/hg17/wib/regPotential3X ln -s `pwd`/*.wib /gbdb/hg17/wib/regPotential3X time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential3X \ hg17 regPotential3X chr*.wig # real 1m45.568s # user 0m32.740s # sys 0m6.140s # regulatory potential 5X track (DONE - 2005-09-19 - Daryl) ssh kkstore02 mkdir -p /cluster/data/hg17/bed/regPotential5X/rp_scores cd /cluster/data/hg17/bed/regPotential5X/rp_scores wget -r -l 1 -nH http://www.bx.psu.edu/~james/rp/hg17panTro1mm5rn3canFam1/all_truncate.tar tar xvf all_truncate.tar cd /cluster/data/hg17/bed/regPotential5X mkdir -p wigData dataLimits cd wigData ## 8 minutes for S in ../rp_scores/chr*.scores.truncated.gz do C=${S/.scores.truncated.gz} C=${C#../rp_scores/} echo "$S -> wigData/$C.wig" zcat $S | wigEncode stdin $C.wig $C.wib 2> ../dataLimits/$C.limits done # Loading the table on hgwdev ssh hgwdev cd /cluster/data/hg17/bed/regPotential5X/wigData mkdir -p /gbdb/hg17/wib/regPotential5X chmod o+rx /gbdb/hg17/wib/regPotential5X ln -s /cluster/data/hg17/bed/regPotential5X/wigData/*.wib /gbdb/hg17/wib/regPotential5X chmod o+r /gbdb/hg17/wib/regPotential5X/ch*wib time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential5X hg17 regPotential5X chr*.wig # 57.720u 9.960s 2:26.05 46.3% 0+0k 0+0io 213pf+0w # SGP GENES (DONE 9/17/04 angie) ssh eieio mkdir /cluster/data/hg17/bed/sgp cd /cluster/data/hg17/bed/sgp foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/SGP/$chr.gtf wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/SGP/$chr.prot end # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf cp /dev/null sgpPep.fa foreach f (chr*.prot) perl -wpe 's/^(>chr\S+)/$1.1/' $f >> sgpPep.fa end ssh hgwdev cd /cluster/data/hg17/bed/sgp ldHgGene -gtf -genePredExt hg17 sgpGene chr*.gtf hgPepPred hg17 generic sgpPep sgpPep.fa # SGP GENES (UPDATE 1/18/2006) sgpPep table dropped, replaced by hgc generated protein seq in browser # LIFTOVER RNAGENE FROM HG16 (09/29/04, acs) cd /cluster/data/hg17/bed mkdir rnaGene cd rnaGene liftOver -gff /cluster/data/hg16/bed/rnaGene/all.gff \ /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \ rnaGeneLift.gff rnaGeneMiss.gff # 7204 records passed, 16 failed hgsql hg17 < ~/kent/src/hg/lib/rnaGene.sql hgRnaGenes hg17 rnaGeneLift.gff # BUILD BioCyc TABLES (DONE 10/1/04 Fan) - Create bioCycMapDesc table. CREATE TABLE bioCycMapDesc ( mapID varchar(40) NOT NULL default '', description varchar(255) NOT NULL default '', KEY mapID (mapID) ) TYPE=MyISAM; - Crate bioCycPathway table. CREATE TABLE bioCycPathway ( kgID varchar(40) NOT NULL default '', geneID varchar(40) NOT NULL default '', mapID varchar(40) NOT NULL default '', KEY kgID (kgID), KEY geneID (geneID), KEY mapID (mapID) ) TYPE=MyISAM; Using data files sent by Peter Carp from SRI, per Peter's email of 10/1/04, they don't have recent update, so data files received last year are used. Save the BioCyc Pathway name and description table as names.txt. Save the pathway data file as gene-pathway.dat. Make sure there is no extra ^M at end of the lines. hgsql hg17 -e 'LOAD DATA local INFILE 'names.txt' into table bioCycMapDesc' Run hgBioCyc program to generate the file bioCycPathway.tab. hgBioCyc gene-pathway.dat hg17 Load into hg17. hgsql hg17 -e 'LOAD DATA local INFILE "bioCycPathway.tab" into table bioCycPathway' # MAKING FOLDUTR TABLES (DONE - 2004-10-4 Fan) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev mkdir -p /cluster/data/hg17/bed/rnaStruct cd /cluster/data/hg17/bed/rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa hg17 knownGene utr3 utr3/utr.fa utrFa hg17 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/hg17/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < rgdQtl.gff # create rgdQtl.tab awk '{print $1"\t"$4"\t"$5"\t"$10}' rgdQtl.gff |sed -e 's/Chr/chr/g'| \ sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtl.tab # create rgdQtlLink.tab awk '{printf "%s\t%s\t", $12, $10; for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' rgdQtl.gff | \ sed -e 's/"//g'| sed -e 's/RGD://g' | sed -e 's/;//g'| sed -e 's/Note//g' > rgdQtlLink.tab # load rgdQtl table hgLoadBed hg17 rgdQtl rgdQtl.tab # check rgdQtl table checkTableCoords hg17 rgdQtl # load rgdQtlLink table hgsql hg17 -e "drop table hg17.rgdQtlLink;" hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;' # updated trackDb.ra under /kent/src/hg/makeDb/trackDb/human/hg17 and # added rgdQtl.html. #### AFFYMETRIX HG-U133 PLUS TRACK (DONE, 2004-10-11, hartera) ssh hgwdev mkdir -p /projects/compbio/data/microarray/affyHuman/HG-U133Plus2 # Go to #http://www.affymetrix.com/support/technical/byproduct.affx?product=hg-u133-plus # and download the consensus and exemplar sequences to this directory cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2 unzip HG-U133_Plus_2_consensus.zip unzip HG-U133_Plus_2_exemplar.zip cat HG-U133_Plus_2_consensus HG-U133_Plus_2_exemplar >> U133Plus2_all.fa perl -pi.bak -e "s/(consensus|exemplar):HG-U133_Plus_2:/U133+2:/" \ U133Plus2_all.fa # remove ";" from probe set names perl -pi.bak -e "s/;//" U133Plus2_all.fa # clean up rm *.zip *.bak mkdir -p /cluster/data/hg17/bed/affyU133Plus2.2004-10-11 cp U133Plus2_all.fa /cluster/data/hg17/bed/affyU133Plus2.2004-10-11 # Set up cluster job to align consensus/exemplars to hg16 ssh kkr1u00 mkdir -p /iscratch/i/affy mv /cluster/data/hg17/bed/affyU133Plus2.2004-10-11/U133Plus2_all.fa \ /iscratch/i/affy iSync ssh kk cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11 ls -1 /iscratch/i/affy/U133Plus2_all.fa > affy.lst ls -1 /iscratch/i/gs.18/build35/maskedContigs/* > allctg.lst cat << '_EOF_' > template.sub #LOOP /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/gs.18/build35/hg17.11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << for emacs gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para create para.spec para try, para check, para push ..... # para time # Completed: 380 of 380 jobs # CPU time in finished jobs: 24533s 408.88m 6.81h 0.28d 0.001 y # IO & Wait Time: 2180s 36.34m 0.61h 0.03d 0.000 y # Average job time: 70s 1.17m 0.02h 0.00d # Longest job: 751s 12.52m 0.21h 0.01d # Submission to last job: 2425s 40.42m 0.67h 0.03d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU133Plus2.psl pslSort dirs raw.psl tmp psl # use filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least 95% identity in aligned region. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyU133Plus2.psl ../../jkStuff/liftAll.lft warn contig.psl perl -pi.bak -e "s/U133\+2://" affyU133Plus2.psl # load into the database ssh hgwdev cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11 hgLoadPsl hg17 affyU133Plus2.psl # Add sequence data to database # Copy probe sequence to /gbdb if it isn't already mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes ln -s /projects/compbio/data/microarray/affyHuman/HG-U133Plus2/U133Plus2_all.fa . cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11 hgLoadSeq -abbr=U133+2: hg17 /gbdb/hgFixed/affyProbes/U133Plus2_all.fa # clean up rm -r psl tmp err contig.psl raw.psl *.bak psl.tab seq.tab # Added knownToU133Plus2 track (2004-10-14) - see GeneSorter section #### MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie) # First, get ranges of target coverage: ssh eieio mkdir /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage/ cat /cluster/data/hg17/bed/var_multiz.2004-08-12/maf.09-12-04/*.maf \ | nice mafRanges -notAllOGap stdin hg17 hg17.mafRanges.bed # Get pairwise coverage as well. ssh kolossus cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage/ cat /cluster/bluearc/hg17/multiz8way/rn3/*.maf \ | nice mafRanges -notAllOGap stdin hg17 hg17.rn3.mafRanges.bed cat /cluster/bluearc/hg17/multiz8way/mm5/*.maf \ | nice mafRanges -notAllOGap stdin hg17 hg17.mm5.mafRanges.bed cat /cluster/bluearc/hg17/multiz8way/galGal2/*.maf \ | nice mafRanges -notAllOGap stdin hg17 hg17.galGal2.mafRanges.bed cat /cluster/bluearc/hg17/multiz8way/fr1/*.maf \ | nice mafRanges -notAllOGap stdin hg17 hg17.fr1.mafRanges.bed ssh hgwdev cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage # To make subsequent intersections a bit quicker, output a bed with # duplicate/overlapping ranges collapsed: nice featureBits hg17 hg17.mafRanges.bed \ -bed=hg17.mafRangesCollapsed.bed #1147548420 bases of 2866216770 (40.037%) in intersection foreach other (mm5 rn3 galGal2 fr1) nice featureBits hg17 hg17.$other.mafRanges.bed \ -bed=hg17.${other}.mafRangesCollapsed.bed end #1013348528 bases of 2866216770 (35.355%) in intersection #975533772 bases of 2866216770 (34.036%) in intersection #101623034 bases of 2866216770 (3.546%) in intersection #46737824 bases of 2866216770 (1.631%) in intersection # mafCoverage barfs currently, so pass on this for now: #cat ../maf.09-12-04/*.maf \ #| nice mafCoverage -count=2 hg17 stdin > hg17.mafCoverage # Intersect maf target coverage with gene regions -- # use Adam's knownGene region files: nice featureBits hg17 -enrichment \ ../phastCons/stats2/knownGenesCds.bed \ hg17.mafRangesCollapsed.bed \ -bed=hg17.mafCds.bed #knownGenesCds.bed 1.166%, hg17.mafRangesCollapsed.bed 40.037%, both 1.111%, cover 95.36%, enrich 2.38x nice featureBits hg17 -enrichment \ ../phastCons/stats2/knownGenesUtr3.bed \ hg17.mafRangesCollapsed.bed \ -bed=hg17.mafUtr3.bed #knownGenesUtr3.bed 0.918%, hg17.mafRangesCollapsed.bed 40.037%, both 0.662%, cover 72.18%, enrich 1.80x nice featureBits hg17 -enrichment \ ../phastCons/stats2/knownGenesUtr5.bed \ hg17.mafRangesCollapsed.bed \ -bed=hg17.mafUtr5.bed #knownGenesUtr5.bed 0.266%, hg17.mafRangesCollapsed.bed 40.037%, both 0.198%, cover 74.42%, enrich 1.86x # Intersect pairwise target coverages with gene regions: foreach other (mm5 rn3 galGal2 fr1) nice featureBits hg17 -enrichment \ ../phastCons/stats2/knownGenesCds.bed \ hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Cds.bed nice featureBits hg17 -enrichment \ ../phastCons/stats2/knownGenesUtr3.bed \ hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Utr3.bed nice featureBits hg17 -enrichment \ ../phastCons/stats2/knownGenesUtr5.bed \ hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Utr5.bed end #knownGenesCds.bed 1.166%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 1.093%, cover 93.74%, enrich 2.65x #knownGenesUtr3.bed 0.918%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 0.618%, cover 67.37%, enrich 1.91x #knownGenesUtr5.bed 0.266%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 0.186%, cover 69.81%, enrich 1.97x #knownGenesCds.bed 1.166%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 1.071%, cover 91.85%, enrich 2.70x #knownGenesUtr3.bed 0.918%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 0.597%, cover 65.09%, enrich 1.91x #knownGenesUtr5.bed 0.266%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 0.179%, cover 67.33%, enrich 1.98x #knownGenesCds.bed 1.166%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.779%, cover 66.84%, enrich 18.85x #knownGenesUtr3.bed 0.918%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.194%, cover 21.12%, enrich 5.96x #knownGenesUtr5.bed 0.266%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.056%, cover 21.03%, enrich 5.93x #knownGenesCds.bed 1.166%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.714%, cover 61.26%, enrich 37.57x #knownGenesUtr3.bed 0.918%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.073%, cover 7.92%, enrich 4.86x #knownGenesUtr5.bed 0.266%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.039%, cover 14.82%, enrich 9.09x # ALTERNATIVE CPG ISLANDS (DONE 10/14/04 angie) ssh eieio nice tcsh mkdir /cluster/data/hg17/bed/cpgIslandAlt cd /cluster/data/hg17/bed/cpgIslandAlt # Try cpg_ash (WUSTL program modified to not chop islands in half before # scoring) with default params: cp /dev/null cpg_ash.default.cpg foreach f (../../?{,?}/chr*.fa.masked) echo running on $f:t:r:r ~angie/cb/hg3rdParty/cpgIslands/cpg_ash.exe $f >> cpg_ash.default.cpg end awk -f ../cpgIsland/filter.awk cpg_ash.default.cpg > cpgIslandAlt.bed # Run Andy Law's script on masked seq: cp /dev/null cpgIslandGgfAndyMasked.bed foreach f (../../?{,?}/chr*.fa.masked) set chr = $f:t:r:r echo running on $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyMasked.bed end # Compare enrichment for knownGene upstream -- an uphill battle for # programs closer to meeting the stated length, GC, O/E params! ssh hgwdev nice featureBits hg17 -enrichment knownGene:upstream:1000 \ /cluster/data/hg17/bed/cpgIsland/cpgIsland.bed #knownGene:upstream:1000 0.857%, cpgIsland.bed 0.741%, both 0.166%, cover 19.37%, enrich 26.13x nice featureBits hg17 -enrichment knownGene:upstream:1000 \ /cluster/data/hg17/bed/cpgIslandAlt/cpgIslandAlt.bed #knownGene:upstream:1000 0.857%, cpgIslandAlt.bed 1.075%, both 0.200%, cover 23.38%, enrich 21.76x nice featureBits hg17 -enrichment knownGene:upstream:1000 \ /cluster/data/hg17/bed/cpgIslandAlt/cpgIslandGgfAndyMasked.bed #knownGene:upstream:1000 0.857%, cpgIslandGgfAndyMasked.bed 1.964%, both 0.292%, cover 34.06%, enrich 17.34x cd /cluster/data/hg17/bed/cpgIslandAlt sed -e 's/cpgIslandExt/cpgIslandAlt/g' \ ~/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandAlt.sql hgLoadBed -noBin -tab -sqlTable=cpgIslandAlt.sql \ hg17 cpgIslandAlt cpgIslandAlt.bed #Loaded 29998 elements of size 10 sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ ~/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \ hg17 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed #Loaded 80555 elements of size 10 # Quick length stats: hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandExt' #| 201 | 764.1913 | 40058 | hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandAlt' #| 200 | 1026.9194 | 32440 | hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandGgfAndyMasked' #| 200 | 698.8257 | 100308 | # 1/26/05: Make better island names in cpgIslandGgfAndyMasked, # for Dave Burt's cross-species island comparisons. ssh eieio cd /cluster/data/hg17/bed/cpgIslandAlt mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig perl -wpe '@w=split("\t"); $w[3] = "hg17.$w[0]." . ($w[1]+1) . ".$w[2]"; \ $_ = join("\t", @w);' \ cpgIslandGgfAndyMasked.bed.orig \ > cpgIslandGgfAndyMasked.bed # Now liftOver islands from mm5, rn3, galGal2: ssh kolossus cd /cluster/data/hg17/bed/cpgIslandAlt foreach match (50 95) liftOver /cluster/data/mm5/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \ /cluster/data/mm5/bed/bedOver/mm5Tohg17.chain -minMatch=0.$match \ cpgIslandGAMFromMm5_$match.bed cpgIslandGAMFromMm5_$match.unmapped liftOver /cluster/data/rn3/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \ /cluster/data/rn3/bed/bedOver/rn3ToHg17.over.chain -minMatch=0.$match \ cpgIslandGAMFromRn3_$match.bed cpgIslandGAMFromRn3_$match.unmapped liftOver /cluster/data/galGal2/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \ /cluster/data/galGal2/bed/bedOver/galGal2ToHg17.over.chain -minMatch=0.$match \ cpgIslandGAMFromGalGal2_$match.bed cpgIslandGAMFromGalGal2_$match.unmapped end # Load up the renamed islands as well as ssh hgwdev cd /cluster/data/hg17/bed/cpgIslandAlt hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \ hg17 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed # MAKE UNIGENE/SAGE TRACK (DONE - 2004-10-15 Fan) # First get SAGE data and determine which version of UniGene to use first ssh hgwdev cd ~/kent/src/hg/sage make # XXX = uniGene build for which SAGE was built -- not necessarily current! # Figure out the build number by peeking at this file: wget -O - ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt 2> /dev/null # UniGene Build #44 Arabidopsis thaliana # UniGene Build #61 Bos taurus # UniGene Build #16 Caenorhabditis elegans # UniGene Build #171 Homo sapiens # UniGene Build #19 Medicago truncatula # UniGene Build #138 Mus musculus # UniGene Build #52 Oryza sativa # UniGene Build #14 Pinus taeda # UniGene Build #132 Rattus norvegicus # UniGene Build #27 Sus scrofa # UniGene Build #38 Triticum aestivum # UniGene Build #11 Vitis vinifera # UniGene Build #41 Zea mays # From above info, set Version 171 for hg17 ls /projects/cc/hg/sugnet/uniGene # set Version = XXX set Version=171 mkdir /projects/cc/hg/sugnet/sage/sage.$Version cd /projects/cc/hg/sugnet/sage/sage.$Version wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/Hs wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/readme.txt wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/extr wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/info # That downloaded about 1 GB of data cd map/Hs/NlaIII unzip -j SAGEmap_tag_ug-rel.zip cd ../../../extr/ ../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_* ../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab ../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \ ./SAGE_* ../../scripts/countsPerExp.pl expCounts.tab expList.tab cd ../map/Hs/NlaIII/ cat << '_EOF_' > /tmp/t.pl #!/usr/local/bin/perl while (<>) { chomp($_); @p = split(/\t/, $_); print "$p[2]\t$p[3]\t$p[0]\n"; } '_EOF_' chmod +x /tmp/t.pl cat SAGEmap_tag_ug-rel | /tmp/t.pl | sort | sed -e 's/ /_/g' \ > SAGEmap_ug_tag-rel_Hs cd ../../../extr createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \ tagExpArrays.tab sageSummary.sage # Create the uniGene alignments # /cluster/data/hg17/uniGene/hg17.uniGene.lifted.pslReps.psl # Download of the latest UniGene version is now automated by a # cron job -- see /cluster/home/angie/crontab , # /cluster/home/angie/unigeneVers/unigene.csh . # If hgwdev gets rebooted, that needs to be restarted... maybe there's # a more stable place to set up that cron job. # substitute XXX -> the uniGene version used by SAGE. # set Version = XXX set Version = 171 (bash: export Version=171) cd /projects/cc/hg/sugnet/uniGene/uniGene.$Version gunzip Hs.seq.uniq.gz Hs.data.gz ../countSeqsInCluster.pl Hs.data counts.tab ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects) ssh kkstore set Version = 171 # same as above mkdir -p /iscratch/i/uniGene.$Version cp -p \ /projects/cc/hg/sugnet/uniGene/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \ /iscratch/i/uniGene.$Version ssh kkr1u00 iSync ssh kk set Version = 171 # same as above mkdir -p /cluster/data/hg17/bed/uniGene.$Version cd /cluster/data/hg17/bed/uniGene.$Version ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > allctg.lst ls -1S /iscratch/i/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \ > uniGene.lst cat << '_EOF_' > template.sub #LOOP /cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 allctg.lst uniGene.lst template.sub para.spec para create para.spec mkdir psl para try para check para push # Completed: 380 of 380 jobs # CPU time in finished jobs: 35994s 599.91m 10.00h 0.42d 0.001 y # IO & Wait Time: 1812s 30.19m 0.50h 0.02d 0.000 y # Average job time: 99s 1.66m 0.03h 0.00d # Longest job: 1497s 24.95m 0.42h 0.02d # Submission to last job: 1551s 25.85m 0.43h 0.02d ssh eieio set Version = 171 # same as above cd /cluster/data/hg17/bed/uniGene.$Version pslSort dirs raw.psl tmp psl >& pslSort.log liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \ | pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \ stdin hg17.uniGene.lifted.pslReps.psl /dev/null # Processed 141416 alignments # use hg17.uniGene.lifted.pslReps.psl for building UNIGENE/SAGE track. ssh hgwdev set Version = 171 cd /projects/cc/hg/sugnet/sage/sage.$Version/extr addAveMedScoreToPsls \ /cluster/data/hg17/bed/uniGene.$Version/hg17.uniGene.lifted.pslReps.psl \ sageSummary.sage uniGene.wscores.bed hgLoadBed hg17 uniGene_2 uniGene.wscores.bed hgsql hg17 < ~kent/src/hg/lib/sage.sql echo "load data local infile 'sageSummary.sage' into table sage" \ | hgsql hg17 cd ../info ../../scripts/parseRecords.pl ../extr/expList.tab > sageExp.tab hgsql hg17 < ~/kent/src/hg/lib/sageExp.sql echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg17 # update ~/kent/src/hg/makeDb/trackDb/human/hg17/uniGene_2.html # with current uniGene date. # CREATE kgSpAlias TABLE FOR PB (Done 10/20/04) hgsql hg17 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql hg17 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab rm j.tmp hgsql hg17 -e 'drop table kgSpAlias'; hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias' # SEGMENTAL DUPLICATIONS (DONE 10/21/04 angie) ssh hgwdev mkdir /cluster/data/hg17/bed/genomicSuperDups cd /cluster/data/hg17/bed/genomicSuperDups # A tar file containing files for both hg16 and hg17 was downloaded into # /cluster/data/hg16/bed/genomicSuperDups; move over the hg17 part. mv /cluster/data/hg16/bed/genomicSuperDups/bd35 . cd bd35 # A note from Xinwei She about the contents: #Build35 contains only 2 tables: genomicSuperDups and celeraDupPositive. # use tail +2 to skip past the header line: # actually, celeraDupPositive.tab.gz has one extra bogus line so +3 for it: zcat celeraDupPositive.tab.gz | tail +3 \ | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraDupPositive.sql \ hg17 celeraDupPositive stdin zcat genomicSuperDups.tab.gz | tail +2 \ | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql \ hg17 genomicSuperDups stdin # clean up rm bed.tab # ECGENE TRACK (DONE, 2004-10-29, hartera) ssh eieio mkdir -p /cluster/data/hg17/bed/ECgene.2004-10-27 ln -s /cluster/data/hg17/bed/ECgene.2004-10-27 \ /cluster/data/hg17/bed/ECgene cd /cluster/data/hg17/bed/ECgene wget \ "http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_hg17_low_gene.txt.gz" wget \ "http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_hg17_low_pep.txt.gz" gunzip *.gz # load database ssh hgwdev cd /cluster/data/hg17/bed/ECgene ldHgGene -predTab hg17 ECgene v1.2_hg17_low_gene.txt # 646778 gene predictions hgPepPred hg17 tab ECgenePep v1.2_hg17_low_pep.txt rm *.tab nice gzip *.txt # LOAD ENSEMBL GENES (DONE, 2004-11-19, hartera) mkdir /cluster/data/hg17/bed/ensembl cd /cluster/data/hg17/bed/ensembl # Get the ensembl protein data from # http://www.ensembl.org/Homo_sapiens/martview # Follow this sequence through the pages: # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensemblGene.gtf.gz # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. # Finally, get rid of the ".1" or ".2" after the name gunzip -c ensemblGene.gtf.gz \ | grep -v ^6_DR51 \ | grep -v ^DR51 \ | grep -v ^DR52 \ | grep -v ^DR53 \ | grep -v _NT_ \ | perl -wpe 's/^([0-9]|X|Y|Un|MT)/chr$1/ \ || die "Line $. doesnt start with human chrom:\n$_"' \ | sed -e 's/\..\"/\"/g' \ | sed -e 's/chrMT_NC_001807/chrM/' \ > ensGene.gtf ssh hgwdev /cluster/data/hg17/bed/ensembl /cluster/bin/i386/ldHgGene hg17 ensGene ensGene.gtf # Read 33666 transcripts in 696579 lines in 1 files # 33666 groups 25 seqs 1 sources 4 feature types # 33666 gene predictions # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql hg17 < ~/kent/src/hg/lib/ensGtp.sql # remove header line from ensGtp.txt echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg17 # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz gunzip ensemblPep.fa.gz hgPepPred hg17 ensembl ensemblPep.fa # UPDATE KNOWN GENES TABLES (DONE 11/22/04 Fan) # Make sure the protein databases (sp041115 and proteins041115) were built first. hgsql hg17 -e "create database kgHg17B" mkdir -p /cluster/store8/kg/kgHg17B cd /cluster/store6/kgDB/bed ln -s /cluster/store8/kg/kgHg17B kgHg17B cd kgHg17B ~/src/hg/protein/KGprocess.sh kgHg17B hg17 041115 # Found gbGetSeqs changed the format of mrna.fa output file # (extra version number). Updated Kgprocess.sh and manually # re-ran the following manually: grep "^>" mrna.fa |awk '{print $1}' > mrna.lis kgGetPep 041115 > mrnaPep.fa hgKgMrna kgH17BTemp mrna.fa mrna.ra tight_mrna.psl ll/loc2ref \ mrnaPep.fa ll/mim2loc ${PDB} > kgHg17BKgMrna.out 2> kgHg17BKgMrna.err # then run Kgprocess.sh again to continue processing. ~/src/hg/protein/KGprocess.sh kgHg17B hg17 041115 hgsql hg17 -e "select * from chromInfo" > chromInfo.tab getDbTabDef hg17 chromInfo >chromInfo.sql hgsql kgHg17B j.tmp hgsql hg17 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab rm j.tmp hgsql hg17 -e 'drop table kgSpAlias'; hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias' gzip kgSpAlias.tab # Create hg17GeneList.html (to be used by Google). # This step was done 12/08/04. cd /cluster/data/hg17/bed mkdir geneList cd geneList wget -O hg17GeneList.html "http://hgwdev-fanhsu.cse.ucsc.edu/cgi-bin/hgGeneList?db=hg17" cp -p hg17GeneList.html /usr/local/apache/htdocs/goldenPath # Check this html file into CVS. # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. The hgKgGetText takes # about 5 minutes when the database is not too busy. The rest # is real quick. ssh hgwdev cd /cluster/data/hg17/bed/kgHg17F mkdir index cd index hgKgGetText hg17 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/data/hg17/bed/kgHg17F/index/knownGene.ix /gbdb/hg17/knownGene.ix ln -s /cluster/data/hg17/bed/kgHg17F/index/knownGene.ixx /gbdb/hg17/knownGene.ixx # UPDATE TABLES NEEDED BY hgGene (DONE 11/30/04 Fan) # UPDATE BioCyc TABLES hgsql hg17 -e 'delete from bioCycPathway' hgsql hg17 -e 'delete from bioCycMapDesc' # Using data files sent by Peter Carp from SRI, # per Peter's email of 10/1/04, they don't have recent update, # so data files received last year are used. # Save the BioCyc Pathway name and description table as gene-pathway.dat. # Save the pathway data file as gene-pathway.dat. # Make sure there is no extra ^M at end of the lines. # Run hgBioCyc program to generate the file bioCycPathway.tab. hgBioCyc gene-pathway.dat hg17 # Load results into hg17. LOAD DATA local INFILE 'pathway-names.dat' into table bioCycMapDesc; LOAD DATA local INFILE 'bioCycPathway.tab' into table bioCycPathway; # REBUID FOLDUTR TABLES (DONE - 2004-11-30 Fan) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/hg17/bed mv rnaStruct rnaStruct.2004-10-04 mkdir -p /cluster/data/hg17/bed/rnaStruct.2004-11-30 ln -s rnaStruct.2004-11-30 rnaStruct cd /cluster/data/hg17/bed/rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa hg17 knownGene utr3 utr3/utr.fa utrFa hg17 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/hg17/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \ -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod +x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~15 minutes if the cluster is free. Completed: 7748 of 7748 jobs CPU time in finished jobs: 191136s 3185.59m 53.09h 2.21d 0.006 y IO & Wait Time: 66703s 1111.72m 18.53h 0.77d 0.002 y Average job time: 33s 0.55m 0.01h 0.00d Longest job: 370s 6.17m 0.10h 0.00d Submission to last job: 747s 12.45m 0.21h 0.01d # Load into database. This takes about 30 minutes ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out time hgLoadBlastTab hg17 knownBlastTab *.tab # Scanning through 7748 files # Loading database with 12810133 rows # 306.480u 54.190s 26:35.50 22.6% 0+0k 0+0io 206pf+0w cd /cluster/data/hg17/bed/geneSorter # Create table that maps between known genes and RefSeq hgMapToGene hg17 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # hgsql -e "select count(*) from knownToRefSeq;" hg17 # row count changed 37611 # Create table that maps between known genes and LocusLink hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \ > refToLl.txt hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt # hgsql -e "select count(*) from knownToLocusLink;" hg17 # row count changed to 37611 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam # hgsql -e "select count(*) from knownToPfam;" hg17 # row count changed to 36302 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # hgsql -e "select count(*) from knownToGnfAtlas2;" hg17 # row count changed to 36373 # Create expression distance table - takes about an hour hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 & # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio # Got 36373 unique elements in hgFixed.gnfHumanAtlas2MedianRatio # hgsql -e "select count(*) from gnfAtlas2Distance;" hg17 # row count changed to 36373000 # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133 # hgsql -e "select count(*) from knownToU133;" hg17 # row count changed to 37299 # Create expression distance table. This will take about 2.5 hours cd /tmp cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight . time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \ -weights=affyUcla.weight -lookup=knownToU133 & # Have 43039 elements in affyUclaNorm # 211 genes, 42 weights, 26.500000 total wieght # Got 37299 unique elements in affyUclaNorm # 8212.320u 217.310s 2:38:07.84 88.8% 0+0k 0+0io 267pf+0w # Create table that maps between known genes and # the GNF data. cd /tmp hgMapToGene hg17 affyU95 knownGene knownToU95 # row count changed to 18791 # hgFixed.gnfHumanU95Exps argument is unused, no need to exist hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \ hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 & # Have 11545 elements in hgFixed.gnfHumanU95MedianRatio # Got 17682 unique elements in hgFixed.gnfHumanU95MedianRatio # row count changed to 17682000 # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes only 10 minutes.) hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnf1h & # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio # Got 10273 unique elements in hgFixed.gnfHumanAtlas2MedianRatio cd /cluster/data/hg17/bed/geneSorter hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2 # row count changed to 40015 # Make sure that GO database is up to date. # UPDATE GO DATABASE (DONE 11/24/04 Fan) # Download the terms and make the database. ssh hgwdev mkdir /cluster/store1/geneOntology/20041124 cd /cluster/store1/geneOntology/20041124 wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200411-assocdb-data.gz hgsql mysql < blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/ce1/blastp/wormPep \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Only takes 10 minutes on an idle cluster # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 33235s 553.91m 9.23h 0.38d 0.001 y # IO & Wait Time: 19891s 331.52m 5.53h 0.23d 0.001 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest job: 68s 1.13m 0.02h 0.00d # Submission to last job: 653s 10.88m 0.18h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/ce1/run/out hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab # row count changed to 28252 # Make mouse ortholog column using blastp on mouse known genes. # First make mouse protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeMm5.doc for procedure # the directory: /cluster/bluearc/scratch/mus/mm5/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm5 cd /cluster/data/hg17/bed/geneSorter/blastp/mm5 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 141842s 2364.04m 39.40h 1.64d 0.004 y # IO & Wait Time: 52251s 870.85m 14.51h 0.60d 0.002 y # Average job time: 25s 0.42m 0.01h 0.00d # Longest job: 254s 4.23m 0.07h 0.00d # Submission to last job: 540s 9.00m 0.15h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/mm5/run/out hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab # Scanning through 7748 files # row count changed to 37549 # Make rat ortholog column using blastp on rat known genes. # First make rat protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeRn3.doc for procedure. # Files were put in this directory: /cluster/bluearc/rn3/blastp/ # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/rn3/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 31786s 529.77m 8.83h 0.37d 0.001 y # IO & Wait Time: 25795s 429.91m 7.17h 0.30d 0.001 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest job: 75s 1.25m 0.02h 0.00d # Submission to last job: 157s 2.62m 0.04h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab # Scanning through 7748 files #Loading database with 26133 rows # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # NOTE: data used to reside in /cluster/bluearc/dr1/blastp mv /cluster/bluearc/dr1/blastp /cluster/bluearc/danRer1/blastp # the directory: /cluster/bluearc/danRer1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/danRer1 cd /cluster/data/hg17/bed/geneSorter/blastp/danRer1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/danRer1/blastp/ensembl \ -i $1 -o $2 -e 0.005 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 102324s 1705.39m 28.42h 1.18d 0.003 y # IO & Wait Time: 47203s 786.72m 13.11h 0.55d 0.001 y # Average job time: 19s 0.32m 0.01h 0.00d # Longest job: 230s 3.83m 0.06h 0.00d # Submission to last job: 427s 7.12m 0.12h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/danRer1/run/out hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab # Loading database with 33852 rows # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/sc1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/sc1/blastp/sgd \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 20983s 349.72m 5.83h 0.24d 0.001 y # IO & Wait Time: 25513s 425.21m 7.09h 0.30d 0.001 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest job: 37s 0.62m 0.01h 0.00d # Submission to last job: 106s 1.77m 0.03h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab # Loading database with 18489 rows # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 83377s 1389.62m 23.16h 0.97d 0.003 y # IO & Wait Time: 39913s 665.21m 11.09h 0.46d 0.001 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest job: 167s 2.78m 0.05h 0.00d # Submission to last job: 365s 6.08m 0.10h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab # Loading database with 30067 rows # update knownToHInv table # Verified that there is now new release of HInv data. hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv # count changed to 33236 #### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-11-30 - Fan) # Get the ensembl gene/protein cross-reference data from # http://www.ensembl.org/Homo_sapiens/martview # Follow this sequence through the pages: # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Feature" box, select gene, transcript, protein, SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC # Page 4) Choose "Text, tab separated". choose gzip compression. hit export. # Save as ensXref.txt sed ensXref.txt -e 's/\./\t/g' > ensemblXref3.tab hgsql hg17 -e "drop table ensemblXref3" hgsql hg17 < ~/src/hg/lib/ensemblXref3.sql hgsql hg17 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines' #### BUILD SUPERFAMILY RELATED TABLES (DONE - 2004-11-30 - Fan) # Download Superfamily data files and build the Superfamily DB # from supfam.mrc-lmb.cam.ac.uk mkdir /cluster/store8/superfamily/041128 ln -s /cluster/store8/superfamily/041128 /cluster/data/superfamily/041128 cd /cluster/data/superfamily/041128 # ftp over the following two files: ass_28-Nov-2004.tab.gz supfam_28-Nov-2004.sql.gz gzip -d *.gz # Load the Superfamily database hgsql hg17 -e "create database superfam041128" hgsql superfam041128 < supfam_28-Nov-2004.sql # This may take about an hour. # Make sure to add an index on id of the des table of superfam041128. hgsql superfam041128 -e "create index id on des(id);" hgsql superfam041128 < ~/src/hg/lib/sfAssign.sql hgsql superfam041128 -e 'load data local infile "ass_28-Nov-2004.tab" into table superfam041128.sfAssign;' # Build or rebuild Superfamily track and create sf tables needed for PB hgsql hg17 < ~/src/hg/lib/sfAssign.sql cd /cluster/data/superfamily/041128 hgsql hg17 -e 'load data local infile "ass_28-Nov-2004.tab" into table hg17.sfAssign;' # If hg17.sfDes already exists, drop it. hgsql superfam041128 -e "select * from des" >sfDes.tab hgsql hg17 < ~/src/hg/lib/sfDes.sql hgsql hg17 -e 'load data local infile "sfDes.tab" into table hg17.sfDes ignore 1 lines;' # If hg17.superfamily already exists, drop it. cd /cluster/data/hg17/bed mkdir /cluster/data/hg17/sf.2004-1128 ln -s sf.2004-1128 sf hgSuperfam hg17 > sf.log # It is normal that many proteins does not have corresponding Superfamily entries. # If hg17.sfDescription exists, drop it. hgsql hg17 < ~/src/hg/lib/sfDescription.sql hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab" into table hg17.sfDescription;' # Finally, load the superfamily table. hgLoadBed hg17 superfamily superfamily.tab -tab # Create knownToSuperfamily table # Note hs is changed into ht for this Superfamily release. cat /cluster/data/superfamily/041128/ass_28-Nov-2004.tab \ | hgKnownToSuper hg17 hs stdin # created 25287 rows in knownToSuper ### HG17 PROTEOME BROWSER TABLES RE-BUILD #### (DONE - 2004-12-01 - Fan) # These are instructions for rebuilding tables # needed for the Proteome Browser. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This update is based on proteins DBs dated 041115. # Create the working directory ssh hgwdev mv /cluster/data/hg17/bed/pb /cluster/data/hg17/bed/pb.2004-06-11 mkdir /cluster/data/hg17/bed/pb.2004-12-01 cd /cluster/data/hg17/bed ln -s /cluster/data/hg17/bed/pb.2004-12-01 pb # Move the existing PB tables by: hgsql hg17 create database hg17Sav2; alter table hg17.pepCCntDist rename as hg17Sav2.pepCCntDist; alter table hg17.pepExonCntDist rename as hg17Sav2.pepExonCntDist; alter table hg17.pepHydroDist rename as hg17Sav2.pepHydroDist; alter table hg17.pepIPCntDist rename as hg17Sav2.pepIPCntDist; alter table hg17.pepMolWtDist rename as hg17Sav2.pepMolWtDist; alter table hg17.pepMwAa rename as hg17Sav2.pepMwAa; alter table hg17.pepPi rename as hg17Sav2.pepPi; alter table hg17.pepPiDist rename as hg17Sav2.pepPiDist; alter table hg17.pepResDist rename as hg17Sav2.pepResDist; alter table hg17.pbAaDistA rename as hg17Sav2.pbAaDistA; alter table hg17.pbAaDistC rename as hg17Sav2.pbAaDistC; alter table hg17.pbAaDistD rename as hg17Sav2.pbAaDistD; alter table hg17.pbAaDistE rename as hg17Sav2.pbAaDistE; alter table hg17.pbAaDistF rename as hg17Sav2.pbAaDistF; alter table hg17.pbAaDistG rename as hg17Sav2.pbAaDistG; alter table hg17.pbAaDistH rename as hg17Sav2.pbAaDistH; alter table hg17.pbAaDistI rename as hg17Sav2.pbAaDistI; alter table hg17.pbAaDistK rename as hg17Sav2.pbAaDistK; alter table hg17.pbAaDistL rename as hg17Sav2.pbAaDistL; alter table hg17.pbAaDistM rename as hg17Sav2.pbAaDistM; alter table hg17.pbAaDistN rename as hg17Sav2.pbAaDistN; alter table hg17.pbAaDistP rename as hg17Sav2.pbAaDistP; alter table hg17.pbAaDistQ rename as hg17Sav2.pbAaDistQ; alter table hg17.pbAaDistR rename as hg17Sav2.pbAaDistR; alter table hg17.pbAaDistS rename as hg17Sav2.pbAaDistS; alter table hg17.pbAaDistT rename as hg17Sav2.pbAaDistT; alter table hg17.pbAaDistV rename as hg17Sav2.pbAaDistV; alter table hg17.pbAaDistW rename as hg17Sav2.pbAaDistW; alter table hg17.pbAaDistY rename as hg17Sav2.pbAaDistY; alter table hg17.pbAnomLimit rename as hg17Sav2.pbAnomLimit; alter table hg17.pbResAvgStd rename as hg17Sav2.pbResAvgStd; alter table hg17.pbStamp rename as hg17Sav2.pbStamp; quit # Define pep* tables in hg17 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # First edit out pepPred table definition, then hgsql hg17 < pepAll.sql # Build the pepMwAa table hgsql proteins041115 -e "select info.acc, molWeight, aaSize from sp041115.info, sp041115.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql hg17 < protAcc.lis pbCalPi protAcc.lis sp041115 pepPi.tab hgsql hg17 <pbCalDist.out cat pbCalDist.out wc pbCalDist.out hgsql hg17 load data local infile "pepExonCntDist.tab" into table hg17.pepExonCntDist; load data local infile "pepCCntDist.tab" into table hg17.pepCCntDist; load data local infile "pepHydroDist.tab" into table hg17.pepHydroDist; load data local infile "pepMolWtDist.tab" into table hg17.pepMolWtDist; load data local infile "pepResDist.tab" into table hg17.pepResDist; load data local infile "pepIPCntDist.tab" into table hg17.pepIPCntDist; load data local infile "pepPiDist.tab" into table hg17.pepPiDist; quit # Calculate frequency distributions pbCalResStd 041115 9606 hg17 # Create pbAnomLimit and pbResAvgStd tables hgsql hg17 < ~/src/hg/lib/pbAnomLimit.sql hgsql hg17 < ~/src/hg/lib/pbResAvgStd.sql hgsql hg17 -e 'load data local infile "pbResAvgStd.tab" into table hg17.pbResAvgStd;' hgsql hg17 -e 'load data local infile "pbAnomLimit.tab" into table hg17.pbAnomLimit;' # Create pbStamp table for PB hgsql hg17 < ~/src/hg/lib/pbStamp.sql hgsql hg17Sav2 -e 'select * from pbStamp' > pbStamp.tab hgsql hg17 -e 'load data local infile "pbStamp.tab" into table hg17.pbStamp ignore 1 lines;' # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. # Perform preliminary review of Proteome Browser for hg17, then notify QA for formal review. #### Blat knownGene proteins to determine exons (braney DONE 12/11/04) ssh hgwdev cd /cluster/data/hg17/bed mkdir blat.hg17KG.2004-12-08 rm blat.hg17KG ln -s blat.hg17KG.2014-12-08 blat.hg17KG cd blat.hg17KG pepPredToFa hg17 knownGenePep known.fa ssh kk cd /cluster/data/hg17/bed/blat.hg17KG cat << '_EOF_' > blatSome #!/bin/csh -fe /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3 '_EOF_' cat << '_EOF_' > blatGsub #LOOP blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy chmod +x blatSome ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst mkdir kgfa cd kgfa faSplit sequence ../known.fa 3010 kg cd .. ls -1S kgfa/*.fa > kg.lst gensub2 human.lst kg.lst blatGsub blatSpec mkdir psl cd psl foreach i (`cat ../human.lst`) mkdir `basename $i .nib` end cd .. para create blatSpec para push # Completed: 134130 of 134136 jobs # Crashed: 6 jobs # CPU time in finished jobs: 29801114s 496685.23m 8278.09h 344.92d 0.945 y # IO & Wait Time: 1983513s 33058.55m 550.98h 22.96d 0.063 y # Average job time: 237s 3.95m 0.07h 0.00d # Longest job: 63306s 1055.10m 17.59h 0.73d # Submission to last job: 169384s 2823.07m 47.05h 1.96d # did 6 crashed jobs on small cluster ssh eieio cd /cluster/data/hg17/bed/blat.hg17KG pslSort dirs raw.psl /tmp psl/* pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null pslUniq cooked.psl hg17KG.psl pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft kgName hg17 hg17KG.psl blastKGRef01 cut -f 10 hg17KG.psl > kgName.lst faSomeRecords known.fa kgName.lst hg17KG.fa hgPepPred hg17 generic blastKGPep01 hg17KG.fa ssh hgwdev cd /cluster/data/hg17/bed/blat.hg17KG hgsql hg17 < ~/kent/src/hg/lib/blastRef.sql echo "rename table blastRef to blastKGRef01" | hgsql hg17 echo "load data local infile 'blastKGRef01' into table blastKGRef01" | hgsql hg17 #### TIGR GENE INDEX (DONE 2004-12-04 Fan) mkdir -p /cluster/data/hg17/bed/tigr cd /cluster/data/hg17/bed/tigr wget -timestamp ftp://ftp.tigr.org/pub/data/tgi/Homo_sapiens/TGI_track_HumanGenome_build35.tgz tar xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) echo $o setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ssh hgwdev cd /cluster/data/hg17/bed/tigr hgsql hg17 -e "drop table tigrGeneIndex" hgsql hg17 < ~/kent/src/hg/lib/tigrGeneIndex.sql foreach f (*.gff) echo Processing $f ... /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC hg17 tigrGeneIndex $f hgsql hg17 -e "select count(*) from tigrGeneIndex" end # Total of 401322 entries created in tigrGeneIndex table. hgsql hg17 -e "update tigrGeneIndex set cdsStart = txStart;" hgsql hg17 -e "update tigrGeneIndex set cdsEnd = txEnd;" checkTableCoords hg17 tigrGeneIndex gzip *.gff *TCs # BLASTZ FOR ZEBRAFISH (danRer2) (DONE, 2004-12-09, hartera) ssh kkr1u00 # blastz requires lineage-specific repeats # Treat all repeats as lineage-specific. # /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish exists (makeDanRer1.doc) mkdir -p /iscratch/i/danRer2/linSpecRep.notInHuman foreach f (/iscratch/i/danRer2/rmsk/chr*.fa.out) cp -p $f /iscratch/i/danRer2/linSpecRep.notInHuman/$f:t:r:r.out.spec end iSync ssh kk mkdir -p /cluster/data/hg17/bed/blastz.danRer2.2004-12-08 ln -s /cluster/data/hg17/bed/blastz.danRer2.2004-12-08 \ /cluster/data/hg17/bed/blastz.danRer2 cd /cluster/data/hg17/bed/blastz.danRer2 # Set L=6000 and abridge repeats - these are the same parameters used # for hg16 and Fugu and similar to those for hg16-galgal2 cat << '_EOF_' > DEF # human (hg17) vs zebrafish (danRer2) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from hg16-fr1. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human (hg17) SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInZebrafish SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: zebrafish (danRer2) SEQ2_DIR=/iscratch/i/danRer2/nib/ SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/danRer2/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastz.danRer2 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF # Save the DEF file in the current standard place cp DEF ~angie/hummus/DEF.hg17-danRer2.2004-12-08 # prepare first cluster run ssh kk cd /cluster/data/hg17/bed/blastz.danRer2 bash # if a csh/tcsh user . ./DEF /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check ...etc. # para time # Completed: 58993 of 58993 jobs # CPU time in finished jobs: 19583036s 326383.93m 5439.73h 226.66d 0.621 y # IO & Wait Time: 471090s 7851.50m 130.86h 5.45d 0.015 y # Average job time: 340s 5.67m 0.09h 0.00d # Longest job: 885s 14.75m 0.25h 0.01d # Submission to last job: 78245s 1304.08m 21.73h 0.91d ssh kki cd /cluster/data/hg17/bed/blastz.danRer2 bash # if a csh/tcsh user . ./DEF /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Completed: 341 of 341 jobs # CPU time in finished jobs: 789s 13.14m 0.22h 0.01d 0.000 y # IO & Wait Time: 2992s 49.87m 0.83h 0.03d 0.000 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest job: 34s 0.57m 0.01h 0.00d # Submission to last job: 391s 6.52m 0.11h 0.00d # Third cluster run to convert lav's to axt's ssh kki cd /cluster/data/hg17/bed/blastz.danRer2 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/danRer2/nib stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg17/bed/blastz.danRer2/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 45 of 46 jobs # Crashed: 1 jobs # CPU time in finished jobs: 99s 1.64m 0.03h 0.00d 0.000 y # IO & Wait Time: 862s 14.37m 0.24h 0.01d 0.000 y # Average job time: 21s 0.36m 0.01h 0.00d # Longest job: 92s 1.53m 0.03h 0.00d # Submission to last job: 456s 7.60m 0.13h 0.01d # crashed job: chr6_hla_hap1.axt is empty - has no alignments # translate sorted axt files into psl ssh kolossus cd /cluster/data/hg17/bed/blastz.danRer2 mkdir -p pslChrom set tbl = "blastzDanRer2" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/hg17/bed/blastz.danRer2/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 $f echo "$f Done" end # try different parameters for blastz with chr1 of hg17 # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer1 -enrichment # refGene:cds 1.301%, blastzDanRer1 3.934%, both 0.874%, cover 67.23%, # enrich 17.09x # H=2000, Y=3400, L=6000, K=2200 and HoxD55.q scoring matrix # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2 -enrichment # refGene:cds 1.301%, blastzDanRer2 3.845%, both 0.879%, cover 67.55%, # enrich 17.57x # same parameters as above but L=8000 # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2L8k -enrichment # refGene:cds 1.301%, blastzDanRer2L8k 2.309%, both 0.778%, cover 59.81%, # enrich 25.91x # enrichment went up but coverage dropped quite a bit. # Default parameters with H=2000 # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2Default -enrichment # refGene:cds 1.301%, blastzDanRer2Default 1.701%, both 0.846%, cover 65.04%, # enrich 38.24x # same as first run but with no Y option set (default Y) # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2NoY -enrichment # refGene:cds 1.301%, blastzDanRer2NoY 3.980%, both 0.877%, cover 67.47%, # enrich 16.95x # row count: # danRer2 122160 # danRer2L8k 62815 # danRer2Default 75818 # danRer2NoY 124129 # can be pruned at the chaining step. # trackDb - change Zebrafish Blastz to danRer1 Blastz and display this track # for danRer2 as Zebrafish Blastz # RESCORE DANRER2 BLASTZ (DONE, 2004-12-09, hartera) # Low scores can occur with repeats abridged and using the # HoxD55.q matrix. PSU's restore_rpts program rescored alignments # with the default matrix instead of the BLASTZ_Q matrix. # Rescore them here so the chainer sees the higher scores: ssh kolossus cd /cluster/data/hg17/bed/blastz.danRer2 mkdir axtChrom.rescore foreach f (axtChrom/chr*.axt) axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \ $f axtChrom.rescore/$f:t end mv axtChrom axtChrom.orig mv axtChrom.rescore axtChrom # psl files and blastz tables will be the same regardless of score so # no need to reload # CHAIN DANRER2 BLASTZ (DONE, 2004-12-09, hartera) # RELOAD CHAINS WIH FILTERING (DONE, 2004-12-10, hartera) # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS # AND DEGENERATE DNA (DONE, 2004-12-22, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/hg17/bed/blastz.danRer2 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain # create input list ls -1S /cluster/data/hg17/bed/blastz.danRer2/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from hg16 vs chicken run. cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap $1 \ /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/danRer2/nib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 45 of 46 jobs # Crashed: 1 jobs # CPU time in finished jobs: 1837s 30.62m 0.51h 0.02d 0.000 y # IO & Wait Time: 441s 7.35m 0.12h 0.01d 0.000 y # Average job time: 51s 0.84m 0.01h 0.00d # Longest job: 106s 1.77m 0.03h 0.00d # Submission to last job: 419s 6.98m 0.12h 0.00d # crashed job is chr6_hla_hap1 which has no alignments # now on the cluster server, sort chains ssh kksilo cd /cluster/data/hg17/bed/blastz.danRer2/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # apart from chr19 not too many with chains with scores < 5000 # load chr1 chain into table and check ssh hgwdev cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/chain hgLoadChain hg17 chr1_chainDanRer2 chr1.chain # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment # refGene:cds 1.301%, chainDanRer2Link 3.676%, both 0.877%, cover 67.42%, # enrich 18.34x # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2 -enrichment # refGene:cds 1.301%, chainDanRer2 32.611%, both 1.034%, cover 79.52%, # enrich 2.44x ssh kksilo cd /cluster/data/hg17/bed/blastz.danRer2/axtChain mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain chainSplit chainFilt5k all.chain # load chr1 filtered chains and check ssh hgwdev cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/chainFilt5k hgLoadChain hg17 chr1_chainDanRer2Filt5k chr1.chain # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Filt5kLink -enrichment # refGene:cds 1.301%, chainDanRer2Filt5kLink 2.907%, both 0.870%, cover 66.86%, # enrich 23.00x # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Filt5k -enrichment # refGene:cds 1.301%, chainDanRer2Filt5k 31.343%, both 1.028%, cover 79.02%, # enrich 2.52x # checked browser - when filtered on minScore=5000, the low scoring # alignments removed are small and/or poor alignments so use this version. # remove repeats from filtered chains and reload into database # (2004-12-22, hartera) ssh kksilo cd /cluster/data/hg17/bed/blastz.danRer2/axtChain mv chainFilt5k chainRaw mkdir chain cd chainRaw foreach f (*.chain) set c = $f:r echo $c nice chainAntiRepeat /cluster/bluearc/hg17/bothMaskedNibs \ /cluster/bluearc/danRer2/nib $f \ ../chain/$c.chain end cd .. chainMergeSort ./chain/*.chain > all.chain.antirepeat chainSplit chainAR all.chain.antirepeat # load filtered chains and check ssh hgwdev echo 'drop table chr1_chainDanRer2Filt5k;' | hgsql hg17 echo 'drop table chr1_chainDanRer2Filt5kLink;' | hgsql hg17 # reload filtered chains instead of unfiltered (2004-12-10, hartera) # reload filtered chains with repeats removed (2004-12-22, hartera) cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/ cd chainAR foreach i (*.chain) set c = $i:r hgLoadChain hg17 ${c}_chainDanRer2 $i echo done $c end # trackDb - change Zebrafish Chain to danRer1 Chain and display this track # for danRer2 as Zebrafish Chain. # after chainAntiRepeat # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment # refGene:cds 1.304%, chainDanRer2Link 2.742%, both 0.872%, cover 66.81%, # enrich 24.36x # NET DANRER2 BLASTZ (DONE, 2004-12-09, hartera) # RE-CREATE NET WITH FILTERED CHAINS (DONE, 2004-12-10, hartera) # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-22, hartera) ssh kksilo cd /cluster/data/hg17/bed/blastz.danRer2/axtChain rm -r preNet mkdir preNet cd chainAR foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 133443584, utime 905 s/100, stime 139 # Add classification info using db tables: # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInHuman # linSpecRep.notInZebrafish exists for hg17 cp /iscratch/i/danRer2/linSpecRep.notInHuman/* \ /cluster/bluearc/danRer2/linSpecRep.notInHuman ssh hgwdev cd /cluster/data/hg17/bed/blastz.danRer2/axtChain time netClass noClass.net hg17 danRer2 zfishdanRer2.net \ -tNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish \ -qNewR=/cluster/bluearc/danRer2/linSpecRep.notInHuman -noAr # 97.230u 54.290s 5:37.50 44.8% 0+0k 0+0io 217pf+0w # load net into database cd /cluster/data/hg17/bed/blastz.danRer2/axtChain netFilter -minGap=10 zfishdanRer2.net | hgLoadNet hg17 netDanRer2 stdin # trackDb - change Zebrafish Net to danRer1 Net and display this track # for danRer2 as Zebrafish Net. # after chainAntiRepeat: # featureBits hg17 refGene:cds netDanRer2 -enrichment # refGene:cds 1.015%, netDanRer2 22.898%, both 0.783%, cover 77.15%, # enrich 3.37x # index had NULL cardinality, analyze table to fix (2005-1-18, Heather) hgsql hg17 analyze table netDanRer2 # LOAD ACEMBLY TRACK (DONE, 2005-01-24, hartera) # ACEMBLY TABLE RELOADED AND FINISHED COLOR CODING CODE IN # hgTracks (2005-01-28, hartera) # FINISHED CODE FOR FILTERING BY GENE CLASS (2005-02-03, hartera) mkdir -p /cluster/data/hg17/bed/acembly cd /cluster/data/hg17/bed/acembly # Data is obtained from # Danielle et Jean Thierry-Mieg mieg@ncbi.nlm.nih.gov wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.proteins.fasta.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.gff.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.mrnas.fasta.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.pfamhits.tar.gz tar xvzf acembly.ncbi_35.genes.gff.tar.gz tar xvzf acembly.ncbi_35.genes.proteins.fasta.tar.gz cd acembly.ncbi_35.genes.gff # the acembly dataset for hg16 had problems with reverse blocks so # check for these cat << '_EOF_' > checkReversedBlocks for i in x1*.gff do echo -n "$i working ..." awk -F"\t" ' { if ($4 > $5) { printf "reverse blocks problem for $1" printf "\n" } } ' $i > $i.fixed echo " done" done '_EOF_' # << this line makes emacs coloring happy chmod +x checkReversedBlocks ./checkReversedBlocks ls -l *.fixed # all *.fixed files are empty so remove - there is no reversing of blocks rm *.fixed foreach f (x1.acemblygenes.*.gff) set c=$f:r:e egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \ perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff if (-e ../../../$c/lift/random.lft) then liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \ ctg-chr${c}_random.gff endif grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \ grep -v "^chr//" > chr$c.gff echo "done $c" end #- Load into database - use extended genePred ssh hgwdev cd /cluster/data/hg17/bed/acembly # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf hg17 acembly acembly.ncbi_35.genes.gff/chr*.gff # for entry with 28212470 from chr6.gff, change to chr6 # and for 29124352 in chr6.gff, change to chr6 (1/13/05) echo 'update acembly set chrom = "chr6" where chrom = "chr28212470";' \ | hgsql hg17 echo 'update acembly set chrom = "chr6" where chrom = "chr29124352";' \ | hgsql hg17 # checkTableCoords and runGeneCheck to check data # a number of errors so leave on hgwdev for the moment # checkTableCoords: # rah.acembly has 16 records with chrom not described in chromInfo. # rah.acembly item RPL10A.sNov04 chr6:35544172-35546520: end of last block (35546519) is not the same as chromEnd (35546520). # rah.acembly has 1 records with blockEnd[n-1] != end. # rah.acembly has 1 records with end > chromSize. # chr6 acembly exon 35545934 35546101 . + 0 # gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 5 # chr6 acembly intron 35546102 35546520 . + 0 # gene_id RPL10A; transcript_id RPL10A.sNov04; intron_type fuzzy # chr6 acembly CDS 35546335 35546384 . + 0 # gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 6 # chr6 acembly exon 35546335 35546519 . + 0 # gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 6 # chr6 acembly stop_codon 35546382 35546384 . + # 0 gene_id RPL10A; transcript_id RPL10A.sNov04; # here the intron overlaps exon 6 so take 35546519 to be txEnd echo 'update acembly set txEnd = 35546519 where name = "RPL10A.sNov04";'\ | hgsql hg17 # for record where end > chromSize echo 'select * from acembly as a, chromInfo as c where c.chrom = a.chrom \ and c.size < a.txEnd;' | hgsql hg17 # KIR2DL5.bNov04 on chr19_random, chr19_random size is 301858, # txEnd is 305266 delete this record echo 'delete from acembly where name = "KIR2DL5.bNov04";' | hgsql hg17 # from runGeneCheck: # 5780 inFrameStop # 110664 noStart # 23085 badCdsSplice # 23848 noStop # 14957 badUtrSplice # 3661 gap # 4726 badFrame # 261066 lines in genePred.tab # e-mailed authors of data (2004-12-21, hartera) # notiri.aNov04 - has ctg instead of atg at start. others have no start specified: sirora.nNov04 # sirora.zmNov04 - chr1:19389-19392 is AAC (gtt) (-) # sirora.sNov04 - chr1:8925-8928 CAA (ttg) (-) # sirora.rNov04 - chr1:8925-8928 CAA (ttg) (-) # for entries with 28212470 and 29124352 instead of chr6 change to chr6 # Re-process this x1 file to chr6.gff (2005-01-24) mv x1.acemblygenes.6.gff x1.acemblygenes.6.gff.broken sed -e "s/^28212470/6/" x1.acemblygenes.6.gff.broken | sed -e \ "s/^29124352/6/" > x1.acemblygenes.6.gff grep -v ^6\| x1.acemblygenes.6.gff | grep -v ^Hs | perl -wpe 's/^/chr/;' | \ grep -v "^chr//" > chr6.gff # Received a list of genes from Jean and Danielle Mieg # showing genes that are "main", "putative" or "cloud" - there should be # no "cloud" genes in our data set (2005-01-11) # download acembly_gene_lists.tar.gz from e-mail cd /cluster/data/hg17/bed/acembly tar xvzf acembly_gene_lists.tar.gz cd acembly_gene_lists cat << '_EOF_' > getIDs.pl #!/usr/bin/perl -w use strict; while () { my @f = split(/\s+/); for (my $i =0; $i <= $#f; $i++) { if ($f[$i] =~ /gene_id$/) { # if field is ID type then next value is the ID my $id = $f[$i+1]; # remove ";" at end and print ID chop $id; print "$id\n"; } } } '_EOF_' chmod +x getIDs.pl # get gene IDs from gff files foreach f (../acembly.ncbi_35.genes.gff/chr*.gff) echo "Processing $f" perl getIDs.pl < $f >> genesGffs.ids end # remove back slash from some names sort genesGffs.ids | uniq > genesGffs.ids.uniq # reformat gene list to get just the genes and remove first 2 lines and sort foreach g (*.list) sed -e 's/"//g;' $g | sed -e 's/Gene : //;' | sed -e '1,2d' \ | sort | uniq > $g.IDsort end # remove back slash from some names perl -pi.bak -e 's/\\//' *.IDsort # check if cloud genes appear in gff files list of genes # list of genes in cloud but not in gff comm -13 genesGffs.ids.uniq cloud_gene.list.IDsort > gffvscloud.out diff gffvscloud.out cloud_gene.list.IDsort # there is no difference so none of the cloud genes are in the gff files # check if all the other genes in the main and putative lists are in gffs comm -13 genesGffs.ids.uniq main_gene.list.IDsort > gffvsmain.out comm -13 genesGffs.ids.uniq putative_gene.list.IDsort > gffvsputative.out wc -l *.out # 14 gffvsmain.out # 0 gffvsputative.out # there are 14 genes in the main set not in the gff files # actually there are 12, as FCA/MR and SLA/LP are in the gff files # all putative genes are in the gff set wc -l main_gene.list.IDsort putative_gene.list.IDsort # 52467 main_gene.list.IDsort # 43978 putative_gene.list.IDsort # 96445 total wc -l genesGffs.ids.uniq # 97042 genesGffs.ids.uniq # check discrepancy cat main_gene.list.IDsort putative_gene.list.IDsort > mp.ids sort mp.ids > mp.sort comm -23 genesGffs.ids.uniq mp.sort > gffNotMP.out wc -l gffNotMP.out # 609 gffNotMP.out # create table of Acembly gene classifications # see http://www.ncbi.nlm.nih.gov/IEB/Research/Acembly/index.html?human # in FAQ, describes main, putative and cloud genes. The cloud genes are not # well confirmed and so they are not in this data set. # NEED TO FILTER GENES AND RELOAD TABLES: # authors Jean and Danielle Mieg e-mailed back. The 12 genes in the # putative list that are not in the gff files were not exported # as they did not find a single putative protein to describe so they # were not added to the gffs. They will be added at a later date. # Remove these from the acemblyClass table (2005-01-21, hartera) # Reload acemblyClass table as problems with the gene names # the class table has gene IDs and the acembly table has transcript IDs # it is hard to look up class in the class table since just removing the # transcript ID suffixes (e.g. "aNov04" after a ".") does not work as # some gene IDs have a "." in them anyway. ssh kksilo cd /cluster/data/hg17/bed/acembly/acembly_gene_lists comm -13 gffvsmain.out main_gene.list.IDsort > main_gene.list.filt wc -l main_gene.list.filt # 52455 main_gene.list.filt ssh hgwdev cd /cluster/data/hg17/bed/acembly/acembly_gene_lists # drop acemblyClass table and recreate (2005-01-27, hartera) echo 'drop table acemblyClass;' | hgsql hg17 # prepare a file of genes and classification # use transcript IDs - get these and corresponding gene IDs from gff files # if Gene IDs used the difficult to parse transcript ID (name column) from # the acembly genePred table. e.g. notiri.aNov04 is a transcript ID so can # remove suffix after "." to obtain gene ID, some gene names have "." in # them and not all have the suffix. # 260446 transcript IDs (use allFiltered.gff - see below) perl getClass.pl main_gene.list.filt putative_gene.list.IDsort \ ../acembly.ncbi_35.genes.gff/allFiltered.gff foreach f (main_gene.list.filt putative_gene.list.IDsort) if ($f == "main_gene.list.filt") then set t = "main" endif if ($f == "putative_gene.list.IDsort") then set t = "putative" endif awk 'BEGIN {OFS="\t"} {print $1, "'$t'"}' $f >> class.txt end sort classes.txt | uniq > geneIDtxID.class # get transcript ID and class fields for acemblyClass table awk 'BEGIN {OFS="\t"} {print $2,$3}' geneIDtxID.class > acemblyClass.tab wc -l acemblyClass.tab # 260446 acemblyClass.tab # make change to acemblyClass.as and check in: # change name to be transcript ID instead of gene ID cat << '_EOF_' > $HOME/kent/src/hg/lib/acemblyClass.as table acemblyClass "Class for Acembly genes" ( string name; "Transcript ID for Acembly gene" string class; "Class of gene" ) '_EOF_' cd $HOME/kent/src/hg/lib/ autoSql acemblyClass.as acemblyClass mv acemblyClass.h $HOME/kent/src/hg/inc # do make to check it works and commit the .as, .sql, .c and .h files to CVS cd /cluster/data/hg17/bed/acembly/acembly_gene_lists echo "drop table acemblyClass" | hgsql hg17 hgsql hg17 < ~/kent/src/hg/lib/acemblyClass.sql # reload table with transcript IDs echo "load data local infile 'acemblyClass.tab' into table acemblyClass" \ | hgsql hg17 # There were also 609 genes in the gff files that are not in the # main, putative or cloud gene lists. Jean and Danielle Mieg say that # these were filtered out from their data set but not from the gff files. # Remove these from the gff files. (gffNotMP.out) (2005-01-24) cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.gff cat chr*.gff > all.gff cat << '_EOF_' > removeGenes.pl #!/usr/bin/perl -w use strict; my $genes = $ARGV[0]; my $gff = $ARGV[1]; open(GENES, $genes) || die "Can not open $genes:$!\n"; open(GFF, $gff) || die "Can not open $gff:$!\n"; open(OUT, ">removed.out") || die "Can not open removed.out:$!\n"; my %genes; while () { chomp; my $g = $_; $genes{$g} = 1; } close GENES; while () { my $l = $_; my $id; my @line = split(/\s+/); for (my $i = 0; $i <= $#line; $i++) { if ($line[$i] eq "gene_id") { $id = $line[$i+1]; } } $id =~ s/;//; print "id is now $id\n"; if (!exists($genes{$id})) { print $l; } else { print OUT $l; } } '_EOF_' perl removeGenes.pl ../acembly_gene_lists/gffNotMP.out all.gff \ > allFiltered.gff # checked that gene IDs in the removed.out file are the same # same as those in gffNotMP.out # reload into the acembly table ssh hgwdev cd /cluster/data/hg17/bed/acembly echo 'drop table acembly;' | hgsql hg17 # Reloaded with filtered set 2005-01-23, reload again 2005-01-28 with # the genePredExt option to get gene ID in name 2 field ldHgGene -gtf -genePredExt hg17 acembly \ acembly.ncbi_35.genes.gff/allFiltered.gff # Read 260446 transcripts in 3656676 lines in 1 files # 260446 groups 41 seqs 1 sources 5 feature types # 260446 gene predictions # remove cdsStartStat, cdsEndStat and exonFrames fields echo 'alter table acembly drop column cdsStartStat;' | hgsql hg17 echo 'alter table acembly drop column cdsEndStat;' | hgsql hg17 echo 'alter table acembly drop column exonFrames;' | hgsql hg17 # fix problem data found by checkTableCoords # here the intron overlaps exon 6 so take 35546519 to be txEnd echo 'update acembly set txEnd = 35546519 where name = "RPL10A.sNov04";'\ | hgsql hg17 # for record where end > chromSize echo 'select * from acembly as a, chromInfo as c where c.chrom = a.chrom \ and c.size < a.txEnd;' | hgsql hg17 # KIR2DL5.bNov04 on chr19_random, size is 301858, txEnd is 305266 # delete this record echo 'delete from acembly where name = "KIR2DL5.bNov04";' | hgsql hg17 # acembly peptide table # need to just grab same sequences that are in acembly cd ./acembly.ncbi_35.genes.proteins.fasta echo 'select name from acembly;' | hgsql -N hg17 > acembly.name cat *.fasta > allPep.fa faSomeRecords allPep.fa acembly.name acemblyPep.fa # PEPTIDE SEQUENCES NOT LOADED # There are 236,554 peptide names that do not match transcript IDs in # the acembly table and 110,278 transcript IDs in acembly that do not # have a corresponding peptide. Waiting for repsonse about this from # Jean and Danielle (2005-01-31) # hgPepPred hg17 generic acemblyPep \ # acembly.ncbi_35.genes.proteins.fasta/*.fasta # Edit hgTracks.c to get colour coded tracks based on the gene class # for each gene as read from the acemblyClass table. # Edits to hui.c, hgTrackUi.c and hgTracks.c to allow filtering of # genes based on class. # acembly trackDb entry: # track acembly # shortLabel Acembly Genes # longLabel AceView Gene Models With Alt-Splicing # group genes # priority 41 # visibility dense # color 155,0,125 # type genePred acemblyPep acemblyMrna # url http://www.ncbi.nih.gov/IEB/Research/Acembly/av.cgi?db=hg17&l=$$ # itemClassTbl acemblyClass # geneClasses main putative # gClass_main 128,0,125 # gClass_putative 200,0,125 # urlLabel Transcript ID: # search added: # searchTable acembly # searchType genePred # searchMethod prefix # termRegex [^[:space:]]+ # searchPriority 50 # Received data with gene product relationship from Jean Thierry-Mieg # (2005-02-17) ssh eieio cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.proteins.fasta wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/repository/acedb/human/acembly.ncbi_35.gene2product.txt.gz gunzip acembly.ncbi_35.gene2product.txt.gz # these are gene ID and product mappings, need transcript ID to product # mappings. E-mailed Jean Thierry-Mieg to ask for this information # BUILD WGRNA TRACK (DONE, 2004-12-13, Fan) # Grab data from original miRNA track and convert them into wgRna .tab format. hgsql hg17 --skip-column-names -e 'select * from miRNA' >miRNA.out cat miRNA.out | awk {'print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"'} >wgRna.tab # Break the original custom track data file, hsa-snoRNA_track.txt, into two files j1 and j2, # then remove header and blank lines. cat j1 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""CDBox"'} >>wgRna.tab cat j2 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""HAcaBox"'} >>wgRna.tab # load into wgRna table hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab # create and edit wgRna.html under src/hg/makeDb/trackDb/human/hg17. # RELOADED wgRna DATA USING wgRNA_corrected.txt SENT BY MICHEL WEBER # Manually removed the first header line and the first column of the bin field and removed # the last empty line. cut -f 2- wgRNA_corrected.txt >wgRna.tab vi wgRna.tab hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab # UPDATED WGRNA DATA PER EMAIL FROM WEBER (2004-12-14, Fan). # Added the following 3 lines to j1 chr3 161715396 161715726 U90 480 - chr11 93104316 93104387 Z40 480 - chr11 93106041 93106114 mgh28S-2410 480 - # Regenerated wgRna table cat miRNA.out | awk {'print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"'} >wgRna.tab cat j1 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""CDBox"'} >>wgRna.tab cat j2 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""HAcaBox"'} >>wgRna.tab hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab # Changed the following records to RNA type scaRna. hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U85"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U87"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U88"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U89"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U90"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U91"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U92"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U93"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U100"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA26"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA35"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA45"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA47"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA57"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="HBII-382"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-19/30"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-25/61"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-22/U4-8"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U85"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U87"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U88"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U89"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U90"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U91"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U92"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U93"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U100"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA26"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA35"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA45"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA47"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA57"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="HBII-382"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-19/30"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-25/61"' hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-22/U4-8"' # Updated .../trackDb/human/hg17/wgRna.html. # MAKE VSDANRER2 DOWNLOADABLES (DONE, 2004-12-14, hartera) # REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat # (DONE, 2004-12-22, hartera) ssh hgwdev cd /cluster/data/hg17/bed/blastz.danRer2/axtChrom set gp = /usr/local/apache/htdocs/goldenPath/hg17 mkdir -p $gp/vsDanRer2/axtChrom cp -p *.axt $gp/vsDanRer2/axtChrom cd $gp/vsDanRer2/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # copy chains and nets to downloads area # re-make chains and net downloadables (2004-12-22, hartera) rm $gp/vsDanRer2/zebrafish*.gz $gp/vsDanRer2/md5sum.txt cd /cluster/data/hg17/bed/blastz.danRer2/axtChain gzip -c all.chain.antirepeat > \ /cluster/data/hg17/zip/zebrafishDanRer2.chain.gz gzip -c zfishdanRer2.net > /cluster/data/hg17/zip/zebrafishDanRer2.net.gz cd $gp/vsDanRer2 mv /cluster/data/hg17/zip/zebrafish*.gz . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # CLEANUP DANRER2 BLASTZ (DONE, 2004-12-14, hartera) # RE-DONE (DONE, 2004-12-22, hartera) # REMOVED RAW AND LAV DIRS (DONE, 2005-02-24, hartera) ssh eieio cd /cluster/data/hg17/bed/blastz.danRer2 nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/noClass.net & nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net & nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain & nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet & nice rm -rf raw & nice rm -rf lav & # EXTRACT AXT'S AND MAF'S FROM TETRAODON (tetNig1) NET # (DONE, 2004-12-15, hartera) # Redo to remove overlaps (2006-04-07 kate) ssh eieio # create axts cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain netSplit tetNig1.net tetraodonNet mkdir -p ../axtNet cat > axtNet.csh << 'EOF' foreach f (tetraodonNet/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt tetraodonNet/$c.net chain/$c.chain \ /cluster/data/hg17/nib /cluster/data/tetNig1/nib ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end 'EOF' chmod +x axtNet.csh csh axtNet.csh >&! axtNet.log & tail -100f axtNet.log # sort axts before making mafs - must be sorted for multiz cd /cluster/data/hg17/bed/blastz.tetNig1 mv axtNet axtNet.unsorted mkdir axtNet foreach f (axtNet.unsorted/*.axt) set c = $f:t:r echo "Sorting $c" axtSort $f axtNet/$c.axt end # create maf ssh eieio cd /cluster/data/hg17/bed/blastz.tetNig1 cd axtNet mkdir ../mafNet cat > makeMaf.csh << 'EOF' foreach f (chr*.axt) set maf = $f:t:r.tetNig1.maf echo translating $f to $maf axtToMaf $f \ /cluster/data/hg17/chrom.sizes /cluster/data/tetNig1/chrom.sizes \ ../mafNet/$maf -tPrefix=hg17. -qPrefix=tetNig1. end 'EOF' csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net & # redo axt's and maf's to remove overlaps (2006-04-07 kate) ssh kkstore02 cd /cluster/data/hg17/bed/blastz.tetNig1 mv axtNet axtNet.old mv mafNet mafNet.old mkdir -p axtNet mafNet cd axtChain cat > fix.csh << 'EOF' date foreach f (tetraodonNet/chr*.net) set c = $f:t:r echo $c netToAxt tetraodonNet/$c.net chain/$c.chain \ /cluster/data/hg17/nib /cluster/data/tetNig1/nib stdout | \ axtSort stdin ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" axtToMaf ../axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/tetNig1/chrom.sizes \ ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=tetNig1. end date 'EOF' csh fix.csh >&! fix.log & cd /san/sanvol1/scratch/hg17/mafNet rm -fr tetNig1 cp -rp /cluster/data/hg17/bed/blastz.tetNig1/mafNet tetNig1 # 10-WAY MULTIZ -- 8-WAY PLUS FROG AND TETRA (DONE 2004-12-22 kate) # Use older multiz (not v10) till bugs fixed ssh eieio cd /cluster/data/hg17/bed rm multiz10way mkdir multiz.2004-12-22 ln -s multiz.2004-12-22 multiz10way cd multiz10way cat > tree.nh << 'EOF' ((((((hg17,panTro1),(rn3,mm5)),canFam1),galGal2),xenTro1),((fr1,tetNig1),danRer1)) 'EOF' mkdir /cluster/bluearc/hg17/multiz.2004-12-22 cd /cluster/bluearc/hg17 mkdir 2004-12-22 rm multiz10way ln -s multiz.2004-12-17 multiz10way.v10 ln -s multiz.2004-12-22 multiz10way # reuse pairwise MAF's on bluearc mv multiz10way.v10/{canFam1,danRer1,fr1,galGal2,mm5,panTro1,rn3,tetNig1,xenTro1} multiz10way # NOTE: pairwise mafs were moved to /cluster/bluearc/hg17/mafNet # make output dir and run dir ssh kk9 cd /cluster/data/hg17/bed cd multiz10way mkdir -p maf mkdir -p run cd run # create scripts to run multiz on cluster cat > oneMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 set multi = /scratch/$user/multiz10way.$c set pairs = /cluster/bluearc/hg17/multiz10way # special mode -- # with 1 arg, cleanup if ($#argv == 1) then rm -fr $multi exit endif set s1 = $2 set s2 = $3 # locate input files -- in pairwise dir, or multiple dir set d1 = $multi set d2 = $multi if (-d $pairs/$s1) then set d1 = $pairs endif if (-d $pairs/$s2) then set d2 = $pairs endif set f1 = $d1/$s1/$c.maf set f2 = $d2/$s2/$c.maf # write to output dir set out = $multi/${s2}${s1} mkdir -p $out # check for empty input file if (-s $f1 && -s $f2) then echo "Aligning $f1 $f2" /cluster/bin/penn/multiz $f1 $f2 - > $out/$c.maf else if (-s $f1) then cp $f1 $out else if (-s $f2) then cp $f2 $out endif 'EOF' # << for emacs chmod +x oneMultiz.csh cat > allMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 oneMultiz.csh $c mm5 panTro1 oneMultiz.csh $c rn3 panTro1mm5 oneMultiz.csh $c canFam1 panTro1mm5rn3 oneMultiz.csh $c galGal2 panTro1mm5rn3canFam1 oneMultiz.csh $c xenTro1 panTro1mm5rn3canFam1galGal2 oneMultiz.csh $c fr1 panTro1mm5rn3canFam1galGal2xenTro1 oneMultiz.csh $c tetNig1 panTro1mm5rn3canFam1galGal2xenTro1fr1 oneMultiz.csh $c danRer1 panTro1mm5rn3canFam1galGal2xenTro1fr1tetNig1 # get final alignment file cp /scratch/$user/multiz10way.$c/panTro1mm5rn3canFam1galGal2xenTro1fr1tetNig1danRer1/$c.maf /cluster/data/hg17/bed/multiz10way/maf/$c.maf #cleanup oneMultiz.csh $c 'EOF' # << for emacs chmod +x allMultiz.csh cat > gsub << 'EOF' #LOOP allMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz10way/maf/$(root1).maf} #ENDLOOP 'EOF' # << for emacs cut -f 1 /cluster/data/hg17/chrom.sizes > chrom.lst gensub2 chrom.lst single gsub jobList para create jobList para try; para check para push # post-process multiz maf with maf_project to "glue" short # alignment blocks together ssh eieio cd /cluster/data/hg17/bed/multiz10way.v8 mkdir -p mafGlued cd maf foreach f (*.maf) set c = $f:r echo "gluing $f" /cluster/bin/penn/maf_project $f hg17.$c > ../mafGlued/$c.maf end # filter out alignment blocks with no alignments in non-reference species, # and low-scoring alignments based on Webb Miller's latest # recommendations (score < -5 * ncol^2 * nrow) # NOTE: Webb hasn't approved the filtered alignments yet, # so leaving them in for now. #mkdir -p mafFiltered #cd ../mafGlued #foreach f (*.maf) #set c = $f:r #echo "filtering $f" #~kate/bin/i386/mafFilter -factor $f > ../mafFiltered/$c.maf #end #cd .. grep score mafGlued/chr1.maf | wc -l grep score mafFiltered/chr1.maf | wc -l grep score mafGlued/bad | wc -l # 43692 grep score=0.0 bad | wc -l # 10206 # load alignments into tables ssh hgwdev cd /cluster/data/hg17/bed/multiz10way.v8 set mafDir = /gbdb/hg17/mafNet mkdir -p $mafDir # multiple alignment set mafDir = /gbdb/hg17/multiz10way/maf mkdir -p $mafDir/multiz10way cd /cluster/data/hg17/bed/multiz10way.v8/mafGlued ln -s `pwd`/*.maf $mafDir/multiz10way hgLoadMaf hg17 -warn multiz10way -pathPrefix=$mafDir/multiz10way # load summary table to replace pairwise cd /cluster/data/hg17/bed/multiz10way.v8/mafGlued/ time cat chr*.maf | hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 hg17 multiz10waySummary stdin # Processed 27314693 components in 9081437 mafs from stdin # 30 minutes # CONSERVATION SCORING WITH PHASTCONS (DONE 2005-01-14 kate) # 1. Partition multiple alignment into windows, using "msa_split" # 2. Create starting tree model, with branch lengths # use "phyloFit" on alignments # 3. Estimate GC avg. over all species, use "msa_view" on maf # 4. Estimate other model params, using phastCons (via doEstimate script) # NOTE: no alignment filtering done -- the scores don't look # particularly meaningful w/ this version of multiz. # Next time, run on "glued" (maf_projected) ssh eieio cd /cluster/data/hg17/bed/multiz10way.v8 set mafDir = /cluster/bluearc/hg17/multiz10way.v8/maf mkdir -p $mafDir cp -r maf/*.maf $mafDir ssh kk9 cd /cluster/data/hg17/bed/multiz10way.v8 mkdir cons cd cons # break up the genome-wide MAFs into pieces # NOTE: chrom fasta files are already on the bluearc # from previous run mkdir /cluster/bluearc/hg17/chrom cd /cluster/data/hg17 foreach f (`cat chrom.lst`) echo $f cp -r $f/*.fa /cluster/bluearc/hg17/chrom end cd /cluster/data/hg17/bed/multiz10way.v8/cons mkdir run.split cd run.split set WINDOWS = /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.sh #!/bin/sh PHAST=/cluster/bin/phast FA_SRC=/cluster/bluearc/hg17/chrom WINDOWS=/cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS maf=$1 c=`basename $maf .maf` echo $c mkdir -p /scratch/msa_split ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg17,panTro1,mm5,rn3,canFam1,galGal2,xenTro1,fr1,tetNig1,danRer1 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000 [ $? -eq 0 ] || exit 1 echo "Copying..." cd /scratch/msa_split for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done [ $? -eq 0 ] || exit 1 rm -f /scratch/msa_split/$c.*.ss echo "Done copying" echo "Done" >> ${WINDOWS}/$c.done 'EOF' # << for emacs set WINDOWS = /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS chmod +x doSplit.sh rm -f jobList foreach file (/cluster/bluearc/hg17/multiz10way.v8/maf/*.maf) set c = $file:t:r echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList end para create jobList # 46 jobs para try para check # TODO: cleanup # rm -fr $mafDir # now generate conservation scores and predicted elements set path = ($path /cluster/bin/phast); rehash cd /cluster/data/hg17/bed/multiz10way.v8/cons mkdir run.elements cd run.elements # create a starting tree model from a chr1 ss files in WINDOWS dir. ssh kolossus cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements gunzip -c /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/chr1.14996059-15998256.ss.gz \ > /tmp/phastCons.$$ phyloFit -i SS /tmp/phastCons.$$ --out-root starting-tree --tree \ "((((((hg17,panTro1),(mm5,rn3)),canFam1),galGal2),xenTro1),((fr1,tetNig1),danRer1))" rm /tmp/phastCons.$$ cat starting-tree.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #TRAINING_LNL: -2635749.517410 #BACKGROUND: 0.247225 0.248374 0.250827 0.253574 #RATE_MAT: #-0.997890 0.201447 0.648573 0.147870 #0.200515 -1.020796 0.190184 0.630096 #0.639258 0.188324 -1.025170 0.197587 #0.144168 0.617176 0.195447 -0.956791 #TREE: ((((((hg17:0.006401,panTro1:0.008342):0.099376,(mm5:0.083404,rn3:0.105411):0.242694):0.020883,canFam1:0.221922):0.099131,galGal2:0.275759):0.041997,xenTro1:0.280306):0.064815,((fr1:0.137674,tetNig1:0.091463):0.118573,danRer1:0.250847):0.064815); # estimate model parameters # estimate avg. cross-species avg. GC content from chr1 maf's ssh kolossus set path = ($path /cluster/bin/phast); rehash cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1,galGal2,xenTro1,danRer1,tetNig1,fr1 \ -i MAF \ --summary-only /cluster/data/hg17/bed/multiz10way.v8/maf/chr1.maf\ > maf_summary.txt awk '$1 == "[aggregate]" {printf "%0.3f\n", $3 + $4}' maf_summary.txt # 0.424 # generate models from random sample of genome (use 90 1Mb windows, # to conveniently run on rack 9 100-node cluster) # On first pass, used parameters from 8way alignment: # expected-lengths 12 -taret-coverage .17 # NOTE: there may be a cleverer way to select the first length param # On second pass, used parameters below, based on consEntropy # and featureBits coverage of elements, below cat << 'EOF' > doEstimate.sh #!/bin/sh zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.424 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 11 --target-coverage 0.20 --quiet --log $2 --estimate-trees $3 'EOF' chmod u+x doEstimate.sh rm -fr LOG TREES mkdir -p LOG TREES ls /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/*.gz > all.windows /cluster/bin/phast/chooseLines -k 90 all.windows > subset.windows rm -f jobs.lst foreach f (`cat subset.windows`) set root = $f:t:r:r echo doEstimate.sh /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/$f LOG/$root.log TREES/$root >> jobs.lst end # run cluster job (about an hour) ssh kk9 cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements para create jobs.lst # 90 jobs written to batch para try; para check para push # 2 jobs crashed with out-of-mem; as we are just taking a sample # this is probably OK, but I've notified Adam # Average job time: 1055s 17.58m 0.29h 0.01d # Longest job: 3647s 60.78m 1.01h 0.04d # NOTE: should have used ave.noncons.mod to improve parameter estimation # cp nave.noncons.mod starting-tree.mod ls TREES/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt grep TREE ave.cons.mod # TREE: ((((((hg17:0.002313,panTro1:0.002931):0.036375,(mm5:0.029849,rn3:0.039008):0.095334):0.003258,canFam1:0.078205):0.047189,galGal2:0.158045):0.020103,xenTro1:0.169387):0.028857,((fr1:0.071610,tetNig1:0.057766):0.091165,danRer1:0.138905):0.028857); ls TREES/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt grep TREE ave.noncons.mod # TREE: ((((((hg17:0.007342,panTro1:0.009340):0.116009,(mm5:0.095037,rn3:0.124288):0.304355):0.010633,canFam1:0.249367):0.151476,galGal2:0.507037):0.064317,xenTro1:0.549121):0.094733,((fr1:0.231246,tetNig1:0.185161):0.296288,danRer1:0.446734):0.094733); # analyze conservation genome-wide cat << 'EOF' > doPhastCons.sh #!/bin/sh mkdir -p /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS pref=`basename $1 .ss.gz` chr=`echo $pref | awk -F\. '{print $1}'` tmpfile=/scratch/phastCons.$$ zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 11 --target-coverage 0.20 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile gzip -c $tmpfile > /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile 'EOF' chmod u+x doPhastCons.sh rm -fr /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS rm -f jobs2.lst foreach f (/cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/*.ss.gz) echo doPhastCons.sh $f >> jobs2.lst end # run cluster job (it's quick -- 10 minutes or so) ssh kk cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements para create jobs2.lst # 2932 jobs written to batch para try; para check para push # Average job time: 80s 1.33m 0.02h 0.00d # Longest job: 157s 2.62m 0.04h 0.00d # Submission to last job: 583s 9.72m 0.16h 0.01d # combine predictions and transform scores to be in 0-1000 interval # do in a way that avoids limits on numbers of args rm -f splitfiles* all.raw.bed find /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS -name "*.bed" > files split files splitfiles foreach s (splitfiles*) awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed end /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed rm files splitfiles* ssh hgwdev cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements hgLoadBed hg17 phastConsElements10way all.bed sort -rn -k 5 all.bed | sed -n '1,100000p' > top100K.bed hgLoadBed hg17 phastConsElements10wayTop100K top100K.bed # check coverage -- reran estimation and conservation steps with new parameters till # coverage close to 5% and expected-length parameter is close to consEntropy recommended length featureBits hg17 phastConsElements10way # first pass # .17 12 # 132657993 bases of 2866216770 (4.628%) in intersection # second pass -- used this # .20 11 # 143386170 bases of 2866216770 (5.003%) in intersection featureBits hg17 phastConsElements # 137850739 bases of 2866216770 (4.810%) in intersection # check expected-length parameter # first pass /cluster/bin/phast/consEntropy .17 12 \ ave.cons.mod ave.noncons.mod --NH 9.78 # recommended length 10.4 # second pass -- good enough according to Adam /cluster/bin/phast/consEntropy .20 11 \ ave.cons.mod ave.noncons.mod --NH 9.78 #( Solving for new omega: 11.000000 12.243251 12.155776 12.155369 ) #Transition parameters: gamma=0.200000, omega=11.000000, mu=0.090909, nu=0.022727 #Relative entropy: H=1.263205 bits/site #Required length: N=7.548911 sites #Total entropy: NH=9.535821 bits #Recommended expected length: omega=12.155369 sites (for NH=9.780000) # create wiggle data files ssh eieio cd /cluster/data/hg17/bed/multiz10way.v8/cons # sort post-prob files by chrom position using filename, then # use wigEncode to create binary files for wiggle find /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS \ -name "*.pp.gz" | sort -t\. -k2,2n | xargs zcat | \ wigEncode stdin phastCons10way.wig phastCons10way.wib hgWiggle -db=hg17 -doHistogram -hBinSize=0.001 \ -hBinCount=1000 -hMinVal=0.0 -db=hg17 phastCons >histo.8way.data hgWiggle -db=hg17 -doHistogram -hBinSize=0.001 \ -hBinCount=1000 -hMinVal=0.0 -db=hg17 phastCons10way >histo.10way.data hgWiggle -db=hg17 -doStats \ phastCons > stats.8way.data hgWiggle -db=hg17 -doStats \ phastCons10way > stats.10way.data # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/hg17/bed/multiz10way.v8/cons set wibDir = /gbdb/hg17/multiz10way/wib/phastCons10way mkdir -p $wibDir ln -s `pwd`/phastCons10way.wib $wibDir hgLoadWiggle hg17 phastCons10way phastCons10way.wig \ -pathPrefix=$wibDir # create tree image: # edit tree.nh to create species.nh with common names /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps # photoshop to enhance, then save as gif/jpg cp /cluster/data/hg17/bed/multiz10way.v8/species10.jpg \ /usr/local/apache/htdocs/images/phylo/10way.jpg # get stats on the track ssh hgwdev featureBits hg17 -enrichment refGene:cds phastConsElements10way # refGene:cds 1.020%, phastConsElements10way 5.003%, both 0.711%, cover 69.73%, enrich 13.94x # compare to previous elements (generated from 8way) featureBits hg17 -enrichment refGene:cds phastConsElements # refGene:cds 1.020%, phastConsElements 4.810%, both 0.747%, cover 73.22%, enrich 15.22x # see how gluing reduces number of alignments ssh eieio cd /cluster/data/hg17/bed/multiz10way.v8 mkdir stats grep score maf/chr22.maf | grep -v 0.0 | wc -l #179576 grep score mafGlued/chr22.maf | grep -v 0.0 | wc -l #110550 # look at distribution of alignment sizes after gluing ssh hgwdev cd /cluster/data/hg17/bed/multiz10way.v8 mkdir mafTemp ln -s `pwd`/maf/chr1.maf mafTemp # load temp table hgLoadMaf hg17 -pathPrefix=mafTemp multiz10wayChr1 #Loaded 1246727 mafs # again, compare to glued: echo "SELECT COUNT(*) FROM multiz10way" # 738030 # again, ~40% fewer cd stats echo "SELECT chromEnd - chromStart FROM multiz10way WHERE chrom='chr1'" | \ hgsql -N hg17 | sort -n > chr1.maf.glued.sizes echo "SELECT chromEnd - chromStart FROM multiz10wayChr1"| \ hgsql -N hg17 | sort -n > chr1.maf.sizes # cleanup hgsql hg17 -e "DROP TABLE multiz10wayChr1" rm -fr ../mafTemp # coverage of multiple alignment, and pairs ssh kolossus cd /cluster/data/hg17/bed/multiz10way.v8 cd stats nice mafRanges -notAllOGap ../mafGlued/chr1.maf hg17 \ hg17.chr1.mafRanges.bed nice mafRanges -notAllOGap /cluster/data/hg17/bed/multiz8way/maf/chr1.maf \ hg17 hg17.8way.chr1.mafRanges.bed foreach db (panTro1 canFam1 mm5 rn3 galGal2 xenTro1 fr1 tetNig1 danRer1) echo $db nice mafRanges /cluster/data/hg17/bed/blastz.$db/mafNet/chr1.*maf \ -notAllOGap hg17 $db.chr1.mafRanges.bed ls /cluster/data/hg17/bed/blastz.$db/mafNet/chr1.*maf end ssh hgwdev cd /cluster/data/hg17/bed/multiz10way.v8/stats nice featureBits -chrom=chr1 hg17 refGene:cds hg17.chr1.mafRanges.bed -enrichment # refGene:cds 1.308%, hg17.chr1.mafRanges.bed 95.725%, both 1.307%, cover 99.94%, enrich 1.04x nice featureBits -chrom=chr1 hg17 refGene:cds hg17.8way.chr1.mafRanges.bed -enrichment # refGene:cds 1.308%, hg17.8way.chr1.mafRanges.bed 95.742%, both 1.307%, cover 99.97%, enrich 1.04x foreach db (panTro1 canFam1 mm5 rn3 galGal2 xenTro1 fr1 tetNig1 danRer1) nice featureBits -chrom=chr1 -enrichment hg17 refGene:cds $db.chr1.mafRanges.bed end #refGene:cds 1.308%, panTro1.chr1.mafRanges.bed 93.472%, both 1.264%, cover 96.65%, enrich 1.03x #refGene:cds 1.308%, canFam1.chr1.mafRanges.bed 55.377%, both 1.277%, cover 97.64%, enrich 1.76x #refGene:cds 1.308%, mm5.chr1.mafRanges.bed 37.342%, both 1.280%, cover 97.92%, enrich 2.62x #refGene:cds 1.308%, rn3.chr1.mafRanges.bed 35.429%, both 1.257%, cover 96.14%, enrich 2.71x #refGene:cds 1.308%, galGal2.chr1.mafRanges.bed 3.840%, both 0.936%, cover 71.61%, enrich 18.65x #refGene:cds 1.308%, xenTro1.chr1.mafRanges.bed 3.059%, both 0.881%, cover 67.36%, enrich 22.02x #refGene:cds 1.308%, fr1.chr1.mafRanges.bed 1.892%, both 0.854%, cover 65.29%, enrich 34.50x #refGene:cds 1.308%, tetNig1.chr1.mafRanges.bed 1.384%, both 0.805%, cover 61.57%, enrich 44.50x #refGene:cds 1.308%, danRer1.chr1.mafRanges.bed 2.716%, both 0.847%, cover 64.81%, enrich 23.86x # MAKE HG17-RN3 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie) ssh kolossus set chainDir = /cluster/data/hg17/bed/blastz.rn3/axtChain netChainSubset $chainDir/rat.net.gz $chainDir/all.chain.gz \ /cluster/data/hg17/bed/bedOver/hg17ToRn3.over.chain # MAKE HG17-GALGAL2 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie) ssh kolossus set chainDir = /cluster/data/hg17/bed/blastz.galGal2/axtChain netChainSubset $chainDir/human.net $chainDir/all.chain \ /cluster/data/hg17/bed/bedOver/hg17ToGalGal2.over.chain # DOWNLOADS FOR 10-WAY MULTIZ (2005-01-24 kate) # Use "glued" mafs ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17 mkdir -p multiz10way cd multiz10way foreach f (/cluster/data/hg17/bed/multiz10way.v8/mafGlued/*.maf) set c = $f:r:t echo $c nice gzip -c $f > $c.maf.gz end # copy README and edit # Create upstream files for download ssh hgwdev cd /cluster/data/hg17/bed/multiz10way.v8 echo hg17 panTro1 mm5 rn3 canFam1 galGal2 xenTro1 fr1 tetNig1 danRer1 > org.txt # mafFrags takes a while foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags hg17 multiz10way up.bed upstream$i.maf -orgs=org.txt rm up.bed end ssh eieio cd /cluster/data/hg17/bed/multiz10way.v8 nice gzip upstream{1000,2000,5000}.maf # 6 mins. ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17 mv /cluster/data/hg17/bed/multiz10way.v8/upstream*.maf.gz multiz10way cd multiz10way md5sum *.gz > md5sum.txt # Create histogram of this phastCons data (Hiram - 2005-02-07) ssh hgwdev cd /cluster/data/hg17/bed/multiz.2004-12-22/cons time hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=hg17 phastCons > histogram.data 2>&1 # 34 minutes cat << '_EOF_' > histo.gp set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 xff0000 xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Hg17 Histogram phastCons track" set xlabel "Hg17 phastCons score" set ylabel "p-Value" set y2label "Cumulative Probability Distribution" set y2range [0:1] set y2tics plot "histogram.data" using 2:5 title " pValue" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CPD" with lines '_EOF_' gnuplot histo.gp > histo.png display histo.png & # BLASTZ BOREOEUTHERIAN (BOREUT1) (DONE 1/29/05 braney) ssh kk mkdir /cluster/data/borEut1/bed/zb.hg17 ln -s /cluster/data/borEut1/bed/zb.hg17 /cluster/data/hg17/bed/blastz.borEut1 cd /cluster/data/hg17/bed/blastz.borEut1 # Use default (Human-Mouse) settings for starters. cat << '_EOF_' > DEF # human vs. dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Default BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog SEQ2_DIR=/iscratch/i/borEut1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastz.borEut1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy /cluster/data/hg17/jkStuff/BlastZ_run0.sh cd run.0 para push # Completed: 2728 of 2728 jobs # CPU time in finished jobs: 621440s 10357.34m 172.62h 7.19d 0.020 y # IO & Wait Time: 19079s 317.98m 5.30h 0.22d 0.001 y # Average job time: 235s 3.91m 0.07h 0.00d # Longest job: 2340s 39.00m 0.65h 0.03d # Submission to last job: 2837s 47.28m 0.79h 0.03d ssh kki cd /cluster/data/hg17/bed/blastz.borEut1 /cluster/data/hg17/jkStuff/BlastZ_run1.sh cd run.1 para push # Completed: 341 of 341 jobs # CPU time in finished jobs: 95s 1.58m 0.03h 0.00d 0.000 y # IO & Wait Time: 825s 13.75m 0.23h 0.01d 0.000 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest job: 10s 0.17m 0.00h 0.00d # Submission to last job: 73s 1.22m 0.02h 0.00d ssh kk cd /cluster/data/hg17/bed/blastz.borEut1 /cluster/data/hg17/jkStuff/BlastZ_run2.sh cd run.2 para push # /cluster/data/hg17/bed/blastz.borEut1/axtChrom/chr18_random.axt is empty # /cluster/data/hg17/bed/blastz.borEut1/axtChrom/chr19_random.axt is empty # .. # Completed: 44 of 46 jobs # Crashed: 2 jobs # CPU time in finished jobs: 104s 1.73m 0.03h 0.00d 0.000 y # IO & Wait Time: 482s 8.04m 0.13h 0.01d 0.000 y # Average job time: 13s 0.22m 0.00h 0.00d # Longest job: 134s 2.23m 0.04h 0.00d # Submission to last job: 142s 2.37m 0.04h 0.00d # END BLASTZ BOREOEUTHERIAN ########################################################################## # MAKE tfbsConsSites and tfbsConsFactors for TFBS conserved track (DONE braney 1/15/05) # Questions? weirauch@soe.ucsc.edu or braney@soe.ucsc.edu # tfbsConsSites table reloaded 2006-11-03 - Hiram - see below: ## reload tfbsCons table - it was based on a newer version of tfbs names that ssh hgwdev mkdir /cluster/data/hg17/bed/tfbsCons cd /cluster/data/hg17/bed/tfbsCons # Define all parameters in 'PARAMS.txt' # Define all chromosomes in 'CHROMS.txt' # Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts weirauch@soe.ucsc.edu set tarfile=/cluster/data/hg17/bed/tfbsCons/tfbsConsUtils.tar.gz tar zxf $tarfile nice ./getRefseqStats.pl & nice ./getBatchQueries.pl & ssh kk mkdir /cluster/bluearc/braney/tfloc # Copy ./tmp/ctfbs_batch_list.txt to this dir # Copy ./scripts/doit to this dir para create ctfbs_batch_list.txt para try para push # When the run is done (within a day or so), the results will be in individual dirs, one for each chromosome. ssh kksilo (or hgwdev, or whatever) nice ./getBedFile.pl & hgLoadBed -noSort hg17 tfbsConsSites \ -sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql \ tfbsConsSites.bed -tab hgLoadBed -noSort hg17 tfbsConsFactors \ -sqlTable=$HOME/kent/src/hg/lib/tfbsConsFactors.sql \ tfbsConsFactors.bed -tab # Feel free to delete or gzip anything in ./tmp # (particularly the huge .maf and .bed files) # after the final two bed files are sucessfully loaded ########################################################################## # CHICKEN RECIPROCAL-BEST NET FOR STRINGENT LIFTOVER (DONE 2/3/05 angie) ssh kolossus cd /cluster/data/hg17/bed/blastz.galGal2/axtChain # Run chainNet again, this time keeping both of its outputs: chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin ../S1.len ../S2.len h_g.net g_h.net # Get the chicken chains from the chicken-referenced (but human-centric) # net: chainSwap all.chain g_h.chain netChainSubset g_h.net g_h.chain stdout \ | chainSort stdin g_h.subset.chain # Net those (sorted) chicken chains, and keep both outputs, to get # reciprocal best nets referenced to both species: chainPreNet g_h.subset.chain ../S2.len ../S1.len stdout \ | chainNet stdin ../S2.len ../S1.len g_h.rbest.net h_g.rbest.net # Get the chains from the recip-best nets for stringent liftOver: netChainSubset g_h.rbest.net g_h.chain galGal2ToHg17.rbest.over.chain netChainSubset h_g.rbest.net all.chain hg17ToGalGal2.rbest.over.chain ####### RE-BUILD RGD HUMAN QTL TRACKS (DONE 2/5/05 Fan) ############## mkdir -p /cluster/store8/rgd/human050205 rm /cluster/data/hg17/bed/rgdQtl ln -s /cluster/store8/rgd/human050205 /cluster/data/hg17/bed/rgdQtl cd /cluster/data/hg17/bed/rgdQtl # download data files from RGD wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/human_QTL.gff # remove extra line feed character at the end of lines # !!! manually corrected the line of AASTH7_H because chromStart is greater than chrEnd rmLf human_QTL.gff > rgdQtl.gff # create rgdQtl.tab awk '{print $1"\t"$4"\t"$5"\t"$10}' rgdQtl.gff |sed -e 's/Chr/chr/g'| \ sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtl.tab # create rgdQtlLink.tab awk '{printf "%s\t%s\t", $12, $10; for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' rgdQtl.gff | \ sed -e 's/"//g'| sed -e 's/RGD://g' | sed -e 's/;//g'| sed -e 's/Note//g' > rgdQtlLink.tab # load rgdQtl table hgLoadBed hg17 rgdQtl rgdQtl.tab # check rgdQtl table checkTableCoords hg17 rgdQtl # load rgdQtlLink table hgsql hg17 -e "drop table hg17.rgdQtlLink;" hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;' # updated trackDb.ra under /kent/src/hg/makeDb/trackDb/human/hg17 and # added rgdQtl.html. # GENOSCOPE TETRAODON (tetNig1) ECORES (DONE, 2005-02-08, hartera) ssh eieio mkdir -p /cluster/data/hg17/bed/ecoresTetNig1 cd /cluster/data/hg17/bed/ecoresTetNig1 wget --timestamp \ http://www.genoscope.cns.fr/externe//4ucsc/ExofishHs35Tnig1 # this is in gff format # remove "Ecotig" from name field sed -e 's/Ecotig EG/EG/g' ExofishHs35Tnig1 > ExofishHs35Tnig1.gff # need to have tabs between fields not a space to load file into table sed -e 's/ /\t/g' ExofishHs35Tnig1.gff > Hs35Tnig1format.gff # if "ecore" is changed to "CDS" and "ecotig" to "transcript" this loads # correctly into the table. sed -e 's/ecore/CDS/' Hs35Tnig1format.gff | sed -e 's/ecotig/transcript/' \ > Hg17vstetNig1.gff # add "chr" in front of the chromsome name in first field (2005-02-08) perl -pi.bak -e 's/^([0-9XYM]{1,2})/chr$1/' Hg17vstetNig1.gff rm *.bak # need to reload table ssh hgwdev cd /cluster/data/hg17/bed/ecoresTetNig1 echo 'drop table ecoresTetNig1;' | hgsql hg17 nice ldHgGene hg17 ecoresTetNig1 Hg17vstetNig1.gff # Read 40172 transcripts in 186032 lines in 1 files # 40172 groups 42 seqs 1 sources 2 feature types # 40172 gene predictions # added ecoresTetNig1 entry to trackDb.ra in trackDb/human # and created ecoresTetNig1.html. Genoscope will not be maintaining this # newest data in their Exofish comparative browser display. # UPDATE kgSpAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan) # Add new human protein display IDs to the alias table to support user search ssh hgwdev cd /cluster/data/hg17/bed/pb mkdir newDisplayId cd newDisplayId hgsql proteome -e 'select hg17.kgSpAlias.kgID, hg17.kgSpAlias.SpID, spOldNew.newDisplayId from spOldNew, hg17.kgSpAlias where spOldNew.acc=hg17.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >hg17.tab hgsql hg17 -e 'load data local infile "hg17.tab" into table hg17.kgSpAlias' # UPDATE kgProtAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan) # Add new hg17 protein display IDs to the alias table to support user search ssh hgwdev cd /cluster/data/hg17/bed/pb/newDisplayId hgsql proteome -e 'select hg17.kgSpAlias.kgID,spOldNew.oldDisplayId,spOldNew.newDisplayId from spOldNew, hg17.kgSpAlias where spOldNew.acc=hg17.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >hg17.kgProtAlias.tab # get rid of the header line at the end of the file vi hg17.kgProtAlias.tab hgsql hg17 -e 'load data local infile "hg17.kgProtAlias.tab" into table hg17.kgProtAlias' # BLASTZ HUMAN TARGET, COW QUERY (DONE, Nov. 2004 - Jan. 2005, Heather) ssh kk # use /cluster/data/bosTau1 because more disk space there cd /cluster/data/bosTau1/bed mkdir zb.hg17 # create DEF file # for now, not doing ABRIDGE_REPEATS # this means I don't need to create lineage specific repeats # This is because blastz-run wouldn't take advantage of these # because my query is in scaffolds # human vs. cow export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Default BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs #SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow SEQ2_DIR=/iscratch/i/bosTau1/splitDir SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/zb.hg17 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy bash cd /cluster/data/bosTau1/bed/zb.hg17 source DEF mkdir $RAW run.0 # create S2.len so make-joblist doesn't have to /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 # check how many lines in j sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, para check, para push, para check.... # convert out to lav ssh kki cd /cluster/data/bosTau1/bed/zb.hg17 # run bash shell if not running it already source DEF mkdir -p $BASE/run.1 mkdir -p $BASE/lav # create a new job list to convert out files to lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > run.1/jobList cd run.1 # make sure the job list is OK wc -l jobList head jobList para create jobList para try para check para push # lavToAxt ssh kksilo cd /cluster/data/bosTau1/bed/zb.hg17 mkdir axtTemp cd lav foreach i (*) catDir $i | lavToAxt stdin /cluster/data/hg17/nib \ /cluster/data/bosTau1/bosTau1.2bit ../axtTemp/$i.axt echo done $i end # axtChain ssh kki cd /cluster/data/bosTau1/bed/zb.hg17 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainRaw ls -1S /cluster/data/bosTau1/bed/zb.hg17/axtTemp/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chainRaw/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh '_EOF_' axtChain $1 /iscratch/i/hg17/bothMaskedNibs /iscratch/i/bosTau1/nib/bosTau1.2bit $2 > $3 # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try para check para push # Completed: 46 of 46 jobs # Average job time: 83s 1.39m 0.02h 0.00d # Longest job: 1240s 20.67m 0.34h 0.01d # Submission to last job: 1326s 22.10m 0.37h 0.02d # mergesort ssh kksilo cd /cluster/data/bosTau1/bed/zb.hg17/axtChain chainMergeSort run1/chainRaw/*.chain > all.chain.jan3 # chainAntiRepeat ssh kksilo cd /cluster/data/bosTau1/bed/zb.hg17/axtChain/run1 mkdir chainAntiRepeat # test with just one chainAntiRepeat /cluster/store5/gs.18/build35/nib /cluster/data/bosTau1/bosTau1.2bit \ chainRaw/chr18.chain chainAntiRepeat/chr18.chain # do them all foreach f (chainRaw/*.chain) set f1 = $f:t echo $f1 chainAntiRepeat /cluster/store5/gs.18/build35/nib /cluster/data/bosTau1/bosTau1.2bit \ $f chainAntiRepeat/$f1 end # mergesort again ssh kksilo cd /cluster/data/bosTau1/bed/zb.hg17/axtChain chainMergeSort run1/chainAntiRepeat/*.chain > all.chain.jan5 gzip all.chain.jan3 # split mkdir chain chainSplit chain all.chain.jan5 # look at the distribution foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=5000 /tmp/score.$f:t:r echo "" end # see files histogram.out and histogram.interesting # run chainFilter chainFilter -minScore=5000 all.chain.jan5 > all.chain.jan5.filtered gzip all.chain.jan5 # split rm chain/* chainSplit chain all.chain.jan5.filtered gzip all.chain.jan5.filtered # load ssh hgwdev cd /cluster/data/bosTau1/bed/zb.hg17/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain hg17 ${c}_chainBosTau1 $i end # featureBits -chrom=chr1 hg17 chainBosTau1Link # 103272818 bases of 222827847 (46.346%) in intersection # featureBits -chrom=chr2 hg17 chainBosTau1Link # 105920345 bases of 237506229 (44.597%) in intersection # featureBits -chrom=chr3 hg17 chainBosTau1Link # 89582887 bases of 194635740 (46.026%) in intersection # featureBits -chrom=chr4 hg17 chainBosTau1Link # 77513949 bases of 187161218 (41.416%) in intersection # featureBits -chrom=chr5 hg17 chainBosTau1Link # 80428726 bases of 177702766 (45.260%) in intersection # featureBits -chrom=chr6 hg17 chainBosTau1Link # 71830264 bases of 167317699 (42.930%) in intersection # featureBits -chrom=chr7 hg17 chainBosTau1Link # 64561289 bases of 154759139 (41.717%) in intersection # featureBits -chrom=chr8 hg17 chainBosTau1Link # 55896735 bases of 142612826 (39.195%) in intersection # featureBits -chrom=chr9 hg17 chainBosTau1Link # 52068957 bases of 117781268 (44.208%) in intersection # featureBits -chrom=chr10 hg17 chainBosTau1Link # 57427282 bases of 131613628 (43.633%) in intersection # featureBits -chrom=chr11 hg17 chainBosTau1Link # 58412709 bases of 131130853 (44.545%) in intersection # featureBits -chrom=chr12 hg17 chainBosTau1Link # 56076163 bases of 130259811 (43.049%) in intersection # featureBits -chrom=chr13 hg17 chainBosTau1Link # 37951944 bases of 95559980 (39.715%) in intersection # featureBits -chrom=chr14 hg17 chainBosTau1Link # 39896970 bases of 88290585 (45.188%) in intersection # featureBits -chrom=chr15 hg17 chainBosTau1Link # 37507979 bases of 81341915 (46.112%) in intersection # featureBits -chrom=chr16 hg17 chainBosTau1Link # 33883573 bases of 78884754 (42.953%) in intersection # featureBits -chrom=chr17 hg17 chainBosTau1Link # 31871034 bases of 77800220 (40.965%) in intersection # featureBits -chrom=chr18 hg17 chainBosTau1Link # 30359555 bases of 74656155 (40.666%) in intersection # NET # run in stages to avoid memory problems ssh kolossus cd /cluster/data/bosTau1/bed/zb.hg17/axtChain # PRE /cluster/bin/x86_64/chainPreNet all.chain.jan5.filtered ../S1.len ../S2.len chainPreNet.out # chainNet /cluster/bin/x86_64/chainNet chainPreNet.out \ -minSpace=1 ../S1.len ../S2.len bosTau1.net.raw /dev/null # syntenic (using revision 1.6) /cluster/home/heather/bin/x86_64/netSyntenic bosTau1.net.raw bosTau1.net.syn # memory usage 2757492736, utime 13404 s/100, stime 616 # backup/compress ssh kksilo cd /cluster/data/bosTau1/bed/zb.hg17/axtChain gzip bosTau1.net.raw cp bosTau1.net.syn bosTau1.net.syn.backup # netClass # takes about 4 hours ssh hgwdev cd /cluster/data/bosTau1/bed/zb.hg17/axtChain netClass -noAr bosTau1.net.syn hg17 bosTau1 bosTau1.net # backups ssh kksilo cp bosTau1.net bosTau1.net.backup rm bosTau1.net.syn.backup # load ssh hgwdev cd /cluster/data/bosTau1/bed/zb.hg17/axtChain netFilter -minGap=10 bosTau1.net | hgLoadNet hg17 netBosTau1 stdin rm bosTau1.net.backup # index has NULL cardinality; analyze to fix hgsql hg17 analyze table netBosTau1 # generate axts ssh kksilo cd /cluster/data/bosTau1/bed/zb.hg17 mkdir axtNet # split first (not required?) cd axtChain mkdir net netSplit bosTau1.net.syn net cd net foreach i (*.net) netToAxt $i ../chain/$i:r.chain /cluster/data/hg17/nib /cluster/data/bosTau1/bosTau1.2bit ../../axtNet/$i:r.axt end gzip bosTau1.net.syn gzip bosTau1.net # axtSort (takes about 5 minutes) ssh kksilo cd /cluster/data/bosTau1/bed/zb.hg17 mkdir axtNetSort foreach f ( axtNet/*.axt ) set c = $f:t:r echo "axtSort on $c" axtSort $f axtNetSort/$c.axt end # make maf files mkdir mafNet foreach f (axtNetSort/*.axt) set c = $f:t:r echo "axtToMaf on $c" axtToMaf $f /cluster/data/hg17/chrom.sizes /cluster/data/bosTau1/chrom.sizes mafNet/$c.maf -tPrefix=hg17. -qPrefix=bosTau1. end # MAKE VSBOSTAU1 DOWNLOADABLES (DONE Feb. 15, 2005 Heather) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17 mkdir vsBosTau1 cd vsBosTau1 mkdir axtNet cd /cluster/data/bosTau1/bed/zb.hg17/axtChain cp -p all.chain.gz /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/cow.chain.gz cp -p bosTau1.net.gz /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/cow.net.gz cd ../axtNet cp -p * /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/axtNet cd /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1 # Make a README.txt which explains the files & formats. md5sum *.gz > md5sum.txt cd axtNet md5sum *.gz > md5sum.txt # YALE PSEUDOGENES (started Robert Baertsch, finished JK 2/21/05) ssh hgwdev cd /cluster/data/hg17/bed mkdir pseudoYale cd pseudoYale # Place file obtained from Mark Gerstein at yale in pseudoYale.gtf ldHgGene hg17 pseudoYale pseudoYale.gtf # Note - I'm guessing how this goes. Robert left no record. -jk # added xenoRefGene track (markd ~2005-02-20) add to /cluster/data/genbank/genbank.con hg17.refseq.mrna.xeno.load = yes hg17.refseq.mrna.xeno.loadDesc = yes # BUILD ccdsGene and ccdsInfo tables (markd 2005-02-25) # download files to the genbank data area, as this will eventually # be done automatically as part of the genbank build process. cd /cluster/data/genbank mkdir -p data/ccds/hg17/2005-02-25 cd data/ccds/hg17/2005-02-25 # get the basic text dumps of the data, rather than the database dumps wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/ # ends up with: About-NcbiHinxton.txt NcbiHinxton.txt NcbiHinxtonAllAccessions.txt # this is a preliminary release, it contained 2 PAR genes that had # bad coordinates and 7 genes that were determined at be pseudogenes # at the last minute. The accessions for these 9 genes were # placed in skip.ccds and then removed: fgrep -v -f skip.ccds /scratch/markd/gene-sets/ncbiDb/set1.5/NcbiHinxtonAllAccessions.txt > /scratch/markd/gene-sets/ncbiDb/set1.5/NcbiHinxtonAllAccessions.cleaned.txt # create the tab files to load in the database /cluster/data/genbank/bin/i386/ccdsImport NcbiHinxtonAllAccessions.cleaned.txt ccdsGene.gp ccdsInfo.tab # load ccdsInfo hgsql hg17 <../../../../../lib/ccdsInfo.sql hgsql -e 'load data local infile "ccdsInfo.tab" into table ccdsInfo' hg17 # load cdsGene.gp and check ldHgGene -predTab -genePredExt hg17 ccdsGene ccdsGene.gp checkTableCoords hg17 -verbose=2 ccdsGene rm *.tab gzip -9 NcbiHinxton*.txt # BUILD refSeqKg TABLE TO SUPPORT CCDS GENES (RE-DONE, Fan 2/26/05) hgsql hg17 -N -e "select * from knownGene" >kg.gp hgsql hg17 -N -e "select * from refGene" >ref.gp overlapSelect -inCds -strand -idOutput -fraction=fraction.out -selectCds -overlapSimilarity=0.90 -selectFmt=genePred -inFmt=genePred kg.gp ref.gp refSeqKg.90.tab cat fraction.out|sort -u >refSeqKg.tab hgsql hg17 -e 'drop table refSeqKg' hgsql hg17 < ~/src/hg/lib/refSeqKg.sql hgsql hg17 -e 'load data local infile "refSeqKg.tab" into table refSeqKg' rm fraction.out # BUILD ccdsGene and ccdsInfo tables (markd, reone 2005-03-17) cd /cluster/store5/genbank/data/ccds/hg17 wget ftp://ftp.ncbi.nlm.nih.gov/pub/hcds/Hs35.1/CDSTrackDB/CCDS.20050303.tar.gz mkdir /scratch/tmp/ccds cd /scratch/tmp/ccds tar -zxf /cluster/store5/genbank/data/ccds/hg17/CCDS.20050303.tar.gz # import ccds database tables hgsql -e 'create database ccds' hgsql ccds kg.gp hgsql -N -e "select * from refGene" hg17 >ref.gp overlapSelect -statsOutput -strand -inCds -selectCds -overlapSimilarity=0.90 kg.gp ref.gp stdout | tail +2 | sort -u >refSeqKg.tab hgsql hg17 -e 'drop table refSeqKg' hgsql hg17 < ~/compbio/kent/src/hg/lib/refSeqKg.sql hgsql hg17 -e 'load data local infile "refSeqKg.tab" into table refSeqKg' cd .. rm -r ccds # COW BACENDS (Done, Heather, Mar. 21, 2005) ssh hgwdev cd /cluster/data/hg17/bed mkdir bacendsCow cd bacendsCow # Obtain GFF file from Denis; unzip into BACendhg15.gff # Convert into BED 6: makebed.pl < BACendhg17.gff > BACendhg17.bed hgLoadBed -noBin hg17 bacendsCow BACendhg17.bed # 53403 warnings # add to kent/src/hg/makeDb/trackDb/human/hg17/trackDb.ra # make map between ccds and known genes (markd 2005/03/08) # this should be run whenever either known genes or ccds is updated /cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap # UPDATE WGRNA TRACK (DONE, 2004-12-13, Fan) # Received updated data file, wg_track_april2005.txt, from Michel Weber by email. cut -f 2-10 wg_track_april2005.txt |tail +2 >wg_track_april2005.tab # Use editor to remove the last blank line. hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wg_track_april2005.tab # Asked Donna to update Reference section according to Michel's email. ## refresh vega tracks with vega build30 (done 5/4/04 Robert) ##download vega mysql tables cd /cluster/store8/ensembl mkdir vega30_35c cd vega30_35c ln /cluster/store8/ensembl/vega30_35c /cluster/data/hg17/bed/vega30 -s for i in `cat tables` ; do wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/$i.gz ; done wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/homo_sapiens_vega_30_35c_mysql40_compatible..sql.gz gunzip *.gz ##create mysql database mysql create database vega30 use vega30 source homo_sapiens_vega_30_35c_mysql40_compatible.sql source dropMt.sql source load.sql exit hgsql vega30 -N -B < vegaGene.sql > vegaGene.tab awk -f vegaGene.awk < vegaGene.tab > vegaGene.gp ldHgGene hg17 vegaGene -predTab vegaGene.gp -gtf -genePredExt hgsql vega30 -N -B < vegaPseudo.sql > vegaPseudo.tab awk -f vegaPseudo.awk < vegaPseudo.tab > vegaPseudo.gp ldHgGene hg17 vegaPseudoGene -predTab vegaPseudo.gp -gtf -genePredExt #load processed pseudogenes grep Processed vegaPseudo.tab > vegaProcPseudo.tab awk -f vegaPseudo.awk < vegaProcPseudo.tab > vegaProcPseudo.gp ldHgGene hg17 vegaProcessedPseudo -predTab vegaProcPseudo.gp -gtf -genePredExt #load vegaInfo hgsql vega30 -N -B < vegaGeneInfo.sql > vegaInfo.tab hgsql vega30 -N -B < vegaPseudoInfo.sql >> vegaInfo.tab hgsql hg17 -N -B < /cluster/home/baertsch/kent/src/hg/lib/vegaInfo.sql echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg17 -N -B #load down to hg16 liftOver vegaGene.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaGeneHg16.gp unMapped.gp -genePred liftOver vegaPseudo.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaPseudoGeneHg16.gp unMappedPseudo.gp -genePred ldHgGene hg16 vegaGene -predTab vegaGeneHg16.gp -gtf ldHgGene hg16 vegaPseudoGene -predTab vegaPseudoGeneHg16.gp -gtf echo 'truncate table vegaInfo' | hgsql hg16 -N -B echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg16 -N -B ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) #################################################################################### # RE-BUILD KNOWN GENES TABLES, 2ND TRIAL WITH VARIANT PROTEINS (Started 5/13/05 Fan) # First build protein databases, sp050415 and proteins050415 # See makeProteins050415.doc for details. # Create working subdirectories and temporary databases (kgHg17F) ssh hgwdev cd /cluster/store10/kg mkdir kgHg17F ln -s /cluster/store10/kg/kgHg17F /cluster/store6/kgDB/bed/kgHg17F ln -s /cluster/store10/kg/kgHg17F /cluster/data/hg17/bed/kgHg17F hgsql hg17 -e "create database kgHg17F" hgsql hg17 -e "create database kgHg17FTemp" mkdir /cluster/bluearc/kgDB/kgHg17F mkdir /cluster/bluearc/kgDB/kgHg17F/protBlat ln -s /cluster/bluearc/kgDB/kgHg17F/protBlat /cluster/store10/kg/kgHg17F/protBlat cd /cluster/store10/kg/kgHg17F/protBlat ################################################################# # VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV # The protBlat.psl was built during the first KG II build trial # The results are still valid, except that kgHg17E was used # instead of kgHg17F # Create working subdirectories and temporary databases (kgHg17E) ssh hgwdev cd /cluster/store10/kg mkdir kgHg17E ln -s /cluster/store10/kg/kgHg17E /cluster/store6/kgDB/bed/kgHg17E ln -s /cluster/store10/kg/kgHg17E /cluster/data/hg17/bed/kgHg17E hgsql hg17 -e "create database kgHg17E" hgsql hg17 -e "create database kgHg17ETemp" mkdir /cluster/bluearc/kgDB/kgHg17E mkdir /cluster/bluearc/kgDB/kgHg17E/protBlat ln -s /cluster/bluearc/kgDB/kgHg17E/protBlat /cluster/store10/kg/kgHg17E/protBlat cd /cluster/store10/kg/kgHg17E/protBlat # Get all human protein sequences hgsql -N sp050415 -e \ 'select proteins050415.spXref3.accession,protein.val from proteins050415.spXref3,protein where division="9606" and acc=accession' \ |awk '{print ">" $1;print $2}' >humanProt.fa # Prepare and perform cluster run for protein/genome alignment ssh kk cd /cluster/data/hg17/bed/kgHg17E/protBlat mkdir prot faSplit sequence humanProt.fa 1000 prot/prot ls /cluster/bluearc/kgDB/kgHg17E/protBlat/prot/* > prot.lis ssh hgwdev cd /cluster/data/hg17/bed/kgHg17E/protBlat hgsql hg17 -N -e 'select chrom from chromInfo' > chrom.lis exit cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/hg17/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgHg17E/protBlat/result/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' mkdir result gensub2 chrom.lis prot.lis gsub jobList para create jobList para try para check para push para check ... # many output .psl will be empty, the warnings are OK. [kk:protBlat> para check 45494 jobs in batch 0 jobs (including everybody's) in Parasol queue. Checking finished jobs tracking errors: 1 crashed: 12643 ranOk: 32850 total jobs in batch: 45494 [kk:protBlat> para time 45494 jobs in batch 0 jobs (including everybody's) in Parasol queue. Checking finished jobs Completed: 32850 of 45494 jobs Crashed: 12643 jobs para.results: file not found. paraHub can't write to this dir? CPU time in finished jobs: 36153510s 602558.50m 10042.64h 418.44d 1.146 y IO & Wait Time: 1585456s 26424.27m 440.40h 18.35d 0.050 y Average job time: 1149s 19.15m 0.32h 0.01d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 155120s 2585.33m 43.09h 1.80d Submission to last job: 276342s 4605.70m 76.76h 3.20d # This cluster run took about 3 days. Crashed jobs are due to empty BLAT result. It is OK. # collect BLAT results ssh hgwdev cd /cluster/data/hg17/bed/kgHg17E/protBlat mkdir result2 mkdir result3 cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall cat << '_EOF_' > do1.1 echo processing $1 cat result/$1_prot*.psl >result2/$1.psl '_EOF_' cat << '_EOF_' > do1.2 echo processing $1 pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out '_EOF_' chmod +x do* cp do1.1 do1 doall cp do1.2 do1 doall cat result3/*.psl >protBlat.psl #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # The end of protBlat.psl build, using kgHg17E ################################################################################ ############################################################################ # This part process the variant splice proteins. # First build variant splice protein tables. # Get all variant isoform human protein sequences ssh hgwdev cd /cluster/data/swissprot/050415/build wget --timestamp \ ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz wget --timestamp \ ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl_varsplic.fasta.gz gzip -d *varsplic.fasta.gz faToTab -type=protein uniprot_trembl_varsplic.fasta splicTrembl.tab faToTab -type=protein uniprot_sprot_varsplic.fasta splicSprot.tab cat splicTrembl.tab splicSprot.tab >varProtein.tab hgsql sp050415 < ~/src/hg/lib/varProtein.sql hgsql sp050415 -e 'load data local infile "varProtein.tab" into table varProtein' cat varProtein.tab |cut -f 1>j1 cut -f 1 j1|sed -e 's/-/\t/g' >j2 paste j1 j2 >splicProt.tab hgsql kgHg17FTemp -e 'drop table splicProt' hgsql kgHg17FTemp <~/src/hg/lib/splicProt.sql hgsql kgHg17FTemp -e 'load data local infile "splicProt.tab" into table splicProt' hgsql kgHg17FTemp -N -e \ 'select varAcc, varProtein.val from sp050415.varProtein,splicProt,proteins050415.spXref3 where accession=parAcc and varProtein.acc=splicProt.varAcc and division="9606"'| \ awk '{print ">" $1;print $2}' >humanVarProt.fa cd /cluster/data/hg17/bed/kgHg17F # get all Human splicProtBlat records hgsql hg17 -N -e \ 'select splicProtBlat.* from splicProtBlat,proteins050415.spXref3,kgHg17FTemp.splicProt where qName=splicProt.varAcc and parAcc=accession and division="9606"'\ |cut -f 2-22 \ >humanVarProtBlat.psl # Combine the regular protein protBlat records with the variant protein psl records. cd /cluster/store10/kg/kgHg17F cat ../kgHg17E/protBlat/protBlat.psl humanVarProtBlat.psl >protBlat.psl hgLoadPsl hg17 protBlat.psl # Processing protBlat.psl # load of protBlat did not go as planned: 104064 record(s), 0 row(s) skipped, 1484 warning(s) loading psl.tab # Looked into the cause of these 1484 warnings. It was due to that qBaseInsert and tBaseInsert # have negative values, probably due to that this is protein alignment. # create all_mrna.psl and tight_mrna.psl hgsql hg17 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \ all_mrna.psl tight_mrna.psl /dev/null # Use overlapSelect to get protein and mRNA alignment overlaps overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \ -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat.psl protMrna.stat overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \ -inFmt=psl tight_mrna.psl protBlat.psl protMrna.out # Create protein/mRNA pair and protein lists cut -f 10,31 protMrna.out|sort -u >spMrna.tab cut -f 10 protMrna.out|sort -u >protein.lis # Load spMrna.tab into spMrna table in temp DB. hgsql kgHg17FTemp < ~/src/hg/lib/spMrna.sql hgsql kgHg17FTemp -e 'load data local infile "spMrna.tab" into table spMrna' hgsql kgHg17FTemp -e 'create index mrnaID on spMrna(mrnaID)' # Prepare and perform cluster run of protein/mRNA alignment # Get mRNA fa file. cd /cluster/data/hg17/bed/kgHg17F /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=hg17 \ -gbRoot=/cluster/data/genbank genbank mrna mrna.fa # Create mrnaSeq table in kgHg17FTemp DB. faToTab mrna.fa mrnaSeq.tab hgsql kgHg17FTemp -e 'drop table mrnaSeq' hgsql kgHg17FTemp <~/src/hg/lib/mrnaSeq.sql hgsql kgHg17FTemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq' # Prepare files for cluster run ~/src/hg/protein/KG2.sh kgHg17F hg17 050415 # Perform cluster run of protein/mRNA alignment ~/src/hg/protein/KG3.sh kgHg17F hg17 050415 # Collect cluster run results cd kgBestMrna ls out | sed -e 's/prot/do1 prot/g' >doall # create do1 with the following 2 lines: cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protMrnaRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis wc protMrna.lis # Load BLAT results into temp DB. hgsql kgHg17FTemp < ~/src/hg/lib/protMrnaBlat.sql hgsql kgHg17FTemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat' hgsql kgHg17FTemp -e 'create index tName on protMrnaBlat(tName)' # Create CDS files from protein/mRNA alignment results. hgsql kgHg17FTemp -N -e \ 'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\ |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds # Create protMrna.psl with proteinID_mrnaID as query ID. cut -f 22-30 ../protMrna.out > j1.tmp cut -f 32-42 ../protMrna.out > j2.tmp cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp paste j1.tmp j3.tmp j2.tmp >protMrna.psl rm j1.tmp j2.tmp j3.tmp # Run mrnaToGene to create protMrna.gp bash mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log exit # Prepare refGene and all_mrna gp files. cd .. hgsql hg17 -N -e 'select * from refGene' >ref.gp hgsql hg17 -N -e \ 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \ |sort -u > all_mrna.cds bash mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log exit # Align proteins to RefSeq. overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat.psl ref.gp ref.stat overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\ protBlat.psl ref.gp protRef.gp overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\ -selectFmt=genePred ref.gp protBlat.psl protRef.out cut -f 10,22 protRef.out | sort -u >spRef.tab cut -f 10 protRef.out | sort -u >protRef.lis hgsql kgHg17FTemp -e 'drop table spRef' hgsql kgHg17FTemp <~/src/hg/lib/spRef.sql hgsql kgHg17FTemp -e 'load data local infile "spRef.tab" into table spRef' # Prepare and perform cluster runs for protein/RefSeq alignments ~/src/hg/protein/KGRef2.sh kgHg17F hg17 050415 ~/src/hg/protein/KGRef3.sh kgHg17F hg17 050415 cd kgBestRef ls out | sed -e 's/prot/do1 prot/g' >doall cat << '_EOF_' > do1 echo processing $1 cat out/$1/*.out >>protRefRaw.psl '_EOF_' chmod +x do* doall # Filter out low quality alignments. pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis wc protRef.lis hgsql kgHg17FTemp -e 'drop table protRefBlat' hgsql kgHg17FTemp < ~/src/hg/lib/protRefBlat.sql hgsql kgHg17FTemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat' hgsql kgHg17FTemp -e 'create index tName on protRefBlat(tName)' # Run gene-check to filter out invalid gp entries cd /cluster/data/hg17/bed/kgHg17F cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/hg17/nib kgCandidate0.gp kgCandidate0.check hgsql kgHg17FTemp -e 'drop table kgCandidate0' hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidate0.sql hgsql kgHg17FTemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0' hgsql kgHg17FTemp -e 'drop table geneCheck' hgsql kgHg17FTemp < ~/src/hg/lib/geneCheck.sql hgsql kgHg17FTemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgCheck to get all KG candidates that pass the KG gene check criteria kgCheck kgHg17FTemp hg17 kgCandidate0 geneCheck kgCandidate.tab hgsql kgHg17FTemp -e 'drop table kgCandidate' hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidate.sql hgsql kgHg17FTemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate' hgsql kgHg17FTemp -e 'create index alignID on kgCandidate(alignID)' # ####### NEXT TIME AROUND PUT IN AN EXTRA STEP TO BRING IN ITEMS ON A "PUT BACK" LIST # FOR SPECIAL CASES LIKE SELENOCYSTEINE, NON-AUG INITIATION CODON, RIBOSOMAL SLIPPAGE, ETC. # ####### # Construct the kgCandidateX table that has alignID in the name field. cut -f 2-10 kgCandidate.tab >j2.tmp cut -f 11 kgCandidate.tab >j1.tmp paste j1.tmp j2.tmp >kgCandidateX.tab hgsql kgHg17FTemp -e 'drop table kgCandidateX' hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateX.sql hgsql kgHg17FTemp -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX' # Score protein/mRna and protein/RefSeq alignments kgResultBestMrna2 050415 kgHg17FTemp hg17|sort -u >protMrnaBlatScore.tab kgResultBestRef2 050415 kgHg17FTemp hg17|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgHg17FTemp -e 'drop table protMrnaScore' hgsql kgHg17FTemp < ~/src/hg/lib/protMrnaScore.sql hgsql kgHg17FTemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore' hgsql kgHg17FTemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgGetCds to get CDS structure of each gene kgGetCds kgHg17FTemp kgCandidateX jY.tmp cat jY.tmp |sort -u >kgCandidateY.tab rm jY.tmp hgsql kgHg17FTemp -e 'drop table kgCandidateY' hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateY.sql hgsql kgHg17FTemp -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY' # Run kgPickPrep to replace long cds structure string with cdsId. kgPickPrep kgHg17FTemp kgCandidateZ.tab hgsql kgHg17FTemp -e 'drop table kgCandidateZ' hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateZ.sql hgsql kgHg17FTemp -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ' hgsql kgHg17FTemp -e 'create index cdsId on kgCandidateZ(cdsId)' # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure. kgPick kgHg17FTemp hg17 proteins050415 kg3.tmp dupSpMrna.tmp sort -u dupSpMrna.tmp >dupSpMrna.tab # Sort KG genes to make the kg3.gp table file. ~/kent/src/hg/protein/sortKg.pl kg3.tmp >kg3.gp hgsql kgHg17FTemp -e 'drop table knownGene' hgsql kgHg17FTemp < ~/src/hg/lib/knownGene.sql hgsql kgHg17FTemp -e 'load data local infile "kg3.gp" into table knownGene' hgsql hg17 -e 'drop table kg3' hgsql hg17 < ~/src/hg/lib/kg3.sql hgsql hg17 -e 'load data local infile "kg3.gp" into table kg3' # Perform analysis before renaming the kg3 table to knownGene. # Load data into hg17 knownGene table. hgsql hg17 -e 'drop table knownGene' hgsql hg17 < ~/src/hg/lib/knownGene.sql hgsql hg17 -e 'load data local infile "kg3.gp" into table knownGene' # Build knownGeneMrna and knownGenePep tables. kgPepMrna kgHg17FTemp hg17 050415 hgsql hg17 -e 'drop table knownGeneMrna' hgsql hg17 < ~/src/hg/lib/knownGeneMrna.sql hgsql hg17 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' hgsql hg17 -e 'drop table knownGenePep' hgsql hg17 < ~/src/hg/lib/knownGenePep.sql hgsql hg17 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build kgXref table kgXref2 kgHg17FTemp 050415 hg17 hgsql hg17 -e 'drop table kgXref' hgsql hg17 < ~/src/hg/lib/kgXref.sql hgsql hg17 -e 'load data local infile "kgXref.tab" into table kgXref' # Build spMrna table hgsql hg17 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab hgsql hg17 -e 'drop table spMrna' hgsql hg17 <~/src/hg/lib/spMrna.sql hgsql hg17 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build kgProtMap table ~/src/hg/protein/kgProtMap2.sh kgHg17F hg17 050415 # Update and clean up kgResultBestMrna2.c and then check it in. ##################################### # Build alias tables. DONE 5/18/05 Fan. # kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases # proteins050415.hugo.withdraws, hg17.kgXref.kgID # to create kgAliasM.tab and geneAlias.tab # by picking out those kgID items from kgXref where # kgXref.geneSymbol == hugo.symbol kgAliasM hg17 proteins050415 # kgAliasKgXref reads from hg17.knownGene.proteinID, # hg17.knownGene.name, hg17.kgXref.geneSymbol # to create kgAliasKgXref.tab kgAliasKgXref hg17 # kgAliasRefseq reads from hg17.knownGene.name, # hg17.knownGene.proteinID, hg17.kgXref.refseq # to create kgAliasRefseq.tab kgAliasRefseq hg17 hgsql sp050415 -N -e 'select name,gene.val from hg17.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \ | sort -u > kgAliasP.tab hgsql hg17 -N -e 'select name, name from knownGene' >kgAliasDup.tab hgsql hg17 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" hg17 hgsql hg17 < ~/kent/src/hg/lib/kgAlias.sql hgsql hg17 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from hg17.knownGene.name, # hg17.knownGene.proteinID, hg17.knownGene.alignID, # proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb # to create kgProtAlias.tab # kgProtAlias hg17 050415 hgsql hg17 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab # include variant splice protein IDs hgsql hg17 -N -e \ 'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\ |sort -u >kgProtAliasDup.tab # include duplicate protein IDs from dupSpMrna table hgsql hg17 -N -e \ 'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\ |sort -u >>kgProtAliasDup.tab # catch parent acc from dupProteinID too hgsql hg17 -N -e\ 'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\ |sort -u >>kgProtAliasDup.tab cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab echo "`date` creating table kgProtAlias" hgsql hg17 -e "drop table kgProtAlias;" hgsql hg17 <~/src/hg/lib/kgProtAlias.sql; hgsql hg17 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;' # Build kgSpAlias table hgsql hg17 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql hg17 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab rm j.tmp hgsql hg17 -e 'drop table kgSpAlias'; hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias' # MAKE FOLDUTR TABLES (DONE 2005-05-19, Fan) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/hg17/bed mkdir rnaStruct.2005-05-18 rm rnaStruct ln -s rnaStruct.2005-05-18 rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa hg17 knownGene utr3 utr3/utr.fa utrFa hg17 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/hg17/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < cgapAliasSorted.tab hgsql hg17 < ~/kent/src/hg/lib/cgapAlias.sql hgsql hg17 -e 'load data local infile "cgapAliasSorted.tab" \ into table cgapAlias' # LOAD ENSEMBL GENES (DONE, 5/23/05, Fan) # Ensembl changed things again! Please note there are two subtle changes to make it work. mkdir /cluster/data/hg17/bed/ensembl cd /cluster/data/hg17/bed/ensembl mkdir new cd new # Get the ensembl protein data from # http://www.ensembl.org/Homo_sapiens/martview # Follow this sequence through the pages: # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensemblGene.gtf.gz # This time, there are some extra lines, like ' 1;', # that are causing problems, so added an extra filter in the beginning # to get rid of them. # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. # Finally, get rid of the ".1" or ".2" after the name cat ensemblGene.gtf |sed -e 's/\t\t/xxxxx/g' \ |grep -v xxxxx \ | grep -v ^6_DR51 \ | grep -v ^DR51 \ | grep -v ^DR52 \ | grep -v ^DR53 \ | grep -v _NT_ \ | perl -wpe 's/^([0-9]|X|Y|Un|MT)/chr$1/ \ || die "Line $. doesnt start with human chrom:\n$_"' \ | sed -e 's/chrMT/chrM/g' \ | sed -e 's/\..\"/\"/g' \ >ensGene.gtf ssh hgwdev cd /cluster/data/hg17/bed/ensembl/new /cluster/bin/i386/ldHgGene hg17 ensGene ensGene.gtf # Read 33581 transcripts in 699580 lines in 1 files # 33581 groups 25 seqs 1 sources 4 feature types # 33581 gene predictions # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql hg17 -e 'drop table ensGtp' hgsql hg17 < ~/kent/src/hg/lib/ensGtp.sql hgsql hg17 -e 'load data local infile "ensGtp.txt" into table ensGtp ignore 1 lines' # ensMart has some problem with the resulting ensemblPep.fa.gz, so use different # processing step instead: wget -timestamp \ ftp://ftp.ensembl.org/pub/current_human/data/fasta/pep/Homo_sapiens.NCBI35.may.pep.fa.gz zcat Homo_sapiens.NCBI35.may.pep.fa.gz | sed -e "s/transcript:/\n>/g" | grep -v 'gene:' >ensPep.fa faToTab -type=protein ensPep.fa ensPep.tab hgsql hg17 -e 'drop table ensPep' hgsql hg17 < ~/kent/src/hg/lib/ensPep.sql hgsql hg17 -e 'load data local infile "ensPep.tab" into table ensPep' # kept the following, just in case Ensembl fixed the problem in the future # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz # gunzip ensemblPep.fa.gz # hgPepPred hg17 ensembl ensemblPep.fa # UPDATE GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED - 2005-05-21, DONE 2005-05-23 - Fan) # This should be done after knownGene tables are complete from known gene # process. # # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/hg17/bed/geneSorter.2005-05-21 # remove old symbolic link rm /cluster/data/hg17/bed/geneSorter ln -s /cluster/data/hg17/bed/geneSorter.2005-05-21 \ /cluster/data/hg17/bed/geneSorter cd /cluster/data/hg17/bed/geneSorter hgClusterGenes hg17 knownGene knownIsoforms knownCanonical # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/hg17/bed/geneSorter/blastp cd /cluster/data/hg17/bed/geneSorter/blastp pepPredToFa hg17 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /scratch/blast/formatdb -i known.faa -t known -n known # This command is in /projects/compbio/bin/$MACH/formatdb # Copy over database to bluearc rm -fr /cluster/bluearc/hg17/blastp mkdir -p /cluster/bluearc/hg17/blastp cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \ /cluster/bluearc/hg17/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/hg17/bed/geneSorter/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/self cd /cluster/data/hg17/bed/geneSorter/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \ -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod +x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~15 minutes if the cluster is free. # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 150459s 2507.64m 41.79h 1.74d 0.005 y # IO & Wait Time: 22325s 372.09m 6.20h 0.26d 0.001 y # Average job time: 22s 0.37m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 198s 3.30m 0.06h 0.00d # Submission to last job: 2019s 33.65m 0.56h 0.02d # Load into database. This takes about 30 minutes ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out time hgLoadBlastTab hg17 knownBlastTab *.tab # Scanning through 7739 files # Loading database with 9836439 rows # 232.300u 42.580s 23:13.41 19.7% 0+0k 0+0io 205pf+0w cd /cluster/data/hg17/bed/geneSorter # Create table that maps between known genes and RefSeq hgMapToGene hg17 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # hgsql -e "select count(*) from knownToRefSeq;" hg17 # row count changed 34667 # Create table that maps between known genes and LocusLink hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \ > refToLl.txt hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt # hgsql -e "select count(*) from knownToLocusLink;" hg17 # row count changed to 34667 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam # hgsql -e "select count(*) from knownToPfam;" hg17 # row count changed to 36010 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # hgsql -e "select count(*) from knownToGnfAtlas2;" hg17 # row count changed to 32381 # Create expression distance table - takes about an hour hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 & # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio # Got 32381 unique elements in hgFixed.gnfHumanAtlas2MedianRatio # hgsql -e "select count(*) from gnfAtlas2Distance;" hg17 # row count changed to 32381000 # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133 # hgsql -e "select count(*) from knownToU133;" hg17 # row count changed to 32886 # Create expression distance table. This will take about 2.5 hours cd /tmp cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight . time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \ -weights=affyUcla.weight -lookup=knownToU133 & # Have 43039 elements in affyUclaNorm # 211 genes, 42 weights, 26.500000 total wieght # Got 32886 unique elements in affyUclaNorm # Create table that maps between known genes and # the GNF data. cd /tmp hgMapToGene hg17 affyU95 knownGene knownToU95 # row count changed to 17501 # hgFixed.gnfHumanU95Exps argument is unused, no need to exist hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \ hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 & # Have 11545 elements in hgFixed.gnfHumanU95MedianRatio # Got 16450 unique elements in hgFixed.gnfHumanU95MedianRatio # row count changed to 16450000 # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes only 10 minutes.) hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnf1h & # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio # Got 8814 unique elements in hgFixed.gnfHumanAtlas2MedianRatio cd /cluster/data/hg17/bed/geneSorter hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2 # row count changed to 35055 #### UPDATE GO DATABASE (DONE 5/21/05 Fan) # Download the terms and make the database. ssh hgwdev mkdir /cluster/store1/geneOntology/20050521 cd /cluster/store1/geneOntology/20050521 wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200504-assocdb-data.gz hgsql mysql <j.tmp hgsql go050521 blastSome < gsub <split.lst #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg" gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 60973s 1016.22m 16.94h 0.71d 0.002 y # IO & Wait Time: 21292s 354.86m 5.91h 0.25d 0.001 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 50s 0.83m 0.01h 0.00d # Submission to last job: 570s 9.50m 0.16h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/blastp/ce2/run/out hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab # Scanning through 7739 files # Loading database with 25706 rows # Make mouse ortholog column using blastp on mouse known genes. # First make mouse protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeMm6.doc for procedure # the directory: /cluster/bluearc/scratch/mus/mm6/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm6 cd /cluster/data/hg17/bed/geneSorter/blastp/mm6 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/panasas/home/store/mm6/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 65337s 1088.95m 18.15h 0.76d 0.002 y # IO & Wait Time: 20794s 346.56m 5.78h 0.24d 0.001 y # Average job time: 11s 0.19m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 80s 1.33m 0.02h 0.00d # Submission to last job: 598s 9.97m 0.17h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/mm6/run/out hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab # Scanning through 7739 files # row count changed to 32880 # Make rat ortholog column using blastp on rat known genes. # First make rat protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeRn3.doc for procedure. # Files were put in this directory: /cluster/bluearc/rn3/blastp/ # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/rn3/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 28325s 472.08m 7.87h 0.33d 0.001 y # IO & Wait Time: 20416s 340.27m 5.67h 0.24d 0.001 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 24s 0.40m 0.01h 0.00d # Submission to last job: 617s 10.28m 0.17h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab # Scanning through 7739 files # Loading database with 24140 rows # ZEBRAFISH BLASTP FOR GENE SORTER # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to iscratch/i # if it doesn't exist already: ssh kkstore mkdir /cluster/data/danRer2/bed/blastp cd /cluster/data/danRer2/bed/blastp wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.may.pep.fa.gz zcat Dan*.pep.fa.gz > ensembl.faa /scratch/blast/formatdb -i ensembl.faa -t ensembl -n ensembl ssh kkr1u00 if (-e /iscratch/i/danRer2/blastp) then rm -r /iscratch/i/danRer2/blastp endif mkdir -p /iscratch/i/danRer2/blastp cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/hg17/bed/blastp/danRer2/run/out cd /cluster/data/hg17/bed/blastp/danRer2/run # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 113595s 1893.26m 31.55h 1.31d 0.004 y # IO & Wait Time: 26231s 437.18m 7.29h 0.30d 0.001 y # Average job time: 18s 0.30m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 99s 1.65m 0.03h 0.00d # Submission to last job: 445s 7.42m 0.12h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/blastp/danRer2/run/out hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab # Scanning through 7739 files # Loading database with 30731 rows # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/sc1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/sc1/blastp/sgd \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 18630s 310.50m 5.17h 0.22d 0.001 y # IO & Wait Time: 20776s 346.27m 5.77h 0.24d 0.001 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 15s 0.25m 0.00h 0.00d # Submission to last job: 295s 4.92m 0.08h 0.00d ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab # Loading database with 16540 rows # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # The following section was already done. # cd /cluster/data/dm1/bed # mkdir blastp # cd blastp #wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz # zcat whole_ge*.gz | faFlyBaseToUcsc stdin flyBase.faa # formatdb -i flyBase.faa -t flyBase -n flyBase # if (-e /cluster/bluearc/dm1/blastp) then # rm -r /cluster/bluearc/dm1/blastp # endif # mkdir -p /cluster/bluearc/dm1/blastp # cp /cluster/data/dm1/bed/blastp/flyBase.p?? /cluster/bluearc/dm1/blastp # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7739 of 7739 jobs # CPU time in finished jobs: 73518s 1225.30m 20.42h 0.85d 0.002 y # IO & Wait Time: 45038s 750.63m 12.51h 0.52d 0.001 y # Average job time: 15s 0.26m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 69s 1.15m 0.02h 0.00d # Submission to last job: 762s 12.70m 0.21h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab # Loading database with 27212 rows # update knownToHInv table # Verified that there is now new release of HInv data. hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv # count changed to 28851 # The new KG process no longer need entries in knownGeneLink (used to store # info for DNA based RefSeqs. So clean out the old data in knownGeneLink. hgsql hg17 -e "delete from knownGeneLink" #### RE-BUILD SUPERFAMILY RELATED TABLES (DONE - 2005-05-27 - Fan) # Download latest Superfamily data files and build the Superfamily DB # from supfam.mrc-lmb.cam.ac.uk mkdir /cluster/store10/superfamily/050524 ln -s /cluster/store10/superfamily/050524 /cluster/data/superfamily/050524 cd /cluster/data/superfamily/050524 # ftp over the following two files: ass_22-May-2005.tab.gz supfam_22-May-2005.sql.gz gzip -d *.gz # Load the Superfamily database hgsql hg17 -e "create database superfam050524" nice hgsql superfam050524 < supfam_22-May-2005.sql & # This may take about an hour. # Make sure to add an index on id of the des table of superfam050524. hgsql superfam050524 -e "create index id on des(id);" hgsql superfam050524 < ~/src/hg/lib/sfAssign.sql hgsql superfam050524 -e 'load data local infile "ass_22-May-2005.tab" into table superfam050524.sfAssign;' # Build or rebuild Superfamily track and create sf tables needed for PB hgsql hg17 < ~/src/hg/lib/sfAssign.sql cd /cluster/data/superfamily/050524 hgsql hg17 -e 'load data local infile "ass_22-May-2005.tab" into table hg17.sfAssign;' # If hg17.sfDes already exists, drop it. hgsql superfam050524 -N -e "select * from des" >sfDes.tab hgsql hg17 < ~/src/hg/lib/sfDes.sql hgsql hg17 -e 'load data local infile "sfDes.tab" into table sfDes' # If hg17.superfamily already exists, drop it. cd /cluster/data/hg17/bed mkdir /cluster/data/hg17/sf.2004-1128 ln -s sf.2004-1128 sf hgSuperfam hg17 > sf.log # It is normal that many proteins does not have corresponding Superfamily entries. # If hg17.sfDescription exists, drop it. hgsql hg17 < ~/src/hg/lib/sfDescription.sql hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab" into table hg17.sfDescription;' # Finally, load the superfamily table. hgLoadBed hg17 superfamily superfamily.tab -tab # Create knownToSuperfamily table # Note hs is changed into ht for this Superfamily release. cat /cluster/data/superfamily/050524/ass_22-May-2005.tab \ | hgKnownToSuper hg17 hs stdin # created 25287 rows in knownToSuper # Build tables needed by pbGlobal in proteins050415 cd /cluster/data/superfamily/050524 hgsql proteins050415 -e 'load data local infile "ass_22-May-2005.tab" into table sfAssign' hgsql proteins050415 -e 'load data local infile "sfDes.tab" into table sfDes' cd /cluster/store10/kg/kgHg17F hgsql proteins050415 -e 'load data local infile "ensemblXref.tab" into table ensemblXref' # These sf tables and ensemblXref3 are needed for non-HMR KG proteins. # Should add content of ensemblXref3 of mm6 after it is done. # And similarly for rn4 and possibly for other non-HMR species. # CCDS <-> knownGene mapping need to be updated (markd 2005-05-29) # this should be part of the known gene build /cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap # AUGUSTUS GENES (DONE 6/1/2005 Andy) ssh hgwdev cd /cluster/data/hg17/bed mkdir augustus cd augustus/ wget http://augustus.gobics.de/predictions/hg17/hg17.allchr.augustus.gtf.gz cp /cluster/data/dm2/bed/augustus/cleanAugustus.awk . zcat hg17.allchr.augustus.gtf.gz | awk -f cleanAugustus.awk | gzip > hg17.allchr.augustus.clean.gtf.gz ldHgGene -gtf hg17 augustus hg17.allchr.augustus.clean.gtf.gz rm hg17.allchr.augustus.gtf.gz # MAKE Mouse Proteins track (DONE for chr13 braney ~5/25/05) ssh kkstore01 mkdir -p /cluster/data/hg17/blastDb cd /cluster/data/hg17/blastDb awk "{print \$2}" ../*/chr*/*.lft > subChr.lst for i in `cat subChr.lst` do ln -s ../*/chr*/$i.fa echo formatdb -i $i.fa -p F formatdb -i $i.fa -p F done rm *.log *.fa list cd .. for i in `cat chrom.lst`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft ssh kkr1u00 rm -rf /iscratch/i/hg17/blastDb mkdir -p /iscratch/i/hg17/blastDb cd /cluster/data/hg17/blastDb for i in nhr nin nsq; do cp *.$i /iscratch/i/hg17/blastDb ; echo $i; done cd iSync > sync.out mkdir -p /cluster/data/hg17/bed/tblastn.mm6KG cd /cluster/data/hg17/bed/tblastn.mm6KG echo /panasas/store/hg17/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst # back to kkstore01 exit cd /cluster/data/hg17/bed/tblastn.mm6KG rm -rf /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa mkdir -p /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa split -l 560 /cluster/data/mm6/bed/blat.mm6KG/mm6KG.psl /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa/kg ln -s /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst rm -rf /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut mkdir -p /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut ln -s /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subLiftAll.lft carry $f.2 liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/mm6/bed/blat.mm6KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/hg17/bed/tblastn.mm6KG para create blastSpec para push # Completed: 214524 of 214524 jobs # CPU time in finished jobs: 44907411s 748456.85m 12474.28h 519.76d 1.424 y # IO & Wait Time: 712709s 11878.48m 197.97h 8.25d 0.023 y # Average job time: 213s 3.54m 0.06h 0.00d # Longest finished job: 1363s 22.72m 0.38h 0.02d # Submission to last job: 75910s 1265.17m 21.09h 0.88d # just for chr13 # completed: 55290 of 55290 jobs # cCPU time in finished jobs: 1487547s 24792.46m 413.21h 17.22d 0.047 y # cIO & Wait Time: 148854s 2480.89m 41.35h 1.72d 0.005 y # cAverage job time: 30s 0.49m 0.01h 0.00d # cLongest running job: 0s 0.00m 0.00h 0.00d # cLongest finished job: 98s 1.63m 0.03h 0.00d # cSubmission to last job: 3904s 65.07m 1.08h 0.05d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' ssh kki cd /cluster/data/hg17/bed/tblastn.mm6KG tcsh cat << '_EOF_' > chainOne (cd $1; cat q."$2"* | simpleChain -prot -outPsl -maxGap=200000 stdin ../c.`basename $1`.$2.psl) '_EOF_' chmod +x chainOne for j in blastOut/kg??; do for i in `cat ../../chrom.lst`; do echo chainOne $j chr"$i"; done ; done > chainSpec para create chainSpec para push # CPU time in finished jobs: 90s 1.50m 0.03h 0.00d 0.000 y # IO & Wait Time: 19151s 319.18m 5.32h 0.22d 0.001 y # Average job time: 3s 0.04m 0.00h 0.00d # Longest finished job: 5s 0.08m 0.00h 0.00d # Submission to last job: 1642s 27.37m 0.46h 0.02d # Completed: 7695 of 7695 jobs # CPU time in finished jobs: 48s 0.80m 0.01h 0.00d 0.000 y # IO & Wait Time: 18931s 315.51m 5.26h 0.22d 0.001 y # Average job time: 2s 0.04m 0.00h 0.00d # Longest finished job: 6s 0.10m 0.00h 0.00d # Submission to last job: 1618s 26.97m 0.45h 0.02d exit # back to kkstore01 cd /cluster/data/hg17/bed/tblastn.mm6KG/blastOut for i in kg?? do cat c.$i.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.90 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/hg17/bed/tblastn.mm6KG/blastMm6KG.psl cd .. ssh hgwdev cd /cluster/data/hg17/bed/tblastn.mm6KG hgLoadPsl hg17 blastHg17KG.psl # 1425966 bases of 64944656 (2.196%) # back to kkstore01 rm -rf blastOut # End tblastn of mouse proteins #################################################################################### # RE-BUILD KNOWN GENES TABLES, 3ND TRIAL WITH CORRECTED kgCheck and kgGetCds (DONE 6/5/05 Fan) # Start from the step that gene-check is run and kgCandidate0.gp is produced. cd cd /cluster/store10/kg/kgHg17F mkdir try3 cd try3 hgsql kgHg17FTempTry3 -e 'drop table kgCandidate0' hgsql kgHg17FTempTry3 -e 'drop table kgCandidate0' hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidate0.sql hgsql kgHg17FTempTry3 -e 'load data local infile "../kgCandidate0.gp" into table kgCandidate0' hgsql kgHg17FTempTry3 -e 'drop table geneCheck' hgsql kgHg17FTempTry3 < ~/src/hg/lib/geneCheck.sql hgsql kgHg17FTempTry3 -e 'load data local infile "../kgCandidate0.check" into table geneCheck ignore 2 lines' # Run kgCheck to get all KG candidates that pass the KG gene check criteria kgCheck kgHg17FTempTry3 hg17 kgCandidate0 geneCheck kgCandidate.tab hgsql kgHg17FTempTry3 -e 'drop table kgCandidate' hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidate.sql hgsql kgHg17FTempTry3 -e 'load data local infile "kgCandidate.tab" into table kgCandidate' hgsql kgHg17FTempTry3 -e 'create index alignID on kgCandidate(alignID)' # Construct the kgCandidateX table that has alignID in the name field. cut -f 2-10 kgCandidate.tab >j2.tmp cut -f 11 kgCandidate.tab >j1.tmp paste j1.tmp j2.tmp >kgCandidateX.tab hgsql kgHg17FTempTry3 -e 'drop table kgCandidateX' hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateX.sql hgsql kgHg17FTempTry3 -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX' # Score protein/mRna and protein/RefSeq alignments # kgResultBestMrna2 050415 kgHg17FTempTry3 hg17|sort -u >protMrnaBlatScore.tab # kgResultBestRef2 050415 kgHg17FTempTry3 hg17|sort -u >protRefScore.tab # Combine scoring results and load them into temp DB. cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab hgsql kgHg17FTempTry3 -e 'drop table protMrnaScore' hgsql kgHg17FTempTry3 < ~/src/hg/lib/protMrnaScore.sql hgsql kgHg17FTempTry3 -e 'load data local infile "../protMrnaScore.tab" into table protMrnaScore' hgsql kgHg17FTempTry3 -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)' # Run kgGetCds to get CDS structure of each gene kgGetCds kgHg17FTempTry3 kgCandidateX jY.tmp1 cat jY.tmp1 |sort -u >kgCandidateY.tab rm jY.tmp1 hgsql kgHg17FTempTry3 -e 'drop table kgCandidateY' hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateY.sql hgsql kgHg17FTempTry3 -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY' # Run kgPickPrep to replace long cds structure string with cdsId. kgPickPrep kgHg17FTempTry3 kgCandidateZ.tab hgsql kgHg17FTempTry3 -e 'drop table kgCandidateZ' hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateZ.sql hgsql kgHg17FTempTry3 -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ' hgsql kgHg17FTempTry3 -e 'create index cdsId on kgCandidateZ(cdsId)' # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure. kgPick kgHg17FTempTry3 hg17 proteins050415 kg3Try3.tmp dupSpMrna.tmp cat kg3Try3.tmp | grep NM_ > jNM cat kg3Try3.tmp | grep -v NM_ >jnoNM cut -f 1 jnoNM | sed -e "s/_/_\n/" |grep -v _ >jnoNM1 cut -f 2-12 jnoNM >jnoNM2 paste jnoNM1 jnoNM2 > kg3Try3B.tmp cat jNM >> kg3Try3B.tmp sort -u dupSpMrna.tmp >dupSpMrna.tab hgsql hg17 -e 'drop table dupSpMrna' hgsql hg17 < ~/src/hg/lib/dupSpMrna.sql hgsql hg17 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna' # Add entries in the put back list # Obtain from Mark the put back list, kgPutBack.lis, for human RefSeq. hgsql kgHg17FTempTry3 -e 'drop table kgPutBack' hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgPutBack.sql hgsql kgHg17FTempTry3 -e 'load data local infile "kgPutBack.lis" into table kgPutBack' kgPutBack kgHg17FTempTry3 hg17 proteins050415 kgPutBack kgPutBack.gp # Sort KG genes to make the kg3Try3.gp table file. cat kg3Try3B.tmp kgPutBack.gp >kg3Try3C.tmp ~/kent/src/hg/protein/sortKg.pl kg3Try3C.tmp >kg3Try3.gp # Manually edit to correct one line problem of O75438_BC009691 hgsql kgHg17FTempTry3 -e 'drop table knownGene' hgsql kgHg17FTempTry3 < ~/src/hg/lib/knownGene.sql hgsql kgHg17FTempTry3 -e 'load data local infile "kg3Try3.gp" into table knownGene' hgsql hg17 -e 'drop table kg3Try3' hgsql hg17 < ~/src/hg/lib/kg3Try3.sql hgsql hg17 -e 'load data local infile "kg3Try3.gp" into table kg3Try3' # Perform analysis before renaming the kg3Try3 table to knownGene. # Load data into hg17 knownGene table. hgsql hg17 -e 'drop table knownGene' hgsql hg17 < ~/src/hg/lib/knownGene.sql hgsql hg17 -e 'load data local infile "kg3Try3.gp" into table knownGene' # Build knownGeneMrna and knownGenePep tables. hgsql kgHg17FTempTry3 -e 'drop table mrnaSeq' hgsql kgHg17FTempTry3 < ~/src/hg/lib/mrnaSeq.sql hgsql kgHg17FTempTry3 -e 'load data local infile "../mrnaSeq.tab" into table mrnaSeq' kgPepMrna kgHg17FTempTry3 hg17 050415 hgsql hg17 -e 'drop table knownGeneMrna' hgsql hg17 < ~/src/hg/lib/knownGeneMrna.sql hgsql hg17 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna' hgsql hg17 -e 'drop table knownGenePep' hgsql hg17 < ~/src/hg/lib/knownGenePep.sql hgsql hg17 -e 'load data local infile "knownGenePep.tab" into table knownGenePep' # Build spMrna table hgsql hg17 -N -e 'select proteinID, name from knownGene' |sort -u >kgSpMrna.tab hgsql hg17 -e 'drop table spMrna' hgsql hg17 <~/src/hg/lib/spMrna.sql hgsql hg17 -e 'load data local infile "kgSpMrna.tab" into table spMrna' # Build kgXref table kgXref2 kgHg17FTempTry3 050415 hg17 hgsql hg17 -e 'drop table kgXref' hgsql hg17 < ~/src/hg/lib/kgXref.sql hgsql hg17 -e 'load data local infile "kgXref.tab" into table kgXref' # MAKE FOLDUTR TABLES # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev cd /cluster/data/hg17/bed mkdir rnaStruct.2005-06-05 rm rnaStruct ln -s rnaStruct.2005-06-05 rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa hg17 knownGene utr3 utr3/utr.fa utrFa hg17 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/hg17/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < kgAliasP.tab hgsql hg17 -N -e 'select name, name from knownGene' >kgAliasDup.tab hgsql hg17 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \ sort |uniq > kgAlias.tab hgsql -e "drop table kgAlias;" hg17 hgsql hg17 < ~/kent/src/hg/lib/kgAlias.sql hgsql hg17 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' # kgProtAlias reads from hg17.knownGene.name, # hg17.knownGene.proteinID, hg17.knownGene.alignID, # proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb # to create kgProtAlias.tab # kgProtAlias hg17 050415 hgsql hg17 -N -e \ 'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\ | sort -u >kgProtAliasNCBI.tab # include variant splice protein IDs hgsql hg17 -N -e \ 'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\ |sort -u >kgProtAliasDup.tab # include duplicate protein IDs from dupSpMrna table hgsql hg17 -N -e \ 'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\ |sort -u >>kgProtAliasDup.tab # catch parent acc from dupProteinID too hgsql hg17 -N -e\ 'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\ |sort -u >>kgProtAliasDup.tab cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab echo "`date` creating table kgProtAlias" hgsql hg17 -e "drop table kgProtAlias;" hgsql hg17 <~/src/hg/lib/kgProtAlias.sql; hgsql hg17 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;' # Build kgSpAlias table hgsql hg17 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp hgsql hg17 -e \ 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\ >>j.tmp cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab rm j.tmp hgsql hg17 -e 'drop table kgSpAlias'; hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias' # Build KEGG pathway tables ssh hgwdev cd /cluster/store10/kg/kgHg17F md kegg cd kegg ~/src/hg/protein/KGpath.sh kgHg17F hg17 050415 hgsql hg17 -e "drop table keggMapDesc" hgsql hg17 -e "drop table keggPathway" hgsql hg17 <~/src/hg/lib/keggMapDesc.sql hgsql hg17 <~/src/hg/lib/keggPathway.sql hgsql hg17 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' hgsql hg17 -e 'load data local infile "keggPathway.tab" into table keggPathway' # Build CGAP pathway tables cd .. ~/src/hg/protein/KGcgap.sh kgHg17F hg17 050415 hgsql hg17 -e "drop table cgapAlias" hgsql hg17 -e "drop table cgapBiocDesc" hgsql hg17 -e "drop table cgapBiocPathway" hgsql hg17 <~/src/hg/lib/cgapAlias.sql hgsql hg17 <~/src/hg/lib/cgapBiocDesc.sql hgsql hg17 <~/src/hg/lib/cgapBiocPathway.sql hgsql hg17 -e 'load data local infile "cgapAlias.tab" into table cgapAlias' hgsql hg17 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc' hgsql hg17 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway' # Build BioCyc pathway tables # Download BioCyc DB, create and load bioCyc DB # See makeBioCycDB.doc for details. hgsql hg17 -e "drop table bioCycMapDesc" hgsql hg17 <~/src/hg/lib/bioCycMapDesc.sql hgsql hg17 -e 'load data local infile "bioCycMapDesc.tab" into table bioCycMapDesc' kgBioCyc |sort -u > bioCycPathway.tab hgsql hg17 -e "drop table bioCycPathway" hgsql hg17 <~/src/hg/lib/bioCycPathway.sql hgsql hg17 -e 'load data local infile "bioCycPathway.tab" into table bioCycPathway' # CCDS <-> knownGene mapping need to be updated (Fan redone 2005-06-05) # this should be part of the known gene build /cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap ### HG17 PROTEOME BROWSER TABLES RE-BUILD #### (DONE - 2005-06-05 - Fan) # These are instructions for rebuilding tables # needed for the Proteome Browser. # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table # ARE REBUILT. # This update is based on proteins DBs dated 050415. # Create the working directory ssh hgwdev mkdir /cluster/store10/kg/kgHg17F/pb-2005-06-05 cd /cluster/data/hg17/bed rm pb ln -s /cluster/store10/kg/kgHg17F/pb-2005-06-05 pb cd pb # Move the existing PB tables by: hgsql hg17 create database hg17Sav4; alter table hg17.pepCCntDist rename as hg17Sav4.pepCCntDist; alter table hg17.pepExonCntDist rename as hg17Sav4.pepExonCntDist; alter table hg17.pepHydroDist rename as hg17Sav4.pepHydroDist; alter table hg17.pepIPCntDist rename as hg17Sav4.pepIPCntDist; alter table hg17.pepMolWtDist rename as hg17Sav4.pepMolWtDist; alter table hg17.pepMwAa rename as hg17Sav4.pepMwAa; alter table hg17.pepPi rename as hg17Sav4.pepPi; alter table hg17.pepPiDist rename as hg17Sav4.pepPiDist; alter table hg17.pepResDist rename as hg17Sav4.pepResDist; alter table hg17.pbAnomLimit rename as hg17Sav4.pbAnomLimit; alter table hg17.pbResAvgStd rename as hg17Sav4.pbResAvgStd; alter table hg17.pbStamp rename as hg17Sav4.pbStamp; quit # Define pep* tables in hg17 DB cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql # First edit out pepPred table definition, then hgsql hg17 < pepAll.sql # Build the pepMwAa table hgsql proteins050415 -N -e \ "select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab hgsql hg17 -e 'load data local infile "pepMwAa.tab" into table pepMwAa' o Build the pepPi table hgsql proteins050415 -e \ "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis hgsql hg17 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis pbCalPi protAcc.lis sp050415 pepPi.tab hgsql hg17 -e 'delete from pepPi' hgsql hg17 -e 'load data local infile "pepPi.tab" into table hg17.pepPi' # Calculate and load pep distributions pbCalDist sp050415 proteins050415 9606 hg17 >pbCalDist.out wc pbCalDist.out hgsql hg17 load data local infile "pepExonCntDist.tab" into table hg17.pepExonCntDist; load data local infile "pepCCntDist.tab" into table hg17.pepCCntDist; load data local infile "pepHydroDist.tab" into table hg17.pepHydroDist; load data local infile "pepMolWtDist.tab" into table hg17.pepMolWtDist; load data local infile "pepResDist.tab" into table hg17.pepResDist; load data local infile "pepIPCntDist.tab" into table hg17.pepIPCntDist; load data local infile "pepPiDist.tab" into table hg17.pepPiDist; quit # Calculate frequency distributions pbCalResStd sp050415 9606 hg17 # Create pbAnomLimit and pbResAvgStd tables hgsql hg17 -e "drop table pbAnomLimit" hgsql hg17 -e "drop table pbResAvgStd" hgsql hg17 < ~/src/hg/lib/pbAnomLimit.sql hgsql hg17 < ~/src/hg/lib/pbResAvgStd.sql hgsql hg17 -e 'load data local infile "pbResAvgStd.tab" into table hg17.pbResAvgStd;' hgsql hg17 -e 'load data local infile "pbAnomLimit.tab" into table hg17.pbAnomLimit;' # Create pbStamp table for PB hgsql hg17 -e "drop table pbStamp" hgsql hg17 < ~/src/hg/lib/pbStamp.sql hgsql hg17Sav4 -N -e 'select * from pbStamp' > pbStamp.tab hgsql hg17 -e 'load data local infile "pbStamp.tab" into table hg17.pbStamp' # Adjust drawing parameters for Proteome Browser stamps Now invoke Proteome Browser and adjust various drawing parameters (mostly the ymax of each stamp) if necessary, by updating the pbStamp.tab file and then delete and reload the pbStamp table. # Perform preliminary review of Proteome Browser for hg17, then notify QA for formal review. # RE-BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (DONE - 2005-06-04 - Fan) # This should be done after KG tables are complete from known genes build # process. # # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/hg17/bed/geneSorter.2005-06-04 # remove old symbolic link rm /cluster/data/hg17/bed/geneSorter ln -s /cluster/data/hg17/bed/geneSorter.2005-06-04 /cluster/data/hg17/bed/geneSorter cd /cluster/data/hg17/bed/geneSorter hgClusterGenes hg17 knownGene knownIsoforms knownCanonical # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/hg17/bed/geneSorter/blastp cd /cluster/data/hg17/bed/geneSorter/blastp pepPredToFa hg17 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /scratch/blast/formatdb -i known.faa -t known -n known # This command is in /projects/compbio/bin/$MACH/formatdb # Copy over database to bluearc rm -fr /cluster/bluearc/hg17/blastp mkdir -p /cluster/bluearc/hg17/blastp cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* /cluster/bluearc/hg17/blastp # Split up fasta file into bite sized chunks for cluster cd /cluster/data/hg17/bed/geneSorter/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/self cd /cluster/data/hg17/bed/geneSorter/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \ -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod +x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para push para check # Completed: 7735 of 7735 jobs # CPU time in finished jobs: 142764s 2379.39m 39.66h 1.65d 0.005 y # IO & Wait Time: 67623s 1127.06m 18.78h 0.78d 0.002 y # Average job time: 27s 0.45m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 144s 2.40m 0.04h 0.00d # Submission to last job: 392s 6.53m 0.11h 0.00d # Load into database. This takes about 30 minutes ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out time hgLoadBlastTab hg17 knownBlastTab *.tab # Scanning through 7735 files # Loading database with 9757382 rows # 255.200u 50.520s 25:19.66 20.1% 0+0k 0+0io 247pf+0w cd /cluster/data/hg17/bed/geneSorter # Create table that maps between known genes and RefSeq hgMapToGene hg17 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # hgsql -e "select count(*) from knownToRefSeq;" hg17 # row count changed 34667 # Create table that maps between known genes and LocusLink hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 > refToLl.txt hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt # hgsql -e "select count(*) from knownToLocusLink;" hg17 # row count changed to 34773 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam # hgsql -e "select count(*) from knownToPfam;" hg17 # row count changed to 29171 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # hgsql -e "select count(*) from knownToGnfAtlas2;" hg17 # row count changed to 32458 # Create expression distance table - takes about an hour hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 & # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio # Got 32458 unique elements in hgFixed.gnfHumanAtlas2MedianRatio # hgsql -e "select count(*) from gnfAtlas2Distance;" hg17 # row count changed to 32381000 # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133 # hgsql -e "select count(*) from knownToU133;" hg17 # row count changed to 32965 # Create expression distance table. This will take about 2.5 hours cd /tmp cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight . time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \ -weights=affyUcla.weight -lookup=knownToU133 & # Have 43039 elements in affyUclaNorm # 211 genes, 42 weights, 26.500000 total wieght # Got 32965 unique elements in affyUclaNorm # Create table that maps between known genes and # the GNF data. cd /tmp hgMapToGene hg17 affyU95 knownGene knownToU95 # row count changed to 17555 # hgFixed.gnfHumanU95Exps argument is unused, no need to exist hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \ hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 & # Have 11545 elements in hgFixed.gnfHumanU95MedianRatio # Got 16501 unique elements in hgFixed.gnfHumanU95MedianRatio # row count changed to 16450000 # Create known gene mapping table and expression distance tables # for GNF Atlas 2. (The hgExpDistance takes only 10 minutes.) hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnf1h & # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio # Got 8827 unique elements in hgFixed.gnfHumanAtlas2MedianRatio cd /cluster/data/hg17/bed/geneSorter hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2 # row count changed to 35139 #### UPDATE GO DATABASE (THIS PART WAS DONE 5/21/05 Fan) # Download the terms and make the database. ssh hgwdev mkdir /cluster/store1/geneOntology/20050521 cd /cluster/store1/geneOntology/20050521 wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200504-assocdb-data.gz hgsql mysql <j.tmp hgsql go050521 blastSome < gsub <split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Initial run has 13 jobs crashed. # crashed: 13 # ranOk: 7722 # para problems show the following typical message: # total jobs in batch: 7735 # job: blastSome ../../../geneSorter/blastp/split/kg5911.fa out/kg5911.tab # id: 209522384 # failure type: crash # host: kkr2u28.kilokluster.ucsc.edu # start time: Sat Jun 4 11:45:51 2005 # return: 0 # stderr: # [blastall] FATAL ERROR: blast: Unable to open input file ../../../geneSorter/blastp/split/kg5911.fa # para push again and these 13 ran fine. # Completed: 7735 of 7735 jobs # CPU time in finished jobs: 60319s 1005.32m 16.76h 0.70d 0.002 y # IO & Wait Time: 31239s 520.65m 8.68h 0.36d 0.001 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 72s 1.20m 0.02h 0.00d # Submission to last job: 199s 3.32m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/blastp/ce2/run/out hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab # Scanning through 7735 files # Loading database with 25574 rows # Make mouse ortholog column using blastp on mouse known genes. # First make mouse protein database and copy it to /cluster/panasas # if it doesn't exist already # This already exists. See makeMm6.doc for procedure # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm6 cd /cluster/data/hg17/bed/geneSorter/blastp/mm6 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/panasas/home/store/mm6/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7735 of 7735 jobs # CPU time in finished jobs: 85769s 1429.49m 23.82h 0.99d 0.003 y # IO & Wait Time: 20587s 343.11m 5.72h 0.24d 0.001 y # Average job time: 14s 0.23m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 78s 1.30m 0.02h 0.00d # Submission to last job: 206s 3.43m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/mm6/run/out hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab # Scanning through 7735 files # Loading database with 32951 rows # Make rat ortholog column using blastp on rat known genes. # First make rat protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeRn3.doc for procedure. # Files were put in this directory: /cluster/bluearc/rn3/blastp/ # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/rn3/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc # Completed: 7735 of 7735 jobs # CPU time in finished jobs: 27804s 463.40m 7.72h 0.32d 0.001 y # IO & Wait Time: 30334s 505.56m 8.43h 0.35d 0.001 y # Average job time: 8s 0.13m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 26s 0.43m 0.01h 0.00d # Submission to last job: 119s 1.98m 0.03h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab # Scanning through 7735 files # Loading database with 24030 rows # ZEBRAFISH BLASTP FOR GENE SORTER # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to iscratch/i # if it doesn't exist already: ssh kkstore mkdir /cluster/data/danRer2/bed/blastp cd /cluster/data/danRer2/bed/blastp wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.may.pep.fa.gz zcat Dan*.pep.fa.gz > ensembl.faa /scratch/blast/formatdb -i ensembl.faa -t ensembl -n ensembl ssh kkr1u00 if (-e /iscratch/i/danRer2/blastp) then rm -r /iscratch/i/danRer2/blastp endif mkdir -p /iscratch/i/danRer2/blastp cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp iSync # Make parasol run directory ssh kk mkdir -p /cluster/data/hg17/bed/blastp/danRer2/run/out cd /cluster/data/hg17/bed/blastp/danRer2/run # Make blast script cat > blastSome < gsub < split.lst gensub2 split.lst single gsub spec para create spec para try, check, push, check, ... # Completed: 7735 of 7735 jobs # CPU time in finished jobs: 111467s 1857.78m 30.96h 1.29d 0.004 y # IO & Wait Time: 21159s 352.65m 5.88h 0.24d 0.001 y # Average job time: 17s 0.29m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 95s 1.58m 0.03h 0.00d # Submission to last job: 223s 3.72m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/blastp/danRer2/run/out hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab # Scanning through 7735 files # Loading database with 30651 rows # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/sc1/blastp should have data # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/sc1/blastp/sgd \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7735 of 7735 jobs # CPU time in finished jobs: 18194s 303.23m 5.05h 0.21d 0.001 y # IO & Wait Time: 24452s 407.53m 6.79h 0.28d 0.001 y # Average job time: 6s 0.09m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 16s 0.27m 0.00h 0.00d # Submission to last job: 120s 2.00m 0.03h 0.00d ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab # Scanning through 7735 files # Loading database with 16395 rows # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # The following section was already done. # cd /cluster/data/dm1/bed # mkdir blastp # cd blastp #wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz # zcat whole_ge*.gz | faFlyBaseToUcsc stdin flyBase.faa # formatdb -i flyBase.faa -t flyBase -n flyBase # if (-e /cluster/bluearc/dm1/blastp) then # rm -r /cluster/bluearc/dm1/blastp # endif # mkdir -p /cluster/bluearc/dm1/blastp # cp /cluster/data/dm1/bed/blastp/flyBase.p?? /cluster/bluearc/dm1/blastp # Make parasol run directory ssh kk mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... # Completed: 7735 of 7735 jobs # CPU time in finished jobs: 72141s 1202.35m 20.04h 0.83d 0.002 y # IO & Wait Time: 41717s 695.28m 11.59h 0.48d 0.001 y # Average job time: 15s 0.25m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 57s 0.95m 0.02h 0.00d # Submission to last job: 204s 3.40m 0.06h 0.00d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab # Scanning through 7735 files # Loading database with 27109 rows # update knownToHInv table # Verified that there is now new release of HInv data. hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv # count changed to 28851 # The new KG process no longer need entries in knownGeneLink (used to store # info for DNA based RefSeqs. So clean out the old data in knownGeneLink. hgsql hg17 -e "delete from knownGeneLink" #### RE-BUILD SUPERFAMILY RELATED TABLES (DONE - 2005-05-27 - Fan) # Download latest Superfamily data files and build the Superfamily DB # from supfam.mrc-lmb.cam.ac.uk mkdir /cluster/store10/superfamily/050524 ln -s /cluster/store10/superfamily/050524 /cluster/data/superfamily/050524 cd /cluster/data/superfamily/050524 # ftp over the following two files: ass_22-May-2005.tab.gz supfam_22-May-2005.sql.gz gzip -d *.gz # Load the Superfamily database hgsql hg17 -e "create database superfam050524" nice hgsql superfam050524 < supfam_22-May-2005.sql & # This may take about an hour. # Make sure to add an index on id of the des table of superfam050524. hgsql superfam050524 -e "create index id on des(id);" hgsql superfam050524 < ~/src/hg/lib/sfAssign.sql hgsql superfam050524 -e 'load data local infile "ass_22-May-2005.tab" into table superfam050524.sfAssign;' # Build or rebuild Superfamily track and create sf tables needed for PB hgsql hg17 < ~/src/hg/lib/sfAssign.sql cd /cluster/data/superfamily/050524 hgsql hg17 -e 'load data local infile "ass_22-May-2005.tab" into table hg17.sfAssign;' # If hg17.sfDes already exists, drop it. hgsql superfam050524 -N -e "select * from des" >sfDes.tab hgsql hg17 < ~/src/hg/lib/sfDes.sql hgsql hg17 -e 'load data local infile "sfDes.tab" into table sfDes' # If hg17.superfamily already exists, drop it. cd /cluster/data/hg17/bed mkdir /cluster/data/hg17/sf.2004-1128 ln -s sf.2004-1128 sf hgSuperfam hg17 > sf.log # It is normal that many proteins does not have corresponding Superfamily entries. # If hg17.sfDescription exists, drop it. hgsql hg17 < ~/src/hg/lib/sfDescription.sql hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab" into table hg17.sfDescription;' # Finally, load the superfamily table. hgLoadBed hg17 superfamily superfamily.tab -tab # Create knownToSuperfamily table # Note hs is changed into ht for this Superfamily release. cat /cluster/data/superfamily/050524/ass_22-May-2005.tab \ | hgKnownToSuper hg17 hs stdin # created 32906 rows in knownToSuper # Build tables needed by pbGlobal in proteins050415 cd /cluster/data/superfamily/050524 hgsql proteins050415 -e 'load data local infile "ass_22-May-2005.tab" into table sfAssign' hgsql proteins050415 -e 'load data local infile "sfDes.tab" into table sfDes' cd /cluster/store10/kg/kgHg17F hgsql proteins050415 -e 'load data local infile "ensemblXref.tab" into table ensemblXref' # These sf tables and ensemblXref3 are needed for non-HMR KG proteins. # Should add content of ensemblXref3 of mm6 after it is done. # And similarly for rn4 and possibly for other non-HMR species. # CCDS <-> knownGene mapping need to be updated (markd 2005-05-29) # this should be part of the known gene build /cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap # Build targetScanS track - (DONE - 2005-06-22 Fan) # requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov ssh hgwdev mkdir -p /cluster/data/hg17/bed/targetScanS cd /cluster/data/hg17/bed/targetScanS wget --timestamp http://genes.mit.edu/targetscan/tracks/targetscan.bed # Remove the first description line of targetscan.bed hgLoadBed -tab hg17 targetScanS targetscan.bed # Create/edit/check in targetScans.html and trackDb.ra under # kent/src/hg/makeDb/trackDb/human/hg17 # Update mrnaRefseq table (DONE - Fan 6/22/05) # The old table contains non-human mrna/RefSeqs. # The new table contains only human mrna/RefSeq and RefSeq/RefSeq. # First build entrez DB tables, see makeMm6.doc for details. hgsql entrez -N -e \ 'select mrna, refseq from entrezRefseq, entrezMrna, hg17.all_mrna where entrezRefseq.geneID=entrezMrna.geneID and mrna=all_mrna.qName' \ >mrnaRefseq1.tab # Include RefSeq as valid mRNA too. hgsql hg17 -N -e 'select name, name from refGene' >mrnaRefseq2.tab cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab hgsql hg17 -e 'drop table mrnaRefseq' hgsql hg17 < ~/src/hg/lib/mrnaRefseq.sql hgsql hg17 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq' # BUILD KNOWN GENE LIST FOR GOOGLE. DONE 6/27/05 Fan. # make knownGeneLists.html hg17GeneList.html mm5GeneList.html rm3GeneList.html cd /cluster/data/hg17/bed rm -rf knownGeneList/hg17 # Run hgKnownGeneList to generate the tree of HTML pages # under ./knownGeneList/hg17 hgKnownGeneList hg17 # copy over to /usr/local/apache/htdocs rm -rf /usr/local/apache/htdocs/knownGeneList/hg17 mkdir -p /usr/local/apache/htdocs/knownGeneList/hg17 cp -Rfp knownGeneList/hg17/* /usr/local/apache/htdocs/knownGeneList/hg17 #### Blat knownGene proteins to determine exons (DONE braney 06-30-05) ssh hgwdev cd /cluster/data/hg17/bed mkdir blat.hg17KG.2005-06-17 rm blat.hg17KG ln -s blat.hg17KG.2005-06-17 blat.hg17KG cd blat.hg17KG pepPredToFa hg17 knownGenePep known.fa hgPepPred hg17 generic blastKGPep02 known.fa grep ">" known.fa | sed "s/>//" > kgName.lst kgName hg17 kgName.lst blastKGRef02 hgsql hg17 < ~/kent/src/hg/lib/blastRef.sql echo "rename table blastRef to blastKGRef02" | hgsql hg17 echo "load data local infile 'blastKGRef02' into table blastKGRef02" | hgsql hg17 ssh kk cd /cluster/data/hg17/bed/blat.hg17KG cat << '_EOF_' > blatSome #!/bin/csh -fe /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3 '_EOF_' # << keep emacs happy chmod +x blatSome ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst mkdir kgfa cd kgfa faSplit sequence ../known.fa 3020 kg cd .. ls -1S kgfa/*.fa > kg.lst cat << '_EOF_' > blatGsub #LOOP blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << keep emacs happy gensub2 human.lst kg.lst blatGsub blatSpec mkdir psl cd psl foreach i (`cat ../human.lst`) mkdir `basename $i .nib` end cd .. para create blatSpec para push # Completed: 134320 of 134320 jobs # CPU time in finished jobs: 22196680s 369944.67m 6165.74h 256.91d 0.704 y # IO & Wait Time: 1712586s 28543.10m 475.72h 19.82d 0.054 y # Average job time: 178s 2.97m 0.05h 0.00d # Longest finished job: 7691s 128.18m 2.14h 0.09d # Submission to last job: 608750s 10145.83m 169.10h 7.05d # Completed: 133676 of 133676 jobs # CPU time in finished jobs: 29661130s 494352.16m 8239.20h 343.30d 0.941 y # IO & Wait Time: 2181179s 36352.99m 605.88h 25.25d 0.069 y # Average job time: 238s 3.97m 0.07h 0.02d # Longest job: 105972s 1766.20m 29.44h 1.23d ssh eieio cd /cluster/data/hg17/bed/blat.hg17KG pslSort dirs raw.psl /tmp psl/* pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null pslUniq cooked.psl hg17KG.psl pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft ssh hgwdev cd /cluster/data/hg17/bed/blat.hg17KG kgName hg17 hg17KG.psl blastKGRef02 cut -f 10 hg17KG.psl > kgName.lst faSomeRecords known.fa kgName.lst hg17KG.fa hgPepPred hg17 generic blastKGPep02 hg17KG.fa #end blat proteins # MAKE Drosophila Proteins track (DONE 07-05-05 braney) ssh kk mkdir -p /cluster/data/hg17/bed/tblastn.dm2FB cd /cluster/data/hg17/bed/tblastn.dm2FB echo /panasas/store/hg17/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > target.lst mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl|awk "{print \\\$1}"`/\(264630/`wc target.lst| awk "{print \\\$1}"`\) # 18929/(350000/5959) = 322.279746 split -l 322 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir -p /cluster/bluearc/hg17/bed/tblastn.dm2FB/blastOut ln -s /cluster/bluearc/hg17/bed/tblastn.dm2FB/blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/hg17/jkStuff/subLiftAll.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/hg17/jkStuff/liftAll.lft warn $f.3 liftUp -isPtoG -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 target.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/hg17/bed/tblastn.dm2FB para create blastSpec para push # Completed: 351581 of 351581 jobs # CPU time in finished jobs: 30733031s 512217.19m 8536.95h 355.71d 0.975 y # IO & Wait Time: 1035790s 17263.16m 287.72h 11.99d 0.033 y # Average job time: 90s 1.51m 0.03h 0.00d # Longest finished job: 816s 13.60m 0.23h 0.01d # Submission to last job: 135367s 2256.12m 37.60h 1.57d ssh kki cd /cluster/data/hg17/bed/tblastn.dm2FB tcsh cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) $(path2) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat $2.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec para push # Completed: 2714 of 2714 jobs # CPU time in finished jobs: 222508s 3708.46m 61.81h 2.58d 0.007 y # IO & Wait Time: 10577s 176.29m 2.94h 0.12d 0.000 y # Average job time: 86s 1.43m 0.02h 0.00d # Longest finished job: 9787s 163.12m 2.72h 0.11d cd /cluster/data/hg17/bed/tblastn.dm2FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.*.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/hg17/bed/tblastn.dm2FB/blastDm2FB.psl cd .. ssh hgwdev cd /cluster/data/hg17/bed/tblastn.dm2FB hgLoadPsl hg17 blastDm2FB.psl exit # back to kksilo rm -rf blastOut # End tblastn # Build kgReactome table for KG to Reactome xref. Done 6/28/05 Fan. ssh hgwdev mkdir -p /cluster/store10/reactome/reactome050613 rm /cluster/data/reactome ln -s /cluster/store10/reactome/reactome050613 /cluster/data/reactome cd /cluster/data/reactome wget --timestamp http://www.reactome.org/download/current/sql.gz hgsql hg17 -e 'drop database reactome' hgsql hg17 -e 'create database reactome' zcat sql.gz| hgsql reactome hgsql reactome -N -e 'select kgId, spID, DB_ID from ReferenceEntity, hg17.kgXref where identifier=spID' >kgReactome.tab; hgsql hg17 -e 'drop table kgReactome' hgsql hg17 < ~/src/hg/lib/kgReactome.sql hgsql hg17 -e 'load data local infile "kgReactome.tab" into table kgReactome' # UPDATE WGRNA TRACK (DONE, 2005-07-05, Fan) ssh hgwdev cd /cluster/data/hg17/bed mv wgRna wgRna-2005-06-16 mkdir wgRna-2005-07-05 cd wgRna-2005-07-05 # Received the data file, wgtrack_july2005.txt, from Michel Weber's email (Michel.Weber@ibcg.biotoul.fr) # and place it under cd /cluster/data/hg17/bed/wgRna-2005-07-05. cat wgtrack_july2005.txt|sed -e 's/ /\t/g' >wgRna.tab # edit wgRna.tab to take out the first 5 lines of data field labels. hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab # REBUILT knownToPfam TABLE TO ALLOW KG REPRESENTED BY VARIANT SPLICE PROTEINS MAPPED TO PFAM (DONE 7/14/05, Fan) # hgMapViaSwissProt.c was updated to support this. # Create table that maps between known genes and Pfam domains hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam ## EVOFOLD (DONE, 2005-07-15, Jakob (jsp) ) # EvoFold is a new comparative method for predicting functional RNA # secondary structures based on multiple sequence alignnments. The # predictions generated for the EvoFold track are based on the most # conserved elements of the 8-way alignment (multiz8way). The current # data is the result of a pilot study (ongoing research of mine), the # procedure used to generate the data will therefore be simplified # when forthcoming evofold tracks for other organism are made. The # documentation therefore skips the actual data generation, and # instead starts with a data file I provide. ssh -C hg17 mkdir -p /cluster/data/hg17/bed/evofold cd /cluster/data/hg17/bed/evofold cp /cluster/home/jsp/data/rnass/genome-scan/vertebrate/folds_hg17.bed foldsHg17.bed # The folds_hg17.bed is a 9-column bed file: column 1-6 provide # standard bed information, column 7 is element length, column 8 is # the RNA secondary structure in parentheses format, and column nine # is a commaseparated list of position specific confidence scores # (floats). hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql hg17 evofold foldsHg17.bed ########################################################################## # TRANSFRAG PHASE 2 TABLES - lifted from hg15 (Jakob Skou Pedersen) # Done: July 21, 2005 # # These tables were lifted for use in my own research, but may be used # for the 'Affymetrix Transcriptome Project Phase 2' tracks. ssh -C hgwdev mkdir -p /cluster/data/hg17/bed/transfrag cd /cluster/data/hg17/bed/transfrag # lifting transfrag tables from hg15 via hg16 to hg17 for name in A375CytosolicPolyAPlusTnFg FHs738LuCytosolicPolyAPlusTnFg HepG2CytosolicPolyAMinusTnFg HepG2CytosolicPolyAPlusTnFg HepG2NuclearPolyAMinusTnFg HepG2NuclearPolyAPlusTnFg JurkatCytosolicPolyAPlusTnFg NCCITCytosolicPolyAPlusTnFg PC3CytosolicPolyAPlusTnFg SKNASCytosolicPolyAPlusTnFg U87CytosolicPolyAPlusTnFg; do echo "select chrom, chromStart, chromEnd, name from ${name};" | hgsql hg15 | sed -e 1d > ${name}Hg15.bed liftOver ${name}Hg15.bed /cluster/data/hg15/bed/liftOver/hg15ToHg16.over.chain ${name}Hg16.bed unmappedHg16.bed liftOver ${name}Hg16.bed /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain ${name}Hg17.bed unmappedHg17.bed echo "hg16 unmapped count for ${name}: " `grep "#" unmappedHg16.bed | wc -l | awk '{print $1}'` echo "hg17 unmapped count for ${name}: " `grep "#" unmappedHg17.bed | wc -l | awk '{print $1}'` hgLoadBed hg17 ${name} ${name}Hg17.bed # clean up rm ${name}Hg15.bed ${name}Hg16.bed unmappedHg16.bed unmappedHg17.bed done # GLADSTONE ARRAY TRACK (DONE 7/19/2005 Andy) ssh hgwdev cd /cluster/data/hg17/bed mkdir gladHumES cd gladHumES/ cp /cluster/data/hg16/bed/geneAtlas2/geneAtlas2.bed . cut -f1-12 geneAtlas2.bed > bed.hg16 liftOver bed.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain bed.hg17 /dev/null hgMapMicroarray bed.hg17.data hgFixed.gladHumESRatio \ -bedIn bed.hg17 #Loaded 11087 rows of expression data from hgFixed.gladHumESRatio #Mapped 10925, multiply-mapped 382, missed 23266, unmapped 162 hgLoadBed hg17 gladHumES bed.hg17.data # PHASTODDS GENESORTER COLUMN (DONE 7/28/2005 Andy) ssh kolossus cd /panasas/store/andy mkdir phastOdds cd phastOdds/ export PATH=${PATH}:/cluster/bin/phast/x86_64 mkdir sso beds cat > runChrom.sh << "_EOF_" #!/bin/bash c=$1 numDir=`echo ${c#chr} | sed 's/_random//'` ALNDIR=/cluster/data/hg17/bed/multiz10way echo msa_view $c /cluster/bin/phast/x86_64/msa_view --in-format MAF ${ALNDIR}/maf/${c}.maf --refseq /cluster/data/hg17/${numDir}/${c}.fa > /tmp/${c}.sso echo phastCons $c /cluster/bin/phast/x86_64/phastOdds -f ${ALNDIR}/cons/run.elements/ave.cons.mod -b ${ALNDIR}/cons/run.elements/ave.noncons.mod -g kg/${c}.bed /tmp/${c}.sso > /tmp/${c}.phastOdds.gtf cp /tmp/${c}.sso sso/ rm /tmp/${c}.sso cp /tmp/${c}.phastOdds.gtf gtfs/ rm /tmp/${c}.phastOdds.gtf echo $c done _EOF_ ssh hgwdev cd /panasas/store/andy/phastOdds genePredToGtf hg17 knownGene kg.gtf exit for c in `cut -f1 kg.gtf | sort | uniq`; do grep "\b${c}\b" kg.gtf > kg/${c}.gtf; done for f in kg/*.bed; do c=`basename $f .bed`; echo $c; ./runChrom.sh $c; addPhastOddsExons $f gtfs/$c.phastOdds.gtf beds/$c.bed done cat beds/* | sort -k4,4 -k1,1 -k2,2n -k3,3n > phastOdds.kg.bed cat > phastOdds.sql << "EOF" CREATE TABLE phastOdds ( bin smallint not null, # Speedup. chrom varchar(255) not null, # Human chromosome or FPC contig chromStart int unsigned not null, # Start position in chromosome chromEnd int unsigned not null, # End position in chromosome name varchar(255) not null, # Name of item #Indices score float not null, # phastOdds score. index(chrom(8),bin), index(name(10)) ); EOF # << hgLoadBed -sqlTable=phastOdds.sql hg17 phastOdds phastOdds.kg.bed # Actually I probably don't need that hg17 table. echo create table phastOdds select name, score from hg17.phastOdds | hgsql hgFixed echo create index nameIndex on phastOdds (name(10)) | hgsql hgFixed ########################################################################## # Illumina SNPs (Heather, July 2005) # Source: Jeff Ohmen, PhD, johmen@illumina.com, 858/232-2702 # using /cluster/store11 because /cluster/data/hg17 is on store5, # which is currently being restored cd /cluster/store11/heather/illumina fix.pl < LinkageIVbSNP.txt > illumina.bed hgLoadBed hg17 snpIllumina -tab -strict -sqlTable=snpIllumina.sql illumina.bed # Reading illumina.bed # Loaded 6008 elements of size 4 # Sorted # Saving bed.tab # Loading hg17 # note: 28 rows where chrom = "chrXY" # reload rankProp and psiBlast gene sorter tables to link with new # known genes (markd 2005-07-15) (spLoadRankProp -noMapFile=max1k.nomap hg17 rankProp -- /cluster/bluearc/markd/rankprop/results/hs.sw+tr/max1k.rankp.gz) >&max1k.hg17.out (spLoadPsiBlast hg17 spPsiBlast /cluster/bluearc/markd/rankprop/results/hs.sw+tr.eval.gz) >&pslBlast.hg17.out # BLASTZ/CHAIN/NET CANFAM2 (DONE 8/2/05 angie - REDONE 12/12/05 angie - REDONE 2/6/06 angie) # Unfortunately, this was done with a corrupted # /san/sanvol1/scratch/hg17/nib/chr5.nib the first time around; # also, a linSpecRep bug in blastz-run-ucsc has been fixed since then. # Doh, then Kate pointed out that linSpecReps were not snipped properly -- # I had omitted the BLASTZ_ABRIDGE_REPEATS line from the DEF!!! # Added an error message to doBlastzChainNet.pl to catch that next time. # Therefore I'm moving aside the previous run: mv /usr/local/apache/htdocs/goldenPath/hg17/vsCanFam2{,.bak} # And rerunning... ssh kkstore02 mkdir /cluster/data/hg17/bed/blastz.canFam2.2006-02-06 cd /cluster/data/hg17/bed/blastz.canFam2.2006-02-06 cat << '_EOF_' > DEF # human vs. dog BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/san/sanvol1/scratch/hg17/nib SEQ1_SMSK=/san/sanvol1/scratch/hg17/linSpecRep.notInDog SEQ1_LEN=/cluster/data/hg17/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog SEQ2_DIR=/san/sanvol1/scratch/canFam2/nib SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman SEQ2_LEN=/cluster/data/canFam2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastz.canFam2.2006-02-06 '_EOF_' # << for emacs doBlastzChainNet.pl DEF -bigClusterHub pk -smallClusterHub pk \ -workhorse pk \ -blastzOutRoot /san/sanvol1/scratch/blastzHg17CanFam2Out >& do.log & tail -f do.log rm -f /cluster/data/hg17/bed/blastz.canFam2 ln -s blastz.canFam2.2006-02-06 /cluster/data/hg17/bed/blastz.canFam2 # RE-RUN NETTOAXT, AXTTOMAF FOR CANFAM2 (DONE 10/26/05 angie) # Kate fixed netToAxt to avoid duplicated blocks, which is important # for input to multiz. Regenerate maf using commands from sub-script # netChains.csh generated by doBlastzChainNet.pl above. # Obsoleted by re-run of hg17-canFam2 above 12/12/05 angie... ssh kolossus cd /cluster/data/hg17/bed/blastz.canFam2.2005-08-01/axtChain netSplit hg17.canFam2.net.gz net chainSplit chain hg17.canFam2.all.chain.gz cd .. mv axtNet axtNet.orig mkdir axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /panasas/store/hg17/nib /iscratch/i/canFam2/nib stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.hg17.canFam2.net.axt.gz end rm -r mafNet mkdir mafNet foreach f (axtNet/*.hg17.canFam2.net.axt.gz) axtToMaf -tPrefix=hg17. -qPrefix=canFam2. $f \ /cluster/data/hg17/chrom.sizes /cluster/data/canFam2/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end rm -r axtChain/{chain,net}/ axtNet.orig ############ # Sangamo/EIO DNaseI Hypersensitive Sites (2005-08-15 kate) # (Sangamo Biosciences and European Inst. Oncology) # Contact: Fyodor Umov (fumov@sangamo.com) cd /cluster/data/hg17/bed mkdir sangamo cd sangamo grep chr 3314_hs_sites_browser.bed | grep -v browser | \ hgLoadBed -noBin hg17 sangamoDnaseHs stdin # Loaded 3314 elements of size 6 checkTableCoords -table=sangamoDnaseHs hg17 # use "antiword" to create plain text from .doc description file # UPDATE WGRNA TRACK (DONE, 2005-08-24, Fan) ssh hgwdev cd /cluster/data/hg17/bed mkdir wgRna-2005-08-24 cd wgRna-2005-08-24 # Received the data file, wgtrack_aug2005.txt, from Michel Weber's email # (Michel.Weber@ibcg.biotoul.fr) # and place it under cd /cluster/data/hg17/bed/wgRna-2005-08-24. cut -f 2-10 wgtrack_aug2005.txt >wgRna.tab vi wgRna.tab # edit wgRna.tab to take out the first line of data field labels. hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab # Compared to 7/5 data, one record updated, one record dropped, one record added, out of 741 records. # Generate snpMask files (Done Heather Sept. 1, 2005) # Takes about 10-15 minutes # Consumes about 1 gig of disk ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg17 mkdir snpMask cd snpMask foreach chrom ( `cat /cluster/data/hg17/chrom.lst` ) snpMaskChrom hg17 ${chrom} /gbdb/hg17/nib/${chrom}.nib ${chrom}.ambigs.fa gzip ${chrom}.ambigs.fa end ############################################################################# # BLASTZ Mm7 (WORKING - 2005-09-06 - Hiram) # Experiment, try the alignments without the linage specific # repeats ssh pk mkdir /cluster/data/hg17/bed/blastzMm7.2005-09-06 cd /cluster/data/hg17/bed ln -s blastzMm7.2005-09-06 blastz.mm7 cd blastzMm7.2005-09-06 cat << '_EOF_' > DEF # human vs mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_M=50 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human Hg17 SEQ1_DIR=/cluster/bluearc/hg17/bothMaskedNibs SEQ1_LEN=/cluster/bluearc/hg17/chrom.sizes SEQ1_CTGDIR=/cluster/bluearc/hg17/bothMaskedNibs SEQ1_CTGLEN=/cluster/bluearc/hg17/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=500000 SEQ1_LAP=50 # QUERY: Mouse Mm7 SEQ2_DIR=/cluster/bluearc/mm7/mm7.2bit SEQ2_LEN=/cluster/bluearc/mm7/chrom.sizes SEQ2_CTGDIR=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit SEQ2_CTGLEN=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.sizes SEQ2_LIFT=/cluster/bluearc/mm7/Chroms_RandomContigs.lft SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=3000000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastzMm7.2005-09-06 TMPDIR=/scratch/tmp '_EOF_' # happy emacs cp -p /cluster/data/hg17/chrom.sizes ./S1.len twoBitInfo /cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit S2.len # establish a screen to control this job screen time ./doBlastzChainNet.pl -stop chainMerge \ -bigClusterHub=pk \ `pwd`/DEF > toChainMerge.run.out 2>&1 & # STARTED - 2005-09-06 - 11:00 # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=cat -stop=cat \ `pwd`/DEF > catStep.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=chainRun \ `pwd`/DEF > continueChainRun.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=chainMerge -stop=chainMerge \ `pwd`/DEF > chainMerge.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=net -stop=net \ `pwd`/DEF > net.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -continue=load -stop=load \ `pwd`/DEF > load.out 2>&1 & time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk \ -swap -stop=load \ `pwd`/DEF > swap.out 2>&1 & # Create plain pslChrom files to load a simple blastz track ssh kkstore02 cd /cluster/data/hg17/bed/blastzMm7.2005-09-06 mkdir -p pslChrom (cd pslParts; ls | awk -F"." '{print $1}' | sort -u) | while read C do echo -n "working ${C} ... " zcat pslParts/${C}.nib*.gz | gzip -c > pslChrom/${C}.psl.gz echo "done" done # Load those alignments ssh hgwdev cd /cluster/data/hg17/bed/blastzMm7.2005-09-06 ls pslChrom | sed -e "s/.psl.gz//" | while read T do echo "hgLoadPsl -fastLoad -noTNameIx hg17 -table=${T}_blastzMm7 pslChrom/${T}.psl.gz" hgLoadPsl -fastLoad -noTNameIx hg17 -table=${T}_blastzMm7 pslChrom/${T}.psl.gz done # After this same alignment was done with Hg17 query and Mm7 # target, came back to these swapped results in mm7 and manually loaded # the swapped tables as: chainMm7LSR, chainMm7LSRLink and # netMm7LSR # 41,223,632 total rows in the chainMm7Link split tables # 58,458,613 total rows in the chainMm7LSRLink table time featureBits hg17 chainMm7LSRLink # 959444893 bases of 2866216770 (33.474%) in intersection # real 36m30.822s # user 14m19.620s # sys 5m13.910s time featureBits hg17 chainMm7Link # 955168137 bases of 2866216770 (33.325%) in intersection # real 16m13.902s # user 10m20.780s # sys 3m42.810s # And, their intersection: ssh kolossus time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 \ chainMm7LSRLink chainMm7Link # 952667399 bases of 2866216770 (33.238%) in intersection # real 38m53.448s # user 8m38.853s # sys 2m23.362s # LOAD ACEMBLY TRACK (DONE, 2005-09-12, Fan) mv /cluster/data/hg17/bed/acembly /cluster/data/hg17/bed/acembly_050217 mkdir -p /cluster/data/hg17/bed/acembly cd /cluster/data/hg17/bed/acembly # Data is obtained from Jean Thierry-Mieg mieg@ncbi.nlm.nih.gov wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.proteins.fasta.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.gff.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.mrnas.fasta.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.pfamhits.tar.gz tar xvzf acembly.ncbi_35g.genes.gff.tar.gz tar xvzf acembly.ncbi_35g.genes.proteins.fasta.tar.gz cd acembly.ncbi_35.genes.gff # the acembly dataset for hg16 had problems with reverse blocks so # check for these cat << '_EOF_' > checkReversedBlocks for i in x1*.gff do echo -n "$i working ..." awk -F"\t" ' { if ($4 > $5) { printf "reverse blocks problem for $1" printf "\n" } } ' $i > $i.fixed echo " done" done '_EOF_' # << this line makes emacs coloring happy chmod +x checkReversedBlocks ./checkReversedBlocks ls -l *.fixed # all *.fixed files are empty so remove - there is no reversing of blocks rm *.fixed foreach f (x1.acemblygenes.*.gff) set c=$f:r:e egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \ perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff if (-e ../../../$c/lift/random.lft) then liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \ ctg-chr${c}_random.gff endif grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \ grep -v "^chr//" > chr$c.gff echo "done $c" end #- Load into database - use extended genePred ssh hgwdev cd /cluster/data/hg17/bed/acembly # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf hg17 acembly acembly.ncbi_35.genes.gff/chr*.gff # for entry with 28212470 from chr6.gff, change to chr6 # and for 29124352 in chr6.gff, change to chr6 (1/13/05) echo 'update acembly set chrom = "chr6" where chrom = "chr28212470";' \ | hgsql hg17 echo 'update acembly set chrom = "chr6" where chrom = "chr29124352";' \ | hgsql hg17 # checkTableCoords and runGeneCheck to check data checkTableCoords hg17 acembly hgPepPred hg17 generic acemblyPep \ acembly.ncbi_35.genes.proteins.fasta/*.fasta # create table of Acembly gene classifications cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.gff rm acemblyClass.tab foreach f (x1.acemblygenes.*.gff) cut -f 9 $f |sed -e 's/;/\t/g' |sed -e 's/transcript_id //' >j.tmp cut -f 2 j.tmp >j2.tmp cut -f 3 j.tmp >j3.tmp paste j3.tmp j2.tmp|sed -e 's/Main_gene/main/g' |sed -e 's/Putative_gene/putative/g' |sed -e 's/ //g' >>acemblyClass.tab end rm *.tmp hgsql hg17 -e 'drop table acemblyClass' hgsql hg17 < ~src/hg/lib/acemblyClass.sql hgsql hg17 -e 'load data local infile "acemblyClass.tab" into table acemblClass.tab' hgsql hg17 -e 'delete from acemblyClass where class!="main" and class!="putative"' # build acemblyPep table hgPepPred hg17 generic acemblyPep \ acembly.ncbi_35.genes.proteins.fasta/*.fasta # Please note, per email from Jean Thierry-Mieg on 9/9/05, # there are AceView genes (~10,000) without corresponding # protein sequences. They will fix it next time. ########################################################################### # LOADING AFFYTXNPHASE2 TRACK (sugnet) # cd to where data is downloaded. cd /cluster/store10/sugnet/affyTranscription/graphs/transcriptome.affymetrix.com/download/publication/polyA_minus/graphs # lift data from hg16 to hg17. This takes a long time. ./liftWigFilesHg16ToHg17.sh # make the .wib and .wig files. This takes a long time. ./makeWibWigHg17.sh # Copy .wib files to /cluster/data/hg17 files mkdir /cluster/data/hg17/bed/affyTxnPhase2/wigData/ cp `find ./ -name "*.hg17.wib"` /cluster/data/hg17/bed/affyTxnPhase2/wigData/ chmod 775 /cluster/data/hg17/bed/affyTxnPhase2/wigData/ chmod 664 /cluster/data/hg17/bed/affyTxnPhase2/wigData/* # Make gbdb entry mkdir /gbdb/hg17/wib/affyTxnPhase2 chmod 775 /gbdb/hg17/wib/affyTxnPhase2 cd /gbdb/hg17/wib/affyTxnPhase2 ln -s /cluster/data/hg17/bed/affyTxnPhase2/wigData/* . cd - # Load the database tables (using bash) this takes a while for file in `find ./ -name "*hg17.wig"`; do base=`basename $file .hg17.wig` echo "Doing ${base}Txn" hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/affyTxnPhase2 hg17 ${base}Txn $file done # Do the transfrags cd ../transfrags ./liftHg16ToHg17.sh ./loadHg17Tnfg.sh # End of affyTxnPhase2 ########################################################################### # Creating download files for the affyTxnPhase2 data # (DONE - 2006-11-20 - Hiram) # Copy all of the data above to a temporary /san/sanvol1/scratch/ # location, and run the following script: #!/bin/sh mkdir -p rawData/hg17 TOP=`pwd` export TOP for dir in `find ./ -type d | grep '_' | grep -v A375_cytosolic_polyAPlus | grep -v FHs738Lu_cytosolic_polyAPlus | grep -v HepG2_CytosolVsNucleusDifferenceGraphs | grep -v HepG2_cytosolic_polyAPlus | grep -v HepG2_cytosolic_polyAMinus | sed -e "s#^./##"`; do base=`echo $dir | sed -e 's/\.\///; s/\//_/g' | sed -e 's/polyA-/polyAMinus/g' | sed -e 's/-/_/g' | sed -e 's/\+/Plus/g' | $TOP/changeName.pl` RAW=$TOP/rawData/hg17/$base.data echo $RAW cd $dir; zcat `ls -1 *hg17.bed.gz` | bedSort stdin stdout | cut -f 1,2,3,4 | grep chr | $TOP/avgerizeBed.pl > $RAW cd $TOP; done # Then copy the rawData/hg17/ results directory back to: /cluster/data/hg17/bed/affyTxnPhase2/rawResults/ # And deliver to hgdownloads via symlinks on hgwdev: cd /usr/local/apache/htdocs/goldenPath/hg17/affyTxnPhase2/ # to: ln -s /cluster/data/hg17/bed/affyTxnPhase2/rawData/*.data.gz . # Remove the san scratch data ########################################################################### # ALTGRAPHX TRACK (sugnet) /cluster/store1/sugnet/altSplice mkdir hg17-2005.03.28 # First get the RNA clusters cd hg17-2005.03.28 # Don't use RAGE libraries for clone bounds. ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg17 rage.libs # Make spec file to run. foreach c (`echo 'select chrom from chromInfo' | hgsql hg17 | grep -v chrom`) set out = chrom/$c.bed echo "clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c" >> clusterRna.spec end # Tried running it on the minicluster, but can't connect to the # cluster accounts so run it from here on hgwdev. ./clusterRna.spec >& clusterRna.log cd .. # Make script to setup parasol job file for raw altGraphX files on human cat << '_EOF_' > makeRun.sh #!/bin/sh for chrom in `echo "select chrom from chromInfo" | hgsql hg17 | grep -v chrom`; do echo 'echo "Doing $chrom"' echo "/cluster/home/sugnet/bin/i386/altSplice -db=hg17 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg17.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg17/nib/$chrom.nib" done '_EOF_' # << this line makes emacs coloring happy mkdir agxs chmod 755 makeRun.sh # Minicluster down, have to run on hgwdev. ./makeRun.sh > toRun.sh chmod 755 toRun.sh ./toRun.sh >& toRun.log & cat agxs/*.agx > hg17.agx # make raw altGraphX files for mouse mkdir ../mm5-2005.03.28/ cd ../mm5-2005.03.28/ # make the rnaClusters mkdir rnaCluster cd rnaCluster/ mkdir chrom # Don't use RAGE libraries for clone bounds. ~/jk/hg/geneBounds/clusterRna/generateRageAccList.csh mm5 rage.libs # Doing select on mm5 into mm5.rage.libs # Done. # Make spec file to run. foreach c (`echo 'select chrom from chromInfo' | hgsql mm5 | grep -v chrom`) set out = chrom/$c.bed echo "clusterRna -mrnaExclude=mm5.rage.libs mm5 /dev/null $out -chrom=$c" >> clusterRna.spec end # Tried running it on the minicluster, but can't connect to the # cluster accounts so run it from here on hgwdev. chmod 755 clusterRna.spec ./clusterRna.spec >& clusterRna.log & # Make the gene bounds in rnaCluster. mkdir agxs # This script generates the jobs, one per chromosome. echo << '_EOF_' > makeRun.sh #!/bin/sh for chrom in `echo "select chrom from chromInfo" | hgsql mm5 | grep -v chrom`; do echo 'echo "Doing $chrom"' echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm5 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm5.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm5/nib/$chrom.nib" done '_EOF_' chmod 755 makeRun.sh ./makeRun.sh > toRun.sh chmod 755 toRun.sh ./toRun.sh >& toRun.log & # Takes an hour or so... # Consolodiate all of the records in a single file. cat agxs/*.agx > mm5.agx # Make the orthologous splicing graphs. mkdir orthoSpliceExoniphy cd orthoSpliceExoniphy/ # Get the exoniphy exons... echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg17 | grep -v txStart > hg17.exoniphy.bed # Set up the commands for the orthosplice run. echo 'select chrom, size from chromInfo' | hgsql hg17 | grep -v chrom > chromSizes.tab ln -s /cluster/data/hg17/bed/blastz.mm5/axtChain/mouseNet/ nets ln -s /cluster/data/hg17/bed/blastz.mm5/axtChain/chain/ chains mkdir agx report logs cat << '_EOF_' > makeRun.sh #!/usr/bin/perl -w open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n"; while() { chomp; @w = split; print "/cluster/home/sugnet/bin/i386/orthoSplice -chromSize=$w[1] -exonFile=hg17.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../agxs/hg17.$w[0].agx -orthoAgxFile=../../mm5-2005.03.28/mm5.agx -db=hg17 -orthoDb=mm5 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg17.mm5.cons.t3.agx -reportFile=report/$w[0].hg17.report -edgeFile=report/$w[0].hg17.edge.report >& logs/$w[0].test.log\n"; } '_EOF_' # << emacs ./makeRun.sh > orthoSplice.para.spec ssh kki cd /cluster/store1/sugnet/altSplice/hg17-2005.03.28/orthoSpliceExoniphy para create orthoSplice.para.spec para push cat agx/*.agx > hg17.mm5.t3.exoniphy.agx # Make bed file agxToBed hg17.mm5.t3.exoniphy.agx hg17.mm5.t3.exoniphy.bed # Load up files hgLoadBed hg17 agxBed hg17.mm5.t3.exoniphy.bed hgLoadBed -notItemRgb -sqlTable=/cluster/home/sugnet/kent/src/hg/lib/altGraphX.sql hg17 altGraphX hg17.mm5.t3.exoniphy.agx # end altGraphX track # EXONWALK TRACK (sugnet) # make altGraphX track (see above) cd /cluster/store1/sugnet/altSplice/hg17-2005.03.28/orthoSpliceExoniphy cd exonWalk mkdir beds # Make parasol script. foreach file (`ls ../agx/*.agx`) set base=`basename $file .agx` echo "/cluster/home/sugnet/bin/i386/exonWalk db=hg17 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec end para create exonWalk.para.spec para push cat beds/*.bed > hg17.mm5.cons.t3.exoniphy.bed # Predict orfs mkdir orfs cd orfs mkdir bedOrf beds fa borf cp ~/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./ splitFile ../../hg17.mm5.cons.t3.exoniphy.bed 500 exonWalk. cat < < '_EOF_' > makeFa.sh #!/bin/sh for file in "$@" do base=`basename $file` echo "Doing $file" echo "sequenceForBed -db=hg17 -bedIn=$file -fastaOut=fa/$base.fa " sequenceForBed -db=hg17 -bedIn=$file -fastaOut=fa/$base.fa done '_EOF_' chmod 755 makeFa.sh makeFa.sh beds/* # Run borf lots of times... makeSpec.sh beds/* > para.spec para create para.spec para push mkdir genePred cat << '_EOF_' > makeGenePred.sh #!/bin/sh for file in "$@" do base=`basename $file` /cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp done '_EOF_' # << this line makes emacs coloring happy chmod 755 makeGenePred.sh makeGenePred.sh beds/* cat beds/* > hg17.mm5.exonWalk.bed cat genePred/*.gp > hg17.mm5.exonWalk.gp wc *.bed *.gp # 155470 1865640 29956585 hg17.mm5.exonWalk.bed # 98433 984330 32495119 hg17.mm5.exonWalk.gp # Load it into the database. ldHgGene -predTab hg17 exonWalk hg17.mm5.exonWalk.gp # end exonWalk #################################################################### ### hapmapRecombRate (Daryl; September 19, 2005) Lifted from hg16; see makeHg16.doc for details # Update (Jen; October 25, 2005) Data points that lifted to chroms other than 1-22 + X removed before release to RR (confirmed with Daryl). chr4_random: 11 data points chr6_hla_hap1: 25 data points ### hapmapRecombHotspot (Daryl; September 19, 2005) Lifted from hg16; see makeHg16.doc for details ### HapMap SNPs (Daryl; February 4, 2006) # most of this work was done in October and November 2005 for the ENCODE workshop cd /cluster/store4/gs.17/build34/bed/hapmap/frequencies/2005-10/non-redundant/hapmapSnps ln -sf ../hg17.daf.all/daf.txt.gz . ln -sf ../hg17.panTro1.rheMac1.txt.gz . zcat hg17.panTro1.rheMac1.txt | grep -v chrom | sort >! hg17.panTro1.rheMac1.sort.txt zcat daf.txt | grep -v chrom | sort >! daf.sort.txt # check that order matches; should be empty paste hg17.panTro1.rheMac1.sort.txt daf.sort.txt | awk '$1!=$17||$2!=$18||$3!=$19||$4!=$20||$5!=$21||$6!=$22||$7!=$23||$8!=$24||$11!=$25||$12!=$27||$15!=$26||$16!=$28{print $0;}' paste hg17.panTro1.rheMac1.sort.txt daf.sort.txt | awk '{printf "%s\t%d\t%d\t%s\t0\t%c\t%c\t%c\t%c\t%c\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n",$1,$2,$3,$4,$6,$7,$8,$11,$15,$12,$16,$29,$30,$31,$32,$33}' >! hapmapSnps.bed hgLoadBed hg17 hapmapSnps -sqlTable=hapmapSnps.sql hapmapSnps.bed ############################################################################################ # HapMap SNPs rel21a (Feb. 2007, Heather) # June 2007 [partial fix of hapmapAllelesSummary released 6/25/07: # using hg17 instead of hg18 liftOver files... for most but not all # chroms! :( not documented below; error found by user] # 1/11/08, 1/24/08 (angie): regenerated hapmapAllelesSummary with corrected # hapmapAllelesChimp. # get files for each chrom, for each population # these contain data for all individuals # not using the JPT+CHB files ssh kkstore05 cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant/zips wget http://www.hapmap.org/downloads/genotypes/2007-01/rs_strand/non-redundant/* # get population data (needed to define children in CEU and YRI trios) cd /cluster/store12/snp/hapmap wget http://www.hapmap.org/downloads/samples_individuals/*gz gunzip pedinfo2sample_CEU.txt.gz filterPedigree.pl < pedinfo2sample_CEU.txt > CEU.filtered cp CEU.filtered rel21a/genotypes/2007-01/rs_strand/non-redundant/CEU.list gunzip pedinfo2sample_YRI.txt.gz filterPedigree.pl < pedinfo2sample_YRI.txt > YRI.filtered cp YRI.filtered rel21a/genotypes/2007-01/rs_strand/non-redundant/YRI.list #!/usr/bin/env perl while () { my @fields = split; if ($fields[2] == 0 && $fields[3] == 0) { @subfields = split /:/, $fields[6]; print $subfields[4]; print "\n"; } } cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant zcat zips/*chr22_CEU* | head -1 > header.CEU zcat zips/*chr22_YRI* | head -1 > header.YRI # add carriage returns to header.CEU and header.YRI grep -n -f CEU.list header.CEU > offsets.CEU grep -n -f YRI.list header.YRI > offsets.YRI # delete ids in offsets.CEU and offsets.YRI so just column numbers remain # for each population, combine all chroms, and combine all individuals # for CEU and YRI, filter out children from trios # This creates CEU.merge, CHB.merge, JPT.merge, YRI.merge ./merge.csh #!/bin/tcsh rm -f CEU.merge rm -f CHB.merge rm -f JPT.merge rm -f YRI.merge foreach chrom (`cat chrom.list`) echo $chrom # CEU echo "CEU" set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_CEU_r21a_nr.txt.gz", $1}'` zcat $fileName | filterCEU.pl >> CEU.merge # CHB echo "CHB" set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_CHB_r21a_nr.txt.gz", $1}'` zcat $fileName | filterCHB.pl >> CHB.merge # JPT echo "JPT" set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_JPT_r21a_nr.txt.gz", $1}'` zcat $fileName | filterJPT.pl >> JPT.merge # YRI echo "YRI" set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_YRI_r21a_nr.txt.gz", $1}'` zcat $fileName | filterYRI.pl >> YRI.merge end # Below is filterCEU.pl # The others are very similar: YRI uses "offsets.YRI" # CHB and JPT just read the input directly #!/usr/bin/env perl # read in a list of the columns that we are keeping sub initList { open LIST, "offsets.CEU"; chomp(@list = ); close LIST; $listSize = @list; } &initList; while () { my @fields = split; # skip header if ($fields[0] eq "rs#") { next; } # chrom print $fields[2]; print " "; # position: add zero-based start coord print $fields[3] - 1; print " "; print $fields[3]; print " "; # rsId print $fields[0]; print " "; # score print "0 "; # strand print $fields[4]; print " "; # observed print $fields[1]; print " "; @alleles = (); for ( my $loop = 0; $loop < $listSize; $loop++ ) { push (@alleles, $fields[@list[$loop]-1]); } # N is used for missing data $nCount = 0; # counts $aCountHomo = 0; $cCountHomo = 0; $gCountHomo = 0; $tCountHomo = 0; $aCountHetero = 0; $cCountHetero = 0; $gCountHetero = 0; $tCountHetero = 0; foreach $allele (@alleles) { $parent1 = substr($allele, 0, 1); $parent2 = substr($allele, 1, 1); # Ns must be together if ($parent1 eq "N" && $parent2 ne "N") { die "Unexpected input"; } if ($parent2 eq "N" && $parent1 ne "N") { die "Unexpected input"; } if ($parent1 eq "N" && $parent2 eq "N") { $nCount++; next; } if ($parent1 eq "A" && $parent2 eq "A") { $aCountHomo = $aCountHomo + 2; next; } if ($parent1 eq "C" && $parent2 eq "C") { $cCountHomo = $cCountHomo + 2; next; } if ($parent1 eq "G" && $parent2 eq "G") { $gCountHomo = $gCountHomo + 2; next; } if ($parent1 eq "T" && $parent2 eq "T") { $tCountHomo = $tCountHomo + 2; next; } if ($parent1 eq "A") { $aCountHetero++; } if ($parent1 eq "C") { $cCountHetero++; } if ($parent1 eq "G") { $gCountHetero++; } if ($parent1 eq "T") { $tCountHetero++; } if ($parent2 eq "A") { $aCountHetero++; } if ($parent2 eq "C") { $cCountHetero++; } if ($parent2 eq "G") { $gCountHetero++; } if ($parent2 eq "T") { $tCountHetero++; } } print "A "; print $aCountHomo; print " "; print $aCountHetero; print " "; print "C "; print $cCountHomo; print " "; print $cCountHetero; print " "; print "G "; print $gCountHomo; print " "; print $gCountHetero; print " "; print "T "; print $tCountHomo; print " "; print $tCountHetero; print " "; print "\n"; } # << emacs # Switch to C programs from kent/src/hg/snp/snpLoad. # Determine allele1 and allele2 (set allele2 to "none" if monomorphic) # Alleles are in alphabetical order # Calculate score (minor allele frequency) # Log and skip if wrong number of elements in row # Log and skip if triallelic or quadallelic # Log and skip degenerate case (no alleles) # No errors this run # Still running on kkstore05 # Could rename "hapmap1" to "hapmapCondense" /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 CEU.merge CEU.condense wc -l hapmap1.log /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 CHB.merge CHB.condense wc -l hapmap1.log /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 JPT.merge JPT.condense wc -l hapmap1.log /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 YRI.merge YRI.condense wc -l hapmap1.log # save some space gzip *merge # load ssh hgwdev cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant cp /cluster/home/heather/kent/src/hg/lib/hapmapSnps.sql . # modify hapmapSnps for 4 populations hgLoadBed hg17 hapmapSnpsCEU -sqlTable=hapmapSnpsCEU.sql CEU.condense hgLoadBed hg17 hapmapSnpsCHB -sqlTable=hapmapSnpsCHB.sql CHB.condense hgLoadBed hg17 hapmapSnpsJPT -sqlTable=hapmapSnpsJPT.sql JPT.condense hgLoadBed hg17 hapmapSnpsYRI -sqlTable=hapmapSnpsYRI.sql YRI.condense # save some more space ssh kkstore05 cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant gzip *condense # sanity check mysql> select count(*) from hapmapSnpsCEU where homoCount1 + homoCount2 + heteroCount = 0; +----------+ | count(*) | +----------+ | 0 | +----------+ mysql> select count(*) from hapmapSnpsCHB where homoCount1 + homoCount2 + heteroCount = 0; +----------+ | count(*) | +----------+ | 0 | +----------+ mysql> select count(*) from hapmapSnpsJPT where homoCount1 + homoCount2 + heteroCount = 0; +----------+ | count(*) | +----------+ | 0 | +----------+ mysql> select count(*) from hapmapSnpsYRI where homoCount1 + homoCount2 + heteroCount = 0; +----------+ | count(*) | +----------+ | 0 | +----------+ mysql> select max(score) from hapmapSnpsCEU; +------------+ | max(score) | +------------+ | 500 | +------------+ # create indexes mysql> alter table hapmapSnpsCEU add index name (name); mysql> alter table hapmapSnpsCEU add index chrom (chrom, bin); mysql> alter table hapmapSnpsCHB add index name (name); mysql> alter table hapmapSnpsCHB add index chrom (chrom, bin); mysql> alter table hapmapSnpsJPT add index name (name); mysql> alter table hapmapSnpsJPT add index chrom (chrom, bin); mysql> alter table hapmapSnpsYRI add index name (name); mysql> alter table hapmapSnpsYRI add index chrom (chrom, bin); # 2nd step in processing: create hapmapSnpsCombined ssh hgwdev cd /cluster/data/hg17/bed/hapmap/rel21a /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap2 hg17 hgLoadBed hg17 hapmapSnpsCombined -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapSnpsCombined.sql hapmapSnpsCombined.tab # create indexes (not used by browser) mysql> alter table hapmapSnpsCombined add index name (name); mysql> alter table hapmapSnpsCombined add index chrom (chrom, bin); # errors # nothing that isn't biallelic # nothing with mixed positions # over 500K that were not available in all 4 populations # YRI: 187,485 # CEU: 129,359 # CHB and JPT: 97,095 # Also, 2 strand corrections done grep -v missing hapmap2.errors # different strands for rs1621378 # different strands for rs5768 # cleanup to save space rm hapmapSnpsCombined.tab # monomorphism YRI 867,835 CEU 1,252,743 CHB 1,496,438 JPT 1,539,094 combined 607,393 # observed strings # why is A/T different from other transversions? A/G 1,344,043 C/T 1,344,542 A/C 352,875 A/T 275,670 C/G 354,299 G/T 354,149 triallelic 1,370 quadallelic 403 other 1,226 # some details on the others: 125 -/A/T 124 -/A/G 107 -/C/T 85 -/A/C 79 -/G/T 25 -/C/G 18 -/A/C/T 13 -/A/G/T 12 -/A/C/G 11 -/C/G/T 7 (LARGEINSERTION) 5 (LARGEDELETION) 6 microsat 2 het # check for collisions (more than one SNP at the same location) # none found ssh hgwdev cd /cluster/data/hg17/bed/hapmap/rel21a /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckCluster2 hg17 hapmapSnpsCombined > snpCheckCluster2.out # check against hg17.snp125 ssh hgwdev cd /cluster/data/hg17/bed/hapmap/rel21a /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapLookup hg17 hapmapSnpsCombined snp125 snp125Exceptions # 1817 total that are complex type from dbSNP (hapmapLookup.log) # This is not based on observed string, only on size, class and locType # 1176 class = mixed # 616 class = single but locType != exact # 11 class = named # 6 class = insertion # 4 class = deletion # 2 class = microsat # 2 class = het # Generally if class = single the observed string is bi-allelic as expected # Exceptions to that: # rs700519 quad-allelic, locType = rangeDeletion # rs1572672 tri-allelic, locType = between # rs2357412 tri-allelic, locType = range # rs2364671 tri-allelic, locType = rangeSubstitution # rs3959788 quad-allelic, locType = between # 74 items in hapmapLookup.error # 59 reverse complement (that's okay) # 7 multiple alignment (6 from chrX:154,219,000-154,220,500 which is close to PAR) # Also rs6645103 which is PAR mysql> select chrom, chromStart, chromEnd, strand, observed, class, locType, weight from snp125 where name = "rs6645103"; +-------------+------------+----------+--------+----------+--------+---------+--------+ | chrom | chromStart | chromEnd | strand | observed | class | locType | weight | +-------------+------------+----------+--------+----------+--------+---------+--------+ | chrX_random | 273788 | 273789 | - | C/T | single | exact | 3 | | chrX | 421141 | 421142 | + | C/T | single | exact | 3 | | chrY | 421141 | 421142 | + | C/T | single | exact | 3 | +-------------+------------+----------+--------+----------+--------+---------+--------+ # 4 observed with dbSNP complex, hapmap biallelic # all positive strand, locType = between # all cluster errors in dbSNP # rs10485830 # rs7625205 (intronic) # rs713582 # rs11403115 (class = insertion) # 3 observed mismatch # all dbSNP clustering error # rs2230624 (tri-allelic) # rs3963317 (monomorphic in hapmap, rangeSubstitution in dbSNP) # rs5017503 (monomoprhic in hapmap) # a strange one # rs731449 # dbSNP strand = -, hapmap strand = + # dbSNP observed = G/T, hapmap observed = C/T # dbSNP clustering error rs2321451, which is C/T # hapmap monomorphic for T # ortho A # no repeats, no genes, no mRNAs, no conservation # Counts of rows where 3 populations have one major allele, the 4th has the other hapmapMixed hg17 # countCEU = 162931 # countCHB = 46543 # countJPT = 48791 # countYRI = 309105 # Generate summary table (used by filters) # Summary table includes ortho allele and ortho qual score # Summary table score is heterozygosity # Individual zygosity is *not* preserved ssh hgwdev # 6/25/08: regenerated with mostly-corrected hapmapAllelesChimp # 1/11/08 angie: regenerated with finally-corrected (I hope) hapmapAllelesChimp # 1/24/08 angie: regenerated with finally-corrected (I hope!) hapmapAllelesChimp cd /cluster/data/hg17/bed/hapmap/rel21a /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapSummary hg17 hapmapSnpsCombined hapmapAllelesChimp hapmapAllelesMacaque hgLoadBed hg17 hapmapAllelesSummary -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesSummary.sql hapmapSummary.tab -tab # sanity check mysql> select count(*) from hapmapAllelesSummary where majorAlleleCountCEU > totalAlleleCountCEU; +----------+ | count(*) | +----------+ | 0 | +----------+ mysql> select count(*) from hapmapAllelesSummary where majorAlleleCountCHB > totalAlleleCountCHB; +----------+ | count(*) | +----------+ | 0 | +----------+ mysql> select max(score) from hapmapAllelesSummary; +------------+ | max(score) | +------------+ | 500 | +------------+ mysql> select count(*), popCount from hapmapAllelesSummary group by popCount; +----------+----------+ | count(*) | popCount | +----------+----------+ | 52479 | 1 | | 72977 | 2 | | 207643 | 3 | | 3700478 | 4 | +----------+----------+ mysql> select count(*), isMixed from hapmapAllelesSummary group by isMixed; +----------+---------+ | count(*) | isMixed | +----------+---------+ | 3192896 | NO | | 840681 | YES | +----------+---------+ # histogram of heterozygosity: 0 ************************************************************ 883400 25 ************** 204000 50 ************* 188703 75 *********** 157404 100 ********** 143119 125 ********* 131575 150 ********* 126916 175 ********* 128585 200 ******** 123440 225 ******** 119815 250 ******** 120646 275 ******** 120239 300 ******** 122654 325 ********* 128233 350 ********* 130069 375 ********** 144699 400 ********** 152829 425 ************ 172513 450 *************** 225645 475 ********************************** 503166 500 5927 ############################################################################################ ### HapMap LD (Daryl; February 11, 2006) ## start from the genotypes files, run Haploview, reformat, and load mkdir -p /san/sanvol1/hg17/bed/hapmap/genotypes/2006-01/non-redundant/para cd /san/sanvol1/hg17/bed/hapmap/genotypes/2006-01/non-redundant # wget all genotype data: # ftp://www.hapmap.org/genotypes/2006-01/non-redundant/genotypes_chr*_*.b35.txt.gz # Haploview had to be recompiled because there was a missing JPT sample in the ped file ##runHaploview.csh #!/bin/csh if ( $#argv < 2 ) then echo "usage: $0 []" echo " $0 /cluster/bin/foo bar.gz 2G" exit 1 endif set path = $1 set file = $2 set root = $file:r set memFlag = "" if ( $#argv >= 3 ) then set memFlag = "-Xmx$3" endif cd /scratch /bin/cp -f $path/$file . /bin/gunzip -f $file /usr/java/jre1.5.0_06/bin/java -d64 $memFlag -jar /cluster/home/daryl/haploview/haploview/Haploview.jar -c -d -n -maxDistance 250 -a $root >&! $root.log /bin/gzip -f $root.LD $root.CHECK >>& $root.log /bin/mv -f $root.LD.gz $root.CHECK.gz $root.log $path/ /bin/rm -f $root* ### cd para set hv = /cluster/home/daryl/scripts/runHaploview.csh set ldDir = /cluster/store5/gs.18/build35/bed/hapmap/genotypes/2006-01/non-redundant foreach pop (YRI CEU CHB JPT JPT+CHB) foreach chrom (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X) echo $hv $ldDir genotypes_chr${chrom}_{$pop}.b35.txt.gz 4G >> jobList end end ssh pk # para create, para try, para push -maxNode=25 ... #Completed: 120 of 120 jobs #CPU time in finished jobs: 1564915s 26081.91m 434.70h 18.11d 0.050 y #IO & Wait Time: 21862s 364.37m 6.07h 0.25d 0.001 y #Average job time: 13223s 220.39m 3.67h 0.15d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 40742s 679.03m 11.32h 0.47d #Submission to last job: 104809s 1746.82m 29.11h 1.21d #### makeDcc.pl #!/usr/bin/perl -W $pop = shift || die "usage: makeDcc.pl \n"; $chrom = shift || die "usage: makeDcc.pl \n"; $geno = "geno/genotypes_${chrom}_${pop}.b35.txt.gz"; $ld = "ld/genotypes_${chrom}_${pop}.b35.txt.LD.gz"; $txt = "dcc/ld_${chrom}_${pop}.b35.txt"; open(GENO,"zcat $geno | " ) || die "can't open $geno"; open(LD, "zcat $ld | " ) || die "can't open $ld"; open(TXT, " > $txt " ) || die "can't open $txt"; ;#ignore header while () { @fields = split / /; $pos{$fields[0]} = $fields[3]; } close(GENO); ;#ignore header; while () { @fields = split /\t/; $chromStart = $pos{$fields[0]}; $chromEnd = $pos{$fields[1]}; print TXT "$chromStart $chromEnd $pop $fields[0] $fields[1] $fields[2] $fields[4] $fields[3]\n"; } close(LD); close(TXT); system("gzip $txt"); #### #### makeDcc.csh #!/bin/csh #set path = "/cluster/home/daryl/scripts"; set path = "."; foreach pop (CEU CHB JPT YRI JPT+CHB) foreach chr (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) echo $path/makeDcc.pl $pop chr$chr end end #### #### makeLdBed.pl #!/usr/bin/perl -W sub min ($$) { my $a = shift @_; my $b = shift @_; if ($a<$b) {return $a;} return $b; } sub encodeDprime($) { my $val = shift @_; if ( ($val > 1) || ($val < -1) ) { die "Dprime value ($val) is out of range [-1,1]";} elsif ($val>=0) { $ret = ord('a') + $val*9;} else { $ret = ord('A') - $val*9;} return chr($ret); } sub encodeRsquared($) { my $val = shift @_; if ( ($val > 1) || ($val < 0) ) { die "R^2 value ($val) is out of range [0,1]";} return encodeDprime($val); } sub encodeLod($$) { my $lod = shift @_; my $dPrime = shift @_; $ret = ord('a'); if ($lod>=2) # high LOD { if (abs($dPrime)<0.5) { $ret = ord('y'); } # high LOD, low D' -> pink else { $ret += min((int($lod-abs($dPrime)-1.5)), 9) ;} } elsif (abs($dPrime)>0.99) { $ret = ord('z'); } # high D', low LOD -> blue return chr($ret); } $inDir = shift||"data"; $outDir = shift||"bed"; $foo = ""; $bar = ""; @rest = (); @pops = ("CEU", "CHB", "JPT", "YRI", "JPT+CHB"); printf("> Starting \t" . `date` . "\n"); foreach $pop (@pops) { opendir(DIR, $inDir) || die "can't open $inDir"; if ($pop eq "JPT+CHB") { @hmFiles = grep {/^ld_/ && /_JPT/ && /CHB.b35.txt.gz$/} readdir(DIR); } else { @hmFiles = grep {/^ld_/ && /_${pop}.b35.txt.gz$/} readdir(DIR); } closedir(DIR); printf "POP:\t$pop\t$#hmFiles\n"; foreach $hmFile (sort @hmFiles) { ($foo, $chrom, $bar) = split /_/, $hmFile; $chrom =~ s/chrx/chrX/; $chrom =~ s/chry/chrY/; $outfile = "$outDir/${pop}_${chrom}.hg17.bed"; if ((-e $outfile)||(-e "$outfile.gz")) { next; } $tmpFile = "/tmp/${pop}_${chrom}.hg17.bed"; printf("$inDir/$hmFile => $outfile.gz\t" . `date`); open(OUT, "> $tmpFile" ) || die "can't open $tmpFile"; open(IN, "zcat $inDir/$hmFile | " ) || die "can't open $inDir/$hmFile"; $line = ; if (!defined $line){next;} chomp($line); ($chromStart, $chromEnd, $pop, $name, $marker2, $dprime, $rsquared, $lod, @rest) = split / /, $line; $ldCount = 1; while () { chomp(); ($chromStartNew, $chromEndNew, $pop, $nameNew, $marker2, $dprime, $rsquared, $lod, @rest) = split / /; if ($chromStart ne $chromStartNew) { $chromStart--; printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n"); $chromStart = $chromStartNew; $chromEnd = $chromEndNew; $name = $nameNew; $ldCount = 1; $dprimeList = encodeDprime($dprime); $rsquaredList = encodeRsquared($rsquared); $lodList = encodeLod($lod, $dprime); } elsif ($chromEndNew-$chromStartNew<250000) { $chromEnd = $chromEndNew; $ldCount++; $dprimeList .= encodeDprime($dprime); $rsquaredList .= encodeRsquared($rsquared); $lodList .= encodeLod($lod, $dprime); } } close(IN); $chromStart--; printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n"); close(OUT); system("gzip $tmpFile"); system("mv $tmpFile.gz $outDir"); } } printf("> Finished \t" . `date` . "\n"); #### #### getMax.csh -- check for consistency by chrom and population #!/bin/csh set out = maxDist.txt rm -f $out touch $out echo this takes about 4 hours to run completely >> $out foreach f (dcc/ld_*.b35.txt.gz) echo -n "$f " >> $out zcat $f | awk '{if ($2-$1>max) max=$2-$1} END {print max}' >> $out end #### getSizes.csh -- should all be 249999 #!/bin/csh set out = wcList.txt rm -f $out touch $out echo "this takes about 2 hours to run completely" foreach f (dcc/*.txt.gz) echo -n $f:r:r " " | sed 's/ld_//;s/chr//;s/_/\t/' >> $out zcat $f | cut -f1 -d " " | uniq | wc -l >> $out end #### load.csh #!/bin/csh set db = hg17 sed 's/hapmapLd/hapmapLdCeu/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db} sed 's/hapmapLd/hapmapLdChb/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db} sed 's/hapmapLd/hapmapLdJpt/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db} sed 's/hapmapLd/hapmapLdYri/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db} sed 's/hapmapLd/hapmapLdChbJpt/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db} # about half an hour to an hour per population foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X) hgLoadBed -noSort -oldTable -strict ${db} hapmapLdCeu CEU_chr${c}.${db}.bed.gz hgLoadBed -noSort -oldTable -strict ${db} hapmapLdChb CHB_chr${c}.${db}.bed.gz hgLoadBed -noSort -oldTable -strict ${db} hapmapLdJpt JPT_chr${c}.${db}.bed.gz hgLoadBed -noSort -oldTable -strict ${db} hapmapLdYri YRI_chr${c}.${db}.bed.gz hgLoadBed -noSort -oldTable -strict ${db} hapmapLdChbJpt JPT+CHB_chr${c}.${db}.bed.gz end rm -f bed.tab ### # AFFYHUEX1 TRACK (sugnet Wed Oct 5 12:16:42 PDT 2005) mkdir hg17 cd hg17 pwd # /cluster/store1/sugnet/affymetrixHumanAllExon/hg17 mkdir gff beds annot cd gff # download gff design files # parse gff script... #!/usr/bin/perl -w if(scalar(@ARGV) == 0) { print STDERR "parseGff.pl - Parse out affymetrixes gff annotation probesets for human all exon design. usage: parseGff.pl file1.design.gff file2.design.gff ... fileN.design.gff "; exit(1); } sub splitField($) { my $l = shift(@_); my @w = split / /, $l; return $w[1]; } while($file = shift(@ARGV)) { if(!($file =~ /(.+)\.gff/)) { die "$file doesn't have .gff suffix\n"; } $prefix = $1; print STDERR "Doing file $file.\n"; open(IN, $file) or die "Can't open $file to read."; open(BED, ">../beds/$prefix.pset.bed") or die "Can't open ../beds/$prefix.pset.bed to write."; open(ANNOT, ">../annot/$prefix.tab") or die "Can't open ../annot/$prefix.tab to write."; while($line = ) { # Only want the probeset records. if($line =~ /\tprobeset\t/) { $score = 0; $cds = 0; $bounded = 0; chomp($line); # pop off an microsoft line endings. $line =~ s/\r$//; @words = split /\t/, $line; # This makes the evidence be comman separated. $words[8] =~ s/\" \"/,/g; # This gets rid of pesky quotes. $words[8] =~ s/\"//g; # Set the score based on the annotation type if($words[8] =~ /full/) { $score = 200; } elsif($words[8] =~ /extended/) { $score = 500; } elsif($words[8] =~ /core/) { $score = 900; } if($words[8] =~ /bounded/) { $score -= 200; } if($words[8] =~ /cds/) { $score += 100; } if($score <= 0) { $score = 100; } # Print out the annotation fields. @fields = split /; /,$words[8]; $id = splitField($fields[1]); $f = shift(@fields); $f = splitField($f); print ANNOT "$f"; while($f = shift(@fields)) { if($f =~ /^bounded/) { $bounded = 1; } if($f =~ /^cds/) { $cds = 1; } if(!($f =~ /^bounded/ || $f =~ /^cds/)) { $f = splitField($f); print ANNOT "\t$f"; } } print ANNOT "\t$bounded\t$cds"; print ANNOT "\n"; print BED "$words[0]\t$words[3]\t$words[4]\t$id\t$score\t$words[6]\n"; } } close(IN); close(BED); close(ANNOT); } ./parseGff.pl *.gff cat beds/*.bed > affyHuEx1.bed hgLoadBed hg17 affyHuEx1 affyHuEx1.bed -strict cat annot/*.tab > affyHuEx1.annot.tab # Contents of affyHuEx1Annot.sql file CREATE TABLE affyHuEx1Annot ( numIndependentProbes smallint not null, probesetId int(11) not null, exonClustId int(11) not null, numNonOverlapProbes smallint not null, probeCount smallint not null, transcriptClustId int(11) not null, probesetType smallint not null, numXHybeProbe smallint not null, psrId int(11) not null, level varchar(10) not null, evidence varchar(255) not null, bounded smallint not null, cds smallint not null, PRIMARY KEY (probesetId) ); hg17S -A < affyHuEx1Annot.sql echo "load data local infile 'affyHuEx1.annot.tab' into table affyHuEx1Annot;" | hg17S -A # end AFFYHUEX1 track ########################################################################## # AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14) ssh hgwdev cd /cluster/data/hg17/bed/affyHumanExon echo "select * from affyHuEx1" | hgsql hg17 | \ tail +2 | awk 'BEGIN{OFS="\t"}{print $2,$3-1,$4,$5,$6,$7}' \ > affyHuEx1.fixed.bed hgLoadBed hg17 affyHuEx1 affyHuEx1.fixed.bed ########################################################################## # NSCAN composite track - (2005-09-29 markd) loaded proteins 2005-10-13 cd /cluster/data/hg17/bed/nscan/ # obtained NSCAN and NSCAN-EST predictions from michael brent's group # at WUSTL wget http://genome.cse.wustl.edu/predictions/human/hg17_nscan_mm5_9_14_2005/hg17_nscan_mm5_9_14_2005.tar.gz tar -zxf hg17_nscan_mm5_9_14_2005.tar.gz wget http://genome.cse.wustl.edu/predictions/human/NCBI35_NSCAN_EST_4-16-2005.tar gzip -9 NCBI35_NSCAN_EST_4-16-2005.tar # change protein fasta file to have transcript id in header foreach f (chr_ptx/*.ptx) awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix end foreach f (NCBI35_NSCAN_EST_4-16-2005/chr_ptx/*.ptx) awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix end # load tracks. Note that these have *utr features, rather than # exon featres. currently ldHgGene creates separate genePred exons # for these. ldHgGene -gtf -genePredExt hg17 nscanGene chr_gtf/chr*.gtf hgPepPred hg17 generic nscanPep chr_ptx/chr*.fix rm -rf chr_* *.tab ldHgGene -gtf -genePredExt hg17 nscanEstGene NCBI35_NSCAN_EST_4-16-2005/chr_gtf/chr*.gtf hgPepPred hg17 generic nscanEstPep NCBI35_NSCAN_EST_4-16-2005/chr_ptx/chr*.fix rm -rf NCBI35_NSCAN_EST_4-16-2005 *.tab # update trackDb; need a hg17-specific page to describe informants human/hg17/nscan.html human/hg17/trackDb.ra ########################################################################## # NHGRI DNASE I HYPERSENSITIVE SITES (2005-10-05 kate) # Submitted by Greg Crawford via web site, # http://research.nhgri.nih.gov/DNaseHS/May2005/ # In addition, a file containing the 'randoms' was FTP'ed by Greg # Submitted for hg16 -- lifted to hg17. # Details of hg16 data prep are in makeHg16.doc mkdir /cluster/data/hg17/bed/nhgri cd /cluster/data/hg17/bed/nhgri cp /cluster/data/hg16/bed/nhgri/hs.bed hs.hg16.bed liftOver hs.hg16.bed /gbdb/hg16/liftOver/hg16ToHg17.over.chain \ hs.hg17.bed hs.unmapped grep '^chr' hs.unmapped | wc -l # 8 unmapped hgLoadBed hg17 nhgriDnaseHs hs.hg17.bed # Loaded 14216 elements of size 5 checkTableCoords hg17 nhgriDnaseHs # UPDATE WGRNA TRACK (DONE, 2005-10-20, Fan) ssh hgwdev cd /cluster/data/hg17/bed mkdir wgRna-2005-10-20 cd wgRna-2005-10-20 # Received the data file, wgtrack_no_bin_oct2005.txt, from Michel Weber's email # (Michel.Weber@ibcg.biotoul.fr) # and place it under cd /cluster/data/hg17/bed/wgRna-2005-10-20. cp wgtrack_no_bin_oct2005.txt wgRna.tab vi wgRna.tab # edit wgRna.tab to take out the first line of data field labels. hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab # Compared to 8/24/05 data, a few records were changed. ########################################################################## # REBUILD hg17.gnfAtlas2Distance TABLE. SOMEHOW IT HAD MUCH FEWER RECORDS. (DONE 10/27/05, Fan) # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # Create expression distance table - takes about an hour hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 & # hgsql -e "select count(*) from gnfAtlas2Distance;" hg17 # row count changed to 32458000 ########################################################################## # BUILD ALLEN BRAIN TRACK (DONE 10/29/05 JK) # Make the working directory ssh hgwdev cd /cluster/data/hg17/bed mkdir allenBrain cd allenBrain # Remap the probe alignments from mm7 to hg17 zcat /cluster/data/mm7/bed/bedOver/mm7.hg17.over.chain.gz \ | pslMap -chainMapFile -swapMap \ /cluster/data/mm7/bed/allenBrain/allenBrainAli.psl stdin stdout | sort -k 14,14 -k 16,16n > unscored.psl pslRecalcMatch unscored.psl /cluster/data/hg17/nib \ /cluster/data/mm7/bed/allenBrain/allProbes.fa allenBrainAli.psl # Load the database hgsql hg17 < ~/kent/src/hg/lib/allenBrainUrl.sql hgsql hg17 -e 'load data local infile "/cluster/data/mm7/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;' hgLoadPsl hg17 allenBrainAli.psl mkdir /gbdb/hg17/allenBrain ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/hg17/allenBrain/allProbes.fa hgLoadSeq hg17 /gbdb/hg17/allenBrain/allProbes.fa # Make mapping between known genes and allenBrain hgMapToGene hg17 allenBrainAli -type=psl knownGene knownToAllenBrain ########################################################################## # BUILD NIBB IMAGE PROGES (DONE 11/07/05 JK) # Make directory on san for cluster job and copy in sequence ssh pk mkdir /san/sanvol1/scratch/hg17/nibbPics cd /san/sanvol1/scratch/hg17/nibbPics cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa . # Make parasol job dir and sequence list files mkdir run cd run mkdir psl ls -1 /cluster/sanvol1/scratch/hg17/nib/*.nib > genome.lst echo ../nibbImageProbes.fa > rna.lst # Create parasol gensub file file cat << '_EOF_' > gsub #LOOP blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl #ENDLOOP '_EOF_' # Create parasol batch gensub2 genome.lst mrna.lst gsub spec para create spec # Do para try/push/time etc. #Completed: 46 of 46 jobs #CPU time in finished jobs: 11818s 196.97m 3.28h 0.14d 0.000 y #IO & Wait Time: 145s 2.41m 0.04h 0.00d 0.000 y #Average job time: 260s 4.33m 0.07h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 1022s 17.03m 0.28h 0.01d #Submission to last job: 1060s 17.67m 0.29h 0.01d # Make sort and filter catDir psl | sort -k 10 \ | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \ | sort -k 14,14 -k 16,16n \ | sed 's/..\/..\/nib\/chr/chr/' \ | sed 's/.nib//' > ../nibbImageProbes.psl # Make bed file and copy in stuff ssh hgwdev cd /cluster/data/hg17/bed mkdir nibbPics cd nibbPics cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa . cp /san/sanvol1/scratch/hg17/nibbPics/nibbImageProbes.psl . # Load into database ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/hg17/nibbImageProbes.fa hgLoadSeq hg17 /gbdb/hg17/nibbImageProbes.fa hgLoadPsl hg17 nibbImageProbes.psl ########################################################################### # EXONIPHY WITH DOG (acs, 11/22/05) -- MM7, RN3, CANFAM2, HG17 # first build 4-way multiz alignment from syntenic nets (helps reduce # false positive predictions due to paralogous alignments) # (prepare mafNet files from syntenic nets and copy to # /cluster/bluearc/hg17/mafNetSyn; do this for mm7, rn3, canFam2, # and galGal2) # make output dir and run dir ssh pk cd /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2 mkdir -p mafSyn runSyn cd runSyn # create scripts to run multiz on cluster cat > oneMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 set multi = /scratch/$user/multiz.hg17Mm7Rn3CanFam2.$c set pairs = /cluster/bluearc/hg17/mafNetSyn # special mode -- # with 1 arg, cleanup if ($#argv == 1) then rm -fr $multi exit endif # special mode -- # with 3 args, saves an alignment file if ($#argv == 3) then cp $multi/$2/$c.maf $3 exit endif set s1 = $2 set s2 = $3 set flag = $4 # locate input files -- in pairwise dir, or multiple dir set d1 = $multi set d2 = $multi if (-d $pairs/$s1) then set d1 = $pairs endif if (-d $pairs/$s2) then set d2 = $pairs endif set f1 = $d1/$s1/$c.maf set f2 = $d2/$s2/$c.maf # write to output dir set out = $multi/${s1}${s2} mkdir -p $out # check for empty input file if (-s $f1 && -s $f2) then echo "Aligning $f1 $f2 $flag" /cluster/bin/penn/multiz.v10.5 $f1 $f2 $flag > $out/$c.tmp.maf echo "Ordering $c.maf" /cluster/bin/penn/maf_project $out/$c.tmp.maf hg17.$c > $out/$c.maf else if (-s $f1) then cp $f1 $out else if (-s $f2) then cp $f2 $out endif 'EOF' # << for emacs chmod +x oneMultiz.csh cat > allMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 oneMultiz.csh $c mm7 rn3 0 oneMultiz.csh $c mm7rn3 canFam2 1 # get final alignment file oneMultiz.csh $c mm7rn3canFam2 /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf #cleanup oneMultiz.csh $c 'EOF' # << for emacs chmod +x allMultiz.csh cat > gsub << 'EOF' #LOOP allMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2/mafSyn/$(root1).maf} #ENDLOOP 'EOF' # << for emacs cut -f 1 /cluster/data/hg17/chrom.sizes > chrom.lst set path = (/parasol/bin $path);rehash gensub2 chrom.lst single gsub jobList para create jobList # 46 jobs para try; para check para push # build chromosome-by-chromosome SS files cd /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2 mkdir run-ss-syn cd run-ss-syn mkdir -p /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn cat > makeSS.csh << 'EOF' #!/bin/csh -fe set c = $1 /cluster/bin/phast/msa_view -i MAF -o SS /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/hg17/chrom/$c.fa | gzip -c > /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$c.ss.gz 'EOF' # << for emacs chmod +x makeSS.csh rm -f jobList foreach chr (`cut -f 1 /cluster/data/hg17/chrom.sizes`) echo "makeSS.csh $chr" >> jobList end para create jobList # 46 jobs para try; para check para push # now train hmm, with indel model # note: commands below require bash # first get a clean set of genes for training (with --indel-strict) mkdir -p /cluster/data/hg17/bed/exoniphy/train cd /cluster/data/hg17/bed/exoniphy/train mkdir -p stats genes CHROMS="chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22" for chr in ${CHROMS} ; do echo $chr zcat /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$chr.ss.gz | clean_genes genes/refGene.$chr.gff - --stats stats/$chr.stats --conserved --indel-strict --groupby exon_id --offset3 4 --offset5 4 > genes/refGene.$chr.clean.gff done # get conserved noncoding seqs and add to GFFs mkdir -p cns for chr in ${CHROMS} ; do echo $chr featureBits -bed=cns/$chr.bed -chrom=$chr hg17 phastConsElementsPaper \!knownGene:exon:100 \!refGene:exon:100 \!mrna \!ensGene \!intronEst \!twinscan cp genes/refGene.$chr.clean.gff genes/refGene.$chr.withCNS.gff awk '{printf "%s\tphastCons\tCNS\t%d\t%d\t.\t.\t.\texon_id \"CNS.%s\"\n", $1, $2+1, $3, $4}' cns/$chr.bed >> genes/refGene.$chr.withCNS.gff done # now train HMM # note: actually have to unzip SS files before this step rm -f alns gffs for chr in ${CHROMS} ; do echo /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$chr.ss >> alns echo genes/refGene.$chr.withCNS.gff >> gffs done hmm_train -m '*alns' -c ~/phast/data/exoniphy/default.cm -g '*gffs' -R exon_id -i SS -I CDS,background,CNS,5\'splice,3\'splice,prestart -t "((hg17,(mm7,rn3)),canFam2)" > indels.hmm # training complete; now run exoniphy genome-wide # first need to split up alignments mkdir -p /cluster/data/hg17/bed/exoniphy/test/run-split cd /cluster/data/hg17/bed/exoniphy/test/run-split cat > doSplit.csh << 'EOF' #!/bin/csh -fe set c = $1 mkdir -p /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c /cluster/bin/phast/msa_split /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/hg17/chrom/$c.fa -i MAF --windows 100000,0 --between-blocks 5000 --min-informative 1000 --out-format SS --out-root /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c/$c --tuple-size 3 gzip /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c/$c*.ss 'EOF' # << for emacs chmod +x doSplit.csh rm -f jobList for file in /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/chr*.maf ; do echo doSplit.csh `basename $file .maf` >> jobList ; done para create jobList # 43 jobs para try; para check para push # now set up exoniphy run mkdir -p /cluster/data/hg17/bed/exoniphy/test/run-exoniphy cd /cluster/data/hg17/bed/exoniphy/test/run-exoniphy cp -p ../../train/indels.hmm /cluster/bluearc/hg17/exoniphy/training mkdir -p /cluster/bluearc/hg17/exoniphy/GFF cat > doExoniphy.sh << 'EOF' #!/usr/local/bin/bash root=`basename $1 .ss.gz` chrom=`echo $root | awk -F\. '{print $1}'` no=`echo $root | awk 'BEGIN{FS="[-.]"} {printf "%d\n", ($2+10000)/100000}'` if [ ! -d /cluster/bluearc/hg17/exoniphy/GFF/$chrom ] ; then mkdir -p /cluster/bluearc/hg17/exoniphy/GFF/$chrom fi zcat $1 | /cluster/bin/phast/exoniphy - --hmm /cluster/bluearc/hg17/exoniphy/training/indels.hmm --reflect-strand --extrapolate default --score --indels --alias "hg17=human; mm7=mouse; rn3=rat; canFam2=dog" --seqname $chrom --idpref $chrom.$no > /cluster/bluearc/hg17/exoniphy/GFF/$chrom/$root.gff 'EOF' # << for emacs chmod +x doExoniphy.sh rm -f jobList for dir in /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/* ; do find $dir -name '*.ss.gz' | awk '{printf "doExoniphy.sh %s\n", $1}' >> jobList ; done para create jobList # 27070 jobs para try; para check para push #Completed: 27059 of 27070 jobs #Crashed: 11 jobs #CPU time in finished jobs: 8573545s 142892.41m 2381.54h 99.23d 0.272 y #IO & Wait Time: 73412s 1223.54m 20.39h 0.85d 0.002 y #Average job time: 320s 5.33m 0.09h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 593s 9.88m 0.16h 0.01d #Submission to last job: 22823s 380.38m 6.34h 0.26d # crashed jobs all on random chroms, chrM, etc., and appear to be # due to all species not being present; okay to ignore # collect predictions and create track rm -f exoniphy.gff for dir in /cluster/bluearc/hg17/exoniphy/GFF/chr* ; do \ rm -f files.* tmp.gff ;\ find $dir -name "chr*.gff" > files ;\ split -l 1000 files files. ;\ for l in files.* ; do cat `cat $l` >> tmp.gff ; done ;\ refeature --sort tmp.gff >> exoniphy.gff ;\ done ldHgGene -genePredExt -gtf hg17 exoniphyDog exoniphy.gff # COW SYNTENY (Done, Heather, Dec. 2005) # Data from Harris A. Lewin ssh hgwdev cd /cluster/data/hg17/bed mkdir syntenyCow cd syntenyCow hgLoadBed -noBin hg17 syntenyCow syntenyCow.bed # add to kent/src/hg/makeDb/trackDb/human/hg17/trackDb.ra ########################################################################### # New Conservation track (WORKING 2005-12-15 kate) # Pairwise alignments needed for: monDom2, danRer3, bosTau2 # Use existing alignments for: # macaque_rheMac1 # rat_rn3 # mouse_mm7 # dog_canFam2 # chicken_galGal2 # xenopus_xenTro1 # fugu_fr1 # rabbit_oryCun1 # armadillo_dasNov1 # elephant_loxAfr1 # tenrec_echTel1 # tetraodon_tetNig1 ######################################################################### # BLASTZ danRer3 (DONE - 2005-12-20 kate) # Includes both randoms ssh pk mkdir /cluster/data/hg17/bed/blastz.danRer3.2005-12-20 cd /cluster/data/hg17/bed ln -s blastz.danRer3.2005-12-20 blastz.danRer3 cd blastz.danRer3 cat << 'EOF' > DEF # human target, zebrafish query export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # use parameters suggested for human-fish evolutionary distance # recommended in doBlastzChainNet.pl help # (previously used for hg16-fr1, danrer1-mm5) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q # TARGET: Human hg17 SEQ1_DIR=/san/sanvol1/scratch/hg17/nib SEQ1_SMSK=/cluster/bluearc/hg17/linSpecRep.notInZebrafish SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes # QUERY: zebrafish danRer3 # Use all chroms, including both randoms (chrUn and chrNA) SEQ2_DIR=/san/sanvol1/scratch/danRer3/nib SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers SEQ2_LEN=/cluster/bluearc/danRer3/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=1000 BASE=/cluster/data/hg17/bed/blastz.danRer3.2005-12-20 TMPDIR=/scratch/tmp 'EOF' # << happy emacs /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -stop=net \ `pwd`/DEF >& blastz.out & # mistakenly started this in blastz.danRer3.2005-12-18 dir -- # need to move DEF file and blastz.out to 2005-12-20 dir. # bogus stop at net step -- thinks it can't find chains # I'm just restarting there /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=net \ `pwd`/DEF >& blastz.2.out & # stopped because vsDanRer3 downloads already there from # previous run. ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/hg17/vsDanRer3" /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=download \ `pwd`/DEF >& blastz.3.out & # measurements ssh hgwdev "featureBits hg17 chainDanRer2Link" >& danRer2.fb; cat danRer2.fb # 70696998 bases of 2866216770 (2.467%) in intersection ssh hgwdev "featureBits hg17 chainDanRer3Link" >& danRer3.fb; cat danRer3.fb # 55625762 bases of 2866216770 (1.941%) in intersection # not sure why there's lower coverage from the newer assembly. # It's possibly due to different parameters used in the other # alignment. Rachel is experimenting with hg18/danRer3, and # if warranted, we might replace this later ######################################################################### # BLASTZ bosTau2 (DONE - 2005-12-19 kate) ssh pk mkdir /cluster/data/hg17/bed/blastz.bosTau2.2005-12-19 cd /cluster/data/hg17/bed rm blastz.bosTau2 ln -s blastz.bosTau2.2005-12-19 blastz.bosTau2 cd blastz.bosTau2 cat << 'EOF' > DEF # human vs. cow export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.x86_64 # using parameter used when not using lineage specific repeat # abridging. This parameter restricts the # matches used by # dynamic masking. (We can't currently use LSR repeat abridging # when either assembly sequence is in .2bit). BLASTZ_M=50 # TARGET: Human (hg17) SEQ1_DIR=/san/sanvol1/scratch/hg17/nib SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow (bosTau2) # chunk it as we can't do whole-genome on 2bits SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes SEQ2_CHUNK=300000000 SEQ2_LAP=10000 BASE=/cluster/data/hg17/bed/blastz.bosTau2.2005-12-19 TMPDIR=/scratch/tmp 'EOF' # << happy emacs # use chain parameters for "close" species /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF >& blastz.out & ssh hgwdev "featureBits hg17 chainBosTau1Link" >& bosTau1.fb; cat bosTau1.fb ssh hgwdev "featureBits hg17 chainBosTau2Link" >& bosTau2.fb; cat bosTau2.fb # swapping to get the lift over file in the other direction (Hiram) ssh pk mkdir /cluster/data/bosTau2/bed/blastz.hg17.swap cd /cluster/data/bosTau2/bed ln -s blastz.hg17.swap blastz.hg17 cd blastz.hg17.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/hg17/bed/blastz.bosTau2.2005-12-19/DEF > swap.out 2>&1 & # this failed during the load of the tables, but that is OK, we # just wanted the liftOver files from this # manually cleaned this up since the run faild during the MySQL # load due to out of space problems. These tables do not need to # be loaded anyway. sh kkstore02 cd /cluster/data/bosTau2/bed/blastz.hg17.swap rm -fr psl/ rm -fr axtChain/run/chain/ rm -f axtChain/noClass.net rm -fr axtChain/net/ rm -fr axtChain/chain/ ######################################################################### # BLASTZ rheMac2 (2006-02-08 kate) ssh pk mkdir /cluster/data/hg17/bed/blastz.rheMac2.2006-02-08 cd /cluster/data/hg17/bed ln -s blastz.rheMac2.2006-02-08 blastz.rheMac2 cd blastz.rheMac2 cat << 'EOF' > DEF # macaca mulatta vs. hg18 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin ALIGN=blastz-run BLASTZ=blastz.v7.x86_64 # TARGET - hg17 SEQ1_DIR=/san/sanvol1/scratch/hg17/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes # QUERY - macaca mulatta SEQ2_DIR=/san/sanvol1/scratch/rheMac2/rheMac2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_LEN=/san/sanvol1/scratch/rheMac2/rheMac2.sizes BASE=/san/sanvol1/scratch/hg17/blastz.rheMac2/ RAW=$BASE/raw TMPDIR=/scratch/tmp 'EOF' # << happy emacs # use chain parameters for "close" species /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF >& blastz.out & /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainRun \ `pwd`/DEF >& continueChainRun.out & # NOTE: must set -fileServer (e.g. to pk) if using base dir on SAN /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -fileServer=pk \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=chainMerge \ `pwd`/DEF >& continueChainMerge.out & # netClass was crashing as it expected a bin in the # unsplit gap table. Robert added the bin field. /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -fileServer=pk \ -continue=download \ `pwd`/DEF >& continueDownload.out & ssh hgwdev "featureBits hg17 chainRheMac1Link" >& rheMac1.fb; cat rheMac1.fb ssh hgwdev "featureBits hg17 chainRheMac2Link" >& rheMac2.fb; cat rheMac2.fb ssh kkstore02 cd /cluster/data/hg17/bed/blastz.rheMac2 cp -rp mafNet /san/sanvol1/scratch/hg17/mafNet/rheMac2 # SWAP CHAIN AND NET ALIGNMENTS OVER TO RHESUS (rheMac2) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS # (DONE, 2006-03-22, hartera) # Do the swap of hg17/rheMac2 alignments over to rheMac2 to produce # rheMac2/hg17 alignments. ssh pk cd /cluster/data/hg17/bed/blastz.rheMac2 # use chain parameters for "close" species /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF >& swap.log & # Took about 3 hours 40 minutes to run. ############################################################################# # 17-WAY MULTIZ ALIGNMENTS (DONE - 2005-12-20 kate) # # redo fix overlaps from xenTro1 and tetNig1 (2006-04-08 kate) # copy net mafs to cluster-friendly storage for multiz run (2006-01-25 kate) ssh kkstore01 cd /cluster/data/hg17/bed/blastz.monDom2 cp -rp mafNet /san/sanvol1/scratch/hg17/mafNet/monDom2 ssh kkstore02 cd /cluster/data/hg17/bed mkdir -p multiz17way.2005-12-20 ln -s multiz17way.2005-12-20 multiz17way cd multiz17way # copy MAF's to cluster-friendly server # These MAF's already on bluearc: # canFam2, fr1, galGal2, panTro1, rn3 mkdir -p /san/sanvol1/scratch/hg17/mafNet cd /san/sanvol1/scratch/hg17/mafNet ln -s /cluster/bluearc/hg17/mafNet/{*} . # copy others foreach s (rheMac1 oryCun1 dasNov1 \ loxAfr1 bosTau2 monDom1 xenTro1 tetNig1 danRer3) echo $s cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s end # a few more set s = echTel1 cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s set s = mm7 cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s set s = canFam2 cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s set s = rheMac2 cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s # thanks for the tree, Hiram! Taken from mm7 17way... # Hiram says this is derived from the latest ENCODE # tree, with some species removed and branch lengths # adjusted. The ENCODE tree from the Sept. freeze is: # ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/phylo/tree_4d.tba.v2.nh cd /cluster/data/hg17/bed/multiz17way cat << '_EOF_' > 17way.nh ((((((((( (human_hg17:0.006690,chimp_panTro1:0.007571):0.024272, macaque_rheMac2:0.0592):0.023960, ((rat_rn3:0.081728,mouse_mm7:0.077017):0.229273, rabbit_oryCun1:0.206767):0.1065):0.023026, (cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505, armadillo_dasNov1:0.149862):0.015994, (elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400, monodelphis_monDom2:0.371073):0.189124, chicken_galGal2:0.454691):0.123297, xenopus_xenTro1:0.782453):0.156067, ((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961, zebrafish_danRer3:0.782561):0.156067); '_EOF_' /cluster/bin/phast/draw_tree 17way.nh > 17way.ps /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt grep hg17 17way.distances.txt | sort -k3,3n | \ awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt # edit distances.txt to include featureBits, and chain parameters # from blastz run. cat distances.txt # 0.0143 chimp_panTro1 # 0.0902 macaque_rheMac2 # 0.2563 armadillo_dasNov1 # 0.2651 dog_canFam2 # 0.2677 elephant_loxAfr1 # 0.2766 cow_bosTau2 # 0.3682 rabbit_oryCun1 # 0.4226 tenrec_echTel1 # 0.4677 mouse_mm7 # 0.4724 rat_rn3 # use loose chain params and score from here, down (5000) # 0.7119 monodelphis_monDom1 # 0.9847 chicken_galGal2 # 1.4357 xenopus_xenTro1 # 1.6577 tetraodon_tetNig1 # 1.6983 fugu_fr1 # 1.7480 zebrafish_danRer3 # the order in the browser display will be by tree topology, # not by distance, so it will be: # >> # 0.0143 chimp_panTro1 # >> # 0.0902 macaque_rheMac2 # >> # 0.4677 mouse_mm7 # >> # 0.4724 rat_rn3 # >> # 0.3682 rabbit_oryCun1 # >> # 0.2651 dog_canFam2 # >> # 0.2766 cow_bosTau2 # >> # 0.2563 armadillo_dasNov1 # >> # 0.2677 elephant_loxAfr1 # >> # 0.4226 tenrec_echTel1 # >> # 0.7119 monodelphis_monDom1 # >> # 0.9847 chicken_galGal2 # >> # 1.4357 xenopus_xenTro1 # >> # 1.6577 tetraodon_tetNig1 # >> # 1.6983 fugu_fr1 # >> # 1.7480 zebrafish_danRer3 # make output dir and run dir ssh pk cd /cluster/data/hg17/bed/multiz17way.2005-12-20 # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 17way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir -p maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = hg17 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/mafNet rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == hg17) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp 'EOF' # << happy emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz17way.2005-12-20/maf/$(root1).maf} #ENDLOOP 'EOF' # << happy emacs awk '{print $1}' /cluster/data/hg17/chrom.sizes > chrom.lst # REDO FOR OVERLAPS (2006-04-07 kate) mv ../maf ../maf.old # edit spec file to fix maf dir path gensub2 chrom.lst single spec jobList para create jobList # 46 files para try para check para push para time > run.time # 36 hrs (not typical -- previous runs were ~16 hrs) # PHASTCONS CONSERVATION (2006-01-05 kate) # Redone when multiz redone to fix overlaps (2006-04-12) # This process is distilled from Hiram and Adam's experiments # on mouse (mm7) 17way track. Many parameters are now fixed, without # being experimentally derived, either because the experiments # were lengthy and produced similar results, or because they # weren't runnable given the alignment size. # These parameters are: # --rho # --expected-length # --target-coverage # Also, instead of generating cons and noncons tree models, # we use a single, pre-existing tree model -- Elliot Margulies' model # from the (37-way) ENCODE alignments. # # NOTE: Redone 3/20/06, adding rheMac2 to non-informative options, # by recommendation of Adam Siepel, to correct unwanted # high conservation in regions with primate-only alignments # NOTE: reusing cluster-friendly chrom fasta files created earlier #cd /cluster/data/hg17 #foreach f (`cat chrom.lst`) #echo $f #cp $f/*.fa /cluster/bluearc/hg17/chrom #end # Split chromosome MAF's into windows and use to generate # "sufficient statistics" (ss) files for phastCons input # NOTE: as the SAN fs has lotsa space, we're leaving these # big (temp) files unzipped, to save time during phastCons run. # Note also the larger chunk sizes from previous runs -- this # reduces run-time on the split, slows down the actual phastCons # enough so jobs don't crash (jobs are very quick, just a minute # or so), and according to Adam, will produce better results. # The previous small chunks were probably required by # the phyloFit step, which we are no longer using for the # human alignments. ssh pk mkdir /cluster/data/hg17/bed/multiz17way.2005-12-20/cons cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons cp /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod . # edit, changing rheMac1 -> rheMac2 mkdir run.split cd run.split set WINDOWS = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/hg17/bed/multiz17way.2005-12-20/maf set WINDOWS = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss cd $WINDOWS set c = $1 echo $c rm -fr $c mkdir $c /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \ -M /cluster/bluearc/hg17/chrom/$c.fa \ -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 echo "Done" >> $c.done 'EOF' # << happy emacs chmod +x doSplit.csh rm -f jobList foreach f (../../maf/*.maf) set c = $f:t:r echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList end para create jobList # 46 jobs para try para check para push # CPU time in finished jobs: 9511s 158.52m 2.64h 0.11d 0.000 y # IO & Wait Time: 5391s 89.85m 1.50h 0.06d 0.000 y # Average job time: 324s 5.40m 0.09h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2354s 39.23m 0.65h 0.03d # Submission to last job: 2358s 39.30m 0.66h 0.03d # check tree model on 5MB chunk, using params recommended by Adam, # (to verify branch lengths on 2X species) # he ok'ed the results -- not necessary for next human run ssh kolossus cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \ --tree "`cat ../tree-commas.nh`" \ /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss/chr7/chr7.115000658-120000000.ss \ -o phyloFit.tree # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cd .. mkdir run.cons cd run.cons cat > doPhast.csh << 'EOF' #!/bin/csh -fe set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set tmp = /scratch/tmp/$f mkdir -p $tmp set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp pushd $tmp > /dev/null /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative panTro1,rheMac2 \ --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/pp/$c $san/bed/$c sleep 1 mv $tmp/$f.pp $san/pp/$c mv $tmp/$f.bed $san/bed/$c rm -fr $tmp 'EOF' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << 'EOF' #LOOP doPhast.csh $(root1) $(file1) 14 .008 .28 #ENDLOOP 'EOF' # happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \ /cluster/data/hg17/bed/multiz17way/cons/run.cons/in.list popd gensub2 in.list single template jobList para create jobList # 333 jobs para try para check para push # NOTE: these jobs go fast -- some crashed apparently having # difficulty accessing input files. Just restart them and # they work #CPU time in finished jobs: 15520s 258.67m 4.31h 0.18d 0.000 y #IO & Wait Time: 15796s 263.27m 4.39h 0.18d 0.001 y #Average job time: 94s 1.57m 0.03h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 180s 3.00m 0.05h 0.00d #Submission to last job: 48266s 804.43m 13.41h 0.56d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -name "chr*.bed" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/hg17/bed/multiz17way/cons # load into database ssh hgwdev cd /cluster/data/hg17/bed/multiz17way/cons hgLoadBed -strict hg17 phastConsElements17way mostConserved.bed # Loaded 2212445 elements of size 5 # compare with previous tracks hgsql hg17 -e "select count(*) from phastConsElements10way" # 2011952 hgsql hg17 -e "select count(*) from phastConsElements" # 1601903 # Try for 5% overall cov, and 70% CDS cov (used elen=14, tcov=.008, rho=.28) featureBits hg17 -enrichment refGene:cds phastConsElements17way # refGene:cds 1.065%, phastConsElements17way 5.116%, both 0.759%, cover 71.27%, enrich 13.93x # compare with previous tracks featureBits hg17 -enrichment refGene:cds phastConsElements10way # refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x featureBits hg17 -enrichment refGene:cds phastConsElements # refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x # experiments # previous tracks featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements # refGene:cds 0.873%, phastConsElements 4.497%, both 0.630%, cover 72.10%, enrich 16.04x hgsql hg17 -e "select count(*) from phastConsElements where chrom='chr7'" # 81785 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements10way # refGene:cds 0.873%, phastConsElements10way 4.700%, both 0.602%, cover 68.94%, enrich 14.67x hgsql hg17 -e "select count(*) from phastConsElements10way where chrom='chr7'" # 102959 # len=13, cov=.007, rho=.27 # looks best -- similar chr7 measurements to previous tracks featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_007_27 # refGene:cds 0.874%, phastConsElements17way_13_007_27 4.854%, both 0.607%, cover 69.43%, enrich 14.31x hgsql hg17 -e "select count(*) from phastConsElements17way_13_007_27 where chrom='chr7'" featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_005_28 # refGene:cds 0.873%, phastConsElements17way_13_005_28 4.802%, both 0.612%, cover 70.12%, enrich 14.60x hgsql hg17 -e "select count(*) from phastConsElements17way_13_005_28 where chrom='chr7'" # 95203 # experiments with other parameters, below # len=15, cov=.10 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_10 # refGene:cds 0.873%, phastConsElements17way 7.989%, both 0.627%, cover 71.77%, enrich 8.98x hgsql hg17 -e "select count(*) from phastConsElements17way_15_10 where chrom='chr7'" # 217767 # => too much overall covg, and too many elements # len=15, cov=.05 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_05 # refGene:cds 0.873%, phastConsElements17way_15_05 6.880%, both 0.627%, cover 71.77%, enrich 10.43x hgsql hg17 -e "select count(*) from phastConsElements17way_15_05 where chrom='chr7'" # 166868 # len=15, cov=.01 # These values were used by Elliott for ENCODE featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_01 # refGene:cds 0.873%, phastConsElements17way_15_01 5.721%, both 0.628%, cover 71.89%, enrich 12.57x hgsql hg17 -e "select count(*) from phastConsElements17way_15_01 where chrom='chr7'" # 106034 # len=20, cov=.01 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_20_01 # refGene:cds 0.873%, phastConsElements17way_20_01 7.751%, both 0.634%, cover 72.56%, enrich 9.36x hgsql hg17 -e "select count(*) from phastConsElements17way_20_01 where chrom='chr7'" # 106005 # -> wrong direction on coverage # len=10, cov=.01 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_10_01 # refGene:cds 0.873%, phastConsElements17way_10_01 4.653%, both 0.616%, cover 70.48%, enrich 15.15x hgsql hg17 -e "select count(*) from phastConsElements17way_10_01 where chrom='chr7'" # 108279 # => looks good on coverage and element count, check smoothness in browser # => undersmoothed # len=10, cov=.05 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_10_05 # refGene:cds 0.873%, phastConsElements17way_10_05 5.365%, both 0.615%, cover 70.44%, enrich 13.13x hgsql hg17 -e "select count(*) from phastConsElements17way_10_05 where chrom='chr7'" # 178372 # => fragmented elements # len=15, cov=.005 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_005 # refGene:cds 0.873%, phastConsElements17way_15_005 5.444%, both 0.628%, cover 71.93%, enrich 13.21x hgsql hg17 -e "select count(*) from phastConsElements17way_15_005 where chrom='chr7'" # 90855 # len=20, cov=.005 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_20_005 # refGene:cds 0.873%, phastConsElements17way_20_005 7.373%, both 0.634%, cover 72.61%, enrich 9.85x hgsql hg17 -e "select count(*) from phastConsElements17way_20_005 where chrom='chr7'" # 91858 # len=17, cov=.005 rho=.3 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_17_005 # refGene:cds 0.873%, phastConsElements17way_17_005 6.126%, both 0.631%, cover 72.24%, enrich 11.79x hgsql hg17 -e "select count(*) from phastConsElements17way_17_005 where chrom='chr7'" # 91243 # len=12, cov=.01, rho=.28 -panTro1 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_12_01_28_p # refGene:cds 0.873%, phastConsElements17way_12_01_28_p 4.829%, both 0.612%, cover 70.02%, enrich 14.50x hgsql hg17 -e "select count(*) from phastConsElements17way_12_01_28_p where chrom='chr7'" # 123638 # len=13, cov=.01, rho=.25 -panTro1 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_01_25_p # refGene:cds 0.873%, phastConsElements17way_13_01_25_p 4.793%, both 0.594%, cover 67.99%, enrich 14.19x hgsql hg17 -e "select count(*) from phastConsElements17way_13_01_25_p where chrom='chr7'" # 131895 # len=14, cov=.008, rho=.28 featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_14_008_28 # refGene:cds 0.874%, phastConsElements17way_14_008_28 5.227%, both 0.615%, cover 70.37%, enrich 13.46x hgsql hg17 -e "select count(*) from phastConsElements17way_14_008_28 where chrom='chr7'" # 106071 # Create merged posterier probability file and wiggle track data files # pk is currently closer to the san than any other machine ssh pk cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ # sort by chromName, chromStart so that items are in numerical order # for wigEncode #next time try Angie's simpler sort, below find ./pp -name "chr*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode stdin phastCons17way.wig phastCons17way.wib # about 23 minutes for above # GOT HERE ON REDO # NOTE: remember to flip /gbdb link from cons.old to cons #foreach chr (`awk '{print $1}' /cluster/data/hg17/chrom.sizes`) #echo $chr set chr = chr22 cat `ls -1 pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \ | nice wigEncode stdin phastCons17wayNewChr22.wig phastCons17wayNewChr22.wib #end date cp -p phastCons17way.wi? /cluster/data/hg17/bed/multiz17way/cons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/hg17/bed/multiz17way/cons ln -s /cluster/data/hg17/bed/multiz17way/cons/phastCons17way.wib \ /gbdb/hg17/multiz17way/phastCons17way.wib hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 \ phastCons17way phastCons17way.wig ############################################################################ ## Run phastCons on Placental mammals ssh pk cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons mkdir placental mkdir run.cons.alt cd run.cons.alt # create pruned trees set tree_doctor = /cluster/bin/phast/tree_doctor sed 's/ /,/g' ../../species.lst # hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3 mkdir placental $tree_doctor ../elliotsEncode.mod \ --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1 \ > placental/placental.mod cat > doPhast.csh << 'EOF' #!/bin/csh -fe set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $6 set tmp = /scratch/tmp/hg17/$grp/$f mkdir -p $tmp set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons cp -p $san/ss/$c/$f.ss $grp/$grp.mod $tmp pushd $tmp > /dev/null /cluster/bin/phast/$MACHTYPE/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative panTro1,rheMac2 \ --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c sleep 1 mv $tmp/$f.pp $san/$grp/pp/$c mv $tmp/$f.bed $san/$grp/bed/$c rm -fr $tmp 'EOF' # << emacs happy chmod a+x doPhast.csh # Create gsub file cat > template << 'EOF' #LOOP # template for 5% cov doPhast.csh $(root1) $(file1) 14 .2 .28 placental #ENDLOOP 'EOF' cat > template << 'EOF' #LOOP # template same as vertebrate doPhast.csh $(root1) $(file1) 14 .008 .28 placental #ENDLOOP 'EOF' # happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \ /cluster/data/hg17/bed/multiz17way/cons/run.cons.alt/in.list popd gensub2 in.list single template jobList para create jobList # 333 jobs para try para check para push #.2 #CPU time in finished jobs: 15164s 252.74m 4.21h 0.18d 0.000 y #IO & Wait Time: 14852s 247.53m 4.13h 0.17d 0.000 y #Average job time: 90s 1.50m 0.03h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 170s 2.83m 0.05h 0.00d #Submission to last job: 86364s 1439.40m 23.99h 1.00d #.008 #CPU time in finished jobs: 13712s 228.53m 3.81h 0.16d 0.000 y #IO & Wait Time: 14407s 240.12m 4.00h 0.17d 0.000 y #Average job time: 84s 1.41m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 159s 2.65m 0.04h 0.00d #Submission to last job: 5291s 88.18m 1.47h 0.06d ssh pk cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/placental # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -name "chr*.bed" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/hg17/bed/multiz17way/cons/placental # load into database ssh hgwdev cd /cluster/data/hg17/bed/multiz17way/cons/placental hgLoadBed -strict hg17 phastConsElementsPlacental mostConserved.bed # .2 # Loaded 3775983 elements of size 5 # .008 # Loaded 1290060 elements of size 5 # compare with vertebrate cons hgsql hg17 -e "select count(*) from phastConsElements17way" # 2212445 featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental_14_2_28 featureBits hg17 -enrichment refGene:cds phastConsElements17way featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental # refGene:cds 1.070%, phastConsElementsPlacental 3.844%, both 0.667%, cover 62.32%, enrich 16.21x # refGene:cds 1.069%, phastConsElementsPlacental_14_008_28 3.844%, both 0.667%, cover 62.37%, enrich 16.22x featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental_14_2_28 #refGene:cds 1.070%, phastConsElementsPlacental_14_2_28 5.223%, both 0.691%, cover 64.62%, enrich 12.37x featureBits hg17 -enrichment refGene:cds phastConsElements17way #refGene:cds 1.070%, phastConsElements17way 5.116%, both 0.763%, cover 71.27%, enrich 13.93x # Create merged posterier probability file and wiggle track data files # pk is currently closer to the san than any other machine ssh pk cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/placental # sort by chromName, chromStart so that items are in numerical order # for wigEncode #next time try Angie's simpler sort, below find ./pp -name "chr*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode stdin phastConsPlacental.wig phastConsPlacental.wib # about 23 minutes for above # GOT HERE ON REDO # NOTE: remember to flip /gbdb link from cons.old to cons cp -p phastConsPlacental.wi? \ /cluster/data/hg17/bed/multiz17way/cons/placental # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/hg17/bed/multiz17way/cons/placental ln -s \ /cluster/data/hg17/bed/multiz17way/cons/placental/phastConsPlacental.wib \ /gbdb/hg17/multiz17way/phastConsPlacental.wib hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 \ phastConsPlacental phastConsPlacental.wig ############################################################################ ## Run phastCons on subgroups (mammals, placentals, and w/o low-cov) ssh pk cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons mkdir run.cons.groups cd run.cons.groups # create pruned trees set tree_doctor = /cluster/bin/phast/tree_doctor sed 's/ /,/g' ../../species.lst # hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3 $tree_doctor ../elliotsEncode.mod \ --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3 \ > vertebrate-high.mod $tree_doctor ../elliotsEncode.mod \ --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2,monDom2 \ > mammal-high.mod $tree_doctor ../elliotsEncode.mod \ --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2 \ > placental-high.mod $tree_doctor ../elliotsEncode.mod \ --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2 \ > mammal.mod $tree_doctor ../elliotsEncode.mod \ --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1 \ > placental.mod foreach f (*.mod) mkdir $f:r mv $f $f:r end cat > doPhast.csh << 'EOF' #!/bin/csh -fe set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $6 set tmp = /scratch/tmp/hg17/$grp/$f mkdir -p $tmp set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons cp -p $san/ss/$c/$f.ss $grp/$grp.mod $tmp pushd $tmp > /dev/null /cluster/bin/phast/$MACHTYPE/phastCons $f.ss $grp.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative panTro1,rheMac2 \ --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c sleep 1 mv $tmp/$f.pp $san/$grp/pp/$c mv $tmp/$f.bed $san/$grp/bed/$c rm -fr $tmp 'EOF' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << 'EOF' #LOOP doPhast.csh $(root1) $(file1) 14 .21 .28 placental-high doPhast.csh $(root1) $(file1) 14 .2 .28 placental doPhast.csh $(root1) $(file1) 14 .11 .28 mammal doPhast.csh $(root1) $(file1) 14 .1 .28 mammal-high doPhast.csh $(root1) $(file1) 14 .0028 .28 vertebrate-high #ENDLOOP 'EOF' # happy emacs # Create parasol batch for just chr7 (for test purposes) and run it pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons ls -1 ss/chr7/chr*.ss | sed 's/.ss$//' > \ /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups/in.list popd gensub2 in.list single template jobList para create jobList # 80 jobs para try para check para push # 24 minutes ## create Alt Most Conserved track ssh hgwdev cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons cat > loadAltElements.csh << 'EOF' set b = /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups foreach d (mammal* placental* vertebrate*) echo $d cd $d find ./bed -name "chr*.bed" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin \ > $b/$d/mostConserved.bed set table = `echo $d | perl -wpe "s/(.*)/phastConsElements\u$1/;s/-(.*)/\u$1/"` hgLoadBed -strict hg17 $table $b/$d/mostConserved.bed featureBits hg17 -enrichment refGene:cds -chrom=chr7 $table cd .. end 'EOF' csh loadAltElements.csh >&! loadAltElements.log & grep refGene loadAltElements.log | sort -n -k4 # refGene:cds 0.884%, phastConsElementsPlacentalHigh 4.828%, both 0.606%, cover 68.51%, enrich 14.19x # refGene:cds 0.884%, phastConsElementsMammal 4.869%, both 0.580%, cover 65.62%, enrich 13.48x # refGene:cds 0.884%, phastConsElementsMammalHigh 4.887%, both 0.624%, cover 70.60%, enrich 14.45x # refGene:cds 0.884%, phastConsElementsPlacental 4.904%, both 0.558%, cover 63.14%, enrich 12.88x # refGene:cds 0.884%, phastConsElementsVertebrateHigh 4.965%, both 0.652%, cover 73.74%, enrich 14.85x featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way # refGene:cds 0.884%, phastConsElements17way 4.851%, both 0.623%, cover 70.48%, enrich 14.53x ssh kkstore02 cd /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups cat > makeAltWiggle.csh << 'EOF' set b = `pwd` set san = /san/sanvol1/scratch/hg17/multiz17way/cons pushd $san foreach d (mammal* placental* vertebrate*) echo $d cd $d set table = `echo $d | perl -wpe 's/(.*)/phastCons\u$1/;s/-(.*)/\u$1/'` echo $table find ./pp -name "chr*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode stdin $table.wig $table.wib mv $table.wig $table.wib $b/$d cd .. end popd 'EOF' csh makeAltWiggle.csh >&! makeAltWiggle.log & ssh hgwdev cd /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups cat > loadAltWiggle.csh << 'EOF' set b = `pwd` foreach d (mammal* placental* vertebrate*) echo $d cd $d set table = `echo $d | perl -wpe 's/(.*)/phastCons\u$1/;s/-(.*)/\u$1/'` echo $table ln -s `pwd`/$table.wib /gbdb/hg17/multiz17way hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 $table $table.wig cd .. end 'EOF' csh loadAltWiggle.csh >&! loadAltWiggle.log & # Create parasol batch for just chr7 (for test purposes) and run it pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons ls -1 ss/chr7/chr*.ss | sed 's/.ss$//' > \ /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups/in.list popd gensub2 in.list single template jobList para create jobList # 80 jobs para try para check para push # 24 minutes # Downloads (2006-02-22 kate) ssh hgwdev cd /cluster/data/hg17/bed/multiz17way mkdir mafDownloads cd mafDownloads # upstream mafs cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags hg17 multiz17way up.bed upstream$i.maf \ -orgs=../species.lst nice gzip upstream$i.maf rm up.bed end date 'EOF' time csh mafFrags.csh >&! mafFrags.log & # ~1 hour ssh kkstore02 cd cluster/data/hg17/bed/multiz17way/mafDownloads cat > downloads.csh << 'EOF' date foreach f (../maf/chr*.maf) set c = $f:t:r echo $c nice gzip -c $f > $c.maf.gz end md5sum *.gz > md5sum.txt date 'EOF' time csh downloads.csh >&! downloads.log # ~2 hours # GOT HERE ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/hg17/multiz17way mkdir $dir ln -s /cluster/data/hg17/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/mm7/multiz17way/README.txt $dir # edit README # PHASTCONS SCORES DOWNLOADABLES FOR 17WAY (2006-03-20 kate) ssh kkstore02 cd /cluster/data/hg17/bed/multiz17way mkdir phastConsDownloads cd phastConsDownloads cat > downloads.csh << 'EOF' date cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/pp foreach chr (`awk '{print $1}' /cluster/data/hg17/chrom.sizes`) echo $chr cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \ | nice gzip -c \ > /cluster/data/hg17/bed/multiz17way/phastConsDownloads/$chr.gz end date 'EOF' csh downloads.csh >&! downloads.log & # ~20 minutes # << happy emacs ssh hgwdev cd /cluster/data/hg17/bed/multiz17way/phastConsDownloads md5sum *.gz > md5sum.txt set dir = /usr/local/apache/htdocs/goldenPath/hg17/phastCons17way mkdir $dir ln -s /cluster/data/hg17/bed/multiz17way/phastConsDownloads/{*.gz,md5sum.txt} $dir cp /usr/local/apache/htdocs/goldenPath/hg17/phastCons/README.txt $dir # edit # UPDATE MONKEY DOWNLOADS (2006-01-12 kate) # EXTRACT AXT'S AND MAF'S FROM THE RheMac1 NET # The chr1 was hugely oversized -- the other's were OK, but # axt's were numbered oddly. ssh kkstore2 cd /cluster/data/hg17/bed/blastz.rheMac1/axtChain gunzip -c hg17.rheMac1.net.gz | netSplit stdin humanNet gunzip -c hg17.rheMac1.all.chain.gz | chainSplit chain stdin mkdir ../axtNet.new ../mafNet.new cat > makeMaf.csh << 'EOF' foreach f (humanNet/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/rheMac1/rheMac1.2bit stdout | axtSort stdin ../axtNet.new/$c.axt axtToMaf ../axtNet.new/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/rheMac1/chrom.sizes \ ../mafNet.new/$c.maf -tPrefix=hg17. -qPrefix=rheMac1. end cp -rp ../mafNet.new /san/sanvol1/scratch/hg17/mafNet/rheMac1.new 'EOF' csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log pushd /san/sanvol1/scratch/hg17/mafNet rm -fr rheMac1 mv rheMac1.new rheMac1 popd rm -fr axtNet mv axtNet.new axtNet cd axtNet nice gzip *.axt md5sum *.gz > md5sum.txt # cleanup cd .. rm -fr chain humanNet ssh hgwdev ln -s /cluster/data/hg17/bed/blastz.rheMac1/axtNet \ /usr/local/apache/htdocs/goldenPath/rheMac1/axtNet # Request push to downloads server # UPDATE OPOSSUM DOWNLOADS (2006-01-17 kate) # Fix overlaps ssh kkstore2 cd /cluster/data/hg17/bed/blastz.monDom1 mv axtNet axtNet.old mv mafNet mafNet.old mkdir axtNet mafNet cd axtChain/chain nice gunzip *.gz cd .. nice gunzip -c human.net.gz | netSplit stdin humanNet cat > makeMaf.csh << 'EOF' foreach f (humanNet/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/monDom1/monDom1.2bit stdout | axtSort stdin ../axtNet/$c.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/monDom1/chrom.sizes \ ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=monDom1. end cp -rp ../mafNet /san/sanvol1/scratch/hg17/mafNet/monDom1.new 'EOF' csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log pushd /san/sanvol1/scratch/hg17/mafNet rm -fr monDom1 mv monDom1.new monDom1 popd rm -fr axtNet mv axtNet.new axtNet cd axtNet nice gzip *.axt md5sum *.gz > md5sum.txt # cleanup cd .. rm -fr chain humanNet ssh hgwdev ln -s /cluster/data/hg17/bed/blastz.monDom1/axtNet \ /usr/local/apache/htdocs/goldenPath/monDom1/axtNet # Request push to downloads server # UPDATE COW DOWNLOADS (2006-01-17 kate) # Fix overlaps ssh kkstore2 cd /cluster/data/bosTau1/bed/zb.hg17 mv axtNet axtNet.old mv mafNet mafNet.old mkdir axtNet mafNet cat > makeMaf.csh << 'EOF' foreach f (net/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt net/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/bosTau1/bosTau1.2bit stdout | axtSort stdin ../axtNet/$c.axt axtToMaf ../axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/bosTau1/chrom.sizes \ ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=bosTau1. end 'EOF' csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log cd axtNet nice gzip *.axt md5sum *.gz > md5sum.txt ssh hgwdev ln -s /cluster/data/hg17/bed/blastz.bosTau1/axtNet \ /usr/local/apache/htdocs/goldenPath/bosTau1/axtNet # Request push to downloads server ##### UPDATE hg17 knownToVisiGene (2006-01-21 galt) # Create table that maps between known genes and visiGene database # mapping to other species such as mouse, zebrafish, frog # requires visiGene probe track vgImageProbes be created first knownToVisiGene hg17 -fromProbePsl=vgImageProbes ##### UPDATE hg17 mmBlastTab (2006-01-22 galt) # Make the protein seqs from mm7.knownGenePep cd /cluster/data/hg17/bed/geneSorter/blastp mkdir mm7 cd mm7 pepPredToFa mm7 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /cluster/bluearc/blast229/formatdb -i known.faa -t known -n known mkdir -p /cluster/panasas/home/store/mm7/blastp/ cp known.* /cluster/panasas/home/store/mm7/blastp/ # Make parasol run directory ssh kk cd /cluster/data/hg17/bed/geneSorter/blastp/mm7 mkdir run cd run mkdir out # Make blast script # NOTE!! left off " b 1" from the end of the script because # we wanted to be able to get the near-best, not just the best one. cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/panasas/home/store/mm7/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... Completed: 7735 of 7735 jobs CPU time in finished jobs: 97096s 1618.26m 26.97h 1.12d 0.003 y IO & Wait Time: 564656s 9410.94m 156.85h 6.54d 0.018 y Average job time: 86s 1.43m 0.02h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 240s 4.00m 0.07h 0.00d Submission to last job: 1272s 21.20m 0.35h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg17/bed/geneSorter/blastp/mm7/run/out hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab Scanning through 7735 files Loading database with 33306 rows # changed mm6 to mm7 in src/hg/hgGene/hgGeneData/Human/hg17/otherOrgs.ra # and checked it in. # hgLoadBlastTab hg17 mmBlastTabTopN -maxPer=250 *.tab # (not done, this was only used for research) # hgLoadBlastTab hg17 mmBlastNearBest -topPercent=5 *.tab > hgMmNearBest.stats # (this will be the new way to go) Reading seq lengths from hg17.knownGenePep Finding max gene combined-coverage scores in 7735 files Scanning through 7735 files Loading database with 51520 rows ########################################################################## # MYTOUCH FIX - jen - 2006-01-24 sudo mytouch hg17 gencodeGeneClassJun05 0508301200.00 note - gencodeGeneClassJun05 table on dev only sudo mytouch hg17 knownGeneLink 0506050000.00 sudo mytouch hg17 ensGtp 0505241200.00 sudo mytouch hg17 ccdsInfo 0505241200.00 ########################################################################## # BLASTZ OPOSSUM monDom2 (WORKING - 2006-01-23 - Hiram) ssh kk # running out of disk space on store5: [hiram@kk /cluster/data/hg17/bed] df -h . #Filesystem Size Used Avail Use% Mounted on # 1.5T 1.3T 79G 95% /cluster/store5 # So, keep this elsewhere, and symlink it: cd /cluster/data/hg17/bed ln -s /cluster/store9/hg17/bed/blastzMonDom2.2006-01-23 \ ./blastzMonDom2.2006-01-23 ln -s blastzMonDom2.2006-01-23 blastz.monDom2 cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23 cat << '_EOF_' > DEF # human vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin BLASTZ=blastz.v7 # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human (hg17) SEQ1_DIR=/scratch/hg/hg17/bothMaskedNibs SEQ1_LEN=/cluster/data/hg17/chrom.sizes SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Opossum monDom2 SEQ2_DIR=/scratch/hg/monDom2/monDom2.2bit SEQ2_LEN=/scratch/hg/monDom2/chrom.sizes SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastzMonDom2.2006-01-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > blastz.out 2>&1 & # real 1122m44.191s # failed during the load of chr19 # hgLoadChain hg17 chr19_chainMonDom2 chr19.chain # Out of memory needMem - request size 56 bytes # So, go to kolossus: ssh kolossus # There isn't any hg17 db here yet, get it established with a # chromInfo and a 2bit sequence: hgsql -e "create database hg17;" mysql cd /cluster/data/hg17 twoBitInfo hg17.2bit stdout | awk '{printf "%s\t%s\t/gbdb/hg17/hg17.2bit\n", $1,$2}' \ > chromInfo.kolossus.tab hgsql hg17 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql hg17 \ -e 'load data local infile "chromInfo.kolossus.tab" into table chromInfo;' # it appears /gbdb/hg17 already exists ln -s /cluster/data/hg17/hg17.2bit /gbdb/hg17/hg17.2bit # now, loading only chr19: cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23/axtChain hgLoadChain hg17 chr19_chainMonDom2 chain/chr19.chain # real 33m31.689s # while that is running, back on hgwdev, get the other chains loaded ssh hgwdev cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23/axtChain cp loadUp.csh loadUp.noChr19.csh # change the foreach line to eliminate the chr19.chain: diff loadUp.csh loadUp.noChr19.csh < foreach f (*.chain) --- > foreach f (`ls *.chain | grep -v chr19.chain`) # And then run that script time ./loadUp.noChr19.csh > load.noChr19.out 2>&1 # real 76m8.757s # When the kolossus load finishes, email to push-request and ask # for the two tables to be pushed from kolossus to hgwdev: # chr19_chainMonDom2 # chr19_chainMonDom2Link # then, continuing: time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -continue=download -bigClusterHub=pk -chainMinScore=5000 \ -chainLinearGap=loose `pwd`/DEF > download.out 2>&1 & # real 2m42.505s # now, back on kolossus to run a featurebits time featureBits hg17 chainMonDom2Link >fb.hg17.chainMonDom2Link 2>&1 # 355119482 bases of 2866216770 (12.390%) in intersection featureBits hg17 chainMonDom1Link # 456069062 bases of 2866216770 (15.912%) in intersection # Then, to swap the results: ssh kk cd /cluster/data/hg17/bed/blastz.monDom2 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=5000 \ -chainLinearGap=loose `pwd`/DEF > swap.out 2>&1 & # running 2006-01-30 11:25 # real 47m27.082s # failed during the load - as with the Hg18 experiment, something # is really huge about these results. ######################################################################### # BUILD MAF ANNOTATION FOR MULTIZ17WAY (kate 2006-02-16) # Redo to fix overlaps (2006-04-09 kate) # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd) ssh kkstore01 cd /cluster/data/rheMac2 twoBitInfo -nBed rheMac2.2bit rheMac2.N.bed ssh kkstore02 cd /cluster/data/hg17/bed/multiz17way mkdir anno cd anno mkdir maf run cd run rm sizes nBeds foreach i (`cat /cluster/data/hg17/bed/multiz17way/species.lst`) ln -s /cluster/data/$i/chrom.sizes $i.len ln -s /cluster/data/$i/$i.N.bed $i.bed echo $i.bed >> nBeds echo $i.len >> sizes end rm jobs.csh echo date > jobs.csh foreach i (../../maf/*.maf) echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg17/hg17.2bit ../maf/`basename $i` >> jobs.csh echo "echo $i" >> jobs.csh end echo date >> jobs.csh # do smaller jobs first tac jobs.csh > jobsRev.csh mv jobsRev.csh jobs.csh csh jobs.csh >&! jobs.log & # 1.5 hrs. # 9 hours for redo -- something wrong ? ssh hgwdev cd /cluster/data/hg17/bed/multiz17way/anno/maf mkdir -p /gbdb/hg17/multiz17way/anno/maf ln -s /cluster/data/hg17/bed/multiz17way/anno/maf/*.maf \ /gbdb/hg17/multiz17way/anno/maf cat > loadMaf.csh << 'EOF' date hgLoadMaf -pathPrefix=/gbdb/hg17/multiz17way/anno/maf \ hg17 multiz17way date 'EOF' csh loadMaf.csh >&! loadMaf.log & # load summary table on kolossus, as it crashes on hgwdev ssh kolossus cd /cluster/data/hg17/bed/multiz17way/anno/maf cat *.maf | \ nice hgLoadMafSummary hg17 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 multiz17waySummary stdin # Created 3212623 summary blocks from 114139253 components and 17522217 mafs from stdin # request push to hgwdev # Dropped unused indexes (2006-05-09 kate) # NOTE: this is not required in the future, as the loader # has been fixed to not generate these indexes hgsql hg17 -e "alter table multiz17waySummary drop index chrom_2" hgsql hg17 -e "alter table multiz17waySummary drop index chrom_3" ssh kkstore02 cd /cluster/data/hg17/bed/multiz17way set sanDir = /san/sanvol1/scratch/hg17/multiz17way/frames mkdir -p $sanDir/maf cp -rp maf/* $sanDir/maf mkdir frames cd frames cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames . cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile . #edit Makefile to correct species names and set and sanDir ssh hgwdev cd /cluster/data/hg17/bed/multiz17way/frames make getGenes >&! getGenes.log & # ~1 minute make getFrames >&! getFrames.log & # ~2 hours # NOTE: if jobs get hung up (e.g. running for hours, when # they should run for minutes, do 'para stop' so that # the 'para make' can restart the job make loadDb >&! loadDb.log & ### # rebuild frames to get bug fix, using 1-pass maf methodology # (2006-06-09 markd) ssh kkstore02 cd /cluster/data/hg17/bed/multiz17way/frames mv mafFrames/ mafFrames.old nice tcsh # easy way to get process niced (cat ../maf/*.maf | genePredToMafFrames hg17 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg17 genes/hg17.gp.gz mm7 genes/mm7.gp.gz oryCun1 genes/oryCun1.gp.gz panTro1 genes/panTro1.gp.gz rheMac2 genes/rheMac2.gp.gz rn3 genes/rn3.gp.gz xenTro1 genes/xenTro1.gp.gz | gzip >multiz17way.mafFrames.gz)>&log& ssh hgwdev cd /cluster/data/hg17/bed/multiz17way/frames hgLoadMafFrames hg17 multiz17wayFrames multiz17way.mafFrames.gz >&log& # EXTRACT LINEAGE-SPECIFIC REPEATS FOR RAT (DONE 2/8/06 angie) ssh kolossus mkdir /cluster/data/hg17/rmsk cd /cluster/data/hg17/rmsk ln -s ../*/chr*.fa.out . # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. # Even though we already have the human-mouse linSpecReps, # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl # additions. So add mouse, then ignore it. # Rat in extra column 1, Mouse in extra column 2 foreach outfl ( *.out ) echo "$outfl" /cluster/bluearc/RepeatMasker/DateRepeats \ ${outfl} -query human -comp rat -comp mouse end # Now extract rat (extra column 1), ignore mouse. cd .. mkdir linSpecRep.notInRat foreach f (rmsk/*.out_rat*_mus-musculus) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInRat/$base.out.spec end # Distribute and clean up. rsync -av linSpecRep.notInRat /san/sanvol1/scratch/hg17/ rm -r rmsk # BLASTZ/CHAIN/NET RN4 (DONE 2/10/06 angie) ssh kkstore01 mkdir /cluster/data/hg17/bed/blastz.rn4.2006-02-08 cd /cluster/data/hg17/bed/blastz.rn4.2006-02-08 cat << '_EOF_' > DEF # human vs. rat BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/san/sanvol1/scratch/hg17/nib SEQ1_SMSK=/san/sanvol1/scratch/hg17/linSpecRep.notInRat SEQ1_LEN=/cluster/data/hg17/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Rat SEQ2_DIR=/san/sanvol1/scratch/rn4/nib SEQ2_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman SEQ2_LEN=/cluster/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastz.rn4.2006-02-08 '_EOF_' # << for emacs doBlastzChainNet.pl DEF -chainLinearGap medium \ -bigClusterHub pk -smallClusterHub pk -workhorse pk \ -blastzOutRoot /san/sanvol1/scratch/blastzHg17Rn4Out >& do.log & tail -f do.log rm -f /cluster/data/hg17/bed/blastz.rn4 ln -s blastz.rn4.2006-02-08 /cluster/data/hg17/bed/blastz.rn4 # UPDATE WGRNA TRACK (DONE, 2006-02-15, Fan) ssh hgwdev cd /cluster/data/hg17/bed mkdir wgRna-2006-02-15 cd wgRna-2006-02-15 # Received the data file, wg_track_hg17_feb2006_completed.txt, from Michel Weber's email # (Michel.Weber@ibcg.biotoul.fr) # and place it under cd /cluster/data/hg17/bed/wgRna-2006-02-15. cp -p wg_track_hg17_feb2006_completed.txt wgRna.tab hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab # Compared to previous data, 2 records deleted, 27 records added. ######################################################################## # BLASTZ Opossum monDom4 (DONE - 2006-02-21 - 2006-02-26 - Hiram) ssh pk mkdir /cluster/data/hg17/bed/blastzMonDom4.2006-02-21 cd /cluster/data/hg17/bed ln -s blastzMonDom4.2006-02-21 blastz.monDom4 cd blastzMonDom4.2006-02-21 cat << '_EOF_' > DEF # human vs. opossum export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin BLASTZ=blastz.v7.x86_64 # settings for more distant organism alignments BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Human (hg17) SEQ1_DIR=/scratch/hg/hg17/bothMaskedNibs SEQ1_LEN=/cluster/data/hg17/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: Opossum monDom4 SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg17/bed/blastzMonDom4.2006-02-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits hg17 chainMonDom4Link \ > fb.hg17.chainMonDom4Link 2>&1 time nice -n +19 featureBits monDom4 chainHg17Link \ > fb.monDom4.chainHg17Link 2>&1 ######################################################################## ## Measuring MonDom4 chain pile ups (DONE - 2006-02-26 - Hiram) ssh kkstore02 cd /cluster/data/hg17/bed/blastz.monDom4/axtChain # extract coordinates on the target genome of the chains zcat hg17.monDom4.all.chain.gz | grep "^chain " \ | awk '{printf "%s\t%s\t%s\t%s\t%s\n", $3, $6, $7, $5, $2}' \ | gzip -c > target.chain.bed.gz # turn that into a wiggle graph with bedItemOverlapCount # use HGDB_CONF for read-only access to the hg17 DB in bedItemOverlapCount # it wants to read chromInfo ... export HGDB_CONF=~/.hg.conf.read-only # ignore chains longer than 1,000,000 zcat target.chain.bed.gz | awk '$3-$2<1000000 {print}' \ | sort -k1,1 -k2,2n \ | bedItemOverlapCount hg17 stdin \ | wigEncode stdin monDom4PileUps.wig monDom4PileUps.wib # Do the same for the query coordinates to find out where these # chains are coming from zcat hg17.monDom4.all.chain.gz | grep "^chain " \ | awk '{printf "%s\t%s\t%s\t%s\t%s\n", $8, $11, $12, $10, $2}' \ | gzip -c > query.chain.bed.gz zcat query.chain.bed.gz | awk '$3-$2<1000000 {print}' \ | sort -k1,1 -k2,2n \ | bedItemOverlapCount monDom4 stdin \ | wigEncode stdin hg17PileUps.wig hg17PileUps.wib # load those wiggles ssh hgwdev cd /cluster/data/hg17/bed/blastz.monDom4/axtChain ln -s `pwd`/monDom4PileUps.wib /gbdb/hg17/wib ln -s `pwd`/hg17PileUps.wib /gbdb/monDom4/wib hgLoadWiggle -verbose=2 hg17 monDom4PileUp monDom4PileUps.wig hgLoadWiggle -verbose=2 monDom4 hg17PileUps hg17PileUps.wig # add wiggle track type entries to the respective trackDb.ra files # UPDATE hg17 knownToVisiGene (2006-03-07 galt) # Create table that maps between known genes and visiGene database # mapping to other species such as mouse, zebrafish, frog # requires visiGene probe track vgImageProbes be created first knownToVisiGene hg17 -fromProbePsl=vgImageProbes ############################################################################ # Add Landmark track (2006-03-08 giardine) # Note: This track is for regulatory regions and other landmarks that are not #included in other tracks. It is being gathered from the locus experts #that are contributing data to the Human Mutation track. This should #be helpful in understanding the data in the mutation track. #table definitions for autoSql autoSql landmark.as landmark -dbLink #change index on bin to normal index not primary key #move bin in struct so works as bed 4+ #copy autoSql files to hg/lib and hg/inc (add .o file to makefile) #cat together landmark files from sources in landmark.bed then sort grep "^chr" landmark.bed | sort -k1,1 -k2,2n > sortedLandmark.bed #loading hgsql hg17 < landmark.sql hgLoadBed hg17 landmark sortedLandmark.bed -noSort -oldTable -tab #add to trackDb.ra file (human hg17 level) #changed landmark track to provide links and attributes in prep for ORegAnno #data. Got set of test data by grabbing their .gff file used for custom #tracks and converting to bed, then to landmarks format. cd humPhen/landmarkData/June06/ #convert data to new formats then cat newLandmark.txt landmarkORA.txt > allLandmarks.txt grep "^chr" allLandmarks.txt | sort -k1,1 -k2,2n > sortedAllLandmark.txt #start new tables cd humPhen/kent/src/hg/lib/ autoSql landmark.as landmark -dbLink #move bin in .h file to end of structure, to make load work mv landmark.h ../inc/landmark.h #change primary key to indexes where not unique, add index on landmarkId #limit name, landmarkType, raKey size to 64 hgsql -e "drop table landmark;" hg17 hgsql hg17 < landmark.sql cd ~giardine/humPhen/landmarkData/June06/ hgLoadBed hg17 landmark sortedAllLandmark.txt -noSort -oldTable -tab hgsql hg17 load data local infile "landmarkAttrORA.txt" into table landmarkAttr; load data local infile "landmarkAttrLinkORA.txt" into table landmarkAttrLink; load data local infile "landmarkAttrCat.txt" into table landmarkAttrCat; cd ../../kent/src/ make clean make libs cd hg make cgi cd makeDb/trackDb make DBS=hg17 update #test in hgwdev-giardine #redo landmarks, moving categories out of database convertORAformat < ORegAnnoBed #start new tables cd humPhen/kent/src/hg/lib/ autoSql landmark.as landmark -dbLink #move bin in .h file to end of structure, to make load work mv landmark.h ../inc/landmark.h #change primary key to indexes, add primary key on landmarkId #limit name, landmarkType, raKey size to 64 #only need to reload attributes rest of data & tables same hgsql -e "drop table landmarkAttr;" hg17 hgsql -e "drop table landmarkAttrCat;" hg17 cd ../../../../landmarkData/June06/ hgsql hg17 #cut and paste in create table landmarkAttr load data local infile "landmarkAttrORA.txt" into table landmarkAttr; #Records: 2028 Deleted: 0 Skipped: 0 Warnings: 8 ??? cd ../../kent/src/ make clean make libs cd hg make cgi cd makeDb/trackDb make DBS=hg17 update #test in hgwdev-giardine ############################################################################ # hg15 -> hg17 LIFTOVER CHAINS (STARTED 3/9/06, DONE 3/10/06 Fan) # I used a size of 10kb instead of 3kb for the split (blat query) sizes in # hg17. This had a huge affect on the amount of hits in the blat, which # then had a huge effect on the amount of chains. I should also mention # that hg17 chromosomes chr1 and chr2 were split further # into more than a single query file. This helped a LOT in avoiding # cluster hippos classically associated with those chroms. ######## LIFTOVER PREPARATION # Split up hg17 ssh pk cd /san/sanVol1/scratch/hg17 mkdir -p liftSplits/{split,lift} bash for fa in /cluster/data/hg17/?{,?,*hap*}/*.fa; do c=`basename $fa .fa` echo $c faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c done mkdir -p biggerSplits/split cd biggerSplits/ ln -s ../liftSplits/lift cd split/ ln -s ../../liftSplits/split/* . faSplit sequence chr1.fa 5 chr1_ faSplit sequence chr2.fa 5 chr2_ rm chr{1,2}.fa # Make some dirs cd /san/sanVol1/scratch mkdir -p hg15 # Copy 11.ooc files to hg15 subdirectory. cp -p /cluster/store5/gs.16/build33/11.ooc hg15 ## First, copy over scripts. (Already done before) # mkdir -p /san/sanVol1/scratch/fan # cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan # cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan ######## LIFTOVER BLATTING # HG15 ssh pk cd /cluster/data/hg15 # makeLoChain-align hg15 /scratch/hg/hg15/bothMaskedNibs hg17 \ makeLoChain-align hg15 /scratch/hg/hg15/chromTrfMixedNib hg17 \ /san/sanVol1/scratch/hg17/biggerSplits/split cd bed mv blat.hg17.2006-03-09 /san/sanVol1/scratch/hg15 cd /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/run/ sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg15ToHg17"}' > newspec para create newspec para try para push # Saw some failures, keep pushing again, they finally all finished. # The problems were all from one node. # Used "para remove machine ..." to remove that node from the cluster. # Completed: 2376 of 2376 jobs # CPU time in finished jobs: 626355s 10439.25m 173.99h 7.25d 0.020 y # IO & Wait Time: 49512s 825.20m 13.75h 0.57d 0.002 y # Average job time: 284s 4.74m 0.08h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3693s 61.55m 1.03h 0.04d # Submission to last job: 4165s 69.42m 1.16h 0.05d ######## LIFTOVER CHAINING # LIFTING ssh pk cd /san/sanVol1/scratch/fan cp mm7SplitLift.sh hg17SplitLift.sh # change andy to fan, mm7 to hg17, and chrX to chr2, and remove chrUn_random vi hg17SplitLift.sh cat << 'EOF' > hg17ChainMergeSplit.sh #!/bin/bash cp -r chainRaw/ /scratch/fan/hg17Lifts pushd /scratch/fan/hg17Lifts mkdir chain /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin cp -r chain `dirs +1` rm -rf chain chainRaw 'EOF' chmod +x hg17ChainMergeSplit.sh # HG15 cd /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/raw /san/sanVol1/scratch/fan/hg17SplitLift.sh cd ../ mkdir chainRun chainRaw cd chainRun cat > gsub << 'EOF' #LOOP /cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg15/chromTrfMixedNib /san/sanVol1/scratch/hg17/nib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP 'EOF' ls -1S ../psl/*.psl > in.lst gensub2 in.lst single gsub spec para create spec para push para time # Completed: 46 of 46 jobs # CPU time in finished jobs: 3546s 59.10m 0.98h 0.04d 0.000 y # IO & Wait Time: 895s 14.92m 0.25h 0.01d 0.000 y # Average job time: 97s 1.61m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 270s 4.50m 0.07h 0.00d # Submission to last job: 270s 4.50m 0.07h 0.00d ######### CHAINMERGE/NET/NETSUBSET ssh kolossus mkdir -p /scratch/fan/hg17Lifts cd /scratch/fan/hg17Lifts cp -r /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/chainRaw/ . mkdir chain /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin # about 30 minutes. cp -rp chain /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/ rm -rf chain rm -rf chainRaw ssh pk cd /san/sanvol1/scratch/fan cat << 'EOF' > netOver.sh #!/bin/bash chain=$1 chrom=`basename $chain .chain` sizesHGOld=$2 sizesHG17=/cluster/data/hg17/chrom.sizes chainDir=`dirname $chain` blatDir=`dirname $chainDir` net=${blatDir}/net/${chrom}.net over=${blatDir}/over/${chrom}.over mkdir -p ${blatDir}/{over,net} /cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG17 $net /dev/null /cluster/bin/x86_64/netChainSubset $net $chain $over 'EOF' # << emacs chmod +x netOver.sh mkdir netRun cd netRun/ find /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/chain -name "*.chain" \ | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg15/chrom.sizes"}' >> spec para create spec para push para time # Completed: 44 of 44 jobs # CPU time in finished jobs: 427s 7.12m 0.12h 0.00d 0.000 y # IO & Wait Time: 248s 4.13m 0.07h 0.00d 0.000 y # Average job time: 15s 0.26m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 29s 0.48m 0.01h 0.00d # Submission to last job: 46s 0.77m 0.01h 0.00d # seems much faster than mm7. ########## FINISHING ssh hgwdev # HG15 cd /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/over cat * >> ../hg15ToHg17.over.chain cd ../ rm -rf psl/ net/ chain/ chainRaw/ over/ cd ../ cp -rp blat.hg17.2006-03-09/ /cluster/data/hg15/bed cd /cluster/data/hg15/bed ln -s blat.hg17.2006-03-09 blat.hg17 ln -s `pwd`/blat.hg17/hg15ToHg17.over.chain liftOver/hg15ToHg17.over.chain ln -s `pwd`/liftOver/hg15ToHg17.over.chain /gbdb/hg15/liftOver/hg15ToHg17.over.chain mkdir -p /usr/local/apache/htdocs/goldenPath/hg15/liftOver cd /usr/local/apache/htdocs/goldenPath/hg15/liftOver cp /gbdb/hg15/liftOver/hg15ToHg17.over.chain . gzip hg15ToHg17.over.chain hgAddLiftOverChain hg15 hg17 /gbdb/hg15/liftOver/hg15ToHg17.over.chain # UPDATED hg17.knownToVisiGene (2006-03-14 galt) # after making sure hg17.vgAllProbes was up to date (see makeVisiGene.doc) ssh hgwdev knownToVisiGene hg17 -fromProbePsl=vgAllProbes ######################################################################## ### microRNA targets tracks (DONE - 2006-03-17 - 2006-04-27 - Hiram) ### from: http://pictar.bio.nyu.edu/ Rajewsky Lab ### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu ### Yi-Lu Wang ylw205@nyu.edu ### dg@thp.Uni-Koeln.DE ssh hgwdev mkdir /cluster/data/hg17/bed/picTar cd /cluster/data/hg17/bed/picTar wget --timestamping \ 'http://pictar.bio.nyu.edu/ucsc/new_mammals_bed' -O newMammals.bed wget --timestamping \ 'http://pictar.bio.nyu.edu/ucsc/new_mammals_chicken_bed' \ -O newMammalsChicken.bed grep -v "^track" newMammals.bed \ | hgLoadBed -strict hg17 picTarMiRNA4Way stdin # Loaded 205263 elements of size 9 grep -v "^track" newMammalsChicken.bed \ | hgLoadBed -strict hg17 picTarMiRNA5Way stdin # Loaded 43081 elements of size 9 nice -n +19 featureBits hg17 picTarMiRNA4Way # 608549 bases of 2866216770 (0.021%) in intersection nice -n +19 featureBits hg17 picTarMiRNA5Way # 109059 bases of 2866216770 (0.004%) in intersection ############################################################################ # dbSNP BUILD 125 (Heather, March 2006) # Set up directory structure ssh kkstore02 cd /cluster/data/dbSnp mkdir 125 cd 125 mkdir shared mkdir shared/data mkdir shared/schema mkdir organisms mkdir organisms/human_9606 mkdir organisms/human_9606/rs_fasta mkdir organisms/human_9606/database mkdir organisms/human_9606/database/organism_data mkdir organisms/human_9606/database/organism_data/hg17 mkdir organisms/human_9606/database/schema # Get data from NCBI # Shared data includes data dictionary, # Shared data includes defined types such as validity, class, function, locType # Actually this is independent of hg17 build and should go in separate makeDoc cd shared/data ftp ftp.ncbi.nih.gov cd snp/database/organism_shared_data mget *.gz cd ../schema ftp ftp.ncbi.nih.gov cd snp/database/schema/shared_schema mget *.gz # using headers of fasta files for molType, class and observed cd ../organisms/human_9606/rs_fasta ftp ftp.ncbi.nih.gov cd snp/organisms/human_9606/rs_fasta mget *.gz cd ../database/organism_data/hg17 ftp ftp.ncbi.nih.gov cd snp/organisms/human_9606/database/organism_data # ContigLoc table has coords, orientation, loc_type, and refNCBI allele get b125_SNPContigLoc_35_1.bcp.gz # ContigLocusId has function get b125_SNPContigLocusId_35_1.bcp.gz get b125_ContigInfo_35_1.bcp.gz # MapInfo has alignment weights get b125_SNPMapInfo_35_1.bcp.gz # SNP has validation status and heterozygosity get SNP.bcp.gz # done with FTP # rename mv b125_SNPContigLoc_35_1.bcp.gz ContigLoc.gz mv b125_SNPContigLocusId_35_1.bcp.gz ContigLocusId.gz mv b125_ContigInfo_35_1.bcp.gz ContigInfo.gz mv b125_SNPMapInfo_35_1.bcp.gz MapInfo.gz mv SNP.bcp.gz SNP.gz # edit table descriptions cd /cluster/data/dbSnp/125/organisms/human_9606/database/schema # get CREATE statements from human_9606_table.sql for our 5 tables # store in table.tmp # convert and rename tables sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp rm table.tmp sed -f 'tableRename.sed' table2.tmp > table.sql rm table2.tmp # get header lines from rs_fasta cd /cluster/data/dbSnp/125/organisms/human_9606/rs_fasta /bin/csh gnl.csh # load on kkr5u00 ssh kkr5u00 hgsql -e mysql 'create database dbSnpHumanBuild125' cd /cluster/data/dbSnp/125/organisms/human_9606/database/schema hgsql dbSnpHumanBuild125 < table.sql cd ../organism_data/hg17 /bin/csh load.csh # note rowcount # ContigLoc 24135144 # SNP 10430754 # MapInfo 10271016 # ContigLocusId 9539145 # create working /scratch dir cd /scratch/snp mkdir 125 cd 125 mkdir human cd human # get hg17 ctgPos, load into dbSnpHumanBuild125, compare contig list between ctgPos and ContigInfo # get gnl files cp /cluster/data/dbSnp/125/organisms/human_9606/rs_fasta/*.gnl . # examine ContigInfo for group_term and edit pipeline.csh # use "ref_haplotype" # filter ContigLoc into ContigLocFilter # this gets rid of alternate assemblies and poor quality alignments # uses ContigInfo and MapInfo (weight == 10 || weight == 3) # assumes all contigs are positively oriented # will abort if not true mysql> desc ContigLocFilter; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromName | varchar(32) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | phys_pos_from | int(11) | NO | | | | # | phys_pos | varchar(32) | YES | | NULL | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter dbSnpHumanBuild125 ref_haplotype # note rowcount # ContigLocFilter 10113426 # how many are positive strand? hopefully 90% mysql> select count(*) from ContigLocFilter where orientation = 0; # 9161012 # filter ContigLocusId into ContigLocusIdFilter /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter dbSnpHumanBuild125 ref_haplotype # note rowcount # ContigLocusIdFilter 5352542 # condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions) # assumes SNPs are in numerical order # will errAbort if not true /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense dbSnpHumanBuild125 # note rowcount # expect about 50% for human # ContigLocusIdCondense 4129899 # could delete ContigLocusIdFilter table here # create chrN_snpFasta tables from *.gnl files # snpLoadFasta.error will report all SNPs with "lengthTooLong" # here we have 4428 SNPs with lengthTooLong # these are noted as ObservedNotAvailable /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta dbSnpHumanBuild125 # split ContigLocFilter by chrom (could start using pipeline.csh here) # pipeline.csh takes about 35 minutes to run # create the first chrN_snpTmp # we will reuse this table name, adding/changing columns as we go # at this point chrN_snpTmp will have the same description as ContigLocFilter # this opens a file handle for every chrom, so will not scale to scaffold-based assemblies /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom dbSnpHumanBuild125 ref_haplotype # generate true coords using loc_type # possible errors logged to snpLocType.error: # "Missing quotes in phys_pos for range" # "Chrom end <= chrom start for range" # "Wrong size for exact" # "Unknown locType" # "Unable to get chromEnd" # We got none of these # possible exceptions logged to snpLocType.exceptions: # RefAlleleWrongSize # this run got just 40 # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +---------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +---------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # +---------------+-------------+------+-----+---------+-------+ /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype dbSnpHumanBuild125 ref_haplotype # expand allele as necessary # report syntax errors to snpExpandAllele.errors # this run had 63 of these # possible exceptions logged to snpExpandAllele.exceptions: # RefAlleleWrongSize # this run has 512 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele dbSnpHumanBuild125 ref_haplotype # the next few steps prepare for working in UCSC space # sort by position /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort dbSnpHumanBuild125 ref_haplotype # get hg17 nib files # get hg17 chromInfo, load into dbSnpHumanBuild125 with editted path hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" dbSnpHumanBuild125 # lookup reference allele in nibs # keep reverse complement to use in error checking (snpCheckAlleles) # check here for SNPs larger than 1024 # errAbort if detected /cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC dbSnpHumanBuild125 # morph chrN_snpTmp mysql> desc chr1_snpTmp; # +--------------------+-------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+-------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | ctg_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # +--------------------+-------------+------+-----+---------+-------+ # compare allele from dbSNP to refUCSC # locType between is excluded from this check # log exceptions to snpCheckAllele.exceptions # if SNP is positive strand, expect allele == refUCSC # log RefAlleleMismatch if not # if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp # If allele == refUCSCRevComp, log RefAlleleNotRevComp # If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch # This run we got: # 0 RefAlleleMismatch # 49763 RefAlleleNotRevComp /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles dbSnpHumanBuild125 # add class, observed and molType from chrN_snpFasta tables # log errors to snpReadFasta.errors # errors detected: no data available, duplicate data # This run we got: # 49 no data available # 226048 duplicate # chrN_snpFasta has class = 'in-del' # we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpReadFasta dbSnpHumanBuild125 # morph chrN_snpTmp # +--------------------+---------------+------+-----+---------+-------+ # | Field | Type | Null | Key | Default | Extra | # +--------------------+---------------+------+-----+---------+-------+ # | snp_id | int(11) | NO | | | | # | chromStart | int(11) | NO | | | | # | chromEnd | int(11) | NO | | | | # | loc_type | tinyint(4) | NO | | | | # | class | varchar(255) | NO | | | | # | orientation | tinyint(4) | NO | | | | # | molType | varchar(255) | NO | | | | # | allele | blob | YES | | NULL | | # | refUCSC | blob | YES | | NULL | | # | refUCSCReverseComp | blob | YES | | NULL | | # | observed | blob | YES | | NULL | | # +--------------------+---------------+------+-----+---------+-------+ # generate exceptions for class and observed # SingleClassBetweenLocType # SingleClassRangeLocType # NamedClassWrongLocType # ObservedNotAvailable # ObservedWrongFormat # ObservedWrongSize # ObservedMismatch # RangeSubstitutionLocTypeExactMatch # SingleClassTriAllelic # SingleClassQuadAllelic # This will also detect IUPAC symbols in allele /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved dbSnpHumanBuild125 # add function /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction dbSnpHumanBuild125 # add validation status and heterozygosity # log error if validation status > 31 or missing # this run we got 8 missing /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP dbSnpHumanBuild125 # generate chrN_snp125 and snp125Exceptions tables cp snpCheckAlleles.exceptions snpCheckAlleles.tab cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab cp snpExpandAllele.exceptions snpExpandAllele.tab cp snpLocType.exceptions snpLocType.tab /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable dbSnpHumanBuild125 # PAR SNPs /cluster/home/heather/kent/src/hg/snp/snpLoad/snpPAR dbSnpHumanBuild125 hgsql -e 'load data local infile "snpPARexceptions.tab" into table snp125Exceptions' dbSnpHumanBuild125 # concat into snp125.tab # cat chr*_snp125.tab >> snp125.tab /bin/sh concat.sh # load hgsql dbSnpHumanBuild125 < /cluster/home/heather/kent/src/hg/lib/snp125.sql hgsql -e 'load data local infile "snp125.tab" into table snp125' dbSnpHumanBuild125 # check for multiple alignments /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple dbSnpHumanBuild125 mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions; # run and review snpCompareLoctype (currently tuned for 124/125 differences) /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareLoctype dbSnpHumanBuild125 snp124subset snp125 # cat snpCompareLoctypeCounts.out # exactToExact = 8310192 # exactToBetween = 107956 # exactToRange = 16200 # betweenToBetween = 206224 # betweenToExact = 4012 # betweenToRange = 715 # rangeToRange = 98648 # rangeToBetween = 3151 # rangeToExact = 6198 # oldToNew = 10224 # 12043 coord changes in exact (.1%) # 1370 moved to different chroms # 3664 coord changes in between (1.7%) # 2260 off-by-one # 13 moved to different chroms # 22198 coord changes in range (22.5%) # 19548 look like fixes: observedLengthOld != coordSpanOld && observedLengthNew == coordSpanNew # 1296 look like errors: observedLengthOld == coordSpanOld && observedLengthNew != coordSpanNew # load on hgwdev cp snp125.tab /cluster/home/heather/transfer/snp hgsql dbSnpHumanBuild125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/snp125Exceptions.tab ssh hgwdev mysql> load data local infile 'snp125.tab' into table snp125; # create indexes mysql> alter table snp125 add index name (name); mysql> alter table snp125 add index chrom (chrom, bin); mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions; mysql> alter table snp125Exceptions add index name(name); # create snp125ExceptionDesc table cd /cluster/data/dbSnp # add counts to exception.template hgsql hg17 < snp125ExceptionDesc.sql mysql> load data local file 'exception.template' into table snp125ExceptionDesc; ####### # Add new case for ObservedWrongSize (Heather June 9, 2006) # revisions 1.25 and 1.26 kent/src/hg/snp/snpLoad/snpCheckClassAndObserved.c ssh kkr5u00 cd /scratch/snp/125/human /bin/csh pipeline.csh # wait 35 minutes grep ObservedWrongSize snpCheckClassAndObserved.exceptions > ObservedWrongSize grep ObservedWrongSize snpPARexceptions.tab >> ObservedWrongSize cp ObservedWrongSize /cluster/home/heather/transfer/snp ssh hgwdev hgsql -e 'alter table snp125Exceptions drop index name' hg17 hgsql -e 'load data local infile "/cluster/home/heather/transfer/snp/ObservedWrongSize" into table snp125Exceptions' hg17 hgsql -e 'alter table snp125Exceptions add index name' # fix counts hgsql -e 'select count(*), exception from snp125Exceptions group by exception' hg17 +----------+------------------------------------+ | count(*) | exception | +----------+------------------------------------+ | 785903 | MultipleAlignments | | 623 | NamedClassWrongLocType | | 7686 | ObservedMismatch | | 4333 | ObservedNotAvailable | | 97 | ObservedWrongFormat | | 73558 | ObservedWrongSize | | 466 | RangeSubstitutionLocTypeExactMatch | | 62 | RefAlleleMismatch | | 99849 | RefAlleleNotRevComp | | 1278 | RefAlleleWrongSize | | 20749 | SingleClassBetweenLocType | | 2306 | SingleClassQuadAllelic | | 15639 | SingleClassRangeLocType | | 19330 | SingleClassTriAllelic | +----------+------------------------------------+ # edit /cluster/data/dbSNP/exception.template (need to automate this) hgsql -e 'delete from snp125ExceptionDesc' hg17 hgsql -e 'load data local infile "/cluster/data/dbSNP/exception.template" into table snp125ExceptionDesc' hg17 ########################### # add rs_fasta to seq/extFile (Heather Nov 2006) # use 126 rs_fasta files because I didn't save 125 version ssh hgwdev mkdir /gbdb/hg17/snp ln -s /cluster/store12/snp/126/human/rs_fasta/snp.fa /gbdb/hg17/snp/snp.fa cd /cluster/store12/snp/126/human/rs_fasta hgLoadSeq hg17 /gbdb/hg18/snp/snp.fa # clean up after hgLoadSeq rm seq.tab # look up id in extFile # move into separate table hgsql hg17 < snpSeq.sql hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 33852294' hg17 hgsql -e 'delete from seq where extFile = 33852294' hg17 hgsql -e 'alter table snpSeq add index acc (acc)' hg17 ############################################################# # Get panTro2 and rheMac2 allele for all SNPs (Heather, Dec 2006, Feb 2007 and # June 2007 [partial fix released 6/25/07: using hg17 instead of hg18 liftOver # files... for most but not all chroms! :( not documented below; error found # by user] # 1/11/08 (angie): re-running panTro2Qual and subsequent chimp & summary # steps, so hg17 liftOver files will have been used for all outputs. # Deletions will probably lift okay # The insertions have start == end so none of them will lift # 1/24/08 (angie): constant quality score of 98 for chimp chr{21,M,Y,Y_random} # was previously put in score field -- corrected to orthoScore. ssh hgwdev cd /san/sanvol1/snp/liftOver/hg17 mkdir panTro2All mkdir rheMac2All mkdir input cd input hgsql -N -e 'select chrom, chromStart, chromEnd, name, score, strand from snp125' hg17 > snp125.bed lineFileSplit snp.bed lines 100000 snp- ln -s /san/sanvol1/snp/liftOver/hg17/input /san/sanvol1/snp/liftOver/hg17/panTro2All/input ln -s /san/sanvol1/snp/liftOver/hg17/input /san/sanvol1/snp/liftOver/hg17/rheMac2All/input cd ../panTro2All ./makeJobList.csh mkdir output mkdir unmapped cd ../rheMac2All ./makeJobList.csh mkdir output mkdir unmapped # cluster run ssh pk cd /san/sanvol1/snp/liftOver/hg17/panTro2All para create jobList; para try; para check; para push para time # Completed: 108 of 108 jobs # CPU time in finished jobs: 67758s 1129.29m 18.82h 0.78d 0.002 y # IO & Wait Time: 961s 16.02m 0.27h 0.01d 0.000 y # Average job time: 636s 10.60m 0.18h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1543s 25.72m 0.43h 0.02d # Submission to last job: 61513s 1025.22m 17.09h 0.71d cd /san/sanvol1/snp/liftOver/hg17/rheMac2All para create jobList; para try; para check; para push para time # Completed: 108 of 108 jobs # CPU time in finished jobs: 1833s 30.56m 0.51h 0.02d 0.000 y # IO & Wait Time: 1744s 29.06m 0.48h 0.02d 0.000 y # Average job time: 33s 0.55m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 82s 1.37m 0.02h 0.00d # Submission to last job: 59987s 999.78m 16.66h 0.69d # add sequence # next time do this at the same time as lift cd /san/sanvol1/snp/liftOver/hg17 mkdir panTro2Seq mkdir panTro2Seq/input mkdir panTro2Seq/output cp panTro2All/output/snp*out panTro2Seq/input cd panTro2Seq ./makeJobList.csh cat << 'EOF' > makeJobList.csh #!/bin/tcsh rm -f jobList foreach fileName (`ls input/*`) set baseName = $fileName:t echo $baseName echo "/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq $fileName /scratch/hg/panTro2/panTro2.2bit output/$baseName" >> jobList end 'EOF' cd /san/sanvol1/snp/liftOver/hg17 mkdir rheMac2Seq mkdir rheMac2Seq/input mkdir rheMac2Seq/output cp rheMac2All/output/snp*out rheMac2Seq/input cd rheMac2Seq cat << 'EOF' > makeJobList.csh #!/bin/tcsh rm -f jobList foreach fileName (`ls input/*`) set baseName = $fileName:t echo $baseName echo "/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq $fileName /scratch/hg/rheMac2/rheMac2.2bit output/$baseName" >> jobList end 'EOF' # cluster run for sequence ssh pk cd /san/sanvol1/snp/liftOver/hg17/panTro2Seq para create jobList; para try; para check; para push para time # Completed: 108 of 108 jobs # CPU time in finished jobs: 30509s 508.48m 8.47h 0.35d 0.001 y # IO & Wait Time: 325s 5.42m 0.09h 0.00d 0.000 y # Average job time: 286s 4.76m 0.08h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 551s 9.18m 0.15h 0.01d # Submission to last job: 1195s 19.92m 0.33h 0.01d cd /san/sanvol1/snp/liftOver/hg17/rheMac2Seq para create jobList; para try; para check; para push para time # Completed: 108 of 108 jobs # CPU time in finished jobs: 28517s 475.28m 7.92h 0.33d 0.001 y # IO & Wait Time: 576s 9.61m 0.16h 0.01d 0.000 y # Average job time: 269s 4.49m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 509s 8.48m 0.14h 0.01d # Submission to last job: 1166s 19.43m 0.32h 0.01d # quality scores # This takes about 24 hours for each species!! Ugh. # Solution is to use -bedFile argument to hgWiggle ssh hgwdev cd /san/sanvol1/snp/liftOver/hg17 cd panTro2Seq/output cat << 'EOF' > concat.csh #!/bin/tcsh rm -f all.out foreach fileName (`ls snp*.out`) cat $fileName >> all.out end 'EOF' sort all.out > all.sort rm all.out cd /san/sanvol1/snp/liftOver/hg17 mkdir panTro2Qual cp panTro2Seq/output/all.sort panTro2Qual cd panTro2Qual mkdir input splitFileByColumn all.sort input mkdir output # If we do this again, we should write a c program to read qac files into # memory -- much faster than one hgWiggle process per line. cat << 'EOF' > addQual.pl #!/usr/bin/perl -W $db=shift; $chromName=shift; while () { my @fields = split; my $chrom = $fields[0]; my $chromStart = $fields[1]; my $chromEnd = $fields[2]; my $name = $fields[3]; my $strand = $fields[5]; my $allele = $fields[6]; $cmd="hgWiggle -db=$db -chrom=$chromName -position=$chrom:$chromStart-$chromStart -rawDataOut quality"; open(RESULT, "$cmd |") or die "can't start '$cmd'\n"; while ($line = ) { $score = int($line); print "$chrom\t$chromStart\t$chromEnd\t$name\t$score\t$strand\t$allele\n"; } } 'EOF' cat << 'EOF' > getQual.csh #!/bin/tcsh foreach fileName (`ls input/*`) set chromName = $fileName:t:r echo $chromName addQual.pl panTro2 $chromName < $fileName > output/$chromName end 'EOF' # << emacs ./getQual.csh cd rheMac2Seq/output concat.csh sort all.out > all.sort rm all.out cd /san/sanvol1/snp/liftOver/hg17 mkdir rheMac2Qual cp rheMac2Seq/output/all.sort rheMac2Qual cd rheMac2Qual mkdir input splitFileByColumn all.sort input mkdir output ./getQual.csh # concatenate, merge and load # chimp has no qual scores for chr21, chrY and chrM, just use seq files cd /san/sanvol1/snp/liftOver/hg17/panTro2Qual/output grep chr21 ../../panTro2Seq/output/all.sort > chr21 grep chrY ../../panTro2Seq/output/all.sort | grep -v random > chrY grep chrY ../../panTro2Seq/output/all.sort | grep random > chrY_random grep chrM ../../panTro2Seq/output/all.sort > chrM #----------------------------------------------------------------------------- # 1/11/08: replace outputs for chroms that apparently were skipped in the June # run, and re-run subsequent steps for chimp. cd /san/sanvol1/snp/liftOver/hg17/panTro2Qual mv output output-jun25 foreach f (output-jun25/chr*) if ( "X"`cmp $f output-feb26/$f:t` == "X" ) then echo $f:t endif end #chr21 #chrM #chrY #chrY_random # <<-- those are the ones that may not have actually been regenerated. # It appears that the Feb. outputs, instead of the June Seq files, were copied # to the June output for those chroms. oops! # As a minor improvement, skip duplicate rows instead of just copying. foreach chr (chr21 chrM chrY chrY_random) echo $chr uniq input/$chr.sort > output/$chr end # << emacs mkdir output-jun25-incorrect mv output-jun25/chr{21,M,Y,Y_random} output-jun25-incorrect cat output-jun25/chr* output/chr* > output/qual.tab # end 1/11/08 fix-specific; proceeding to post-concat.csh chimp steps. #----------------------------------------------------------------------------- ./concat.csh /cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrtho hg17 snp125 qual.tab hgLoadBed hg17 snp125OrthoPanTro2 snpOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp125OrthoPanTro2.sql #Loaded 9591230 elements of size 17 # previously 9590961 # add index hgsql hg17 alter table snp125OrthoPanTro2 add index name (name); alter table snp125OrthoPanTro2 add index chrom (chrom, bin); # 1/24/08: these used to set score; should have set orthoScore all along. # tweak to match panTro2 assembly update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chr21"; #Query OK, 129170 rows affected (25.37 sec) #Rows matched: 129170 Changed: 129170 Warnings: 0 update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY"; #Query OK, 22081 rows affected (25.16 sec) #Rows matched: 22081 Changed: 22081 Warnings: 0 update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY_random"; #Query OK, 155 rows affected (25.41 sec) #Rows matched: 155 Changed: 155 Warnings: 0 # macaque cd /san/sanvol1/snp/liftOver/hg17/rheMac2Qual/output ./concat.csh /cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrtho hg17 snp125 qual.tab hgLoadBed hg17 snp125OrthoRheMac2 snpOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp125OrthoRheMac2.sql # add index alter table snp125OrthoRheMac2 add index name (name); alter table snp125OrthoRheMac2 add index chrom (chrom, bin); # get hapmap subset for chimp # skip if lift wasn't size 1 # this run 124822 skipped cd /cluster/data/hg17/bed/hapmap/rel21a time /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg17 \ hapmapSnpsCombined snp125OrthoPanTro2 #108.505u 16.869s 2:26.22 85.7% 0+0k 0+0io 4pf+0w hgLoadBed hg17 hapmapAllelesChimp hapmapOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesChimp.sql #Loaded 3930564 elements of size 13 hgsql hg17 -e 'alter table hapmapAllelesChimp add index name(name); \ alter table hapmapAllelesChimp add index chrom (chrom, bin);' # get hapmap subset for macaque # this run 106607 skipped /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg17 hapmapSnpsCombined snp125OrthoRheMac2 hgLoadBed hg17 hapmapAllelesMacaque hapmapOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesMacaque.sql rm hapmapOrtho.tab rm hapmapOrtho.err rm bed.tab alter table hapmapAllelesMacaque add index name(name); alter table hapmapAllelesMacaque add index chrom (chrom, bin); ############################################################################## # HapMap Recombination Rate Phase 2 (Heather Feb. 2006) # Contacts: # Gil McVean [mcvean@stats.ox.ac.uk] # Colin Freeman [cfreeman@stats.ox.ac.uk] # Simon Myers [smyers@broad.mit.edu] # Data is missing chromEnd. I am setting chromEnd = chromStart + 1 as a # kludge for now. # Solution is to interpolate range but remove gaps. ## **************************************** # This is a bad assumption about the data format -- here is a description. ## **************************************** # The recombination rates are for the regions _between_ snps, so these # files need to be processed slightly differently. For each line i in # the file (except the header and the last line), the recombination # rate is for the position on the current line minus 1 [pos(${i})-1] to # the position on the subsequent line [pos({$i+1})]. The precision is # a bit obnoxious and can be truncated to 3 or 4 significant figures. # (Note that the recombination rate on the last line is 0, as this is a # placeholder.) Here is an example: # # > head genetic_map_chr1.txt # position COMBINED_rate(cM/Mb) Genetic_Map(cM) # 45413 2.98182170902573 0 # 72434 2.08241435350679 0.0805718043995841 # 78032 2.08135840137317 0.0922291599505152 # 244859 2.88844902005393 0.439455937976397 # 604461 2.88749757426825 1.47814798248583 # 604484 2.88586385769306 1.47821439493004 # 605296 2.88389196108775 1.48055771638249 # ### BED format (like a bedGraph) # chr1 45412 72434 2.982 # chr1 72433 78302 2.082 # chr1 78031 244859 2.081 # chr1 244858 604461 2.888 # chr1 604460 604484 2.887 # chr1 604483 605296 2.886 # chr1 605295 ..... 2.884 # # See /cluster/data/hg16/bed/hapmap/recombination/Perlegen/makeBed.pl for an example. /cluster/data/hg16/bed/hapmap/recombination/Perlegen/cmds.csh is also useful. ## **************************************** ssh hgwdev cd /cluster/data/hg17/bed/hapmap mkdir recombination cd recombination mkdir phase2 cd phase2 wget --no-check-certificate -N https://mathgen.stats.ox.ac.uk/HapMap_Phase2_rates_hotspots/HapMap_Phase2_rates_hotspots.tgz # data also available at # http://www.hapmap.org/downloads/recombination/2006-10_rel21_phaseII gunzip *.tgz tar xvf *.tar cat << 'EOF' > makeBed.csh #!/bin/tcsh rm -f recomb.bed foreach chrom (`cat chrom.list`) echo $chrom set fileName=`echo $chrom | awk '{printf "genetic_map_%s.txt", $1}'` makeBed.pl $chrom < $fileName >> recomb.bed end makeBed.pl chrX < genetic_map_chrX_par1.txt >> recomb.bed makeBed.pl chrX < genetic_map_chrX_non-par.txt >> recomb.bed makeBed.pl chrX < genetic_map_chrX_par2.txt >> recomb.bed 'EOF' cat << 'EOF' > makeBed.pl #!/usr/bin/env perl $chromName = shift; while () { my @fields = split; # skip header if ($fields[0] eq "position") { next; } print $chromName; print "\t"; print $fields[0]; print "\t"; print $fields[0] + 1; print "\t"; my $val1000 = $fields[1] * 1000; my $valRound = int($val1000); my $newVal = $valRound / 1000.0; print $newVal; print "\n"; } 'EOF' ./makeBed.csh hgLoadBed hg17 snpRecombRateHapmapPhase2 recomb.bed -tab -bedGraph=4 hgsql -e 'alter table snpRecombRateHapmapPhase add index chrom (chrom, bin)' hg17 ############ # UPDATE hg17 knownToVisiGene (2006-04-05 galt) # Create table that maps between known genes and visiGene database # mapping to other species such as mouse, zebrafish, frog # requires visiGene probe track vgImageProbes be created first knownToVisiGene hg17 -fromProbePsl=vgImageProbes ############################################################# # ADD A NEW TRACK GROUP (DONE, 6/3/06, Fan) # Create a new track group, "phenDis". echo 'INSERT INTO grp (name, label, priority) VALUES ("phenDis", "Phenotype and Disease Associations", 2.5)' \ | hgsql hg17 ############################################################# # hgMut - Human Mutation track - Belinda Giardine # list of tables by show tables like 'hgMut%' # summary of current load June 7, 2006 #table definitions for autoSql autoSql hgMut.as hgMut -dbLink #move bin in struct so works as bed 4+ #hgMut.sql: change INDEXes as needed, put in enums #shrink mutId to 64 chars, plus acc to 48 #data files and details under ~giardine/humPhen/ cd humPhen/hgMutData/April2006/ cat hgMutHbVar.txt hgMutPah.txt hgMutBgmut.txt hgMutCftr.txt hgMutARdb.txt > hgMutUnsorted.txt grep "^chr" hgMutUnsorted.txt | sort -k1,1 -k2,2n > hgMut.bed #create tables hgsql hg17 < ../../hgMut.sql #loading hgLoadBed hg17 hgMut hgMut.bed -noSort -oldTable -tab #load small vocab control tables hgsql hg17 < hgMutLink.sql hgsql hg17 < hgMutAttrClass.sql hgsql hg17 < hgMutAttrName.sql hgsql hg17 < hgMutSrc.sql #from hgsql hg17 load data local infile "hgMutExtLinkHbVar.txt" into table hgMutExtLink; load data local infile "hgMutExtLinkARdb.txt" into table hgMutExtLink; load data local infile "hgMutExtLinkBgmut.txt" into table hgMutExtLink; load data local infile "hgMutExtLinkCFTR.txt" into table hgMutExtLink; load data local infile "hgMutExtLinkPah.txt" into table hgMutExtLink; load data local infile "hgMutExtLinkSP.txt" into table hgMutExtLink; load data local infile "hgMutAttrHbVar2.txt" into table hgMutAttr; load data local infile "hgMutAttrHbvarProt2.txt" into table hgMutAttr; load data local infile "hgMutAttrARdb.txt" into table hgMutAttr; load data local infile "hgMutAttrARdbProt.txt" into table hgMutAttr; load data local infile "hgMutAliasHbVar.txt" into table hgMutAlias; load data local infile "hgMutAliasARdb.txt" into table hgMutAlias; load data local infile "hgMutAliasBgmut.txt" into table hgMutAlias; load data local infile "hgMutAliasPah.txt" into table hgMutAlias; load data local infile "hgMutExtLinkHbVarOmim.txt" into table hgMutExtLink; load data local infile "hgMutAttrLink.txt" into table hgMutAttrLink; load data local infile "hgMutAttrSP.txt" into table hgMutAttr; ############################################################# # gv* Belinda Giardine # These tables are to replace the hgMut tables # Most data is converted by me (on PSU machines) to loadable format and copied. # The Swiss-Prot/UniProt data, is generated from the UniProt database at UCSC, # using perl scripts and table dumps. # scripts in kent/src/hg/utils/gvParsers/swissProt/ # everything redone to not depend on the dv track in July 2006 #make list of variants from Swiss-Prot (make sure featureClass 23 is variant) hgsql -N uniProt > spVars.txt < kgProtMapDump.txt < kgProtMapUniq.txt #check variables for output and input file names computeSpVars > errors.txt #errors.txt will list variants that couldn't be mapped #July 18, 2006 #37 gaps, 564 proteins (2228 variants) not in kgProtMap (test one did align) #found 22389 #Swiss-Prot attributes: hgsql hg17 < listSPconnections.sql > listSPconnections.txt hgsql proteome < listSpXref2.sql > listSpXref2.txt convertOmimTitle > gvLinkSPomim.txt hgsql hg17 < listGeneVals.sql > listGeneVals.txt convertDisPoly > gvLinkSPuni.txt cat gvLinkSPuni.txt gvLinkSPomim.txt > gvLinkSp.txt cp gvLinkSp.txt ../../../gv/gvData/ #creating gv* tables and loading #June 27, 2006 autoSql gv.as gv -dbLink #edit indexes and string lengths in .sql file id=48, srcId=48, raKey=48, attrType=48, primary key=index on bin and attr and link ids (id, attrType for attrs) #do enums #add unique index, to prevent doubles UNIQUE KEY (chrom(12), chromStart, chromEnd, name) #added id field to gvPos struct so can keep ID when change name # char *id; /* Added field to hold ID if change name */ #set to null in gv.c file #reload data July 2006 with more data and corrected Swiss-Prot data #also moved gv*(except gvPos) and omimTitle to hgFixed #prep data: concatenate all the gvPos data, sort cat gvPosSP.txt gvPosHbVar.txt gvPosARdb.txt gvPosBgmut.txt gvPosCftr.txt gvPosPah.txt gvPosSrd5a2.txt gvPosBrca.txt > gvPosAll.txt grep "^chr" gvPosAll.txt | sort -k1,1 -k2,2n > gvPosSortedHg17.bed #load tables hgLoadBed hg17 gvPos gvPosSortedHg17.bed -noSort -oldTable -tab hgsql hg17 < gvSrc.sql hgsql hg17 load data local infile "gvBrca.txt" into table gv; load data local infile "gvAttrBrca.txt" into table gvAttr; load data local infile "gvLinkBrca.txt" into table gvLink; load data local infile "gvLinkSP.txt" into table gvLink; load data local infile "gvLinkSPgene.txt" into table gvLink; load data local infile "gvSP.txt" into table gv; load data local infile "gvAttrSP.txt" into table gvAttr; load data local infile "gvAttrLongSP.txt" into table gvAttrLong; load data local infile "gvLinkHbVar.txt" into table gvLink; load data local infile "gvHbVar.txt" into table gv; load data local infile "gvAttrHbVar.txt" into table gvAttr; load data local infile "gvARdb.txt" into table gv; load data local infile "gvAttrARdb.txt" into table gvAttr; load data local infile "gvBgmut.txt" into table gv; load data local infile "gvAttrBgmut.txt" into table gvAttr; load data local infile "gvAttrLongBgmut.txt" into table gvAttrLong; load data local infile "gvLinkBgmut.txt" into table gvLink; load data local infile "gvCftr.txt" into table gv; load data local infile "gvAttrCftr.txt" into table gvAttr; load data local infile "gvPah.txt" into table gv; load data local infile "gvAttrPah.txt" into table gvAttr; load data local infile "gvLinkPah.txt" into table gvLink; load data local infile "gvSrd5a2.txt" into table gv; load data local infile "gvAttrSrd5a2.txt" into table gvAttr; load data local infile "gvAttrConservedDisease.txt" into table gvAttr; #get disease association predictions for conserved variants #get list a variants that are already done hgsql -N hg17 > gvWithDiseaseStatus.txt < gvAttrConservedDisease.txt #Belinda Giardine Sept 2006 #reload tables, removed ones with sequence mismatches, added label and strand #added new lsbd BTKbase #Sequence mismatches were determined by using the position in the reference #sequence to fetch the sequence affected by the variant. Then for substitions #and deletions with the nts deleted listed, the sequence was compared. #Insertions and large deletions could not be checked. #Belinda Giardine Dec 2006 #reload tables, additions to previous sources and more IDbases #details in hg18 doc #Belinda Giardine Jan 2007 #reload tables, additions and corrections, details in hg18 doc ############################################################# # Illumina Hap300 (Heather, July 2006) ssh hgwdev cd /cluster/data/hg17/bed mkdir illumina cd illumina trim.pl < Illumina_HumanHap300_SNPlist_01.13.2006.txt > trim.out hgsql hg17 < illuminaTmp.sql hgsql -e "load data local infile 'trim.out' into table illuminaTmp" hg17 # illuminaLookup generates bin /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg17 illuminaTmp snp125 snp125Exceptions illuminaLookup.out illuminaLookup.err # errors: # unexpected chrom chr1 for snp rs1291584 # unexpected chrom chr17 for snp rs3826555 # unexpected locType between for snp rs2036773 # unexpected locType between for snp rs2249255 # unexpected locType between for snp rs8051412 # unexpected locType between for snp rs1017238 # unexpected locType between for snp rs5019493 # 16 with locType = range* # 402 not found! # None that have multiple alignments. hgsql hg17 < snpArrayIllumina300.sql hgsql -e "load data local infile 'illuminaLookup.out' into table snpArrayIllumina300" hg17 hgsql -e "alter table snpArrayIllumina300 add index name (name)" hg17 hgsql -e "alter table snpArrayIllumina300 add index chrom (chrom, bin)" hg17 ############################################################# # Illumina Hap550 and Hap650 (Heather, April 2007) # Transfer from hg18 for Bert Gold at NCI ssh hgwdev cd /cluster/data/hg17/bed/illumina hgsql hg18 < getHg18-550.sql > 550.hg18 hgsql hg18 < getHg18-650.sql > 650.hg18 # get name, chrom, chromStart, chromEnd, strand observed from snp125 # where class = "single" and locType = "exact" and chromEnd = chromStart + 1 # Including tri/quad allelic and multiple-aligning for now hgsql hg17 < getHg17.sql > snp125single.hg17 # sort and join sort 550.hg18 > 550.hg18.sort sort 650.hg18 > 650.hg18.sort sort snp125single.hg17 > snp125single.hg17.sort # 560704 lines in 550.join # 660137 lines in 650.join # 687 lines in 550.missing # 706 lines in 650.missing join 550.hg18.sort snp125single.hg17.sort > 550.join join 650.hg18.sort snp125single.hg17.sort > 650.join join -v 1 550.hg18.sort snp125single.hg17.sort > 550.missing join -v 1 650.hg18.sort snp125single.hg17.sort > 650.missing # fix column order awk '{print $2, $3, $4, $1, 0, $5, $6}' 550.join > 550.bed awk '{print $2, $3, $4, $1, 0, $5, $6}' 650.join > 650.bed # load hgLoadBed hg17 snpArrayIllumina550 550.bed -sqlTable=snpArrayIllumina550.sql hgLoadBed hg17 snpArrayIllumina650 650.bed -sqlTable=snpArrayIllumina650.sql # indices mysql> alter table snpArrayIllumina550 add index name (name); mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin); mysql> alter table snpArrayIllumina650 add index name (name); mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin); ############################################################# # Affy 500K (Heather, September 2006) # look up rsId using position ssh hgwdev cd /cluster/data/hg17/bed/snp/affyData/500K # awk to create bed format from tsv files /bin/csh cmds.csh hgsql hg17 < affy250Nsp.sql hgsql hg17 < affy250Sty.sql hgsql -e "load data local infile 'Mapping250K_Nsp.bed' into table affy250Nsp" hg17 hgsql -e "load data local infile 'Mapping250K_Sty.bed' into table affy250Sty" hg17 # look up dbSNP rsIDs using position # affy250Nsp # 4311 missing, 7276 multiple /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy250Nsp snp125 mv affyLookup.out affy250Nsp.bed mv affyLookup.err affy250Nsp.err hgsql hg17 < snpArrayAffy250Nsp.sql hgLoadBed hg17 snpArrayAffy250Nsp affy250Nsp.bed -sqlTable=snpArray250Nsp.sql -tab hgsql -e "alter table snpArrayAffy250Nsp add index name (name)" hg17 hgsql -e "alter table snpArrayAffy250Nsp add index chrom (chrom, bin)" hg17 # affy250Sty # 3540 missing, 6901 multiple /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy250Sty snp125 mv affyLookup.out affy250Sty.bed mv affyLookup.err affy250Sty.err hgsql hg17 < snpArrayAffy250Sty.sql hgLoadBed hg17 snpArrayAffy250Sty affy250Sty.bed -sqlTable=snpArray250Sty.sql -tab hgsql -e "alter table snpArrayAffy250Sty add index name (name)" hg17 hgsql -e "alter table snpArrayAffy250Sty add index chrom (chrom, bin)" hg17 ############################################################# # Affy 10K (Sept. 2006, Heather) # look up rsId using position ssh hgwdev cd /cluster/data/hg17/bed/snp/affyData/10K100Kagain # affy10 # 14 missing, 807 multiple cp affy10K.txt affy10Temp.bed hgLoadBed hg17 affy10Temp affy10Temp.bed -sqlTable=affy10Temp.sql -tab -noBin /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy10Temp snp125 mv affyLookup.out affy10.bed mv affyLookup.err affy10.err hgLoadBed hg17 snpArrayAffy10 affy10.bed -sqlTable=snpArrayAffy10.sql -tab # affy10v2 # 12 missing, 716 multiple hgLoadBed hg17 affy10v2Temp affy10v2Temp.bed -sqlTable=affy10v2Temp.sql -tab -noBin /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy10v2Temp snp125 mv affyLookup.out affy10v2.bed mv affyLookup.err affy10.errv2 hgLoadBed hg17 snpArrayAffy10v2 affy10v2.bed -sqlTable=snpArrayAffy10v2.sql -tab # affy50HindIII # 156 missing, 1396 multiple hgLoadBed hg17 affy50HindIIITemp affy50HindIII.bed -sqlTable=affy50HindIIITemp.sql -tab -noBin /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy50HindIIITemp snp125 mv affyLookup.out affy50HindIII.bed mv affyLookup.err affy50HindIII.err hgLoadBed hg17 snpArrayAffy50HindIII affy50HindIII.bed -sqlTable=snpArrayAffy50HindIII.sql -tab hgsql -e "alter table snpArrayAffy50HindIII add index name (name)" hg17 hgsql -e "alter table snpArrayAffy50HindIII add index chrom (chrom, bin)" hg17 # affy50XbaI # 115 missing, 1745 multiple hgLoadBed hg17 affy50XbaITemp affy50XbaI.bed -sqlTable=affy50XbaITemp.sql -tab -noBin /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy50XbaI snp125 mv affyLookup.out affy50XbaI.bed mv affyLookup.err affy50XbaI.err hgLoadBed hg17 snpArrayAffy50XbaI affy50XbaI.bed -sqlTable=snpArrayAffy50XbaI.sql -tab hgsql -e "alter table snpArrayAffy50XbaI add index name (name)" hg17 hgsql -e "alter table snpArrayAffy50XbaI add index chrom (chrom, bin)" hg17 ######################################################################### # REGULATORY POTENTIAL (DONE - 2006-06-14 - Hiram) # download data from "James Taylor" ssh kkstore02 mkdir /cluster/store11/hg17/bed/regPotential7X cd /cluster/data/hg17/bed ln -s /cluster/store11/hg17/bed/regPotential7X ./regPotential7X cd regPotential7X # This is a lot of data time for C in 1 2 3 4 5 6 7 8 9 X Y 10 11 12 13 14 15 16 17 18 19 20 21 22 do wget --timestamping \ "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/chr${C}.scores.truncated.bz2" done # real 115m1.855s wget --timestamping \ "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/trackDb.html" -O description.html time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y do bzcat chr${C}.scores.truncated.bz2 done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib # Converted stdin, upper limit 1.00, lower limit -0.00 # real 33m48.487s # Loading the table on hgwdev ssh hgwdev cd /cluster/data/hg17/bed/regPotential7X ln -s /cluster/data/hg17/bed/regPotential7X/regPotential7X.wib \ /gbdb/hg17/wib/regPotential7X.wib # using the tmpDir is faster since it is on local disk and it will # clean up any temporary .tab file it creates there time hgLoadWiggle -tmpDir=/scratch/tmp \ hg17 regPotential7X regPotential7X.wig # How about a histogram of the data. ssh kolossus cd /cluster/data/hg17/bed/regPotential7X time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \ -hMinVal=0.0 -db=hg17 regPotential7X > histogram.data 2>&1 # real 2m48.810s # 73 % of the data values are zero # create download gzip files from the bz2 files: for F in chr*.scores.truncated.bz2 do C=`echo $F | awk -F'.' '{print $1}'` echo -n "${C}.regPotential7X.hg17.gz working ... " bzcat ${F} | gzip > ${C}.regPotential7X.hg17.gz echo done ######################################################################### ####### RE-BUILD RGD HUMAN QTL TRACKS (DONE 06/21/06 Fan) ############## # DELETED RECORD FROM rgdQtlLink SO CONSISTENT WITH REMOVAL FROM rgdQtl # (DONE, 2006-06-30, hartera) ssh hgwdev mkdir -p /cluster/store8/rgd/human12062005 rm /cluster/data/hg17/bed/rgdQtl ln -s /cluster/store8/rgd/human12062005 /cluster/data/hg17/bed/rgdQtl cd /cluster/data/hg17/bed/rgdQtl # download data files from RGD wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/rgd_human_qtl_12062005.gff # remove extra line feed character at the end of lines rmLf rgd_human_qtl_12062005.gff > rgdQtl.gff # create rgdQtl.tab awk '{print $1"\t"$4"\t"$5"\t"$10}' rgdQtl.gff |sed -e 's/Chr/chr/g'| \ sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' |sort -u > rgdQtl.tab # create rgdQtlLink.tab cat rgdQtl.gff |cut -f 9 |sed -e 's/; Note /\t/g'|\ sed -e 's/Alignment //' |sed -e 's/;Note /\t/' |\ sed -e 's/"//g' |sed -e 's/RGD://' >j.tmp cut -f 2 j.tmp >j.1 cut -f 1,3 j.tmp >j.2 paste j.1 j.2 |sort -u >rgdQtlLink.tab rm j.1 j.2 j.tmp # load rgdQtl table hgLoadBed hg17 rgdQtl rgdQtl.tab # check rgdQtl table checkTableCoords hg17 rgdQtl # Go the following error messages: #hg17.rgdQtl has 2 records with end > chromSize. #hg17.rgdQtl has 2 records with end > chromSize. #hg17.rgdQtl has 2 records with end > chromSize. #hg17.rgdQtl has 1 records with end > chromSize. #hg17.rgdQtl has 2 records with end > chromSize. #hg17.rgdQtl has 3 records with end > chromSize. #hg17.rgdQtl has 1 records with end > chromSize. #hg17.rgdQtl has 3 records with end > chromSize. #hg17.rgdQtl has 2 records with end > chromSize. #hg17.rgdQtl has 1 records with end > chromSize. #hg17.rgdQtl has 3 records with end > chromSize. #hg17.rgdQtl has 2 records with end > chromSize. #hg17.rgdQtl has 1 records with end > chromSize. #hg17.rgdQtl has 1 records with end > chromSize. #hg17.rgdQtl has 1 records with end > chromSize. #hg17.rgdQtl has 1 records with end > chromSize. #hg17.rgdQtl has 1 records with end < start. hgsql hg17 -N -e 'select "do1", name, c.size from rgdQtl r, chromInfo c where chromEnd > c.size and r.chrom=c.chrom' >doall cat << '_EOF_' > do1 hgsql hg17 -e "update rgdQtl set chromEnd = '${2}' where name='${1}'" '_EOF_' chmod +x do* doall checkTableCoords hg17 rgdQtl #hg17.rgdQtl has 1 records with end < start. hgsql hg17 -e 'select * from rgdQtl where chromEnd < chromStart' # bin chrom chromStart chromEnd name # 9 chr10 7135612 371019 BW63_H # 0 chr20 77628133 5242324 AASTH39_H # Don't know why checkTableCoords only catches one of the two erros. hgsql hg17 -e "update rgdQtl set chromStart = 271019 where name='BW63_H'" hgsql hg17 -e "update rgdQtl set chromEnd = 7135612 where name='BW63_H'" # Delete the following record. The RGD QTL is very questionable. hgsql hg17 -e "delete from rgdQtl where name='AASTH39_H'" # load rgdQtlLink table hgsql hg17 -e "drop table hg17.rgdQtlLink;" hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;' # Delete the record from rgdQtlLink table that was removed from the rgdQtl # table above. (hartera, 2006-06-30) hgsql hg17 -e "delete from rgdQtlLink where name='AASTH39_H'" ######################################################################## ######################################################################### #Reload omimTitle table Belinda Giardine June 28, 2006 #fetched omim.txt.Z from OMIM downloads. #parse out title lines (*FIELD* TI) convertTitle < omim.txt > omimTitle.txt #load into omimTitle table truncate table omimTitle; load data local infile "omimTitle.txt" into table omimTitle; ############################################################# # Lift SV track from hg16 (Heather, July 2006) # hg16 SV track is comprised of 7 subtracks: # cnpFosmid, cnpSebat, cnpIafrate, cnpSharp, delConrad, delMccarroll, delHinds # Use the same table formats as hg16; pre-create # (No bin for del tables) cd /cluster/data/hg17/bed mkdir svMixed cd svMixed # I got hg17 coords from Andy Sharp for cnpFosmid and delHinds trimFosmid.pl < cnpFosmid.txt > cnpFosmid.bed hgLoadBed -tab hg17 cnpFosmid cnpFosmid.bed hinds.pl < hinds.txt > delHinds.bed hgLoadBed -tab -noBin hg17 delHinds delHinds.bed # (7-27-2006 Brooke Rhead -- edited the cnpFosmid table) # According to Andy Sharp, the name='Gap' items should be removed from # cnpFosmid. I dumped the table, removed the 'Gap' lines, then dumped the # table again. cd /cluster/data/hg17/bed/svMixed hgsql hg17 -e "select * from cnpFosmid" > cnpFosmid_withGaps.bed hgsql hg17 delete from cnpFosmid where name='Gap'; hgsql hg17 -e "select * from cnpFosmid" > cnpFosmid_withoutGaps.bed # Simple lifts for delMccarroll cat << '_EOF_' > liftConrad.csh #!/bin/csh hgsql -N -e 'select * from delMccarroll' hg16 > delMccarroll.hg16 liftOver -minMatch=0.7 delMccarroll.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delMccarroll.bed delMccarroll.err hgLoadBed -sqlTable=delMccarroll.sql -tab -noBin hg17 delMccarroll delMccarroll.bed '_EOF_' # Lift both chromStart/chromEnd and thickStart/thickEnd for delConrad and join cat << '_EOF_' > liftConrad.csh #!/bin/csh hgsql -N -e 'select chrom, chromStart, chromEnd, name, score, strand from delConrad' hg16 > delConrad.hg16.1 hgsql -N -e 'select chrom, thickStart, thickEnd, name, score, strand from delConrad' hg16 > delConrad.hg16.2 liftOver -minMatch=0.7 delConrad.hg16.1 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delConrad.tmp.1 delConrad.err.1 liftOver -minMatch=0.7 delConrad.hg16.2 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delConrad.tmp.2 delConrad.err.2 trimConrad.pl < delConrad.tmp.1 > delConrad.trim.1 trimConrad.pl < delConrad.tmp.2 > delConrad.trim.2 sort delConrad.trim.1 > delConrad.sort.1 sort delConrad.trim.2 > delConrad.sort.2 join delConrad.sort.1 delConrad.sort.2 > delConrad.join awk '{print $2, $3, $4, $1, 1000, $5, $7, $8}' delConrad.join > delConrad.bed hgLoadBed -sqlTable=delConrad.sql -noBin hg17 delConrad delConrad.bed '_EOF_' # Andy Sharp says the Sebat data has already been lifted, so be conservative here # Create hg16.cnpSebatLiftCandidate that excludes 5 rows that had wild proliferations cat << '_EOF_' > liftSebat.csh hgsql -N -e 'select chrom, chromStart, chromEnd, name, probes, individuals from cnpSebatLiftCandidate' hg16 > cnpSebat.hg16 liftOver -minMatch=0.7 -bedPlus=4 cnpSebat.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpSebat.bed cnpSebat.err hgLoadBed -sqlTable=cnpSebat.sql -tab hg17 cnpSebat cnpSebat.bed '_EOF_' # For Andy's data, use bacEndPairs first, then lift the remainder cat << '_EOF_' > liftSharp.csh # assumes a copy of hg16.cnpSharp in hg17.cnpSharpHg16Copy /cluster/home/heather/kent/src/hg/snp/snpLoad/cnpLookup hg17 bacEndPairs cnpSharpHg16Copy cnpSharpLookup.out cnpSharpLookup.lift cnpSharpLookup.log sed -e 's/Gain and Loss/GainAndLoss/' cnpSharpLookup.lift > cnpSharpLookup.lift.fix mv cnpSharpLookup.lift.fix cnpSharpLookup.lift liftOver -minMatch=0.7 -bedPlus=4 cnpSharpLookup.lift /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpSharp.bed cnpSharp.err sed -e 's/GainAndLoss/Gain And Loss/' cnpSharp.bed > cnpSharp.bed.fix mv cnpSharp.bed.fix cnpSharp.bed hgLoadBed -tab -sqlTable=cnpSharp.sql hg17 cnpSharp cnpSharpLookup.out hgLoadBed -tab -oldTable hg17 cnpSharp cnpSharp.bed '_EOF_' # For the Iafrate data, the BAC End lookup wasn't good, so just lift # Create hg16.cnpIafrateLiftCandidate that excludes 2 rows that had wild proliferations cat << '_EOF_' > liftIafrate.csh hgsql -N -e 'select chrom, chromStart, chromEnd, name, variationType, score from cnpIafrateLiftCandidate' hg16 > cnpIafrate.hg16 sed -e 's/Gain and Loss/GainAndLoss/' cnpIafrate.hg16 > cnpIafrate.hg16.fix mv cnpIafrate.hg16.fix cnpIafrate.hg16 liftOver -minMatch=0.7 -bedPlus=4 cnpIafrate.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpIafrate.bed cnpIafrate.err sed -e 's/GainAndLoss/Gain And Loss/' cnpIafrate.bed > cnpIafrate.bed.fix mv cnpIafrate.bed.fix cnpIafrate.bed hgLoadBed -sqlTable=cnpIafrate.sql -tab hg17 cnpIafrate cnpIafrate.bed '_EOF_' ############################################################################## # Add HapMap CNVRs from Matt Hurles (Heather Dec 2006) ssh hgwdev cd /cluster/data/hg17/bed/svRedon # File from Matthew Hurles (meh@sanger.ac.uk) was essentially bed 4 # I decided to use bed 6 with score always 0 and strand always + awk '{printf "%st%d\t%d\tcnp%s\t0\t%s\n", $1, $4, $5, $3, $7}' input.gff > input.bed hgLoadBed hg17 cnpRedon input.bed ############################################################################## # dbRIP POLYALUL1SVA track added (2006-07-14 - DONE - Hiram) # dbRIP polyAluL1SVA # Data provider: Dr. Liang at the Liang lab: # http://falcon.roswellpark.org/index.html # Ping.Liang@roswellpark.org # Adding this track is a new data type into our browser. # data definitions for dbRIP and polyGenotype were added to # the hg/lib/ directory: # -rw-rw-r-- 1 351 Jul 13 12:20 polyGenotype.as # -rw-rw-r-- 1 694 Jul 13 12:22 polyGenotype.sql # -rw-rw-r-- 1 6398 Jul 13 12:22 polyGenotype.c # -rw-rw-r-- 1 980 Jul 10 17:59 dbRIP.as # -rw-rw-r-- 1 11408 Jul 13 11:16 dbRIP.c # -rw-rw-r-- 1 1578 Jul 13 12:06 dbRIP.sql # With associated .h files in hg/inc/ # -rw-rw-r-- 1 4600 Jul 10 18:00 dbRIP.h # -rw-rw-r-- 1 4375 Jul 13 16:16 polyGenotype.h # Changes in hgTracks and hgc to make this track appear as it does # at their browser: # http://falcon.roswellpark.org:9090/cgi-bin/hgTables # For this first instance of the track, the data was obtained # directly from their Genome browser via the tables browser, # dumping the tables: # hg17.polyAluL1 and hg17.polyGenotype # saving these data dumps to: # (after a couple of versions were used ...) ssh hgwdev mkdir /cluster/data/hg17/bed/dbRIP cd /cluster/data/hg17/bed/dbRIP # -rw-rw-r-- 1 994485 Aug 1 16:03 dbRIP.2006-08-01.txt.gz # -rw-rw-r-- 1 18532 Aug 1 16:05 polyGenotype.2006-08-01.txt.gz # Rearrange their data columns to more closely match the # standard BED definitions, and split into three different # data sets: zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' ' { chromStart=$6 chromStart -= 1 chromEnd=$7 if (match($1,"^RIP_SVA_.*")) { printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21 } }' | sort -k1,1 -k2,2n > dbRIP.SVA.txt zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' ' { chromStart=$6 chromStart -= 1 chromEnd=$7 if (match($1,"^RIP_L1_.*")) { printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21 } }' | sort -k1,1 -k2,2n > dbRIP.L1.txt zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' ' { chromStart=$6 chromStart -= 1 chromEnd=$7 if (match($1,"^RIP_Alu_.*")) { printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21 } }' | sort -k1,1 -k2,2n > dbRIP.Alu.txt # Create three specific sql table create definitions: sed -e "s/dbRIP/dbRIP_SVA/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_SVA.sql sed -e "s/dbRIP/dbRIP_L1/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_L1.sql sed -e "s/dbRIP/dbRIP_Alu/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_Alu.sql # And loading those three data tables: hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \ -sqlTable=dbRIP_SVA.sql hg17 dbRIP_SVA dbRIP.SVA.txt hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \ -sqlTable=dbRIP_L1.sql hg17 dbRIP_L1 dbRIP.L1.txt hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \ -sqlTable=dbRIP_Alu.sql hg17 dbRIP_Alu dbRIP.Alu.txt # And an associated table of genotype frequencies # Add three extra rows to the original data to provide a better handle # on MySQL lookups for allele Frequency hgsql hg17 -e "drop table polyGenotype;" hgsql hg17 < $HOME/kent/src/hg/lib/polyGenotype.sql zcat polyGenotype.2006-08-01.txt.gz | headRest 1 stdin | \ awk -F'\t' ' { sampleSize = $3 + $4 + $5 plus = ($3 * 2) + $4 minus = ($5 * 2) + $4 if ((plus + minus) < 1) { alleleFreq=0 } else { alleleFreq = plus / (plus + minus) } if (sampleSize > 0) { heteroZyg = (2 * alleleFreq * (1.0 - alleleFreq)) * ((sampleSize * 2)/((sampleSize * 2) - 1)) } else { heteroZyg = 2 * alleleFreq * (1.0 - alleleFreq) } printf "%s\t%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\n", $1, $2, $3, $4, $5, sampleSize, alleleFreq, heteroZyg } ' > polyGenotype.txt hgsql hg17 -e \ 'load data local infile "polyGenotype.txt" into table polyGenotype;' # A composite track was added to human/hg17/trackDb.ra to contain # these three tracks, and search methods to get the name column # participating in the search. Need to figure out how to get some # of the other text-rich columns participating in the search. ############################################################################## # hg17 -> hg15 LIFTOVER CHAINS (DONE 7/27/06 Fan) # I used a size of 10kb instead of 3kb for the split (blat query) sizes in # hg15. This had a huge affect on the amount of hits in the blat, which # then had a huge effect on the amount of chains. I should also mention # that hg15 chromosomes chr1 and chr2 were split further # into more than a single query file. This helped a LOT in avoiding # cluster hippos classically associated with those chroms. ######## LIFTOVER PREPARATION # Split up hg15 ssh pk cd /san/sanVol1/scratch/hg15 mkdir -p liftSplits/{split,lift} bash for fa in /cluster/data/hg15/?{,?}/*.fa; do c=`basename $fa .fa` echo $c faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c done mkdir -p biggerSplits/split cd biggerSplits/ ln -s ../liftSplits/lift cd split/ ln -s ../../liftSplits/split/* . faSplit sequence chr1.fa 5 chr1_ faSplit sequence chr2.fa 5 chr2_ rm chr{1,2}.fa # Make some dirs # cd /san/sanVol1/scratch # mkdir -p hg17 # Copy 11.ooc files to hg17 subdirectory. # cp -p /cluster/store5/gs.16/build33/11.ooc hg17 ## First, copy over scripts. (Already done before) # mkdir -p /san/sanVol1/scratch/fan # cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan # cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan ######## LIFTOVER BLATING # HG17 ssh kk cd /cluster/data/hg17 #makeLoChain-align hg17 /scratch/hg/hg17/nib hg15 /san/sanVol1/scratch/hg15/biggerSplits/split makeLoChain-align hg17 /scratch/hg/hg17/bothMaskedNibs hg15 /san/sanVol1/scratch/hg15/liftOver/biggerSplits/split # Completed: 2392 of 2392 jobs # CPU time in finished jobs: 25651277s 427521.28m 7125.35h 296.89d 0.813 y # IO & Wait Time: 74118s 1235.30m 20.59h 0.86d 0.002 y # Average job time: 10755s 179.25m 2.99h 0.12d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 82545s 1375.75m 22.93h 0.96d # Submission to last job: 82579s 1376.32m 22.94h 0.96d ssh kkstore02 cd /cluster/data/hg17 cd bed mv blat.hg15.2006-07-25 /san/sanVol1/scratch/hg17 ssh pk cd /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/run/ sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg17ToHg15"}' > newspec para create newspec para try para push # Completed: 2392 of 2392 jobs # CPU time in finished jobs: 612316s 10205.26m 170.09h 7.09d 0.019 y # IO & Wait Time: 12421s 207.02m 3.45h 0.14d 0.000 y # Average job time: 261s 4.35m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3524s 58.73m 0.98h 0.04d # Submission to last job: 3588s 59.80m 1.00h 0.04d ######## LIFTOVER CHAINING # LIFTING ssh pk cd /san/sanVol1/scratch/fan cp mm7SplitLift.sh hg15SplitLift.sh # change andy to fan, mm7 to hg15, and chrX to chr2, and remove chrUn_random vi hg15SplitLift.sh cat << 'EOF' > hg15ChainMergeSplit.sh #!/bin/bash cp -r chainRaw/ /scratch/fan/hg15Lifts pushd /scratch/fan/hg15Lifts mkdir chain /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin cp -r chain `dirs +1` rm -rf chain chainRaw 'EOF' chmod +x hg15ChainMergeSplit.sh # HG17 cd /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/raw /san/sanVol1/scratch/fan/hg15SplitLift.sh # There was an extra file, nib22.fa, under /cluster/data/hg15/nib, which should not be there. # -rw-rw-r-- 1 2429 protein 50466533 May 20 2003 nib22.fa # This caused hg15SplitLift.sh to end abnormally. cd ../ mkdir chainRun chainRaw cd chainRun cat > gsub << 'EOF' #LOOP /cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg17/bothMaskedNibs /san/sanVol1/scratch/hg15/nib {check out line+ ../chainRaw/$(root1).chain} #ENDLOOP 'EOF' ls -1S ../psl/*.psl > in.lst gensub2 in.lst single gsub spec para create spec para try para push para time # Completed: 44 of 44 jobs # CPU time in finished jobs: 3596s 59.94m 1.00h 0.04d 0.000 y # IO & Wait Time: 919s 15.31m 0.26h 0.01d 0.000 y # Average job time: 103s 1.71m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 274s 4.57m 0.08h 0.00d # Submission to last job: 284s 4.73m 0.08h 0.00d ######### CHAINMERGE/NET/NETSUBSET ssh kolossus mkdir -p /scratch/fan/hg15Lifts cd /scratch/fan/hg15Lifts cp -r /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/chainRaw/ . mkdir chain /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin # about 30 minutes. cp -rp chain /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/ rm -rf chain rm -rf chainRaw ssh pk cd /san/sanvol1/scratch/fan cat << 'EOF' > netOver.sh #!/bin/bash chain=$1 chrom=`basename $chain .chain` sizesHGOld=$2 sizesHG15=/cluster/data/hg15/chrom.sizes chainDir=`dirname $chain` blatDir=`dirname $chainDir` net=${blatDir}/net/${chrom}.net over=${blatDir}/over/${chrom}.over mkdir -p ${blatDir}/{over,net} /cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG15 $net /dev/null /cluster/bin/x86_64/netChainSubset $net $chain $over 'EOF' chmod +x netOver.sh mkdir netRun cd netRun/ find /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/chain -name "*.chain" \ | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg17/chrom.sizes"}' > spec para create spec para push para time # Completed: 46 of 46 jobs # CPU time in finished jobs: 438s 7.30m 0.12h 0.01d 0.000 y # IO & Wait Time: 118s 1.97m 0.03h 0.00d 0.000 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 28s 0.47m 0.01h 0.00d # Submission to last job: 67s 1.12m 0.02h 0.00d ########## FINISHING ssh hgwdev # HG17 cd /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/over cat * >> ../hg17ToHg15.over.chain cd ../ rm -rf psl/ net/ chain/ chainRaw/ over/ cd ../ cp -rp blat.hg15.2006-07-25/ /cluster/data/hg17/bed cd /cluster/data/hg17/bed ln -s blat.hg15.2006-07-25 blat.hg15 ln -s `pwd`/blat.hg15/hg17ToHg15.over.chain liftOver/hg17ToHg15.over.chain ln -s `pwd`/liftOver/hg17ToHg15.over.chain /gbdb/hg17/liftOver/hg17ToHg15.over.chain mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/liftOver gzip /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain ln -s /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz /gbdb/hg17/liftOver/ cp -p /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz /cluster/data/hg17/bed/liftOver cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver ln -s /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz hg17ToHg15.over.chain.gz hgAddLiftOverChain hg17 hg15 ############################################################################ ############################################################################ # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-08-15 markd) cd /cluster/data/genbank/data/ccds/hg17 ftp ftp-private.ncbi.nih.gov (user ccds, needs password) ftp> get CCDS.20060815.tar.gz mkdir /scratch/tmp/ccds cd /scratch/tmp/ccds tar -zxf /cluster/data/genbank/data/ccds/hg17/CCDS.20060815.tar.gz # import ccds database tables hgsql -e 'create database ccds' hgsql ccds j.kg1 hgsql hg17 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2 cat j.kg1 j.kg2 |sort -u >knownToHprd.tab wc knownToHprd.tab hgsql hg17 -e 'drop table knownToHprd' hgsql hg17 <~/src/hg/lib/knownToHprd.sql hgsql hg17 -e 'load data local infile "knownToHprd.tab" into table knownToHprd' hgsql hg17 -e 'select count(*) from knownToHprd' # 19,345 records created. # remove temporary files. rm j* # Do the same for hg17. See hg17.txt for details. ############################################################################ # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-09-20 markd) # Reloaded due to bug that results in multiple versions of the same accession # in the ccdsInfo table. cd /cluster/data/genbank/data/ccds/hg17 ftp ftp-private.ncbi.nih.gov (user ccds, needs password) get CCDS.20060920.tar.gz mkdir /scratch/tmp/ccds cd /scratch/tmp/ccds tar -zxf /cluster/data/genbank/data/ccds/hg17/CCDS.20060920.tar.gz # import ccds database tables hgsql -e 'drop database ccds; create database ccds' hgsql ccds 0'|\ sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gad.bed hgLoadBed hg17 gad gad.bed ##################################################################### # YALE TRANSCRIPTIONALLY ACTIVE REGIONS (TARs/TransFrags) TRACK IDENTIFIED # USING A WHOLE GENOME TILING ARRAY (DONE, 2006-10-20, hartera) # Data is from the paper: Bertone et al. Science 24 December 2004: # Vol. 306. no. 5705, pp. 2242 - 2246. From Mark Gerstein's lab at Yale. # Contact at Yale: Joel S. Rozowsky, joel.rozowsky@yale.edu # The data consist of Transcriptionally Active Regions (TARs or TransFrags) # found using Affymetrix genome tiling arrays. The data is from the lab # of Mark Gerstein at Yale. ssh kkstore02 mkdir /cluster/data/hg17/bed/yaleBertoneTars/ cd /cluster/data/hg17/bed/yaleBertoneTars/ # download Bertone et al. data from this URL: #http://dart.gersteinlab.org/cgi-bin/ar/download.cgi?ID=TAR_data_NCBI31.txt # and put it in this directory. # The sequences used to design the microarrays were from # UCSC hg13/NCBI Build 31 so the sequences # should be aligned again using Blat since this is probably better # than using liftOver across so many assemblies. # Get sequences from TARs file and put in FASTA format: # Remove characters from Windows: dos2unix TAR_data_NCBI31.txt # The TARs are in order of IDs in the file so the first TAR has ID 1, the # second is 2 up to the last which is 17517. These IDs are used to link # to the DART database of TARs at Yale so use these IDs in the FASTA # header lines. Need to add "TAR" as prefix to ID so that it is unique # in the seq table. awk 'BEGIN {FS="\t";n=0;}{if ($1 ~ /^chr/) print ">TAR"n"\n"$14"\n";n++;}' \ TAR_data_NCBI31.txt > yaleBertoneTARSeqs.fa ssh pk mkdir -p /san/sanvol1/scratch/hg17/TARs/ cp /cluster/data/hg17/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \ /san/sanvol1/scratch/hg17/TARs/ # Set up to Blat the TAR sequences against hg17 cd /cluster/data/hg17/bed/yaleBertoneTars ls -1 /san/sanvol1/scratch/hg17/TARs/yaleBertoneTARSeqs.fa > tars.lst ls -1 /san/sanvol1/scratch/hg17/nib/*.nib > genome.lst # output dir mkdir psl cat << '_EOF_' > template.sub #LOOP /cluster/bin/x86_64/blat -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/hg17/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << for emacs gensub2 genome.lst tars.lst template.sub para.spec para create para.spec para try, para check, para push ... para time # Completed: 46 of 46 jobs # CPU time in finished jobs: 429s 7.16m 0.12h 0.00d 0.000 y # IO & Wait Time: 153s 2.54m 0.04h 0.00d 0.000 y # Average job time: 13s 0.21m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 38s 0.63m 0.01h 0.00d # Submission to last job: 107s 1.78m 0.03h 0.00d # sort and then filter pslSort dirs raw.psl tmp psl # use these parameters as for Genbank alignments of native mRNAs # for finished assemblies. pslCDnaFilter -minId=0.96 -minCover=0.25 -localNearBest=0.001 \ -minQSize=20 -minNonRepSize=16 -ignoreNs -bestOverlap \ raw.psl yaleBertoneTars.psl # seqs aligns # total: 17512 37530 # drop minNonRepSize: 121 254 # drop minIdent: 3827 14532 # drop minCover: 571 897 # weird over: 232 837 # kept weird: 197 201 # drop localBest: 2359 3896 # kept: 17498 17951 # 99.9% were kept. # check how many aligned grep '>' yaleBertoneTARSeqs.fa | wc -l # 17517 # 99.89% of the original set of sequences are in this filtered PSL file. pslCheck yaleBertoneTars.psl # psl is ok # load into database ssh hgwdev cd /cluster/data/hg17/bed/yaleBertoneTars hgLoadPsl hg17 yaleBertoneTars.psl # Add sequences to /gbdb/hg18 and to seq and extFile tables. mkdir -p /gbdb/hg17/yaleTARs/ ln -s /cluster/data/hg17/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \ /gbdb/hg17/yaleTARs/ hgLoadSeq hg17 /gbdb/hg17/yaleTARs/yaleBertoneTARSeqs.fa # trackDb.ra entry is in trackDb/human/trackDb.ra and # a description exist already as this track is also on hg18. ###################################################################### ## reload tfbsCons table - it was based on a newer version of tfbs names that # are not yet public domain (2006-11-03 - Hiram) mkdir /cluster/data/hg17/bed/tfbsCons cd /cluster/data/hg17/bed/tfbsCons cp -p /cluster/store6/weirauch/TFLOC/hg17/tfbsConsSites.bed . hgLoadBed -strict hg17 tfbsConsSites \ -sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql \ tfbsConsSites.bed -tab # this leads to a bunch of extra names in Factors hgsql -N -e "select name from tfbsConsSites;" hg17 | sort -u > names.new hgsql -N -e "select name from tfbsConsFactors;" hg17 \ | sort -u > names.factors comm -13 names.new names.factors > names.extra.factors for N in `cat extra.names.factors` do echo "delete from tfbsConsFactors where name=\"${N}\";" hg17 hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg17 done ######################################################### # BUILD GAD TRACK (Re-Re-Done, 12/12/06, Fan) mkdir /cluster/store12/gad061211 rm /cluster/data/gad ln -s /cluster/store12/gad061211 /cluster/data/gad # Receive "GAD-Hg17DATA.txt" from GAD/NIA # contact person: Shenoy, Narmada, shenoyn@grc.nia.nih.gov hgsql hg17 -e 'drop table gadAll' hgsql hg17 <~/src/hg/lib/gadAll.sql hgsql hg17 -e 'load data local infile "GAD-Hg17DATA.txt" into table gadAll ignore 1 lines' hgsql hg17 -e 'create index geneSymbol on gadAll(geneSymbol(10))' # create gad table hgsql hg17 -N -e \ 'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll where chromStart <>0 and chromosome<>""'|\ sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gad.bed hgLoadBed hg17 gad gad.bed ########################################################################## # xxBlastTab - Help filter out unwanted paralogs (Galt 2007-01-11) # # Background: The xxBlastTab tables are made with a simple blastall # (blastp with -b 1) which chooses the best match. Unfortunately this # means that if there is no proper match it will still pick something # even though it's probably not orthologous. This is especially a problem # in organisms like rat knownGene which has only 30% gene coverage. # The strategy here is to filter our xxBlastTab using synteny mappings from # the chains. This is done by simply taking $db.kg and using /gbdb/$db chains # and pslMap to lift the genes to the target xx assembly. Then hgMapToGene # will find which of those mapped ids have good overlap with xx.knownGene. # The final mapping is then created by doing an inner join between # the traditional xxBlastTab and the mapping table produced above. # Then simply drop the old table and rename the new table. # # We are starting with xxBlastTab tables already built in the usual way with # blastall/blastp, probably with doHgNearBlastp.pl script. # # I created a new utility script called synBlastp.csh since I have to do this # several times. # # we want to update hg17 for rat and mouse, # so check ./hgGeneData/Human/hg17/otherOrgs.ra for current settings ssh hgwdev synBlastp.csh hg17 rn3 #hg17.rnBlastTab results: #new number of unique query values: #10728 #new number of unique target values #5177 #old number of unique query values: #24030 #old number of unique target values #5535 synBlastp.csh hg17 mm7 #new number of unique query values: #25506 #new number of unique target values #13462 #old number of unique query values: #32951 #old number of unique target values #14803 ##################################################################### ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg17 ############ # UPDATE hg17 knownToVisiGene (DONE galt 2007-02-15) # Create table that maps between known genes and visiGene database # mapping to other species such as mouse, zebrafish, frog # requires visiGene probe track vgImageProbes be created first knownToVisiGene hg17 -fromProbePsl=vgImageProbes ######################################################### # Chimp Paralogy data from Eichlers lab (DONE Heather Feb. 2007) cd /cluster/data/hg17/bed/eichler hgLoadBed hg17 chimpParalogy chimpParalogy.bed -tab -sqlTable=chimpParalogy.sql ############################################################################ # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd) cd /cluster/data/genbank/data/ccds/ ftp ftp-private.ncbi.nih.gov (user ccds, needs password) get CCDS.20070228.tar.gz mkdir /scratch/tmp/ccds cd /scratch/tmp/ccds tar -zxf /cluster/data/genbank/data/ccds/CCDS.20070228.tar.gz # import ccds database tables /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords hg17 -verbose=2 ccdsGene joinerCheck -database=hg17 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # build initial version of ccdsMgcMap table, updated by nightly genbank update /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg17 ccdsGene mgcGenes ccdsMgcMap # load trackDb cd kent/src/hg/makeDb/trackDb make alpha # check in browser # request push of ccdsGene ccdsInfo ccdsKgMap ccdsMgcMap # << emacs ##################################################### # Vista Enhancers (galt 2007-02-23 done) # # Vista from Lawrence-Berkeley has assayed # 301 human conserved non-coding intra- and inter- # genic elements for their ability to promote # lacZ in mouse embryos. A positive looks like # a mouse with a purple spine. # # They provided a custom track with two tracks for pos and neg. # http://enhancer-test.lbl.gov/cgi-bin/customTrack.cgi # I am combining the tracks into one with high score for pos. # cd /cluster/data/hg17/bed mkdir vistaEnhancers cd vistaEnhancers wget -O custTrk "http://enhancer-test.lbl.gov/cgi-bin/customTrack.cgi" cat custTrk | head -116 | tail +2 > pos cat custTrk | tail +118 > neg cat pos | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t900"}' > bed5 cat neg | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t200"}' >> bed5 wc -l bed5 #301 bed5 hgLoadBed hg17 vistaEnhancers bed5 #Loaded 301 elements of size 5 # add to human/trackDb.ra track vistaEnhancers shortLabel Vista Enhancers longLabel Vista HMR-Conserved Non-coding Human Enhancers from LBNL group regulation priority 93 visibility hide color 50,70,120 type bed 5 . useScore 1 url http://enhancer-test.lbl.gov/cgi-bin/imagedb.pl?form=presentation&show=1&experiment_id=$$ ### # UPDATES (2007-10-18, conodera) # see also /projects/compbiousr/wet/browser/vista_enhancer/17Oct2007/Makefile cd /projects/compbiousr/wet/browser/vista_enhancer/ # download data file from the vista browser (coordinates are for hg17) # http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1 # save as enhancerbrowser.datadownload.txt # give elements with positive label a score of 900, # give elements with negative label a score of 200. # print to 5-field bed file vista_enhancer.hg17.txt: enhancerbrowser.datadownload.txt grep ">" $< \ | sed -e 's/>//' \ | tr :- ' ' \ | sed -e 's/positive/900/'\ | sed -e 's/negative/200/' \ | awk '{print $$1"\t"$$2"\t"$$3"\telement_"$$6"\t"$$8}' \ > $@; \ hgLoadBed hg17 vistaEnhancers vista_enhancer.hg17.txt; # loaded 446 elements of length 5 ######################################################################### # EPONINE-TSS (TRANSCRIPTON START SITE) PREDICTION # (DONE, 2007-03-08, hartera) # The Eponine software is version 2 and has not changed in several years # (contact: Thomas Down at Sanger, td2@sanger.ac.uk). The version downloaded # for hg16 should be the same as the current version but download again just # to check. The application includes the TSS model file: eponine-tss2.xml ssh kkstore02 # Eponine runs fine on 2.5Mb contig, but barfs on much larger contig; # chop up sequence at gaps into ~2.5Mb chunks for cluster run. mkdir /san/sanvol1/scratch/hg17/chunks cd /cluster/data/hg17 foreach f (?{,?}/NT_*/NT_??????.fa) set ctg = $f:t:r /cluster/bin/x86_64/faSplit -minGapSize=10 \ -lift=/san/sanvol1/scratch/hg17/chunks/${ctg}.lft \ gap $f 2500000 /san/sanvol1/scratch/hg17/chunks/${ctg}.chunk end # seems to ignore the chunk part of the file name mkdir /cluster/data/hg17/bed/eponine cd /cluster/data/hg17/bed/eponine wget --timestamping \ http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar # file has the same date and same size as the one downloaded for hg16 # the script requires all of the path setting found in my .tcshrc file. # Using only set path = (/usr/java/jre1.5.0_06/bin $path) # as in the doEpo file for hg16 does not work. cat << '_EOF_' > doEpo #!/bin/csh -ef set path = (/usr/java/jre1.5.0_06/bin /bin /usr/bin /usr/X11R6/bin \ /usr/local/bin . /cluster/home/hartera/bin/x86_64 \ /cluster/bin/x86_64 /projects/compbio/bin/x86_64 \ /projects/compbio/bin /projects/compbio/bin/x86_64-linux \ /cluster/bin/scripts) java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2 '_EOF_' # << emacs chmod a+x doEpo cp /dev/null jobList foreach f (/san/sanvol1/scratch/hg17/chunks/NT*.fa) echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \ >> jobList end mkdir out ssh pk cd /cluster/data/hg17/bed/eponine /parasol/bin/para create jobList /parasol/bin/para try, check, push, check etc..... /parasol/bin/para time # Completed: 1415 of 1415 jobs # CPU time in finished jobs: 104501s 1741.68m 29.03h 1.21d 0.003 y # IO & Wait Time: 6594s 109.91m 1.83h 0.08d 0.000 y # Average job time: 79s 1.31m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 127s 2.12m 0.04h 0.00d # Submission to last job: 488s 8.13m 0.14h 0.01d # lift chunks -> contigs mkdir contigs/ foreach l (/san/sanvol1/scratch/hg17/chunks/*.lft) set ctg = $l:t:r liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff end # lift contigs -> chrom liftUp eponine.gff /cluster/data/hg17/jkStuff/liftAll.lft \ warn contigs/NT_*.gff # Translate to bed 4 + float-score -- it would be a shame to lose # those scores in genePred or bed 5 (int score) awk 'BEGIN {i=0;} \ {printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \ i = i + 1;}' \ eponine.gff > eponine.bed # load up ssh hgwdev cd /cluster/data/hg17/bed/eponine sed -e 's/bed6FloatScore/eponine/g' \ $HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql hgLoadBed hg17 eponine eponine.bed -tab -sqlTable=eponine.sql # Loaded 61013 elements of size 6 # trackDb.ra entry and eponine.html already exist in trackDb directory. ########################################################################### # ACEScan Track (DONE 2007-03-15 Andy ssh hgwdev cd /cluster/data/hg17/bed mkdir acescan cd acescan/ cp ~/acescan.gff . tail +2 acescan.gff > acescan.nh.gff ldHgGene -out=gp hg17 acescan acescan.nh.gff rm *.gff ldHgGene -predTab hg17 acescan acescan.hg17.gp ########################################################################### # augustusHints track (DONE 2007-4-5 Mario) mkdir -p /cluster/data/hg17/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.Xp.final cd /cluster/data/hg17/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.Xp.final wget http://augustus.gobics.de/predictions/hg17/usingEvidence/augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.gff wget http://augustus.gobics.de/predictions/hg17/usingEvidence/augustus.hg17.Trefseq.hmRNA.hsEST.R.pep.aa ldHgGene -bin hg17 augustusHints augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.gff hgPepPred hg17 generic augustusHintsPep augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.pep.aa ########################################################################### # augustus de novo track (DONE 2007-4-5 Mario) mkdir -p /cluster/data/hg17/bed/augustus/usingHints/predictions/Xp.RA.it cd /cluster/data/hg17/bed/augustus/usingHints/predictions/Xp.RA.it wget http://augustus.gobics.de/predictions/hg17/deNovo/augustus.hg17.Xp.RA.it.gff wget http://augustus.gobics.de/predictions/hg17/deNovo/augustus.hg17.Xp.RA.it.pep.aa ldHgGene -bin hg17 augustusXRA augustus.hg17.Xp.RA.it.gff hgPepPred hg17 generic augustusXRAPep augustus.hg17.Xp.RA.it.pep.aa ########################################################################### # SwitchDB TSS Track (DONE 2007-04-12 Andy) ssh hgwdev mkdir /cluster/data/hg17/bed/switchDbTss cd /cluster/data/hg17/bed/switchDbTss # (obtained from Nathan Trinklein ) cp ~/all_tss_switchdb_psgene.gz . gunzip all_tss_switchdb_psgene.gz cat << "EOF" > reformat.awk BEGIN{FS="\t"} { if (NR > 1) { if ($9 !~ "^PSEUDO.*") { pseudo = "none"; } else { pseudo = $9; } printf("%s\t%d\t%d\t%s\t1000\t%s\t%s\t%s\t%s\t%s\t%s\n", $2, $8, $8+1, $6, $5, $7, $1, $3, $4, pseudo); } } EOF awk -f reformat.awk all_tss_switchdb_psgene > switchDbTss.bed ln -s ~/kent/src/hg/lib/switchDbTss.sql hgLoadBed -sqlTable=switchDbTss.sql hg17 switchDbTss switchDbTss.bed ############################################################################ # enable ORFeome track build. (markd 2007-05-02) cd ~/kent/src/hg/makeDb/genbank cvs update -d etc # edit etc/genbank.conf to add hg17.orfeomeTables.hgwdev = yes hg17.orfeomeTables.hgwbeta = yes # will need to enable for rr later. In the future, this can just be enabled # as part the normal genbank build. Change above to: hg18.orfeomeTables.default = yes ########################################################################### # Transcriptome Phase 3 tracks (Andy 2007-06-10) ssh hgwdev bash cd /san/sanVol1/scratch/andy mkdir transcriptome cd transcriptome/ cp /var/ftp/encode/Affy_transcriptome_phase3.tar . tar xfv Affy_transcriptome_phase3.tar find . -name '*.bz2' -exec bunzip2 '{}' \; cat > processWig.sh << "EOF" #!/bin/bash theDir=`dirname $1`; theFile=`basename $1`; table=affyTxnPhase3${theFile%.sig.wig}; tmp=/scratch/tmp/trans3rdPhase.$$ mkdir $tmp cp $1 $tmp pushd $tmp head -n1 $theFile > $table.sig.track.txt tail +2 $theFile > tmp; mv tmp $theFile wigEncode $theFile $table.wig $table.wib popd cp $tmp/${table}.* $theDir rm -rf $tmp EOF chmod +x processWig.sh cat > gsub << "EOF" #LOOP ./processWig.sh {check in line+ $(path1)} {check out exists $(dir1)/$(root1).track.txt} #ENDLOOP EOF find . -name '*.sig.wig' > wig.lst gensub2 wig.lst single gsub spec ssh pk cd /san/sanVol1/scratch/andy/transcriptome para create spec para push exit cd /cluster/data/hg17/bed mkdir transcriptome3rdPhase/{wig,wib,bed} cd transcriptome3rdPhase/wib/ cp /san/sanVol1/scratch/andy/transcriptome/graphs/human_{long,short}_rna/affyTxnPhase3*.wib . pushd /gbdb/hg17/wib ln -s /cluster/data/hg17/bed/transcriptome3rdPhase/wib/* . popd cd ../wig/ cp /san/sanVol1/scratch/andy/transcriptome/graphs/human_{long,short}_rna/affyTxnPhase3*.wig . for f in *; do hgLoadWiggle hg17 ${f%.wig} $f done cd ../bed for f in /san/sanVol1/scratch/andy/transcriptome/transfrags/human_{long,short}_rna/*; do newName=`basename $f`; newName=${newName%.bz2}; bzcat $f | tail +2 > $newName; tab=affyTxnPhase3Frags${newName%.bed}; hgLoadBed hg17 $tab $newName; done ####################################################################### # CLEANUP OF DANRER1 BLASTZ SWAP (DONE, 2007-06-25, hartera) ssh kkstore02 cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain rm -r run1 cd ../mafNet.new gzip *.maf # we don't tend to keep the Blastz PSLs anymore and this is an old # zebrafish assembly so remove these. cd ../ rm -r pslChrom # this removed 1.2 G of data. ####################################################################### # CLEANUP OF ACEMBLY_050217 DATA (DONE, 2007-06-25, hartera) ssh kkstore02 cd /cluster/store5/gs.18/build35/bed/acembly_050217 rm GeneCheck.out GeneCheck2 acembly acembly.chk acembly.details \ chrom.out genePred.tab hg16.name hg16Pep.name cd acembly_gene_lists rm test transcripts.names *.bak main_gene.list.IDsort mp.ids mp.sort ptest \ maintest gid.tab gid.tab.sort genesGffs.ids genesGffs.ids.uniq cd ../ # remove fasta files as included in gzipped tar file rm -r acembly.ncbi_35.genes.proteins.fasta cd acembly.ncbi_35.genes.gff gzip *.gff ####################################################################### # CLEANUP OF DANRER2 BLASTZ SWAP (DONE, 2007-06-25, hartera) ssh kkstore02 cd /cluster/store5/gs.18/build35/bed/blastz.danRer2.2004-12-08 # remove old axtChrom directory rm -r axtChrom.orig cd axtChain # chain directories can be recreated from all.chain files so remove rm -r chain chainAR # gzip net files gzip net/*.net # gzip .over files gzip over/*.over # removed ~1.3 G data ############################################################################# # Duke DNaseI HS (2007-06-26 kate) # # Submitted by Terry Furey # in collaboration with Greg Crawford # Resubmitted 9/26/07 from FTP site # Resubmitted 10/25/07 from FTP site ssh kkstore02 cd /cluster/data/hg17/bed # download 19GB archive from Duke site, password protected, # user=ucsc, password=dnase mkdir -p dukeDnase/2007-10-25/lab cd dukeDnase/2007-10-25/lab sftp ucsc@sftp.igsp.duke.edu mget * # dukeDnaseHsCd4.bed # dukeDnaseHsCd4Wiggle.tgz # unpack and load wiggle (signal) data nice tar xvfz dukeDnaseHsCd4Wiggle.tgz # packaged as chr*_dukeDnaseHsCd4Wiggle.out # fixedStep 1 files # create wiggle and load into database cd .. cat lab/chr*.out | nice wigEncode stdin \ dukeDnaseCd4Signal.wig dukeDnaseCd4Signal.wib >&! wigencode.log & # upper limit 25.74, lower limit -0.66 ssh hgwdev cd /cluster/data/hg17/bed/dukeDnase/2007-10-25 rm -f /gbdb/hg17/wib/dukeDnaseCd4Signal.wib ln -s /cluster/data/hg17/bed/dukeDnase/2007-10-25/dukeDnaseCd4Signal.wib \ /gbdb/hg17/wib nice hgLoadWiggle hg17 dukeDnaseCd4Signal -pathPrefix=/gbdb/hg17/wib \ dukeDnaseCd4Signal.wig # load bed file (sites) ssh hgwdev cd /cluster/data/hg17/bed/dukeDnase/2007-10-25/ set table = dukeDnaseCd4Sites sed "s/bed5FloatScore/$table/" ~/kent/src/hg/lib/bed5FloatScore.sql > \ $table.sql hgsql hg17 -e "DROP TABLE IF EXISTS $table" hgsql hg17 < $table.sql hgLoadBed -sqlTable=$table.sql hg17 $table lab/dukeDnaseHsCd4.bed # Loaded 95723 elements of size 6 # min value: 0.000103164 # max value: 25.7442 #textHistogram -col=5 lab/dukeDnaseHsCd4.bed -binSize=50 300 ******************************** 11789 350 ************************************************************ 22253 400 ********************************************* 16854 450 ********************************* 12333 500 ************************* 9179 550 ********************* 7870 600 ************* 4987 650 ********* 3271 700 ******** 2789 750 ****** 2153 800 **** 1303 850 ** 567 900 * 219 950 85 1000 71 ########################################################################### # Stanford ChIP-seq (Apr - July 2007, Heather) # Submitted 2007-03-14 by David Johnson # 25bp tags (Solexa sequencing of IP fragments) # genome-wide, but funded by ENCODE, hence the location of the data ssh hgwdev cd /cluster/data/encode/stanford mkdir -p 2007-03-14/lab cd 2007-03-14/lab sort NRSF_chipseq_hg17.bed > data.bed sort NRSF_chipseq_control_hg17.bed > control.bed fix.pl < data.bed > fix.bed fix.pl < control.bed > control_fix.bed hgLoadBed hg17 stanfordNRSFEnriched fix.bed -tab hgLoadBed hg17 stanfordNRSFControl control_fix.bed -tab ############################################################################ # Stanford ChIP/chip # Submitted 2007-07-11 by David Johnson (seasquirtdoctor@gmail.com) # Replaces submission from 2007-03-23 # 12 subtracks # genome-wide, but funded by ENCODE, hence the location of the data ssh hgwdev cd /cluster/data/encode/stanford/2007-07-11/lab # Dave gave us bed 5, we need bed 4 ./shrink.sh ./load.sh ######################################################################### # REGULATORY POTENTIAL 7X UPDATED (DONE - 2007-08-01 - Hiram) # download data from "James Taylor" ssh kkstore02 mkdir /cluster/data/hg17/bed/regPotential7X.update cd /cluster/data/hg17/bed/regPotential7X.update ## In theory, only chr4, chr8, chr9 and chrY have updated, fetch them ## all and verify with ../regPotential7X for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y do wget --timestamping \ "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/chr${C}.scores.truncated.bz2" echo "DONE - chr${C}.scores.bz2" done # create download gzip files from the bz2 files: time for F in chr*.scores.truncated.bz2 do C=`echo $F | awk -F'.' '{print $1}'` echo -n "${C}.regPotential7X.hg17.gz working ... " bzcat ${F} | gzip > ${C}.regPotential7X.hg17.gz echo "done" done time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y do zcat chr${C}.regPotential7X.hg17.gz done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib # Converted stdin, upper limit 1.00, lower limit -0.00 # real 16m51.215s # Loading the table on hgwdev ssh hgwdev cd /cluster/data/hg17/bed/regPotential7X.update mkdir /gbdb/hg17/wib/061116 ln -s /cluster/data/hg17/bed/regPotential7X.update/regPotential7X.wib \ /gbdb/hg17/wib/061116/regPotential7X.wib # using the tmpDir is faster since it is on local disk and it will # clean up any temporary .tab file it creates there time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \ -pathPrefix=/gbdb/hg17/wib/061116 hg17 regPotential7X regPotential7X.wig # real 0m40.523s # How about a histogram of the data. ssh kolossus cd /cluster/data/hg17/bed/regPotential7X.update time nice -n +19 hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 \ -hBinCount=100 -hMinVal=0.0 -db=hg17 regPotential7X > histogram.data 2>&1 # real 3m3.829s # 73 % of the data values are zero # renaming file directory -- kuhn 08-17-2007 cd /gbdb/hg17/wib mv 061116 regPot061116 hgsql -e " update regPotential7X SET file = \ /gbdb/hg17/wib/regPot061116/regPotential7X.wib" hg17 Query OK, 2366123 rows affected (31.46 sec) Rows matched: 2366123 Changed: 2366123 Warnings: 0 ########################################################################### ## Create gc5Base download raw data file (DONE - 2007-08-29 - Hiram) ssh kkstore02 cd /cluster/data/hg17/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 \ hg17 /cluster/data/hg17/hg17.2bit 2> /dev/null \ | gzip > hg17.gc5Base.txt.gz ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg17/gc5Base cd /usr/local/apache/htdocs/goldenPath/hg17/gc5Base ln -s /cluster/data/hg17/bed/gc5Base/hg17.gc5Base.txt.gz . ############################################################################ # INDEL-BASED CONSERVATION TRACK (DONE, 2007-10-02 - 2007-10-03, hartera) # Data from the Gerton Lunter (gerton.lunter@anat.ox.ac.uk), MRC # Functional Genetics Unit, University of Oxford, United Kingdom. # Data is from the paper: # Lunter G, Ponting CP and Hein J Genome-wide identification of human # functional DNA using a neutral indel model. PLoS Comput Biol. 2006 # Jan;2(1):e5. ssh kkstore02 mkdir -p /cluster/data/hg17/bed/consIndels/data cd /cluster/data/hg17/bed/consIndels/ # Add a README.indels with the e-mail from Gerton Lunter, copy over # from hg18 condIndels cp /cluster/data/hg18/bed/consIndels/README.indels . # get the data cd data wget --timestamping \ http://wwwfgu.anat.ox.ac.uk/~gerton/IPS/IPSs.zip # 15 Mb zip file in GFF format. This contains data for hg17 # comparing it to mm5 (NCBI Build 33) and # canFam1 (Broad Institute, July 2004). The chr*.mm5.GFF data is old # data that can be removed. unzip IPSs.zip cd /cluster/data/hg17/bed/consIndels rm ./data/*mm5.GFF foreach f (./data/*.GFF) set r = $f:r echo $r grep -v "track" $f > ${r}NoHeader.gff end # strip off the end of the name e.g. IGS0001:p=.26 # so that the name displayed is short - IGS0001.1. The score field # is used to determine colouring and this is calculated from FDR ssh kkstore02 cd /cluster/data/hg18/bed/consIndels perl -pi.bak -e \ 's/(IGS[0-9a-z]+\.?[0-9XY]*):p=?> consIndelsHg17Mm5CanFam1.bed end # load data ssh hgwdev cd /cluster/data/hg17/bed/consIndels hgLoadBed hg17 consIndelsHg17Mm5CanFam1 consIndelsHg17Mm5CanFam1.bed # Loaded 593298 elements of size 5 # Get the IDs, posterior probabilities (p) for the segment being neutral, # and the FDR from the original GFFs for a separate table. Some items # have p<.001. Can not do Table Browser queries restricting # p to <, =, or > a specified value unless all values are floats. # Contacted the data contributor, Gerton Lunter, and he said it would be # ok to change all p<.001 to p=0.0005 ssh kkstore02 cd /cluster/data/hg17/bed/consIndels/ awk '{if ($1 !~ /random/) print $1;}' /cluster/data/hg17/chrom.sizes \ | sed -e 's/chr//' | sort -n > chrom.lst grep -v 'hap' chrom.lst > tmp2 tail +4 tmp2 > tmp3 echo "X\nY\n" >> chrom.lst rm tmp2 tmp3 # chrom.lst has a list of chroms 1-22, then X and Y foreach c (`cat chrom.lst`) echo $c foreach f (./data/chr${c}.GFF) echo $f awk 'BEGIN {FS="\t"} {OFS="\t"}{if ($9 ~ /IGS/) print $9,$6;}' $f \ | sed -e 's/:/\t/' \ | sed -e 's/p=\./0\./' | sed -e 's/p<\.001/0\.0005/' \ >> consIndelsConf.txt end end # Add the FDR. # For this set, there is no false discovery rate (FDR) field but it # can be related to the score. If score is 999 then FDR is 1% (0.01) and # if score is 500 then FDR is 10% (0.10). Score is in column 6. # there are no GFF files for the haplotype chroms awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($3 ~ /500/) print $1, $2, "0.10"; else if ($3 ~ /999/) print $1, $2, "0.01";}' consIndelsConf.txt > consIndelsHg17Mm5CanFam1Conf.txt # Create a table definition for the table of identifier, posterior # probability and false discovery rate (FDR). Already created for hg18 # track (see hg18.txt). It is $HOME/kent/src/hg/lib/itemConf.as. ssh hgwdev cd /cluster/data/hg17/bed/consIndels hgLoadSqlTab hg17 consIndelsHg17Mm5CanFam1Conf \ $HOME/kent/src/hg/lib/itemConf.sql \ consIndelsHg17Mm5CanFam1Conf.txt # check that all itesm are in this table. hgsql -N -e 'select distinct(name) from consIndelsHg17Mm5CanFam1;' hg17 \ | sort > consIndels.names.sort hgsql -N -e 'select distinct(id) from consIndelsHg17Mm5CanFam1Conf;' hg17 \ | sort > consIndels.idsfromConf.sort wc -l *.sort # 593298 consIndels.idsfromConf.sort # 593298 consIndels.names.sort comm -12 consIndels.names.sort consIndels.idsfromConf.sort | wc -l # 593298 # so all element IDs are in both tables. # cleanup rm ./data/*.bak *.sort # add trackDb/human/hg17/trackDb.ra entry and add description that # was written by the data contributor. Add code to hgc.c to display # the posterior probability and the FDR on the details page for # track elements. Gerton Lunter provided a description for the data # on 2007-09-12. cd ~/kent/src/hg/makeDb/trackDb/human/hg17 cp ../hg18/consIndelsHg18Mm8CanFam2.html consIndelsHg17Mm5CanFam1.html # check this is correct and add trackDb.ra track entry and search. ############################################################## # NIMH Bipolar Genome Graphs built-in (DONE 2007-10-04 Galt) # # See hg18.txt for details. ############################################################# ############################################################# # CCC Genome Graphs (DONE 2007-Sept Andy) # # See hg18 make doc. ############################################################### # Affy Transcriptome Phase 3 chrY fix (DONE 2007-12-10, Andy) ssh kkstore05 cd /cluster/store12/hg17/bed/affyTxnPhase3/raw zcat sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz | grep -n chrY #256994657:variableStep chrom=chrY span=1 zcat sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz | head -n256994656 | gzip -c >tmp.wig.gz mv tmp.wig.gz sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz zcat sRNA.affyTxnPhase3HeLaTopStrand.wig.gz | grep -n chrY 256994657:variableStep chrom=chrY span=1 zcat sRNA.affyTxnPhase3HeLaTopStrand.wig.gz | head -n256994656 | gzip -c > tmp.wig.gz mv tmp.wig.gz sRNA.affyTxnPhase3HeLaTopStrand.wig.gz ssh kolossus cd /cluster/store12/hg17/bed/affyTxnPhase3/raw wigEncode sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz affyTxnPhase3HeLaBottomStrand.{wig,wib} wigEncode sRNA.affyTxnPhase3HeLaTopStrand.wig.gz affyTxnPhase3HeLaTopStrand.{wig,wib} mv *.wig /cluster/data/hg17/bed/affyTxnPhase3/wig/ mv *.wib /cluster/data/hg17/bed/affyTxnPhase3/wib/ ssh hgwdev cd /cluster/data/hg17/bed/affyTxnPhase3/wig hgLoadWiggle hg17 affyTxnPhase3HeLaTopStrand{,.wig} hgLoadWiggle hg17 affyTxnPhase3HeLaBottomStrand{,.wig} ########################################################################### # Reload CCDS (2007-12-12 markd) # import ccds database as described in ccds.txt set db=hg17 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap # build initial version of ccdsMgcMap table, updated by nightly genbank update /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap checkTableCoords ${db} -verbose=2 ccdsGene # update all.jointer to include ${db} in ccdsDb joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # request push of ccdsGene ccdsInfo ccdsKgMap ccdsMgcMap # << emacs ############################################################################ # ADD LINKS TO GENETESTS ON hgGene DETAILS PAGE (DONE 12/12/07 Fan) See hg18.txt for details. ############################################################################ # Reload CCDS (2008-02-01 markd) # import ccds database as described in ccds.txt set db=hg17 # create and load ccdsGene and ccdsInfo tables from imported database /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene # ccdsKgMap /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap checkTableCoords ${db} -verbose=2 ccdsGene # update all.jointer to include ${db} in ccdsDb joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner # request push of ccdsGene ccdsInfo ccdsKgMap # << emacs ############################################################################ # CREATE huge TABLE FOR HuGE LINK (DONE 3/6/08, Fan) # See the HuGE section in hg18.txt for details. ############################################################################ ############################################################################ # DGV V10 (DATABASE OF GENOMIC VARIANTS) (DONE 11/10/10 angie - color change 2/22/11 #2917) # DGV V9 done 3/26/10 # DGV V8 done 8/12/09 (changed color of inverted 11/05/09 kuhn) # DGV V7 done 3/11/09 # DGV V6 thin regions dropped 2/23/09 # DGV V6 with useless thin regions done 11/12/08 # DGV V5 done 8/11/08 # DGV V4 done 5/9/08 # 11-04-2009 color change from brown to magenta: # old color # 6553700 Inversion (100,0,100) # new: # 13107400 Inversion (200,0,200) # 2/22/11 color change (Bug #2917): swap blue and red; green -> brown # Old DGV format is obsolete; see the following section. ####################################################################### # DGV BETA (DATABASE OF GENOMIC VARIANTS) (DONE 2/11/13 angie) # DGV has changed their data format, and for the time being the data are # served by a beta web site, http://dgvbeta.tcag.ca/ ; in time that will # replace their current site. set today = `date +%y%m%d` mkdir -p /hive/data/genomes/hg17/bed/dgv/$today cd /hive/data/genomes/hg17/bed/dgv/$today wget http://dgvbeta.tcag.ca/dgv/docs/NCBI35_hg17_2012-11-23.txt head -1 NCBI35_hg17*.txt #variantaccession chr start end varianttype variantsubtype reference pubmedid method platform mergeid mergedorsample frequency samplesize cohortdescription genes # It's more complicated than Gain/Loss/Complex or Inversion now (+ stray commas): cut -f 5,6 NCBI35_hg17*.txt | sort | uniq -c | head -100 # 16978 CNV # 1193 CNV "" # 27902 CNV CNV # 2179 CNV Complex # 319 CNV Deletion # 1203 CNV Duplication # 39279 CNV Gain # 1715 CNV Gain+Loss # 23850 CNV Insertion # 105187 CNV Loss # 78 OTHER # 4 OTHER "" # 1396 OTHER Inversion # 1 varianttype variantsubtype # shuffle fields into bed9+ w/itemRgb set purple = "200,0,200" set red = "200,0,0" set blue = "0,0,200" set brown = "139,69,19" tail -n +2 NCBI35_hg17*.txt \ | perl -wpe 'chomp; \ s/""//; \ ($id, $chr, $start, $end, $varType, $varSubType, $ref, $pmid, $method, $platform, \ undef, undef, undef, $sampleSize, $sampleDesc, $genes) = split("\t"); \ $start-- unless ($start == 0); \ $landmark = $genes; \ $landmark =~ s/,/, /g; \ $varSubType =~ s/^,//; $varSubType =~ s/,$//; \ $varTypeOut = "$varType ($varSubType)"; \ $ref =~ s/_/ /g; \ $method =~ s/_/ /g; $method =~ s/,/, /g; \ $sample = $sampleDesc; \ $sample .= " (sample size: $sampleSize)" if ($sampleSize); \ $method .= " ($platform)" if ($platform && $platform ne "Not Provided"); \ $rgb = "0,0,0"; \ if ($varType eq "CNV") { \ if ($varSubType eq "Gain" || $varSubType eq "Insertion" || $varSubType eq "Duplication") {\ $rgb = "'$blue'"; \ } elsif ($varSubType eq "Loss" ||$varSubType eq "Deletion") { \ $rgb = "'$red'"; \ } elsif ($varSubType eq "") { \ $varTypeOut = $varType; \ } else { \ $rgb = "'$brown'"; \ } \ } elsif ($varType eq "OTHER") { \ if ($varSubType eq "Inversion") { \ $rgb = "'$purple'"; \ } elsif ($varSubType eq "Tandem Duplication") { \ $rgb = "'$blue'"; \ } else { \ $varTypeOut = $varType; \ } \ } \ $_ = join("\t", "chr$chr", $start, $end, $id, 0, "+", \ $start, $start, $rgb, $landmark, $varTypeOut, \ $ref, $pmid, $method, $sample) . "\n";' \ > dgv.bed hgLoadBed hg17 dgv dgv.bed \ -sqlTable=$HOME/kent/src/hg/lib/dgv.sql -renameSqlTable -tab #Read 221283 elements of size 15 from dgv.bed # 2/11/13: checkTableCoords caught some coords past the end of a few # chromosomes. DGV is looking into it. Truncate: hgsql hg17 -e 'update dgv set chromEnd = 154824264 where chrom = "chrX" && chromEnd > 154824264' hgsql hg17 -e 'update dgv set chromEnd = 199505740 where chrom = "chr3" && chromEnd > 199505740' hgsql hg17 -e 'update dgv set chromEnd = 170975699 where chrom = "chr6" && chromEnd > 170975699' hgsql hg17 -e 'update dgv set chromEnd = 158628139 where chrom = "chr7" && chromEnd > 158628139' hgsql hg17 -e 'update dgv set chromEnd = 138429268 where chrom = "chr9" && chromEnd > 138429268' checkTableCoords -verbose=2 hg17 dgv # No output, good. ############################################################################ # KIDD/EICHLER DISCORDANT CLONE ENDS (DONE 6/10/08 angie) # 8/11/08: Added kiddEichlerToNcbi (ID xref table). ssh kkstore02 mkdir /cluster/data/hg17/bed/kiddEichlerDiscordant cd /cluster/data/hg17/bed/kiddEichlerDiscordant wget --user=uuuu --password=ppppppp \ http://eichlerlab.gs.washington.edu/kiddj/downloads/fosmids.hg17.tgz tar xvzf fosmids.hg17.tgz cd bd35 # 8 clone-end linkedFeaturesSeries tracks and one bed custom track. # bed has illegal coords (maybe for unplaced ends?). # Load the tracks (translate bacEndPairs format to bed12): ssh hgwdev cd /cluster/data/hg17/bed/kiddEichlerDiscordant/bd35 foreach f (abc*.txt) set track = `echo $f:r \ | perl -wpe 's/^(G|abc)(\d+)discordant/kiddEichlerDisc\u$1$2/ || die;'` if ($status != 0) break perl -wpe 'next if s/^#.*\n$//; \ ($c, $s, $e, $n, $sc, $st, undef, $bs, $bSt, $bSz)=split; \ @bSts = split(",", $bSt); @bSzs = split(",", $bSz); \ $s--; \ if ($n =~ /transchr/) { \ $bs = 1; \ $#bSts = 0; $#bSzs = 0; \ $bSts[0]--; $e--; \ $bSts[0] -= $s; \ } elsif ($n =~ /OEA/) { \ $bSts[0]--; \ die "bSts[0] $bSts[0] != s $s\n" if ($bSts[0] != $s); \ $bE = $bSts[0] + $bSzs[0]; \ die "bE $bE != e $e\n" if ($bE != $e); \ $bSts[0] -= $s; \ } elsif ($bs == 2) { \ $bSts[0]--; $bSts[1]--; \ if ($bSts[0] > $bSts[1]) { \ # warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \ $tmp = $bSts[0]; $bSts[0] = $bSts[1]; $bSts[1] = $tmp; \ $tmp = $bSzs[0]; $bSzs[0] = $bSzs[1]; $bSzs[1] = $tmp; \ } \ if ($bSts[0] != $s) { \ # warn "Tweaking $n start from $s to $bSts[0]\n"; \ $s = $bSts[0]; \ } \ $bE0 = $bSts[0] + $bSzs[0]; \ $bE1 = $bSts[1] + $bSzs[1]; \ $bE = $bE0 > $bE1 ? $bE0 : $bE1; \ if ($bE != $e) { \ # warn "Tweaking $n end from $e to $bE\n"; \ $e = $bE; \ } \ $bSts[0] -= $s; $bSts[1] -= $s; \ } else { die "#blks is $bs for $n\n"; } \ $bSt = join(",", @bSts) . ","; $bSz = join(",", @bSzs) . ","; \ $rgb = ($n =~ /deletion/) ? "224,0,0" : \ ($n =~ /insertion/) ? "0,0,224" : \ ($n =~ /inversion/) ? "0,224,0" : \ ($n =~ /OEA/) ? "240,160,64" : "0,0,0"; \ $_ = join("\t", $c, $s, $e, $n, $sc, $st, $s, $e, $rgb, \ $bs, $bSz, $bSt) . "\n";' $f \ | hgLoadBed -tab hg17 $track stdin end perl -pe 'next if s/^track .*\n$//; \ ($c, $s, $e, $n, $sc, $st, $tS, $tE, $r, $bs, $bSz, $bSt) = split; \ @bSts = split(",", $bSt); @bSzs = split(",", $bSz); \ if ($n =~ /transchr/) { \ $bs = 1; \ $#bSts = 0; $#bSzs = 0; \ } elsif ($n =~ /OEA/) { \ $s--; # weird that this is required only for OEA here \ die "$n: bSts[0] $bSts[0] != 0\n" if ($bSts[0] != 0); \ $bE = $s + $bSts[0] + $bSzs[0]; \ die "$n: bE $bE != e $e\n" if ($bE != $e); \ } elsif ($bs == 2) { \ $bSts[0] += $s; $bSts[1] += $s; \ if ($bSts[0] > $bSts[1]) { \ # warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \ $tmp = $bSts[0]; $bSts[0] = $bSts[1]; $bSts[1] = $tmp; \ $tmp = $bSzs[0]; $bSzs[0] = $bSzs[1]; $bSzs[1] = $tmp; \ } \ if ($bSts[0] != $s) { \ # warn "Tweaking $n start from $s to $bSts[0]\n"; \ $s = $bSts[0]; \ } \ $bE0 = $bSts[0] + $bSzs[0]; \ $bE1 = $bSts[1] + $bSzs[1]; \ $bE = $bE0 > $bE1 ? $bE0 : $bE1; \ if ($bE != $e) { \ # warn "Tweaking $n end from $e to $bE\n"; \ $e = $bE; \ } \ $bSts[0] -= $s; $bSts[1] -= $s; \ } else { die "#blks is $bs\n"; } \ $bSt = join(",", @bSts) . ","; $bSz = join(",", @bSzs) . ","; \ $tS = $s; $tE = $e; \ $rgb = ($n =~ /deletion/) ? "224,0,0" : \ ($n =~ /insertion/) ? "0,0,224" : \ ($n =~ /inversion/) ? "0,224,0" : \ ($n =~ /OEA/) ? "240,160,64" : "0,0,0"; \ $_ = join("\t", $c, $s, $e, $n, $sc, $st, $tS, $tE, $rgb, \ $bs, $bSz, $bSt) . "\n";' G248discordant.txt \ | hgLoadBed -tab hg17 kiddEichlerDiscG248 \ stdin # 8/11/08: get clone ID -> NCBI acc mapping. ssh kkstore02 mkdir /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds cd /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds # Saved off emailed file from Jeff Kidd to clones_used_3nov.txt.accessions; # get trace archive trace names for end reads: foreach n (7 9 10 11 12 13 14) wget http://hgsv.washington.edu/general/download/clone_mapping/ABC$n/ABC$n.conversion.gz end # ABC8 has _a and _b files: wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_a.conversion.gz wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_b.conversion.gz # That file is not available for G248. gunzip *.gz # Combine the relevant data from the .conversion files; keep only those # IDs that are used in the tracks. cut -f 4 ../bd35/*discordant.txt \ | egrep -v '^(#chrom|track|name)' \ | sed -e 's/,.*//' \ | sort -u > discIds.txt perl -wpe 's/^OurClone.*\n// || s/^\d+_(HUMAN|\d+_).*\n$// || \ s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\.(F|FORWARD|R|REVERSE)\.\d+\t(\w+)$/$2$3_$6\t$7\t$8/ || \ warn "Parse line $.:\n$_";' \ *.conversion \ | sort > allEnds.tab grep -wFf discIds.txt allEnds.tab > discEnds.txt wc -l discIds.txt allEnds.tab discEnds.txt # 223051 discIds.txt # 17498527 allEnds.tab # 573974 discEnds.txt # discEnds.txt has 2 lines (forward & reverse) for most of its ids... # ideally we would see 2*(223051) lines in discEnds.txt. # Get a list of which discordant clone IDs don't have ends in *.conv*: cut -f 1 allEnds.tab | uniq > all.tmp comm -23 discIds.txt all.tmp > discNotInConv.txt wc -l discNotInConv.txt #16318 discNotInConv.txt cat > combine.pl <<'_EOF_' #!/usr/bin/perl -w use strict; my ($cloneFile, $endsFile) = @ARGV; open(CLONES, $cloneFile) || die "Can't open $cloneFile: $!\n"; my %idInfo; while() { (s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\t(\w+)$/$2$3_$6\t$7/ && m/^(\w+)\t(\w+)/) || \ m/^(G248\w+)\t(\w+)$/ || die "Parse line $.:$_"; my ($id, $acc) = ($1, $2); $idInfo{$id}->[0] = $acc; } close(CLONES); open(ENDS, $endsFile) || die "Can't open $endsFile: $!\n"; while () { chomp; my ($id, $dir, $traceName) = split("\t"); if ($dir =~ /^F/) { $idInfo{$id}->[1] = $traceName; } elsif ($dir =~ /^R/) { $idInfo{$id}->[2] = $traceName; } else { die "What is this \$dir: $dir ?\n"; } } close(ENDS); foreach my $id (sort keys %idInfo) { my $infoRef = $idInfo{$id}; $infoRef->[0] = '' if (! defined $infoRef->[0]); $infoRef->[1] = 0 if (! defined $infoRef->[1]); $infoRef->[2] = 0 if (! defined $infoRef->[2]); print join("\t", $id, @{$infoRef}) . "\n"; } '_EOF_' # << emacs chmod a+x combine.pl combine.pl clones_used_3nov.txt.accessions discEnds.txt \ | sort > kiddEichlerToNcbi.txt # Load table: ssh hgwdev cd /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds hgLoadSqlTab hg17 kiddEichlerToNcbi \ $HOME/kent/src/hg/lib/kiddEichlerToNcbi.sql kiddEichlerToNcbi.txt # Add to makeDb/schema/all.joiner, then check: runJoiner.csh hg17 kiddEichlerToNcbi $HOME/kent/src/hg/makeDb/schema ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # KIDD/EICHLER VALIDATED SITES (DONE 6/11/08 angie) ssh hgwdev mkdir /cluster/data/hg17/bed/kiddEichlerValid cd /cluster/data/hg17/bed/kiddEichlerValid wget http://hgsv.washington.edu/general/download/validated_sites/Kidd_2008_sample_level_valided_sites.xls # Open in Excel, save as Kidd_2008_sample_level_valided_sites.txt, # move first 9 lines to Kidd_2008_sample_level_valided_sites.header. # Split into one file per individual: foreach id (Abc7 Abc8 Abc9 Abc10 Abc11 Abc12 Abc13 Abc14 G248) set ID = `echo $id | tr 'a-z' 'A-Z'` grep ${ID}_ Kidd_2008_sample_level_valided_sites.txt \ | perl -wpe 'chomp; s/\r//; ($c, $s, $e, $n, $t) = split; \ $rgb = ($n =~ /deletion/) ? "224,0,0" : \ ($n =~ /insertion/) ? "0,0,224" : \ ($n =~ /inversion/) ? "0,224,0" : "0,0,0"; \ $t =~ s/:/,/g; \ $n =~ s/^'$ID'_//; $n = "$n,$t"; \ $_ = join("\t", $c, $s, $e, $n, "0", "+", $s, $e, $rgb) . \ "\n";' \ | hgLoadBed -tab hg17 kiddEichlerValid$id stdin end ################################################ # SPLIT EXPRESSION & REGULATION GROUPS # (2008-09-09 kate) echo "insert into grp (name, label, priority, defaultIsClosed) values ('expression', 'Expression', 4.5, 1)" | hgsql hg17 echo "update grp set label='Regulation' where name='regulation'" | hgsql hg17 ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: hg17.upstreamGeneTbl = refGene hg17.upstreamMaf = multiz17way /hive/data/genomes/hg17/bed/multiz17way/species.lst ############################################################################# # MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08 angie) ssh hgwdev mkdir /cluster/data/hg17/bed/mrnaPcr cd /cluster/data/hg17/bed/mrnaPcr # First, get consistent FA and PSL for UCSC Genes. genePredToBed /cluster/data/hg17/bed/kgHg17F/try3/kg3Try3.gp > ucscGenes.bed hgsql hg17 -NBe 'select kgId,geneSymbol from kgXref' \ | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \ > idSub.txt subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed sequenceForBed -keepName -db=hg17 -bedIn=ucscGenesIdSubbed.bed \ -fastaOut=stdout \ | faToTwoBit -ignoreDups stdin kgTargetSeq.2bit cut -f 1-10 /cluster/data/hg17/bed/kgHg17F/try3/kg3Try3.gp \ | genePredToFakePsl hg17 stdin kgTargetAli.psl /dev/null # Load up the UCSC Genes target PSL table and put 2bit in /gbdb:: cd /cluster/data/hg17/bed/mrnaPcr hgLoadPsl hg17 kgTargetAli.psl mkdir /gbdb/hg17/targetDb ln -s /cluster/data/hg17/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg17/targetDb/ # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on # /gbdb/hg17/targetDb/kgTargetSeq.2bit . ssh hgwdev # Add records to hgcentraltest blatServers and targetDb: hgsql hgcentraltest -e \ 'INSERT into blatServers values ("hg17Kg", "blat13", 17797, 0, 1);' hgsql hgcentraltest -e \ 'INSERT into targetDb values("hg17Kg", "UCSC Genes", \ "hg17", "kgTargetAli", "", "", \ "/gbdb/hg17/targetDb/kgTargetSeq.2bit", 1, now(), "");' ############################################################################# # fox2ClipSeq from Gene Yeo (DONE - 2009-01-08 - Hiram) mkdir /hive/data/genomes/hg17/bed/fox2ClipSeq cd /hive/data/genomes/hg17/bed/fox2ClipSeq # fetch data wget --timestamping \ 'http://www.snl.salk.edu/~geneyeo/stuff/FOX2.rmsk.BED.gz' \ -O FOX2.rmsk.BED.gz # remove track line and sort zcat FOX2.rmsk.BED.gz | grep -v "^track" | sort -k1,1 -k2,2n \ | gzip > sorted.bed.gz # separate strand data, and turn the positive into blue zcat sorted.bed.gz | awk '$6 == "+"' | sed -e "s/255,0,0/0,0,255/" \ | gzip > forwardStrand.bed.gz zcat sorted.bed.gz | awk '$6 == "-"' | gzip > reverseStrand.bed.gz # turn into wiggle density plot zcat forwardStrand.bed.gz | bedItemOverlapCount hg17 stdin \ | wigEncode stdin fox2ClipSeqDensityForwardStrand.wig \ fox2ClipSeqDensityForwardStrand.wib # Converted stdin, upper limit 2401.00, lower limit 1.00 zcat reverseStrand.bed.gz | bedItemOverlapCount hg17 stdin \ | wigEncode stdin fox2ClipSeqDensityReverseStrand.wig \ fox2ClipSeqDensityReverseStrand.wib # Converted stdin, upper limit 1406.00, lower limit 1.00 # and load tables zcat forwardStrand.bed.gz reverseStrand.bed.gz \ | hgLoadBed hg17 fox2ClipSeq stdin # Loaded 4418298 elements of size 9 ln -s `pwd`/*.wib /gbdb/hg17/wib hgLoadWiggle hg17 fox2ClipSeqDensityForwardStrand \ fox2ClipSeqDensityForwardStrand.wig hgLoadWiggle hg17 fox2ClipSeqDensityReverseStrand \ fox2ClipSeqDensityReverseStrand.wig # add composite track definitions to makeDb/trackDb/human/trackDb.ra ############################################################################# # LIFTOVER TO Hg19 (DONE - 2009-04-24 - Hiram ) mkdir /hive/data/genomes/hg17/bed/blat.hg19.2009-04-24 cd /hive/data/genomes/hg17/bed/blat.hg19.2009-04-24 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug hg17 hg19 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ hg17 hg19 > do.log 2>&1 & # real 84m19.022s ############################################################################# ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # UPDATE KEGG TABLES (DONE, Fan, 6/18/10) mkdir -p /hive/data/genomes/hg17/bed/pathways/kegg cd /hive/data/genomes/hg17/bed/pathways/kegg wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab cat map_title.tab | sed -e 's/\t/\thsa\t/' > j.tmp cut -f 2 j.tmp >j.hsa cut -f 1,3 j.tmp >j.1 paste j.hsa j.1 |sed -e 's/\t//' > keggMapDesc.tab rm j.hsa j.1 rm j.tmp hgsql hg17 -e 'drop table keggMapDesc' hgsql hg17 < ~/kent/src/hg/lib/keggMapDesc.sql hgsql hg17 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc' wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/hsa/hsa_pathway.list cat hsa_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp hgsql hg17 -e 'drop table keggPathway' hgsql hg17 < ~/kent/src/hg/lib/keggPathway.sql hgsql hg17 -e 'load data local infile "j.tmp" into table keggPathway' hgsql hg17 -N -e \ 'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \ >keggPathway.tab hgsql hg17 -e 'delete from keggPathway' hgsql hg17 -e 'load data local infile "keggPathway.tab" into table keggPathway' rm j.tmp ############################################################################ # Add KEGG column to hg17 Gene Sorter (Done, Fan, 6/18/2010) mkdir -p /hive/data/genomes/hg17/bed/geneSorter cd /hive/data/genomes/hg17/bed/geneSorter hgsql hg17 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab hgsql hg17 -e 'drop table knownToKeggEntrez' hgsql hg17 < ~/kent/src/hg/lib/knownToKeggEntrez.sql hgsql hg17 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez' #############################################################################