# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on # NCBI build 34 (July 18, 2003 freeze) # HOW TO BUILD A ASSEMBLY FROM NCBI FILES # --------------------------------------- # Make gs.17 directory, gs.17/build34 directory, and gs.17/ffa directory. mkdir /cluster/store4/gs.17 mkdir /cluster/store4/gs.17/build34 mkdir /cluster/store4/gs.17/agp mkdir /cluster/store4/gs.17/ffa # Make a symbolic link from /cluster/store1 to this location cd /cluster/store1 ln -s /cluster/store4/gs.17 ./gs.17 # Make a symbolic link from your home directory to the build dir: ln -s /cluster/store4/gs.17/build34 ~/oo # NCBI download site: ftp ftp.ncbi.nih.gov # user and password from /cse/guests/kent/buildHg6.doc cd build_34 # Download all finished agp's and fa's into gs.17/agp mget chr*.agp mget chr*.fa.gz gunzip *.gz # Download contig agp's into gs.17/build34 get ref_placed.agp # used to be in reference.agp get ref_unplaced.agp # used to be in reference.agp get DR51.agp get PAR.agp # new for this build - PAR regions added to chrY cat ref_placed.agp ref_unplaced.agp DR51.agp > ncbi_build34.agp # Download contig fa's into gs.17/ffa get ref_placed.fa.gz # used to be in reference.fa get ref_unplaced.fa.gz # used to be in reference.fa get DR51.fa.gz get PAR.fa.gz # new for this build - PAR regions added to chrY get sequence.inf cat ref_placed.fa ref_unplaced.fa DR51.fa > ncbi_build34.fa # Download assembly related files into gs.17/build34 get seq_contig.md get contig_overlaps.agp # Download questionable join certificates file get e-certificates.txt mkdir certificates mv e-certificates.txt certificates # Save a copy of the original seq_contig.md file cp seq_contig.md seq_contig.md.orig # For build34, edit the seq_contig.md file to remove the alternative chr7 # sequence supplied by the Toronto group: NT_079590, NT_079591, NT_079592, # NT_079593, NT_079594, NT_079595, NT_079596, NT_079597 # Edit seq_contig.md to make the DR51 alternative haplotype look like a # chr6_random sequence: # 9606 6 32491690 32629063 + NG_002432 GI:28212469 CONTIG DR51 1 # to # 9606 6|NG_002432 1 137374 + NG_002432 GI:28212469 CONTIG DR51 1 # Move this edited DR51 line next to other chr6_random contigs (for creating # the lift file) # Sanity check /cluster/bin/i386/checkYbr build34/ncbi_build34.agp ffa/ncbi_build34.fa \ build34/seq_contig.md # Convert fa files into UCSC style fa files and place in "contigs" directory # inside the gs.17/build34 directory cd build34 mkdir contigs /cluster/bin/i386/faNcbiToUcsc -split -ntLast ../ffa/ncbi_build34.fa \ contigs # Copy over chrM contig from previous version cd ~/oo cp -r gs.17/build33/M . # Determine the chromosome sizes from agps /cluster/bin/scripts/getChromSizes ../agp # Create lift files (this will create chromosome directory structure) and # inserts file /cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md . # Create contig agp files (will create contig directory structure) /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build34.agp . # Create chromsome random agp files. /cluster/bin/scripts/createNcbiChrAgp -randomonly . # Copy the original chrN.agp files from the gs.17/agp directory # into each of the chromosome directories since they contain better # gap information. Delete the comments at top from these. # Distribute contig .fa to appropriate directory (assumes all files # are in "contigs" directory). # create global data link for everyone. No more home directory # links required. ln -s /cluster/store4/gs.17/build34 /cluster/data/hg16 cd /cluster/data/hg16 /cluster/bin/scripts/distNcbiCtgFa contigs . rm -r contigs # Copy over jkStuff from previous build (??) mkdir jkStuff cp /cluster/store1/gs.17/build33/jkStuff/*.sh jkStuff /build31/jkStuff/*.csh jkStuff cp /cluster/store1/gs.17/build33/jkStuff/*.gsub jkStuff # Create contig gl files /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md # Create chromosome gl files jkStuff/liftGl.sh contig.gl # Files ready for repeat-masking and trf # CREATE STS/FISH/BACENDS/CYTOBANDS DIRECTORY STRUCTURE AND SETUP # (DONE 2003-07-23 Terry) # Create directory structure to hold information for these tracks cd /projects/hg2/booch/psl/ # Change Makefile parameters for OOVERS, GSVERS, PREVGS, PREVOO make new # Update all Makefiles with latest OOVERS and GSVERS, DATABASE, and locations of .fa files # Makefiles in: # /gs.17/build33/ # /gs.17/build33/bacends # /gs.17/build33/cytobands # /gs.17/build33/cytoPlots # /gs.17/build33/fish # /gs.17/build33/fosends # /gs.17/build33/g2g # /gs.17/build33/geneticPlots # /gs.17/build33/primers # /gs.17/build33/recombrate # /gs.17/build33/sts # /gs.17/build33/stsPlots # Create accession_info file ***** make accession_info.rdb # UPDATE STS INFORMATION (DONE 2003-07-23 Terry) # Download and unpack updated information from dbSTS: cd /projects/hg2/booch/psl/update wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.Z mv sts.Z dbSTS.FASTA.dailydump.Z gunzip dbSTS.FASTA.dailydump.Z # Make new directory for this info and move files there mkdir /cluster/store1/sts.8 cp all.STS.fa /cluster/store1/sts.8 cp all.primers /cluster/store1/sts.8 cp all.primers.fa /cluster/store1/sts.8 # Copy new files to cluster ssh kkstore cd /cluster/store1/sts.8 cp /cluster/store1/sts.8 /*.* /scratch/hg/STS # Ask for propagation from sysadmin # Load the sequences into the database (after database created) ssh hgwdev mkdir /gbdb/hg16/sts.8 cd /gbdb/hg16/sts.8 ln -s /cluster/store1/sts.8/all.STS.fa ./all.STS.fa ln -s /cluster/store1/sts.8/all.primers.fa ./all.primers.fa cd /cluster/store2/tmp hgLoadRna addSeq hg16 /gbdb/hg16/sts.8/all.STS.fa hgLoadRna addSeq hg16 /gbdb/hg16/sts.8/all.primers.fa # CREATE STS MARKER ALIGNMENTS (DONE 2003-08-03 Terry) # Create full sequence alignments ssh kk cd /cluster/home/booch/sts # Update Makefile with latest OOVERS and GSVERS and # run cluster jobs make new make jobList para create jobList para push # wait until alignments done make stsMarkers.psl # Copy files to final destination and remove originals make copy.assembly make clean # Create primer alignments ssh kk cd /cluster/home/booch/primers # Update Makefile with latest OOVERS and GSVERS and # run cluster jobs make new make jobList.scratch para create jobList para push # Do an initial quick filter of results (takes a while, still) and create # final file - best done on eieio since disks local ssh eieio make filter make primers.psl # Copy files to final destination and remove make copy.assembly make clean # Create ePCR alignments ssh kk cd /cluster/home/booch/epcr # Update Makefile with latest OOVERS and GSVERS make new make jobList para create jobList para push make all.epcr # Copy files to final destination and remove make copy.assembly make clean # CREATE AND LOAD STS MARKERS TRACK (DONE 2003-08-03 Terry) # Copy in current stsInfo2.bed and stsAlias.bed files cd /projects/hg2/booch/psl/gs.17/build33 cp ../update/stsInfo2.bed . cp ../update/stsAlias.bed . # Create final version of sts sequence placements ssh kks00 cd /projects/hg2/booch/psl/gs.17/build33/sts make stsMarkers.final # Create final version of primers placements # Make sure PRIMERS variable in Makefile is pointing to current version cd /projects/hg2/booch/psl/gs.17/build33/primers make primers.final # Create bed file cd /projects/hg2/booch/psl/gs.17/build33 make stsMap.bed # Create database tables ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg16 < all_sts_primer.sql hgsql hg16 < all_sts_seq.sql hgsql hg16 < stsAlias.sql hgsql hg16 < stsInfo2.sql hgsql hg16 < stsMap.sql # Load the tables cd /projects/hg2/booch/psl/gs.17/build34/sts/ echo 'load data local infile "stsMarkers.psl.filter.lifted" into table all_sts_seq;' | hgsql hg16 cd /projects/hg2/booch/psl/gs.17/build34/primers/ echo 'load data local infile "primers.psl.filter.lifted" into table all_sts_primer;' | hgsql hg16 cd /projects/hg2/booch/psl/gs.17/build34/ echo 'load data local infile "stsAlias.bed" into table stsAlias;' | hgsql hg16 echo 'load data local infile "stsInfo2.bed" into table stsInfo2;' | hgsql hg16 echo 'load data local infile "stsMap.bed" into table stsMap;' | hgsql hg16 # CREATE AND LOAD RECOMBINATION RATE TRACK (DONE 2003-08-05 Terry) # (must be done after STS Markers track) # Create bed file cd /projects/hg2/booch/psl/gs.17/build34/recombrate make recombRate.bed # Create database table ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg16 < recombRate.sql # Load the table cd /projects/hg2/booch/psl/gs.17/build34/recombrate/ echo 'load data local infile "recombRate.bed" into table recombRate;' | hgsql hg16 # UPDATE BACEND SEQUENCES (DONE 2003-07-23 Terry) # **** Sequences were determined to not have changed since bacends.4 ***** # **** No new sequences downloaded - See makeHg15.doc for download instructions ***** # Load the sequences into the database (after database created) ssh hgwdev mkdir /gbdb/hg16/bacends.4 cd /gbdb/hg16/bacends.4 ln -s /cluster/store1/bacends.4/BACends.fa ./BACends.fa cd /cluster/store2/tmp hgLoadRna addSeq hg16 /gbdb/hg16/bacends.4/BACends.fa # BACEND SEQUENCE ALIGNMENTS (DONE 2003-08-01 Terry) # (alignments done without RepeatMasking) # Create full sequence alignments ssh kk cd /cluster/home/booch/bacends # Update Makefile with latest OOVERS and GSVERS and run cluster jobs make new make jobList para create jobList para push # Compile alignments and lift the files (takes a while) ssh eieio make bacEnds.psl.lifted # Copy files to final destination and remove make copy.assembly make clean # (may want to wait until sure they're OK) # BACEND PAIRS TRACK (DONE 2003-08-01 Terry) # Add /projects/compbiousr/booch/booch/scripts to your path # Update Makefile with new location of pairs/singles # files, if necessary (DONE) cd /projects/hg2/booch/psl/gs.17/build33/bacends # Make initial file of alignments make bacEnds.rdb # Try to fish out more pairs make bacEndsMiss.psl # Re-make bacEnds.rdb with new info make bacEnds.rdb # Create bacEndPairs track file make bacEndPairs.bed # Create bacEndPairsBad and bacEndPairsLong files make bacEndPairsBad.bed # Create psl file to load make bacEnds.load.psl # Create database tables ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg16 < all_bacends.sql hgsql hg16 < bacEndPairs.sql hgsql hg16 < bacEndPairsBad.sql hgsql hg16 < bacEndPairsLong.sql # Load the tables cd /projects/hg2/booch/psl/gs.17/build34/bacends/ echo 'load data local infile "bacEnds.load.psl" into table all_bacends;' | hgsql hg16 echo 'load data local infile "bacEndPairs.bed" into table bacEndPairs;' | hgsql hg16 echo 'load data local infile "bacEndPairsBad.bed" into table bacEndPairsBad;' | hgsql hg16 echo 'load data local infile "bacEndPairsLong.bed" into table bacEndPairsLong;' | hgsql hg16 # FOSEND SEQUENCE ALIGNMENTS (DONE 2003-08-03 Terry) # Create full sequence alignments ssh kk cd /cluster/home/booch/fosends # Update Makefile with latest OOVERS and GSVERS and run cluster jobs make new make jobList para create jobList para push # Compile alignments and lift the files (takes a while) ssh eieio cd /cluster/home/booch/fosends make fosEnds.psl.lifted # Copy files to final destination and remove make copy.assembly make clean # FOSEND PAIRS TRACK (DONE 2003-08-01 Terry) # Update Makefile with location of pairs files, if necessary ssh kks00 cd /projects/hg2/booch/psl/gs.17/build33/fosends # Make initial file of alignments make fosEnds.rdb # Try to fish out more pairs make fosEndsMiss.psl # Re-make bacEnds.rdb with new info make fosEnds.rdb # Create bacEndPairs track file make fosEndPairs.bed # Create bacEndPairsBad and bacEndPairsLong files make fosEndPairsBad.bed # Create psl file to load make fosEnds.load.psl # Create database tables ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg16 < all_fosends.sql hgsql hg16 < fosEndPairs.sql hgsql hg16 < fosEndPairsBad.sql hgsql hg16 < fosEndPairsLong.sql # Load the tables cd /projects/hg2/booch/psl/gs.17/build34/fosends/ echo 'load data local infile "fosEnds.load.psl" into table all_fosends;' | hgsql hg16 echo 'load data local infile "fosEndPairs.bed" into table fosEndPairs;' | hgsql hg16 echo 'load data local infile "fosEndPairsBad.bed" into table fosEndPairsBad;' | hgsql hg16 echo 'load data local infile "fosEndPairsLong.bed" into table fosEndPairsLong;' | hgsql hg16 # Load the sequences (change fosends.# to match correct location) (done for hg15 early 4/9/2003) mkdir /gbdb/hg15/fosends.3 cd /gbdb/hg15/fosends.3 ln -s /cluster/store1/fosends.3/fosEnds.fa ./fosEnds.fa cd /cluster/store2/tmp hgLoadRna addSeq hg15 /gbdb/hg15/fosends.3/fosEnds.fa # UPDATE FISH CLONES INFORMATION (DONE 2003-07-23 Terry) # Download the latest info from NCBI # point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg # change "Show details on sequence-tag" to "yes" # change "Download or Display" to "Download table for UNIX" # press Submit - save as /projects/hg2/booch/psl/fish/hbrc/hbrc.20030723.table # Format file just downloaded. cd /projects/hg2/booch/psl/fish/ # Edit Makefile to point at file just downloaded (variables HBRC, HBRCFORMAT) make HBRC # (Manually added 21 results from FHCRC) # Copy it to the new freeze location cp /projects/hg2/booch/psl/fish/all.fish.format /projects/hg2/booch/psl/gs.17/build34/fish/ # Save it as the new "gold" file cp all.fish.format all.fish.format.gold # CREATE AND LOAD FISH CLONES TRACK (DONE 2003-08-08 Terry) # (must be done after Coverage, STS markers track and BAC end pairs track) # Extract the file with clone positions from database ssh hgwdev echo 'select * into outfile "/tmp/booch/clonePos.txt" from clonePos' | hgsql hg16 mv /tmp/booch/clonePos.txt /projects/hg2/booch/psl/gs.17/build34/fish # Get current clone/accession information ssh kks00 cd /projects/hg2/booch/psl/gs.17/build34/fish wget http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out # Create initial placement file cp /projects/hg2/booch/psl/gs.17/build33/fish/extract.pl . make cyto.markers.bed # Get sequences for accessions not in genome # goto http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide # select file "/projects/hg2/booch/psl/gs.17/build33/fish/not.found.acc # change output to FASTA format # download results to "/projects/hg2/booch/psl/gs.17/build33/fish/not.found.fa" # Place sequences against genome make blat # Try to incorporate new placements make cyto.markers.bed2 # Create bed file make fishClones.bed # Create database table ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg16 < fishClones.sql # Load the table cd /projects/hg2/booch/psl/gs.17/build34/fish/ echo 'load data local infile "fishClones.bed" into table fishClones;' | hgsql hg16 # CREATE AND LOAD CHROMOSOME BANDS TRACK (DONE 2003-08-08 Terry) # (must be done after FISH Clones track) # Create bed file ssh kks00 cd /projects/hg2/booch/psl/gs.17/build34/cytobands/ make setBands.txt # NOTE: may get errors if inserts file out-of-sync with pctSetBands file make cytobands.pct.ranges make predict # Create database table ssh hgwdev cd /projects/hg2/booch/psl/tables hgsql hg16 < cytoBand.sql # Load the table cd /projects/hg2/booch/psl/gs.17/build34/cytobands/ echo 'load data local infile "cytobands.bed" into table cytoBand;' | hgsql hg16 # Make cytoBandIdeo track for ideogram gif on hgTracks page. # For human cytoBandIdeo is just a replicate of the cytoBand track. # Make the cytoBand track (above) and then: echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" | hgsql hg16 # CREATING DATABASE (DONE - 2003-07-26 - Hiram) ssh hgwdev # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql df -h /var/lib/mysql # Filesystem Size Used Avail Use% Mounted on # /dev/sda1 472G 416G 31G 93% /var/lib/mysql # Create the database. echo 'create database hg16' | hgsql hg15 # make a semi-permanent read-only alias (add this to your .cshrc/.bashrc): # (I have not seen a use for this in any procedures ? -Hiram) # alias hg16 mysql -u hguser -phguserstuff -A hg16 # (use 'hgsql hg16' instead) # Initialize the relational-mrna and external sequence info tables: hgLoadRna new hg16 # Copy over grp table (for track grouping) from another database: echo "create table grp (PRIMARY KEY(NAME)) select * from hg15.grp" \ | hgsql hg16 # add ENCODE track. Move Repeats lower in priority echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg16 echo 'INSERT INTO grp (name, label, priority) VALUES ("encode", "ENCODE Tracks", 8)' | hgsql hg16 # New ENCODE groups echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg16 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg16 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg16 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg16 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg16 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg16 echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg16 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2003-07-26 - Hiram) ssh hgwdev # Enter hg16 into hgcentraltest.dbDb so test browser knows about it: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName) \ values("hg16", "July 2003", "/gbdb/hg16/nib", "Human", \ "chr7:26828631-26938371", 1, 10, "Human", "Homo sapiens");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: cd ~kent/src/hg/makeDb/trackDb cvs up -d -P . # Edit that makefile to add hg16 in all the right places and do make update make alpha cvs commit makefile # MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2003-07-26 - Hiram) cd /cluster/data/hg16 mkdir -p jkStuff cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly. # Note: this ncbi.lift will not lift floating contigs to chr_random coords, # but it will show the strand orientation of the floating contigs # (grep for '|'). mdToNcbiLift seq_contig.md jkStuff/ncbi.lft # If a lift file has been edited (e.g. as in 6.2.5 above), edit ncbi.lft # to match. If no step 6.2.5 then no editing needed # REPEAT MASKING (DONE - 2003-07-25 - Hiram, REDONE 2003-08-02) # Split contigs, run RepeatMasker, lift results # Notes: # * Using new RepeatMasker in /cluster/bluearc/RepeatMasker030619 # Always check for new RepeatMasker before proceeding # * Contigs (*/N{T,G}_*/N{T,G}_*.fa) are split into 500kb chunks to make # RepeatMasker runs manageable on the cluster ==> results need lifting. # * For the NCBI assembly we repeat mask on the sensitive mode setting # (RepeatMasker -s) #- Split contigs into 500kb chunks: ssh eieio cd /cluster/data/hg16 foreach chrom ( ?{,?} ) foreach c ( $chrom/N{T,G}_?????? ) set contig = $c:t echo "splitting ${chrom}/${contig}/${contig}.fa" faSplit size ${chrom}/${contig}/$contig.fa 500000 \ ${chrom}/${contig}/${contig}_ -lift=${chrom}/${contig}/$contig.lft \ -maxN=500000 end end #- Make the run directory and job list: cd /cluster/data/hg16 mkdir -p jkStuff # According to RepeatMasker help file, no arguments are required to # specify species because its default is set for primate (human) # This run script saves the .tbl file to be sent to Arian. He uses # those for his analysis. Sometimes he needs the .cat and .align files for # checking problems. Krish needs the .align files, they are large. cat << '_EOF_' > jkStuff/RMHuman #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/hg16/$2 /bin/cp $2 /tmp/hg16/$2/ cd /tmp/hg16/$2 /cluster/bluearc/RepeatMasker030619/RepeatMasker -ali -s $2 popd /bin/cp /tmp/hg16/$2/$2.out ./ if (-e /tmp/hg16/$2/$2.align) /bin/cp /tmp/hg16/$2/$2.align ./ if (-e /tmp/hg16/$2/$2.tbl) /bin/cp /tmp/hg16/$2/$2.tbl ./ # if (-e /tmp/hg16/$2/$2.cat) /bin/cp /tmp/hg16/$2/$2.cat ./ /bin/rm -fr /tmp/hg16/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/hg16/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/hg16 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMHuman ssh eieio cd /cluster/data/hg16 mkdir RMRun rm -f RMRun/RMJobs touch RMRun/RMJobs foreach d ( ?{,?} ) foreach c ( $d/N{T,G}_*/N{T,G}_*_*.fa ) set f = $c:t set cc = $c:h set contig = $cc:t echo /cluster/store4/gs.17/build34/jkStuff/RMHuman \ /cluster/store4/gs.17/build34/${d}/${contig} $f \ '{'check out line+ /cluster/store4/gs.17/build34/${d}/${contig}/$f.out'}' \ >> RMRun/RMJobs end end # We have 6015 jobs in RMJobs: wc RMRun/RMJobs # 6015 42105 1184896 RMRun/RMJobs #- Do the run ssh kk cd /cluster/data/hg16/RMRun para create RMJobs para try, para check, para check, para push, para check,... #- While that is running, you can run TRF (simpleRepeat) on the small # cluster. See SIMPLE REPEAT section below # CPU time in finished jobs: 33575296s 559588.26m 9326.47h 388.60d 1.065 y # IO & Wait Time: 238878s 3981.30m 66.36h 2.76d 0.008 y # Average job time: 7513s 125.21m 2.09h 0.09d # Longest job: 18457s 307.62m 5.13h 0.21d # Submission to last job: 55537s 925.62m 15.43h 0.64d #- Lift up the split-contig .out's to contig-level .out's ssh eieio cd /cluster/data/hg16 foreach d ( ?{,?}/N{T,G}_* ) cd $d set contig = $d:t liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out cd ../.. end #- Lift up RepeatMask .out files to chromosome coordinates via # picked up jkStuff/liftOut2.sh from the hg15 build. Reset the # liftUp command from ~kent/bin/$MACHTYPE to be from # /cluster/bin/i386. Took the redirection to dev/null off of the # command and capture the output here to see what errors we have. ./jkStuff/liftOut2.sh > liftOut2.out 2>&1 & #- By this point, the database should have been created (above): ssh hgwdev cd /cluster/data/hg16 hgLoadOut hg16 ?/*.fa.out ??/*.fa.out # errors during this load: Processing 2/chr2.fa.out Strange perc. field -6.1 line 243430 of 2/chr2.fa.out Strange perc. field -5.6 line 243430 of 2/chr2.fa.out Strange perc. field -6.1 line 243432 of 2/chr2.fa.out Strange perc. field -5.6 line 243432 of 2/chr2.fa.out Processing 5/chr5.fa.out Strange perc. field -0.3 line 4339 of 5/chr5.fa.out Processing 19/chr19.fa.out Strange perc. field -18.6 line 77032 of 19/chr19.fa.out # SIMPLE REPEAT [TRF] TRACK (DONE - 2003-07-25 - Hiram) # Distribute contigs to /iscratch/i ssh kkr1u00 rm -rf /iscratch/i/gs.17/build34/contigs mkdir -p /iscratch/i/gs.17/build34/contigs cd /cluster/data/hg16 cp -p contigs/*.fa /iscratch/i/gs.17/build34/contigs # Make sure the total size looks like what you'd expect: du ./contigs /iscratch/i/gs.17/build34/contigs # 2839768 ./contigs # 2839768 /iscratch/i/gs.17/build34/contigs ~kent/bin/iSync # Create cluster parasol job like so: mkdir -p /cluster/data/hg16/bed/simpleRepeat cd /cluster/data/hg16/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << this line makes emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1S /iscratch/i/gs.17/build34/contigs/*.fa > genome.lst gensub2 genome.lst single gsub spec para create spec para try para check para push para check # Completed: 472 of 472 jobs # CPU time in finished jobs: 36177s 602.95m 10.05h 0.42d 0.001 y # IO & Wait Time: 2038s 33.97m 0.57h 0.02d 0.000 y # Average job time: 81s 1.35m 0.02h 0.00d # Longest job: 6992s 116.53m 1.94h 0.08d # Submission to last job: 10703s 178.38m 2.97h 0.12d # When cluster run is done, a couple of extra files not caught in # the above sequence ./runTrf /cluster/store4/gs.17/build34/M/NT_999999/NT_999999.fa trf/NT_999999.bed # That produces an empty .bed file, mark it so: echo "# trf run produces nothing for this one" >> trf/NT_999999.bed liftUp simpleRepeat.bed /cluster/data/hg16/jkStuff/liftAll.lft \ warn trf/*.bed > lu.out 2>&1 # Load into the database: ssh hgwdev cd /cluster/data/hg16/bed/simpleRepeat /cluster/bin/i386/hgLoadBed hg16 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # stringTab = 0 # Reading simpleRepeat.bed # Loaded 627883 elements # Sorted # Saving bed.tab # Loading hg16 # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2003-07-27 - Hiram - REDONE 07-30) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh eieio cd /cluster/data/hg16/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/hg16 mkdir -p bed/simpleRepeat/trfMaskChrom foreach c (?{,?}) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2003-07-27) # -Hiram # This used to be done right after RepeatMasking. Now, we mask with # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above. ssh eieio cd /cluster/data/hg16 # Make chr*.fa from contig .fa # Copied chrFa.sh from hg15/jkStuff - reset path from ~kent to # /cluster for the ctgToChromFa comand tcsh ./jkStuff/chrFa.sh > chrFa.out 2>&1 & # copied these three scripts from hg15 - fixup path names to # reference /cluster/bin instead of ~kent/bin #- Soft-mask (lower-case) the contig and chr .fa's tcsh ./jkStuff/makeFaMasked.sh > maFaMasked.out 2>&1 #- Make hard-masked .fa.masked files as well: tcsh ./jkStuff/makeHardMasked.sh > maHardMasked.out 2>&1 #- Rebuild the nib, mixedNib, maskedNib files: tcsh ./jkStuff/makeNib.sh > maNib.out 2>&1 # Make symbolic links from /gbdb/hg16/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/hg16/nib foreach f (/cluster/store4/gs.17/build34/nib/chr*.nib) ln -s $f /gbdb/hg16/nib end # Load /gbdb/hg16/nib paths into database and save size info. hgsql hg16 < ~/kent/src/hg/lib/chromInfo.sql cd /cluster/data/hg16 hgNibSeq -preMadeNib hg16 /gbdb/hg16/nib ?{,?}/chr?{,?}{,_random}.fa echo "select chrom,size from chromInfo" | hgsql -N hg16 > chrom.sizes # Copy the masked contig fa to /iscratch and /scratch: # And everything else we will need for blastz runs, etc ... ssh kkr1u00 rm -rf /iscratch/i/gs.17/build34/trfFa mkdir -p /iscratch/i/gs.17/build34/trfFa cp -p /cluster/data/hg16/?{,?}/N{T,G}_*/N{T,G}_??????.fa /iscratch/i/gs.17/build34/trfFa rm -rf /iscratch/i/gs.17/build34/bothMaskedNibs mkdir -p /iscratch/i/gs.17/build34/bothMaskedNibs cp -p /cluster/data/hg16/nib/*.nib /iscratch/i/gs.17/build34/bothMaskedNibs rm -rf /iscratch/i/gs.17/build34/rmsk mkdir -p /iscratch/i/gs.17/build34/rmsk cp -p /cluster/data/hg16/?{,?}/*.out /iscratch/i/gs.17/build34/rmsk ~kent/bin/iSync # ssh kkstore # Since kkstore is currently /cluster/bluearc/scratch, better to do # this on eieio and copy to rm -rf /scratch/hg/gs.17/build34/trfFa mkdir -p /scratch/hg/gs.17/build34/trfFa cp -p /cluster/data/hg16/?{,?}/N{T,G}_*/N{T,G}_??????.fa /scratch/hg/gs.17/build34/trfFa rm -rf /scratch/hg/gs.17/build34/bothMaskedNibs mkdir /scratch/hg/gs.17/build34/bothMaskedNibs cp -p /cluster/data/hg16/nib/*.nib /scratch/hg/gs.17/build34/bothMaskedNibs rm -rf /scratch/hg/gs.17/build34/rmsk mkdir -p /scratch/hg/gs.17/build34/rmsk cp -p /cluster/data/hg16/?{,?}/*.out /scratch/hg/gs.17/build34/rmsk # request rsync of kkstore /scratch # O+O: ASSEMBLY [GOLD], GAP, COVERAGE, MAP CONTIGS TRACKS (DONE - 2003-07-27) # Store o+o info in database. ssh eieio cd /cluster/store4/gs.17/build34 if (-f contig_overlaps.agp) then jkStuff/liftGl.sh contig.gl else ssh hgwdev hgGoldGapGl -noGl hg16 /cluster/store4/gs.17 build34 echo "" echo "*** Note from makeHg15.doc:" echo "Come back to this step later when we have contig_overlaps.agp\!" endif ssh hgwdev cd /cluster/store4/gs.17/build34 if (-f contig_overlaps.agp) then hgGoldGapGl hg16 /cluster/store4/gs.17 build34 cd /cluster/store4/gs.17 /cluster/bin/i386/hgClonePos hg16 build34 ffa/sequence.inf /cluster/store4/gs.17 -maxErr=3 end cd /cluster/store4/gs.17 # (2/27/04 angie) re-loaded -- chr{1,4,8,15}_random lift files changed # 7/30/04. hgCtgPos hg16 build34 # CREATE NON-STANDARD JOIN CERTIFICATES WEB PAGE AND TABLE # Filter certificates file to only contain those relevant to current assembly cd ~/hg16/certificates /cluster/bin/scripts/extractCertificates.pl e-certificates.txt ~/hg16 \ > e-certificates.filter.txt # Create initial web page and table for loading into database hgCert e-certificates.filter.txt > certificates.html # Donna's edits to html page # (3/2/04 angie: edit cert.tab to remove some extra tab characters in comments # so mySql doesn't truncate them, & reload) # Load cert table into database ssh hgwdev cd ~/hg16/certificates echo "drop table certificate" | hgsql hg16 hgsql hg16 < ~/kent/src/hg/lib/certificate.sql echo 'load data local infile "cert.tab" into table certificate;' \ | hgsql hg16 # AUTO UPDATE GENBANK MRNA RUN (WORKING - 2003-07-30 - Hiram) ssh eieio cd /cluster/store5/genbank # This is a new organism, edit the etc/genbank.conf file and add: # hg16 hg16.genome = /scratch/hg/gs.17/build34/bothMaskedNibs/chr*.nib hg16.lift = /cluster/store4/gs.17/build34/jkStuff/liftAll.lft hg16.genbank.est.xeno.load = yes hg16.mgcTables.default = full hg16.mgcTables.mgc = all hg16.downloadDir = hg16 ssh eieio cd /cluster/store5/genbank nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \ -srcDb=genbank -type=mrna -verbose=1 -initial hg16 # Completed: 49591 of 49591 jobs # CPU time in finished jobs: 3853288s 64221.47m 1070.36h 44.60d 0.122 y # IO & Wait Time: 246323s 4105.38m 68.42h 2.85d 0.008 y # Average job time: 83s 1.38m 0.02h 0.00d # Longest job: 21265s 354.42m 5.91h 0.25d # Submission to last job: 22930s 382.17m 6.37h 0.27d # Load the results from the above ssh hgwdev cd /cluster/store5/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg16 # To get this next one started, the above results need to be # moved out of the way. These things can be removed if there are # no problems to debug ssh eieio cd /cluster/bluearc/genbank/work mv initial.hg16 initial.hg16.genbank.mrna ssh eieio cd /cluster/store5/genbank nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \ -srcDb=refseq -type=mrna -verbose=1 -initial hg16 # Completed: 68740 of 68740 jobs # CPU time in finished jobs: 1253290s 20888.16m 348.14h 14.51d 0.040 y # IO & Wait Time: 309126s 5152.10m 85.87h 3.58d 0.010 y # Average job time: 23s 0.38m 0.01h 0.00d # Longest job: 13290s 221.50m 3.69h 0.15d # Submission to last job: 13609s 226.82m 3.78h 0.16d # The iservers came back on-line, so use them for this run. # The batch file can be found in: # /cluster/store5/genbank/work/initial.hg16/align ssh hgwdev cd /cluster/store5/genbank nice bin/gbDbLoadStep -verbose=1 hg16 nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial hg16 # GC PERCENT (DONE 2003-07-31 - Hiram) ssh hgwdev mkdir -p /cluster/data/hg16/bed/gcPercent cd /cluster/data/hg16/bed/gcPercent hgsql hg16 < ~/kent/src/hg/lib/gcPercent.sql hgGcPercent hg16 ../../nib # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE - 2003-07-31 - Hiram) ssh hgwdev # Substitute BBB with the correct number for the hostname: echo 'insert into blatServers values("hg16", "blat6", "17778", "1"); \ insert into blatServers values("hg16", "blat6", "17779", "0");' \ | hgsql -h genome-testdb hgcentraltest # PRODUCING GENSCAN PREDICTIONS (DONE - 2003-08-01 - Hiram) ssh eieio mkdir -p /cluster/data/hg16/bed/genscan cd /cluster/data/hg16/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir -p gtf pep subopt # Generate a list file, genome.list, of all the contigs # *that do not have pure Ns* (due to heterochromatin, unsequencable # stuff) which would cause genscan to run forever. rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/store4/gs.17/build34/?{,?}/N{T,G}_*/N{T,G}_??????.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end # Log into kkr1u00 (not kk!). kkr1u00 is the driver node for the small # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the # big cluster, due to limitation of memory and swap space on each # processing node). ssh kkr1u00 cd /cluster/data/hg16/bed/genscan # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/home/hiram/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try para check para push # Completed: 491 of 491 jobs (this was with only 6 CPUs available) # CPU time in finished jobs: 216220s 3603.67m 60.06h 2.50d 0.007 y # IO & Wait Time: 85597s 1426.62m 23.78h 0.99d 0.003 y # Average job time: 615s 10.24m 0.17h 0.01d # Longest job: 10986s 183.10m 3.05h 0.13d # Submission to last job: 54395s 906.58m 15.11h 0.63d # Issue either one of the following two commands to check the # status of the cluster and your jobs, until they are done. parasol status para check # If there were out-of-memory problems (run "para problems"), then # re-run those jobs by hand but change the -window arg from 2400000 # to 1200000. In build33, this was 22/NT_011519. # In build34 there were NO failures ! # Convert these to chromosome level files as so: ssh eieio cd /cluster/data/hg16/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N{T,G}*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/N{T,G}*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/hg16/bed/genscan ldHgGene hg16 genscan genscan.gtf # Reading genscan.gtf # Read 42974 transcripts in 326300 lines in 1 files # 42974 groups 41 seqs 1 sources 1 feature types # 42974 gene predictions hgPepPred hg16 generic genscanPep genscan.pep # Processing genscan.pep hgLoadBed hg16 genscanSubopt genscanSubopt.bed # stringTab = 0 # Reading genscanSubopt.bed # Loaded 518038 elements # Sorted # Creating table definition for # Saving bed.tab # Loading hg16 # CPGISLANDS (DONE - 2003-08-01 - Hiram) ssh eieio mkdir -p /cluster/data/hg16/bed/cpgIsland cd /cluster/data/hg16/bed/cpgIsland # Copy program as built for previous hg build: mkdir cpg_dist cp -p ~/hg15/bed/cpgIsland/cpg_dist/cpglh.exe ./cpg_dist # This step used to read, but I do not immediately see the .tar # file anywhere: (there is a copy in ~/rn3/bed/cpgIsland) # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu) # copy the tar file to the current directory # tar xvf cpg_dist.tar # cd cpg_dist # gcc readseq.c cpg_lh.c -o cpglh.exe # cd .. # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked) set fout=$f:t:r:r.cpg echo producing $fout... ./cpg_dist/cpglh.exe $f > $fout end cat << '_EOF_' > filter.awk /* chr1\t1325\t3865\t754\tCpG: 183\t64.9\t0.7 */ /* Transforms to: (tab separated columns above, spaces below) */ /* chr1 1325 3865 CpG: 183 754 183 489 64.9 0.7 */ { width = $3-$2; printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\n", $1,$2,$3,$5,$6,width,$6,width*$7*0.01,100.0*2*$6/($3-$2),$7);} '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed ssh hgwdev cd /cluster/data/hg16/bed/cpgIsland hgLoadBed hg16 cpgIsland -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed # stringTab = 1 # Reading cpgIsland.bed # Loaded 27596 elements # Sorted # Saving bed.tab # Loading hg16 # VERIFY REPEATMASKER RESULTS (DONE - 2003-08-01 - Hiram) # Run featureBits on hg16 and on a comparable genome build, and compare: ssh hgwdev featureBits hg16 rmsk # --> 1388770568 bases of 2865697954 (48.462%) in intersection # --> 1388044886 bases of 2865697954 (48.437%) in intersection # --> 1388157103 bases of 2863665240 (48.475%) in intersection featureBits hg15 rmsk # --> 1386879340 bases of 2866466359 (48.383%) in intersection featureBits hg13 rmsk # --> 1383216615 bases of 2860907679 (48.349%) in intersection # PREPARE CLUSTER FOR BLASTZ RUN (DONE - 2003-08-05 - Hiram) ssh eieio # This is where kkstore /scratch is kept: cd /cluster/bluearc/scratch/hg/gs.17/build34/rmsk # The following will mark each line for rat and mouse # Rat first will column 1, Mouse second will be column 2 foreach outfl ( *.out ) echo "$outfl" /cluster/bluearc/RepeatMasker030619/DateRepsinRMoutput.pl \ ${outfl} -query human -comp rat -comp mouse end # Now extract each one, 1 = Rat, 2 = Mouse cd /cluster/bluearc/scratch/hg/gs.17/build34 mkdir linSpecRep.notInRat mkdir linSpecRep.notInMouse foreach f (rmsk/*.out_rat_mus) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInRat/$base.out.spec /cluster/bin/scripts/extractLinSpecReps 2 $f > \ linSpecRep.notInMouse/$base.out.spec end # That produced no difference at all between those two targets. # Have requested confirmation from Arian # BLASTZ MOUSE (DONE - 2003-08-07 - Hiram) ssh eieio cd /cluster/bluearc/mm3.RM030619 foreach f (rmsk.spec/*.out_rat_hum) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 2 $f > \ linSpecRep.notInHuman/$base.out.spec end ssh eieio mkdir -p /cluster/data/hg16/bed/blastz.mm3 cd /cluster/data/hg16/bed/blastz.mm3 cat << '_EOF_' > DEF # mouse vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Human SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Mouse SEQ2_DIR=/iscratch/i/mm3.RM030619/mixedNib/ # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman/ SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/store4/gs.17/build34/bed/blastz.mm3 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place DS=`date -I` cp DEF ~angie/hummus/DEF.mm3-hg16.$DS ssh kk cd ~hg16/bed/blastz.mm3 cd /cluster/data/hg16/bed/blastz.mm3 # source the DEF file to establish environment for following commands bash . ./DEF # follow the next set of directions slavishly mkdir -p $BASE/run # give up on avoiding angie's directories # tcl script # creates xdir.sh and joblist run/j ~angie/hummus/make-joblist $DEF > $BASE/run/j # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size sh $BASE/xdir.sh cd $BASE/run # now edit j to prefix path to executable name # NOTE: we should have a controlled version of schwartz bin executables sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 # make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j # 39663 jobs para try para check para push # ... etc ... # With competition on the cluster: # Completed: 39663 of 39663 jobs # CPU time in finished jobs: 14365996s 239433.27m 3990.55h 166.27d 0.456 y # IO & Wait Time: 681029s 11350.48m 189.17h 7.88d 0.022 y # Average job time: 379s 6.32m 0.11h 0.00d # Longest job: 9275s 154.58m 2.58h 0.11d # Submission to last job: 53023s 883.72m 14.73h 0.61d # post-process blastz ssh eieio cd /cluster/data/hg16/bed/blastz.mm3 # source the DEF file again in case you are coming back to this # (must be bash shell) . ./DEF # a new run directory mkdir -p run.1 mkdir -p $BASE/lav # create a new job list to convert out files to lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \ > run.1/jobList cd run.1 # make sure the job list is OK wc -l jobList # 312 jobs head jobList # run on cluster ssh kk cd /cluster/data/hg16/bed/blastz.mm3/run.1 para create jobList para try para check para push # etc. # Completed: 339 of 339 jobs # CPU time in finished jobs: 11666s 194.44m 3.24h 0.14d 0.000 y # IO & Wait Time: 69155s 1152.58m 19.21h 0.80d 0.002 y # Average job time: 238s 3.97m 0.07h 0.00d # Longest job: 1332s 22.20m 0.37h 0.02d # Submission to last job: 1497s 24.95m 0.42h 0.02d # convert lav files to axt ssh kk cd /cluster/data/hg16/bed/blastz.mm3 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 # create template file for gensub2 # usage: blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir cat << '_EOF_' > gsub #LOOP /cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.mm3/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.mm3/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/mm3.RM030619/mixedNib/ #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1S /cluster/store4/gs.17/build34/bed/blastz.mm3/lav > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList # 42 jobs head jobList cd /cluster/data/hg16/bed/blastz.mm3/run.2 para create jobList para try para check para push # The two crashed jobs are about chr19 and chr19_random # Its chr19_random .fa file is almost all masked sequence # The resulting .axt file is empty. The chr19 is too big #Completed: 40 of 42 jobs #Crashed: 2 jobs #CPU time in finished jobs: 1908s 31.80m 0.53h 0.02d 0.000 y #IO & Wait Time: 22178s 369.64m 6.16h 0.26d 0.001 y #Average job time: 602s 10.04m 0.17h 0.01d #Longest job: 1723s 28.72m 0.48h 0.02d #Submission to last job: 1802s 30.03m 0.50h 0.02d # To fixup the chr19 axtsort problem # sometimes alignments are so huge that they cause axtSort to run out # of memory. Run them in two passes like this: ssh kkr1u00 cd /cluster/data/hg16/bed/blastz.mm3 set base=/cluster/data/hg16/bed/blastz.mm3 set seq1_dir=/iscratch/i/gs.17/build34/bothMaskedNibs set seq2_dir=/iscratch/i/mm3.RM030619/mixedNib/ foreach c (lav/chr19) pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" foreach d (*.lav) set smallout=$d.axt lavToAxt $d $seq1_dir $seq2_dir stdout \ | axtDropSelf stdin stdout \ | axtSort stdin $smallout end cat `ls -1 *.lav.axt | sort -g` > $base/$out popd end # Remove the empty axtChrom/chr19_random.axt file to avoid future # processing errors # translate sorted axt files into psl ssh eieio cd /cluster/data/hg16/bed/blastz.mm3 mkdir -p pslChrom set tbl = "blastzMm3" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 20 minutes # chr19 came along later ssh kkr1u00 set tbl = "blastzMm3" foreach f (axtChrom/chr19.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev set tbl = "blastzMm3" cd /cluster/data/hg16/bed/blastz.mm3/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl # This takes 30 minutes to an hour # and later chr19 /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr19_${tbl}.psl # create trackDb/human/hg16 and get a trackDb.ra file started with: # track blastzMm3 # shortLabel Mouse Blastz # longLabel Blastz All Mouse (Feb. 03) Alignments # group compGeno # priority 130 # visibility hide # color 100,50,0 # altColor 255,240,200 # spectrum on # type psl xeno mm3 # otherDb mm3 # remake trackDb tables # redo chr1 (featureBits shows 7% lower aligments than hg16) # (DONE 2003-09-09 kate) # blastz run ended prematurely -- .tmp files leftover, not moved to .out's ssh kk cd /cluster/data/hg16/bed/blastz.mm3 bash . ./DEF cd $BASE mkdir run.chr1 # create job list for human chr1, with parasol output file validation ~angie/hummus/make-joblist $DEF | \ /cluster/bin/scripts/blastz-clusterjob.pl $BASE | \ grep 'run chr1.nib' | \ sed -e 's#^#/cluster/bin/penn/#' \ > $BASE/run.chr1/spec grep 'chr1/' $BASE/xdir.sh > $BASE/xdir.chr1.sh mv raw/chr1 raw/chr1.old mkdir raw/chr1 sh xdir.chr1.sh cd run.chr1 para create spec # 2925 jobs para try para check para push # ... etc ... ssh eieio bash cd /cluster/data/hg16/bed/blastz.mm3 . DEF mv lav/chr1 lav/chr1.old mkdir run.chr1.lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \ | grep 'lav chr1 ' > run.chr1.lav/jobList cd run.chr1.lav wc -l jobList # 25 jobs head jobList # run on cluster ssh kk cd /cluster/data/hg16/bed/blastz.mm3/run.chr1.lav para create jobList para try para check para push # etc. # convert lav files to chrom axt /cluster/bin/scripts/blastz-chromlav2axt /cluster/data/hg16/bed/blastz.mm3/lav/chr1 /cluster/data/hg16/bed/blastz.mm3/axtChrom/chr1.axt /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib # translate sorted axt files into psl ssh eieio cd /cluster/data/hg16/bed/blastz.mm3 mv pslChrom/chr1_blastzMm3.psl pslChrom/chr1_blastzMm3.psl.old /cluster/bin/i386/axtToPsl axtChrom/chr1.axt S1.len S2.len \ pslChrom/chr1_blastzMm3.psl # reload database table hgsql hg16 -e "drop table chr1_blastzMm3" ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr1_blastzMm3.psl # make chain cd /cluster/data/hg16/bed/blastz.mm3/axtChain/run1 mv chain/chr1.chain chain/chr1.chain.old mv out/chr1.out out/chr1.out.old axtFilter -notQ=chrUn_random /cluster/data/hg16/bed/blastz.mm3/axtChrom/chr1.axt | axtChain stdin \ /cluster/data/hg16/nib \ /cluster/data/mm3/mixedNib chain/chr1.chain > out/chr1.out # sort chains ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain mv all.chain all.chain.old chainMergeSort run1/chain/*.chain > all.chain mv chain chain.old mkdir chain chainSplit chain all.chain # reload chr1 chain into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/axtChain/chain hgLoadChain hg16 chr1_chainMm3 chr1.chain # Loading 510456 chains into hg16.chr1_chainMm3 # make net ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain cd chain /cluster/bin/i386/chainPreNet chr1.chain /cluster/data/hg16/chrom.sizes \ /cluster/data/mm3/chrom.sizes ../preNet/chr1.chain cd .. cd preNet mv ../n1/chr1.net ../n1/chr1.net.old /cluster/bin/i386/chainNet chr1.chain -minSpace=1 \ /cluster/data/hg16/chrom.sizes \ /cluster/data/mm3/chrom.sizes ../n1/chr1.net /dev/null cd .. cp hNoClass.net hNoClass.net.old cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/axtChain netClass hNoClass.net hg16 mm3 mouse.net \ -tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \ -qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain # rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with mv mouseSyn.net mouseSyn.net.old netFilter -syn mouse.net > mouseSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/axtChain netFilter -minGap=10 mouse.net | hgLoadNet hg16 netMm3 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin # make tight subset of net ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain mv ../axtNet/chr1.axt ../axtNet/chr1.old.axt netToAxt mouseNet/chr1.net chain/chr1.chain /cluster/data/hg16/nib \ /cluster/data/mm3.RM030619/mixedNib ../axtNet/chr1.axt mv ../axtTight/chr1.axt ../axtTight/chr1.axt.old cd ../axtNet subsetAxt chr1.axt ../axtTight/chr1.axt \ /cluster/data/subsetAxt/coding.mat 3400 # translate to psl cd ../axtTight axtToPsl chr1.axt ../S1.len ../S2.len ../pslTight/chr1_blastzTightMm3.psl # Load table into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/pslTight hgLoadPsl -noTNameIx hg16 chr1_blastzTightMm3.psl # $ featureBits -chrom=chr1 hg16 chr1_blastzTightMm3.psl # 14052627 bases of 221562941 (6.342%) in intersection # hg15: 13990547 bases of 218713898 (6.397%) in intersection # make axtNet300 ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain netSplit mouse.net mouseNet mv ../axtNet300/chr1.axt ../axtNet300/chr1.axt.old netToAxt -maxGap=300 mouseNet/chr1.net chain/chr1.chain /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib ../axtNet300/chr1.axt # create 2-way maf file for humor alignment set multizDir = /cluster/data/hg16/bed/humor.2003-09-02 cd /cluster/data/hg16 set mouseDir = bed/blastz.mm3/axtNet300 axtSort $mouseDir/chr1.axt $mouseDir/chr1.axt.sorted mv $mouseDir/chr1.axt.sorted $mouseDir/chr1.axt axtToMaf $mouseDir/chr1.axt \ /cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes \ $multizDir/maf/chr1.mm3.maf.unfixed -tPrefix=hg16. -qPrefix=mm3. /cluster/bin/scripts/fixmaf.pl \ < $multizDir/maf/chr1.mm3.maf.unfixed > $multizDir/maf/chr1.mm3.maf rm $multizDir/maf/chr1.mm3.maf.unfixed # NET MOUSE BLASTZ (DONE - 2003-08-22 - Hiram) ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \ /cluster/data/mm3/chrom.sizes ../preNet/$i end # This foreach loop will take about 15 min to execute. cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \ /cluster/data/mm3/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 2490523648, utime 15421 s/100, stime 3665 ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/axtChain ~/bin/i386/netClass hNoClass.net hg16 mm3 mouse.net \ -tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \ -qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman # If things look good do ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with netFilter -syn mouse.net > mouseSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/axtChain netFilter -minGap=10 mouse.net | hgLoadNet hg16 netMm3 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin # make net ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain mkdir mouseNet netSplit mouse.net mouseNet foreach n (mouseNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt mouseNet/$c.net chain/$c.chain \ /cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \ /cluster/data/mm3.RM030619/mixedNib \ ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end # MAKE BLASTZ BEST MOUSE MM3 (DONE - 2003-08-26 - Hiram) # IMPORTANT NOTE - This axtBest process has been replaced by the # chain to net to axt process. Note procedure below continues # after the chain and nets have been produced. # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChrom mkdir -p /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom # copy chrom axt's to bluearc, to avoid hitting fileserver too hard cp -p *.axt /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom # chr19 came along later: cp -p chr19.axt /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom ssh kk cd /cluster/data/hg16/bed/blastz.mm3 mkdir -p axtBest pslBest mkdir run.3 cd run.3 # create script to filter files cat << '_EOF_' > doBestAxt #!/bin/csh -f # usage: doBestAxt chr axt-file best-file psl-file /cluster/bin/i386/axtBest $2 $1 $3 -minScore=300 sleep 1 /cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.mm3/S1.len \ /cluster/data/hg16/bed/blastz.mm3/S2.len $4 '_EOF_' # << this line makes emacs coloring happy chmod +x doBestAxt cd ../axtChrom ls -1S | sed 's/.axt$//' > ../run.3/chrom.list cd ../run.3 # create template for cluster job cat << '_EOF_' > gsub #LOOP doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.mm3/axtBest/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.mm3/pslBest/$(root1)_blastzBestMm3.psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 chrom.list single gsub jobList wc -l jobList # 41 jobs head jobList ssh kk cd /cluster/data/hg16/bed/blastz.mm3 cd run.3 para create jobList para try para check para push # With the chr19 situation, went back and reran this situation. # for some unknown reason the first time it had 9 failed jobs: Completed: 32 of 41 jobs Crashed: 9 jobs CPU time in finished jobs: 827s 13.78m 0.23h 0.01d 0.000 y IO & Wait Time: 1299s 21.65m 0.36h 0.02d 0.000 y Average job time: 66s 1.11m 0.02h 0.00d Longest job: 361s 6.02m 0.10h 0.00d Submission to last job: 1195s 19.92m 0.33h 0.01d # And then rerunning those 9 failed jobs, only chr19 failed: Completed: 8 of 9 jobs Crashed: 1 jobs CPU time in finished jobs: 748s 12.47m 0.21h 0.01d 0.000 y IO & Wait Time: 2290s 38.16m 0.64h 0.03d 0.000 y Average job time: 380s 6.33m 0.11h 0.00d Longest job: 1247s 20.78m 0.35h 0.01d Submission to last job: 1261s 21.02m 0.35h 0.01d # Better yet, Jim says to be consistent, do all the chroms in # this manner: ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain mkdir mouseNet netSplit mouse.net mouseNet foreach n (mouseNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt mouseNet/$c.net chain/$c.chain \ /cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \ /cluster/data/mm3.RM030619/mixedNib \ ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end mkdir -p /cluster/data/hg16/bed/blastz.mm3/axtBest cd /cluster/data/hg16/bed/blastz.mm3/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area (DONE 2003-09-24 kate) cd /cluster/data/hg16/bed/blastz.mm3/axtNet gzip *.axt mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtNet cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtNet # add README.txt file to dir, if needed # Convert those axt files to psl ssh eieio cd /cluster/data/hg16/bed/blastz.mm3 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestMm3.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestMm3.psl echo "Done: ${c}_blastzBestMm3.psl" end # Load tables ssh hgwdev set base="/cluster/data/hg16/bed/blastz.mm3" set tbl="blastzBestMm3" cd $base/pslBest /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl # check results # the original axtBest stuff from the axtBest operation: # featureBits hg16 blastzBestMm3 # 1027438291 bases of 2865248791 (35.859%) in intersection # After going through the chain->net->axt operation: # featureBits hg16 blastzBestMm3 # 991468768 bases of 2865248791 (34.603%) in intersection # And finally after fixing a blastz execution problem on chr1: # 1007362800 bases of 2865248791 (35.158%) in intersection # featureBits hg15 blastzBestMm3 # 1035090465 bases of 2866466359 (36.110%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/hg16/axtBestMm3 cd /gbdb/hg16/axtBestMm3 foreach f (/cluster/data/hg16/bed/blastz.mm3/axtNet/chr*.axt) ln -s $f . end cd /cluster/data/hg16/bed/blastz.mm3/axtNet rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/hg16/axtBestMm3/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql hgsql hg16 < axtInfoInserts.sql # MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-08-25 - Hiram) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtNet mkdir -p ../axtTight tcsh foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm3.psl echo "Done: $i" end # Load tables into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/pslTight hgLoadPsl -noTNameIx hg16 chr*_blastzTightMm3.psl # copy to axt's to download area (DONE 2003-09-24 kate) cd /cluster/data/hg16/bed/blastz.mm3/axtTight gzip *.axt mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtTight cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtTight # add README.txt file to dir, if needed # CHAIN MOUSE BLASTZ (DONE 2003-08-28 - Hiram) # Run axtChain on little cluster ssh kkr1u00 mkdir -p /cluster/data/hg16/bed/blastz.mm3/axtChain/run1 cd /cluster/data/hg16/bed/blastz.mm3/axtChain/run1 mkdir out chain ls -1S /cluster/data/hg16/bed/blastz.mm3/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # axtFilter -notQ=chrUn_random $1 | axtChain stdin cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ=chrUn_random $1 | axtChain stdin \ /iscratch/i/gs.17/build34/bothMaskedNibs \ /iscratch/i/mm3.RM030619/mixedNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain mkdir out chain # 41 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... Completed: 41 of 41 jobs CPU time in finished jobs: 31379s 522.98m 8.72h 0.36d 0.001 y IO & Wait Time: 10761s 179.35m 2.99h 0.12d 0.000 y Average job time: 1028s 17.13m 0.29h 0.01d Longest job: 10327s 172.12m 2.87h 0.12d Submission to last job: 10327s 172.12m 2.87h 0.12d # now on the cluster server, sort chains ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # these steps take ~20 minutes # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg16 ${c}_chainMm3 $i echo done $c end # NET MOUSE BLASTZ (DONE - 2003-08-22 - Hiram) ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \ /cluster/data/mm3/chrom.sizes ../preNet/$i end # This foreach loop will take about 15 min to execute. cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \ /cluster/data/mm3/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 2490523648, utime 15421 s/100, stime 3665 ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/axtChain ~/bin/i386/netClass hNoClass.net hg16 mm3 mouse.net \ -tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \ -qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman # If things look good do ssh eieio cd /cluster/data/hg16/bed/blastz.mm3/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with netFilter -syn mouse.net > mouseSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/axtChain netFilter -minGap=10 mouse.net | hgLoadNet hg16 netMm3 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin # Add entries for net and chain to human/hg16 trackDb # MAKE HUMAN-MOUSE MM3 OVER.CHAIN FOR LIFTOVER (2004-07-09 kate) ssh eieio set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain gunzip *.gz ssh kolossus set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain cd $chainDir mkdir subset cat > makeSubset.csh << 'EOF' set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain foreach f ($chainDir/chain/*.chain.gz) set c = $f:t:r:r echo subsetting $c gunzip -c $f | netChainSubset $chainDir/mouseNet/$c.net stdin \ subset/$c.chain end 'EOF' # << for emacs csh makeSubset.csh >&! makeSubset.log & tail -100f makeSubset.log cat subset/*.chain > /cluster/data/hg16/bed/liftOver/hg16Tomm3.chain # test reciprocal best on chr6 for ENr233 ssh kkstore cd /cluster/data/hg16/bed/blastz.mm3/axtChain # renumber chains to assure unique ID's, # since netting splits some (should redo the liftOver chain with new ID's) # then sort by score for netter mkdir uniqueSubset chainMergeSort subset/chr6.chain > uniqueSubset/chr6.chain mkdir swappedSubset chainSwap uniqueSubset/chr6.chain swappedSubset/chr6.chain mkdir recipBestTest cd recipBestTest chainSort ../uniqueSubset/chr6.chain stdout | \ chainNet stdin \ /cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes\ human.chr6.net mouse.chr6.net netChainSubset mouse.chr6.net ../swappedSubset/chr6.chain stdout | \ chainSwap stdin chr6.chain ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm3/axtChain/recipBestTest hgLoadChain hg16 rBestChainMm3 chr6.chain # didn't filter enuff -- perhaps didn't rechain in proper direction # BLASTZ RAT (DONE - 2003-08-07 - Hiram) ssh eieio mkdir -p /cluster/data/hg16/bed/blastz.rn3 cd /cluster/data/hg16/bed/blastz.rn3 cat << '_EOF_' > DEF # rat vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Human SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInRat SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Rat SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/store4/gs.17/build34/bed/blastz.rn3 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place DS=`date -I` cp DEF ~angie/hummus/DEF.rn3-hg16.$DS ssh kk cd /cluster/data/hg16/bed/blastz.rn3 # source the DEF file to establish environment for following commands . ./DEF # follow the next set of directions slavishly mkdir -p $BASE/run # give up on avoiding angie's directories # tcl script # creates xdir.sh and joblist run/j ~angie/hummus/make-joblist $DEF > $BASE/run/j # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size sh $BASE/xdir.sh cd $BASE/run # now edit j to prefix path to executable name # NOTE: we should have a controlled version of schwartz bin executables sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 # make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j # 39663 jobs para try para check para push # ... etc ... # Completed: 41697 of 41697 jobs # CPU time in finished jobs: 14155946s 235932.43m 3932.21h 163.84d 0.449 y # IO & Wait Time: 1005629s 16760.49m 279.34h 11.64d 0.032 y # Average job time: 364s 6.06m 0.10h 0.00d # Longest job: 4310s 71.83m 1.20h 0.05d # Submission to last job: 35086s 584.77m 9.75h 0.41d # post-process blastz ssh kk cd /cluster/data/hg16/bed/blastz.rn3 # source the DEF file again in case you are coming back to this # (must be bash shell) . ./DEF # a new run directory mkdir -p run.1 mkdir -p $BASE/lav # create a new job list to convert out files to lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \ > run.1/jobList cd run.1 # make sure the job list is OK wc -l jobList # 339 jobs head jobList # run on cluster ssh kk cd /cluster/data/hg16/bed/blastz.rn3/run.1 para create jobList para try para check para push # etc. # Completed: 339 of 339 jobs # CPU time in finished jobs: 6562s 109.37m 1.82h 0.08d 0.000 y # IO & Wait Time: 154475s 2574.58m 42.91h 1.79d 0.005 y # Average job time: 475s 7.92m 0.13h 0.01d # Longest job: 924s 15.40m 0.26h 0.01d # Submission to last job: 933s 15.55m 0.26h 0.01d # convert lav files to axt ssh kk cd /cluster/data/hg16/bed/blastz.rn3 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 # create template file for gensub2 # usage: blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir cat << '_EOF_' > gsub #LOOP /cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.rn3/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.rn3/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1S /cluster/store4/gs.17/build34/bed/blastz.rn3/lav > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList # 42 jobs head jobList para create jobList para try para check para push # ... etc ... # The crashed job is again chr19_random # Completed: 41 of 42 jobs # Crashed: 1 jobs # CPU time in finished jobs: 1507s 25.12m 0.42h 0.02d 0.000 y # IO & Wait Time: 17520s 292.00m 4.87h 0.20d 0.001 y # Average job time: 464s 7.73m 0.13h 0.01d # Longest job: 1214s 20.23m 0.34h 0.01d # Submission to last job: 1214s 20.23m 0.34h 0.01d # Remove the empty axtChrom/chr19_random.axt file to avoid future # processing errors # translate sorted axt files into psl ssh eieio cd /cluster/data/hg16/bed/blastz.rn3 mkdir -p pslChrom set tbl = "blastzRn3" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 20 minutes # Load database tables ssh hgwdev set tbl = "blastzRn3" cd /cluster/data/hg16/bed/blastz.rn3/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl # This takes 30 minutes to an hour # New entry in human/hg16/trackDb.ra # track blastzRn3 # shortLabel Rat Blastz # longLabel Merged Blastz Rat (June 03) Alignments # group compGeno # priority 142 # visibility hide # color 100,50,0 # altColor 255,240,200 # spectrum on # type psl xeno rn3 # otherDb rn3 # MAKE BLASTZ BEST RAT RN3 (DONE - 2003-08-08 - Hiram - Redone 08-26) # IMPORTANT NOTE - this axtBest process has been replaced by # the chain -> net -> netToAxt process. So, after chains and # nets have been created, pick up this best process below. # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio cd /cluster/data/hg16/bed/blastz.rn3/axtChrom mkdir -p /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom # copy chrom axt's to bluearc, to avoid hitting fileserver too hard cp -p *.axt /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom ssh kk cd /cluster/data/hg16/bed/blastz.rn3 mkdir -p axtBest pslBest mkdir run.3 cd run.3 # create script to filter files cat << '_EOF_' > doBestAxt #!/bin/csh -f # usage: doBestAxt chr axt-file best-file psl-file /cluster/bin/i386/axtBest $2 $1 $3 -minScore=300 sleep 1 /cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.rn3/S1.len \ /cluster/data/hg16/bed/blastz.rn3/S2.len $4 '_EOF_' # << this line makes emacs coloring happy chmod +x doBestAxt cd ../axtChrom ls -1S | sed 's/.axt$//' > ../run.3/chrom.list cd ../run.3 # create template for cluster job cat << '_EOF_' > gsub #LOOP doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.rn3/axtBest/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.rn3/pslBest/$(root1)_blastzBestRn3.psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 chrom.list single gsub jobList wc -l jobList # 41 jobs head jobList cd /cluster/data/hg16/bed/blastz.rn3 cd run.3 para create jobList para try para check para push # 106 minutes, almost all I/O time: # Completed: 41 of 41 jobs # CPU time in finished jobs: 2225s 37.09m 0.62h 0.03d 0.000 y # IO & Wait Time: 36349s 605.81m 10.10h 0.42d 0.001 y # Average job time: 941s 15.68m 0.26h 0.01d # Longest job: 6415s 106.92m 1.78h 0.07d # Submission to last job: 6417s 106.95m 1.78h 0.07d # Better yet, Jim says to be consistent, do all the chroms in # this manner: ssh eieio cd /cluster/data/hg16/bed/blastz.rn3/axtChain mkdir ratNet netSplit rat.net ratNet mkdir ../axtNet foreach n (ratNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt -maxGap=25 ratNet/$c.net chain/$c.chain \ /cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \ /cluster/bluearc/rat/rn3/softNib \ ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end mkdir -p /cluster/data/hg16/bed/blastz.rn3/axtBest cd /cluster/data/hg16/bed/blastz.rn3/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area (DONE 2003-09-24 kate) ssh eieio cd /cluster/data/hg16/bed/blastz.rn3/axtNet gzip *.axt ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtNet cd /cluster/data/hg16/bed/blastz.rn3/axtNet cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtNet # add README.txt file to dir, if needed # Convert those axt files to psl ssh eieio cd /cluster/data/hg16/bed/blastz.rn3 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestRn3.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestRn3.psl echo "Done: ${c}_blastzBestRn3.psl" end # Load tables ssh hgwdev cd /cluster/data/hg16/bed/blastz.rn3/pslBest /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzBestRn3.psl # check results # Via the netToAxt process: # featureBits hg16 blastzBestRn3 # 976121391 bases of 2865248791 (34.068%) in intersection # With the original axtBest process, before the netToAxt process: # featureBits hg16 blastzBestRn3 # 1002119325 bases of 2865248791 (34.975%) in intersection # Hg15 results: # featureBits hg15 blastzBestRn3 # 992724355 bases of 2866466359 (34.632%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/hg16/axtBestRn3 cd /gbdb/hg16/axtBestRn3 ln -s /cluster/data/hg16/bed/blastz.rn3/axtNet/chr*.axt . cd /cluster/data/hg16/bed/blastz.rn3/axtNet rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/hg16/axtBestRn3/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end # Already done above. This table needs definition only once # hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql hgsql hg16 < axtInfoInserts.sql # MAKING RAT AXTTIGHT FROM AXTBEST (DONE - 2003-08-26 - Hiram) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd /cluster/data/hg16/bed/blastz.rn3/axtNet mkdir -p ../axtTight tcsh foreach i (*.axt) echo $i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn3.psl echo "Done: $i" end # Load tables into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.rn3/pslTight hgLoadPsl -noTNameIx hg16 chr*_blastzTightRn3.psl # copy axt's to download area (DONE 2003-09-24 kate) ssh eieio cd /cluster/data/hg16/bed/blastz.rn3/axtTight gzip *.axt ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtTight cd /cluster/data/hg16/bed/blastz.rn3/axtTight cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtTight # add README.txt file to dir, if needed # CHAIN RAT BLASTZ (DONE 2003-08-08 - Hiram) # Run axtChain on little cluster ssh kkr1u00 cd /cluster/data/hg16/bed/blastz.rn3 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/hg16/bed/blastz.rn3/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # axtFilter -notQ=chrUn_random $1 | axtChain stdin cat << '_EOF_' > doChain #!/bin/sh axtFilter $1 | axtChain stdin \ /iscratch/i/gs.17/build34/bothMaskedNibs \ /iscratch/i/rn3/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain mkdir out chain # 41 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # With only 6 CPUs available: # Completed: 40 of 40 jobs # CPU time in finished jobs: 21791s 363.19m 6.05h 0.25d 0.001 y # IO & Wait Time: 12491s 208.18m 3.47h 0.14d 0.000 y # Average job time: 857s 14.28m 0.24h 0.01d # Longest job: 2724s 45.40m 0.76h 0.03d # Submission to last job: 5875s 97.92m 1.63h 0.07d # now on the cluster server, sort chains ssh eieio cd /cluster/data/hg16/bed/blastz.rn3/axtChain /cluster/bin/i386/chainMergeSort run1/chain/*.chain > all.chain /cluster/bin/i386/chainSplit chain all.chain # these steps take ~20 minutes # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/hg16/bed/blastz.rn3/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg16 ${c}_chainRn3 $i echo done $c end CREATE bigZips stuff for release (DONE 2003-08-01, 08-06, 08-08 - Hiram) # make bigZips/mrna.zip (markd 8 aug 2003) # on hgbeta: cd /genbank ./bin/i386/gbGetSeqs -get=seq -db=hg16 -native genbank mrna download/hg16/bigZips/mrna.fa zip download/hg16/bigZips/mrna.zip download/hg16/bigZips/mrna.fa rm download/hg16/bigZips/mrna.fa ssh hgwdev # This stuff has to work in a different way because this stuff # updates on a daily basis. - (DONE 2003-08-09 - Hiram) cd /usr/local/apache/htdocs/goldenPath/hg16/bigZips featureBits hg16 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa rm upstream1000.fa featureBits hg16 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa rm upstream2000.fa featureBits hg16 refGene:upstream:5000 -fa=upstream5000.fa zip upstream5000.zip upstream5000.fa rm upstream5000.fa # MAKING MOUSE AND RAT SYNTENY (MOUSE done 2003-09-16)(RAT Done 2003-08-28) ssh hgwdev mkdir -p /cluster/data/hg16/bed/syntenyMm3 cd /cluster/data/hg16/bed/syntenyMm3 # Copy all the needed scripts from /cluster/data/hg15/bed/syntenyMouse cp -p /cluster/data/hg15/bed/syntenyMouse/*.pl . ./syntenicBest.pl -db=hg16 -table=blastzBestMm3 ./smooth.pl ./joinsmallgaps.pl ./fillgap.pl -db=hg16 -table=blastzBestMm3 ./synteny2bed.pl hgLoadBed hg16 syntenyMm3 ucsc100k.bed # And for the Rat, same thing, different directory: mkdir ../syntenyRn3 cd ../syntenyRn3 ../syntenyMm3/syntenicBest.pl -db=hg16 -table=blastzBestRn3 # smooth.pl overwrites genomeBest2phase created by the previous # run of this above. Runs quickly. ../syntenyMm3/smooth.pl # joinsmallgaps.pl overwrites genomeBest3phase created above. Runs quickly. ../syntenyMm3/joinsmallgaps.pl # fillgap.pl creates genomeBestFinal ../syntenyMm3/fillgap.pl -db=hg16 -table=blastzBestRn3 # synteny2bed.pl creates ucsc100k.bed ../syntenyMm3/synteny2bed.pl hgLoadBed hg16 syntenyRn3 ucsc100k.bed # Loaded 1537 elements # NET RAT BLASTZ (WORKING - 2003-08-11 - Hiram) ssh eieio cd /cluster/data/hg16/bed/blastz.rn3/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \ /cluster/data/rn3/chrom.sizes ../preNet/$i end # This foreach loop will take about 15 min to execute. cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \ /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 2511495168, utime 15658 s/100, stime 3383 # The netClass operations requires an "ancientRepeat" table to exist # in either hg16 or rn3. So, create the table: ssh hgwdev mkdir -p /cluster/data/hg16/bed/ancientRepeat cd /cluster/data/hg16/bed/ancientRepeat # mysqldump needs write permission to this directory # and you need to use your read/write enabled user with password chmod 777 . hgsqldump --all --tab=. hg15 ancientRepeat chmod 775 . hgsql hg16 < ancientRepeat.sql mysqlimport -u -p hg16 ancientRepeat.txt # This is a hand curated table obtained from Arian. # The mouse.net argument here should have been rat.net cd /cluster/data/hg16/bed/blastz.rn3/axtChain /cluster/bin/i386/netClass hNoClass.net hg16 rn3 mouse.net \ -tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInRat \ -qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman # If things look good do ssh eieio cd /cluster/data/hg16/bed/blastz.rn3/axtChain rm -r n1 hNoClass.net # The arguments here should have been rat.net and ratSyn.net # Make a 'syntenic' subset of these with netFilter -syn mouse.net > mouseSyn.net # The mouse.net argument here should have been rat.net from the # netClass operation above. # Load the nets into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.rn3/axtChain netFilter -minGap=10 mouse.net | hgLoadNet hg16 netRn3 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetRn3 stdin # Add entries for net and chain to human/hg16 trackDb # MAKE HUMAN-RAT OVER.CHAIN FOR LIFTOVER (2004-07-09 kate) ssh eieio set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain cd $chainDir/ratNet gunzip *.gz ssh kolossus cd /cluster/data/hg16/bed/liftOver mkdir hg16Torn3 cd hg16Torn3 set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain mkdir subset cat > makeSubset.csh << 'EOF' set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain foreach f ($chainDir/chain/*.chain) set c = $f:t:r:r echo subsetting $c netChainSubset $chainDir/ratNet/$c.net $f subset/$c.chain end 'EOF' # << for emacs csh makeSubset.csh >&! makeSubset.log & tail -100f makeSubset.log cat subset/*.chain > /cluster/data/hg16/bed/liftOver/hg16Torn3.chain # Make Known Genes Track This task has many steps and currently it is described by two documents: 1. makeProteins072003.doc describes how the protein databases, biosql072003 and proteins072003, were built 2. makeKgHg16.doc describes how the Known Genes related database tables were built for hg16. makeKgHg16.doc could be merged with makeHg16.doc after minor editing of the format style. # LIFTING REPEATMASKER .ALIGN FILES # for this work, I had to delete some comments that were in the .align files. # The edited files were # NT_008046_01.fa.align (around line 10586) # NT_008046_75.fa.align (around line 3320) # The lines I deleted are: # # These elements can be clipped out with the options is_clip or is_only. # The latter does not run the 'normal' RepeatMasker routine and positions in the current # .out file will not correspond with the -is_only reconstructed sequence. # foreach d (?{,?}/NT_??????) set c=$d:t cd $d echo $c to $c.fa.align /cluster/bin/scripts/liftRMAlign.pl $c.lft > $c.fa.align cd ../.. end foreach chr (?{,?}) cd $chr echo making symbolic links for chr$chr NT .fa.align files foreach ctg (NT_??????) ln -s $ctg/$ctg.fa.align end cd .. if (-e $chr/lift/ordered.lft) then echo making $chr/chr$chr.fa.align /cluster/bin/scripts/liftRMAlign.pl $chr/lift/ordered.lft \ > $chr/chr$chr.fa.align endif if (-e $chr/lift/random.lft) then echo making $chr/chr${chr}_random.fa.align /cluster/bin/scripts/liftRMAlign.pl $chr/lift/random.lft \ > $chr/chr${chr}_random.fa.align endif echo removing symbolic links for chr$chr NT .fa.align files rm $chr/NT_??????.fa.align end # TWINSCAN 1.3 GENE PREDICTIONS (2003-12-12 braney) cd /cluster/data/hg16/bed rm -fr twinscan mkdir twinscan.2003-12-12 ln -s twinscan.2003-12-12 twinscan cd twinscan tarFile=Hs-NCBI34-TS13-pseudo-masked.tgz wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13_pseudo/Hs-NCBI34-TS13-pseudo-masked.tgz wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13_pseudo/md5sum.txt # check file transferred correctly md5sum $tarFile | diff - md5sum.txt tar xvfz $tarFile unset tarFile # pare down protein FASTA header to id and add missing .a: foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) echo chr$c perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa end ldHgGene hg16 twinscan chr_gtf/chr*.gtf -gtf -genePredExt hgPepPred hg16 generic twinscanPep chr_ptx/chr*-fixed.fa # RAW TWINSCAN 1.3 GENE PREDICTIONS, WITHOUT FILTERING OF PSEUDOGENES # (2004-01-11 acs) cd /cluster/data/hg16/bed mkdir twinscan_raw.2004-01-11 ln -s twinscan.2004-01-11 twinscan_raw cd twinscan_raw tarFile=NCBI34_Hs_TS13_11_11_03.tgz wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13/$tarFile wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13/md5sum.txt # check file transferred correctly md5sum $tarFile | diff - md5sum.txt tar xvfz $tarFile unset tarFile # pare down protein FASTA header to id and add missing .a: foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) echo chr$c perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa end ldHgGene hg16 twinscan_raw chr_gtf/chr*.gtf -gtf hgPepPred hg16 generic twinscanrawPep chr_ptx/chr*-fixed.fa # LOAD GENEID GENES (DONE - 2003-09-02 - Hiram RELOADED -gtf 2004-04-02 kate) mkdir -p /cluster/data/hg16/bed/geneid/download cd /cluster/data/hg16/bed/geneid/download # Now download *.gtf and *.prot from set dir = genome.imim.es/genepredictions/H.sapiens/golden_path_200307/geneid_v1.1/ foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y Un) wget http://$dir/chr$c.gtf wget http://$dir/chr${c}_random.gtf wget http://$dir/chr$c.prot wget http://$dir/chr${c}_random.prot end wget http://$dir/readme # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot echo "done $f" end cd .. ldHgGene hg16 geneid download/*.gtf -gtf # Read 32255 transcripts in 281180 lines in 40 files # 32255 groups 40 seqs 1 sources 3 feature types # 32255 gene predictions hgPepPred hg16 generic geneidPep download/*-fixed.prot # QA NOTE: [ASZ 2007-10-02] sudo mytouch hg16 geneidPep 200404021400.00 # HUMAN/MOUSE/RAT ALIGMNMENT USING HUMOR(MULTIZ) (IN PROGRESS 2003-0829 kate) # Multiple alignment with Mm3, Rn3 ssh eieio # make mouse axtNet300 cd /cluster/data/hg16/bed/blastz.mm3/axtChain/mouseNet mkdir -p ../../axtNet300 foreach f (chr*.net) set c = $f:r echo "mouse axtNet300 on $c" netToAxt -maxGap=300 $c.net ../chain/$c.chain /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib ../../axtNet300/$c.axt end # make rat axtNet300 cd /cluster/data/hg16/bed/blastz.rn3/axtChain/ratNet mkdir -p ../../axtNet300 foreach f (chr*.net) set c = $f:r echo "rat axtNet300 on $c" netToAxt -maxGap=300 $c.net ../chain/$c.chain /cluster/data/hg16/nib /cluster/data/rn3/nib ../../axtNet300/$c.axt end # create 2-way maf files #set multizDir = /cluster/data/hg16/bed/humor.2003-09-02 set multizDir = /cluster/data/hg16/bed/humor.2003-09-08 mkdir -p $multizDir/maf cd /cluster/data/hg16 set mouseDir = bed/blastz.mm3/axtNet300 set ratDir = bed/blastz.rn3/axtNet300 foreach c (`cut -f 1 chrom.sizes`) echo "making mouse mafs on $c" # NOTE: this sort should probably be earlier in the pipeline axtSort $mouseDir/$c.axt $mouseDir/$c.axt.sorted mv $mouseDir/$c.axt.sorted $mouseDir/$c.axt axtToMaf $mouseDir/$c.axt /cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes $multizDir/maf/$c.mm3.maf.unfixed -tPrefix=hg16. -qPrefix=mm3. /cluster/bin/scripts/fixmaf.pl \ < $multizDir/maf/$c.mm3.maf.unfixed > $multizDir/maf/$c.mm3.maf echo "making rat mafs on $c" axtSort $ratDir/$c.axt $ratDir/$c.axt.sorted mv $ratDir/$c.axt.sorted $ratDir/$c.axt axtToMaf $ratDir/$c.axt /cluster/data/hg16/chrom.sizes /cluster/data/rn3/chrom.sizes $multizDir/maf/$c.rn3.maf.unfixed -tPrefix=hg16. -qPrefix=rn3. /cluster/bin/scripts/fixmaf.pl \ < $multizDir/maf/$c.rn3.maf.unfixed > $multizDir/maf/$c.rn3.maf rm $multizDir/maf/*.unfixed end # copy maf files to bluearc for cluster run set clusterDir = /cluster/bluearc/hg16/bed mkdir $clusterDir/blastz.mm3/mafNet300 cp $multizDir/maf/*.mm3.maf $clusterDir/blastz.mm3/mafNet300 mkdir /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300 cp $multizDir/maf/*.rn3.maf $clusterDir/blastz.rn3/mafNet300 # create scripts to run on cluster # run "humor" cd $multizDir mkdir hmr mkdir run cd run cat << EOF > doHumor.kk /cluster/bin/penn/humor.v4 $clusterDir/blastz.mm3/mafNet300/\$1.mm3.maf $clusterDir/blastz.rn3/mafNet300/\$1.rn3.maf > $multizDir/hmr/\$1.hmr.maf EOF chmod +x doHumor.kk cat << EOF > gsub #LOOP doHumor.kk \$(root1) {check out line+ $multizDir/hmr/\$(root1).hmr.maf} #ENDLOOP EOF cd $clusterDir/blastz.mm3/mafNet300 # NOTE: probably want a better way to make the chrom list ls *.maf | awk -F. '{print $1}' > $multizDir/run/chrom.list cd $multizDir/run gensub2 chrom.list single gsub jobList # run jobs ssh kkr9u01 #set multizDir = /cluster/data/hg16/bed/humor.2003-09-02 set multizDir = /cluster/data/hg16/bed/humor.2003-09-08 cd $multizDir/run para create jobList para try para check para push # longest job 27 minutes # setup external files for database reference ssh hgwdev mkdir -p /gbdb/hg16/humorMm3Rn3 cd /gbdb/hg16/humorMm3Rn3 foreach f ($multizDir/hmr/*.maf) ln -s $f . end # load into database #cd $multizDir/hmr/*.maf /cluster/bin/i386/hgLoadMaf -warn hg16 humorMm3Rn3 # copy files to download area (2003-10-24 kate) set dir = /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3 mkdir -p $dir cp -p /gbdb/hg16/humorMm3Rn3/*.maf $dir cd $dir gzip * # edit downloads page to add linke to humorMm3Rn3 # add pairwise mafs to downloads page (2003-11-25 kate) set dir = /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3 mkdir $dir/{rn3,mm3} cd /cluster/data/hg16/bed/humor/maf cp *.mm3.maf $dir/mm3 cp *.rn3.maf $dir/rn3 gzip $dir/mm3/* gzip $dir/rn3/* # Create upstream files (kent) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3 echo hg16 mm3 rn3 > org.txt foreach i (1000 2000 5000) featureBits hg16 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad mafFrags hg16 humorMm3Rn3 up.bed upstream$i.maf -orgs=org.txt rm up.bed end # MAKING BLASTZ SELF (DONE - 2003-08-08 - Hiram) # The procedure for lineage spec business with self is to simply # use the actual repeat masker output for this human assembly as # the lineage specific repeats for itself. Thus, merely make # symlinks to the repeat masker out files and name them as expected # for blastz. In this case they are called notInHuman but they # really mean InHuman. Yes, it is confusing, but that's just the # nature of the game in this case. ssh eieio mkdir -p /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman cd /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman foreach f (../rmsk/*.fa.out) set base = $f:t:r:r echo $base.out.spec ln -s $f $base.out.spec end ssh eieio mkdir -p /cluster/data/hg16/bed/blastzSelf cd /cluster/data/hg16/bed/blastzSelf cat << '_EOF_' > DEF # human vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Human SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Human SEQ2_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/store4/gs.17/build34/bed/blastzSelf DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place DS=`date -I` cp DEF ~angie/hummus/DEF.hg16-hg16.$DS ssh kk cd /cluster/data/hg16/bed/blastzSelf # source the DEF file to establish environment for following commands . ./DEF # follow the next set of directions slavishly mkdir -p $BASE/run # give up on avoiding angie's directories # tcl script # creates xdir.sh and joblist run/j ~angie/hummus/make-joblist $DEF > $BASE/run/j # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size sh $BASE/xdir.sh cd $BASE/run # now edit j to prefix path to executable name # NOTE: we should have a controlled version of schwartz bin executables sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* # 114921 j head j2 # make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j # 114921 jobs para try para check para push # ... etc ... # With some cluster difficulties, bluearc hangups, etc: Completed: 114921 of 114921 jobs CPU time in finished jobs: 19898031s 331633.85m 5527.23h 230.30d 0.631 y IO & Wait Time: 42606494s 710108.24m 11835.14h 493.13d 1.351 y Average job time: 544s 9.06m 0.15h 0.01d Longest job: 111877s 1864.62m 31.08h 1.29d Submission to last job: 344744s 5745.73m 95.76h 3.99d # post-process blastz ssh eieio cd /cluster/data/hg16/bed/blastzSelf # source the DEF file again in case you are coming back to this # (must be bash shell) . ./DEF # a new run directory mkdir -p run.1 mkdir -p $BASE/lav # create a new job list to convert out files to lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \ > run.1/jobList cd run.1 # make sure the job list is OK wc -l jobList # 339 jobs head jobList # run on cluster ssh kk cd /cluster/data/hg16/bed/blastzSelf/run.1 para create jobList para try para check para push # etc. #Completed: 339 of 339 jobs #CPU time in finished jobs: 21101s 351.68m 5.86h 0.24d 0.001 y #IO & Wait Time: 74915s 1248.58m 20.81h 0.87d 0.002 y #Average job time: 283s 4.72m 0.08h 0.00d #Longest job: 2028s 33.80m 0.56h 0.02d #Submission to last job: 2993s 49.88m 0.83h 0.03d # convert lav files to axt ssh kk cd /cluster/data/hg16/bed/blastzSelf mkdir axtChrom # a new run directory mkdir run.2 cd run.2 # create template file for gensub2 # usage: blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir cat << '_EOF_' > gsub #LOOP /cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastzSelf/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastzSelf/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/gs.17/build34/bothMaskedNibs #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1S /cluster/store4/gs.17/build34/bed/blastzSelf/lav > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList # 42 jobs head jobList cd /cluster/data/hg16/bed/blastzSelf/run.2 para create jobList para try para check para push # We have two crashed jobs here. The data for chr7 and chr19 is # too much for the processing. Have to run those separately on # the file server eieio. Completed: 40 of 42 jobs Crashed: 2 jobs CPU time in finished jobs: 4737s 78.95m 1.32h 0.05d 0.000 y IO & Wait Time: 57154s 952.57m 15.88h 0.66d 0.002 y Average job time: 1547s 25.79m 0.43h 0.02d Longest job: 7969s 132.82m 2.21h 0.09d Submission to last job: 8029s 133.82m 2.23h 0.09d # Fixup chr7 and chr19 by running them in two passes like this: ssh eieio cd /cluster/data/hg16/bed/blastzSelf set base=/cluster/data/hg16/bed/blastzSelf set seq1_dir=/cluster/data/hg16/nib set seq2_dir=/cluster/data/hg16/nib foreach c (lav/chr19 lav/chr7) pushd $c set chr=$c:t set out=axtChrom/$chr.axt echo "Translating $chr lav to $out" foreach d (*.lav) set smallout=$d.axt lavToAxt $d $seq1_dir $seq2_dir stdout \ | axtDropSelf stdin stdout \ | axtSort stdin $smallout end cat `ls -1 *.lav.axt | sort -g` > $base/$out popd end # translate sorted axt files into psl ssh eieio cd /cluster/data/hg16/bed/blastzSelf # Need to drop overlaps to eliminate diagonals # DropOverlap seems to drop more than axtDropSelf above foreach f (axtChrom/chr*.axt) set c=$f:t:r mv axtChrom/$c.axt axtChrom/$c.axt /cluster/bin/i386/axtDropOverlap axtChrom/$c.axt \ /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/$c.axt echo "Done: $c" end cd /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped gzip *.axt # Needed a deliver of these right away: (REMOVED 2005-01-27) ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsSelf cd /usr/local/apache/htdocs/goldenPath/hg16/vsSelf cp -p /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt.gz . ssh eieio mkdir -p /cluster/data/hg16/bed/blastzSelf/pslChrom cd /cluster/data/hg16/bed/blastzSelf set tbl = "blastzSelf" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" zcat /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/${c}.axt.gz | \ /cluster/bin/i386/axtToPsl stdin S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 20 minutes XXXX Pick this up tomorrow, 03-09-12 with pslChromDroppedFix # Load database tables ssh hgwdev set tbl = "blastzSelf" cd /cluster/data/hg16/bed/blastzSelf/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzSelf.psl # This takes 30 minutes to an hour # create trackDb/human/hg16 and get a trackDb.ra file started with: # remake trackDb tables # PRODUCE FUGU BLAT ALIGNMENT (IN PROGRESS 2003-08-22 kate) # Use masked scaffolds from fr1 assembly (same sequence as # previous BlatFugu, however it's repeat and TRF-masked). # NOTE: can't access /iscratch/i from fileserver ssh kk mkdir /cluster/data/hg16/bed/blatFr1 cd /cluster/data/hg16/bed/blatFr1 mkdir psl # next time, use N?_?????? (to pick up NG_ contigs) foreach f (/cluster/data/hg16/?{,?}/NT_??????/NT_??????.fa) set c=$f:t:r echo $c mkdir -p psl/$c end # special case for NG_002432 mkdir -p psl/NG_002432 # create cluster job cd run ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > human.lst cat << 'EOF' > gsub #LOOP /cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg16/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl} #ENDLOOP 'EOF' # << this line makes emacs coloring happy gensub2 human.lst fugu.lst gsub spec para create spec # 283798 jobs para try para check para push para check # cd psl # count files with aligments # find . -not -size 427c | wc -l # 89878 # count files with no aligments # find . -size 427c | wc -l # 195265 # When cluster run is done, sort alignments # into chrom directory ssh eieio cd /cluster/data/hg16/bed/blatFr1 pslCat -dir psl/N?_?????? | \ liftUp -type=.psl stdout \ /cluster/data/hg16/jkStuff/liftAll.lft warn stdin | \ pslSortAcc nohead chrom temp stdin # 15 minutes ? # Processed 855648 lines into 4 temp files # Rename to correspond with tables and load into database: ssh hgwdev cd /cluster/data/hg16/bed/blatFr1/chrom rm -f chr*_blatFr1.psl foreach i (chr?{,?}{,_random}.psl) set r = $i:r echo $r mv $i ${r}_blatFr1.psl end # Next assembly, lift fugu scaffolds to Fugu browser chrUn, # so you can link to other browser. And don't need to load sequence # liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl hgLoadPsl -noTNameIx hg16 *.psl # $ featureBits hg16 blatFr1 refGene:CDS # 12787423 bases of 2865248791 (0.446%) in intersection # $ featureBits hg15 blatFugu refGene:CDS # 12427544 bases of 2866466359 (0.434%) in intersection # Edit trackDb.ra to include blatFr1 # NOTE: already in top-level trackDb.ra # Make fugu /gbdb/ symlink and load Fugu sequence data. # NOTE: don't need to do this in next assembly mkdir /gbdb/hg16/fuguSeq cd /gbdb/hg16/fuguSeq ln -s /cluster/data/fr1/fugu_v3.masked.fa # hide .tab file cd /cluster/store2/tmp hgLoadSeq hg16 /gbdb/hg16/fuguSeq/fugu_v3.masked.fa # MAKE BLASTZ BEST SELF (RE-DONE - 2003-08-28 - Hiram) # Pick up on this process below after chain and nets have been # done. This run.3 business is obsolete # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio cd /cluster/data/hg16/bed/blastzSelf/axtChrom mkdir -p /cluster/bluearc/hg16/bed/blastzSelf/axtChrom # copy chrom axt's to bluearc, to avoid hitting fileserver too hard cp -p *.axt /cluster/bluearc/hg16/bed/blastzSelf/axtChrom ssh kk cd /cluster/data/hg16/bed/blastzSelf mkdir -p axtBest pslBest mkdir run.3 cd run.3 # create script to filter files cat << '_EOF_' > doBestAxt #!/bin/csh -f # usage: doBestAxt chr axt-file best-file psl-file /cluster/bin/i386/axtBest $2 $1 $3 -minScore=300 sleep 1 /cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastzSelf/S1.len \ /cluster/data/hg16/bed/blastzSelf/S2.len $4 '_EOF_' # << this line makes emacs coloring happy chmod +x doBestAxt cd ../axtChrom ls -1S | sed 's/.axt$//' > ../run.3/chrom.list cd ../run.3 # create template for cluster job cat << '_EOF_' > gsub #LOOP doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastzSelf/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastzSelf/axtBest/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastzSelf/pslBest/$(root1)_blastzBestMm3.psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 chrom.list single gsub jobList wc -l jobList # 42 jobs head jobList ssh kkr1u00 cd /cluster/data/hg16/bed/blastzSelf/run.3 para create jobList para try para check para push Completed: 38 of 42 jobs Crashed: 4 jobs CPU time in finished jobs: 1884s 31.41m 0.52h 0.02d 0.000 y IO & Wait Time: 8421s 140.34m 2.34h 0.10d 0.000 y Average job time: 271s 4.52m 0.08h 0.00d Longest job: 2061s 34.35m 0.57h 0.02d Submission to last job: 2277s 37.95m 0.63h 0.03d # Some of these files are getting too big for this operation # We will have to get back to these via the chains, nets and a # netToAxt trick # Problems: /cluster/data/hg16/bed/blastzSelf/axtBest/chr19.axt is empty /cluster/data/hg16/bed/blastzSelf/pslBest/chr19_blastzBestMm3.psl is empty Out of memory - request size 1564 bytes /cluster/data/hg16/bed/blastzSelf/axtBest/chr7.axt is empty /cluster/data/hg16/bed/blastzSelf/pslBest/chr7_blastzBestMm3.psl is empty Out of memory - request size 634045604 bytes /cluster/data/hg16/bed/blastzSelf/axtBest/chr1.axt is empty /cluster/data/hg16/bed/blastzSelf/pslBest/chr1_blastzBestMm3.psl is empty ut of memory - request size 984185908 bytes /cluster/data/hg16/bed/blastzSelf/axtBest/chr2.axt is empty /cluster/data/hg16/bed/blastzSelf/pslBest/chr2_blastzBestMm3.psl is empty Out of memory - request size 973662824 bytes # Here is the replacement process for the above sequence # Better yet, Jim says to be consistent, do all the chroms in # this manner: ssh eieio cd /cluster/data/hg16/bed/blastzSelf/axtChain mkdir humanNet mkdir ../axtNet netSplit human.net humanNet foreach n (humanNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt humanNet/$c.net chain/$c.chain \ /cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \ /cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \ ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end mkdir -p /cluster/data/hg16/bed/blastzSelf/axtBest cd /cluster/data/hg16/bed/blastzSelf/axtBest ln -s ../axtNet/chr*.axt . # Convert those axt files to psl ssh eieio cd /cluster/data/hg16/bed/blastzSelf mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestSelf.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestSelf.psl echo "Done: ${c}_blastzBestSelf.psl" end # Load tables ssh hgwdev set base="/cluster/data/hg16/bed/blastzSelf" set tbl="blastzBestSelf" cd $base/pslBest /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl # check results # After going through the chain->net->axt operation: # featureBits hg16 blastzBestSelf # 1388295977 bases of 2865248791 (48.453%) in intersection # Hg15 doesn't have a BestSelf, gave this a try with the following # result: # featureBits hg15 blastzSelf # Out of memory - request size 6 bytes # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/hg16/axtBestSelf cd /gbdb/hg16/axtBestSelf ln -s /cluster/data/hg16/bed/blastzSelf/axtNet/chr*.axt . cd /cluster/data/hg16/bed/blastzSelf/axtNet rm -f axtInfoInserts.sql touch axtInfoInserts.sql foreach f (/gbdb/hg16/axtBestSelf/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('hg16','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end # This table has already been created above # hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql hgsql hg16 < axtInfoInserts.sql # MAKE BLASTZ BEST SELF (NOT NECESSARY - NOT USEFUL - NOT NEEDED - NOT DONE) # MAKING CHAIN SELF BLASTZ (DONE - 2003-08-27 - Hiram) # MAKING CHAIN SELF BLASTZ (RE-DONE - 2003-09-04 - Hiram) # 2003-09-04 - with dropped overlap axtChrom # Run axtChain on little cluster ssh kkr1u00 mkdir -p /cluster/data/hg16/bed/blastzSelf/axtChain/run1 cd /cluster/data/hg16/bed/blastzSelf/axtChain/run1 mkdir out chain ls -1S /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # The -notQ_random (new argument to axtFilter) will omit any # *_random from the query. cat << '_EOF_' > doChain #!/bin/csh ~/bin/i386/axtFilter -notQ_random $1 | axtChain stdin \ /iscratch/i/gs.17/build34/bothMaskedNibs \ /iscratch/i/gs.17/build34/bothMaskedNibs $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain mkdir out chain gensub2 input.lst single gsub jobList # edit jobList and remove the first one that does chr19 # It is a job that would fail anyway after more than an # hour of run time. It will be done separately below para create jobList # 41 jobs para try para push # ... etc ... # Completed: 41 of 41 jobs # CPU time in finished jobs: 27107s 451.78m 7.53h 0.31d 0.001 y # IO & Wait Time: 16236s 270.60m 4.51h 0.19d 0.001 y # Average job time: 1057s 17.62m 0.29h 0.01d # Longest job: 4989s 83.15m 1.39h 0.06d # Submission to last job: 240988s 4016.47m 66.94h 2.79d # The chr19 recovery process: ssh kk mkdir -p /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19 cd /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19 cat << '_EOF_' > gsubQ #LOOP doChainQ.sh $(path2) $(path1) {check out line+ chain/$(root1).$(path2).chain} {check out line+ out/$(root1).$(path2).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChainQ.sh #!/bin/sh ~/bin/i386/axtFilter -notQ_random -q=$1 $2 | axtChain stdin \ /cluster/store4/gs.17/build34/nib \ /cluster/store4/gs.17/build34/nib $3 > $4 '_EOF_' # << this line makes emacs coloring happy chmod +x doChainQ.sh # This is a mistake, this should have been chr19.axt only ls -1S /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt > input.lst pushd /cluster/data/hg16 ls -d ?{,?} | sed -e "s/^/chr/" | grep -v chr19 \ > /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19/chrom19.lst popd mkdir out chain gensub2 input.lst chrom19.lst gsubQ spec19 para create spec19 para try para check para push ... etc ... Completed: 948 of 1050 jobs Crashed: 102 jobs CPU time in finished jobs: 45918s 765.30m 12.75h 0.53d 0.001 y IO & Wait Time: 1700328s 28338.80m 472.31h 19.68d 0.054 y Average job time: 1842s 30.70m 0.51h 0.02d Longest job: 13247s 220.78m 3.68h 0.15d Submission to last job: 13268s 221.13m 3.69h 0.15d # the "crashed 102" jobs are empty chains. # This mistakenly did them all, the input.lst should have been # chr19 only. # So, copy the chr19 results to the ../run1/chain result location cp -p chain/chr19*.chain ../run1/chain # now on the cluster server, sort chains ssh eieio cd /cluster/data/hg16/bed/blastzSelf/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # these steps take ~20 minutes # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg16 ${c}_chainSelf $i echo done $c end # DELIVER these chain files to hgdownload (2005-01-27 - Hiram) ssh eieio cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain gzip chr*.chain ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg16/vsSelf cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain cp -p *.chain.gz /usr/local/apache/htdocs/goldenPath/hg16/vsSelf # fixup README file, request push # NET SELF BLASTZ (RE-DONE 2003-09-09 - DONE - 2003-08-27 - Hiram) ssh eieio cd /cluster/data/hg16/bed/blastzSelf/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \ /cluster/data/hg16/chrom.sizes ../preNet/$i end # This foreach loop will take about 15 min to execute. cd .. mkdir n1 cd preNet # Probably OK to make this minSpace=10, used to be 1 foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=10 \ /cluster/data/hg16/chrom.sizes \ /cluster/data/hg16/chrom.sizes ../n1/$n /dev/null end # The above takes about 5 minutes cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 200167424, utime 2489 s/100, stime 161 ssh hgwdev cd /cluster/data/hg16/bed/blastzSelf/axtChain ~/bin/i386/netClass hNoClass.net hg16 hg16 human.net \ -tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman \ -qNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman # If things look good do ssh eieio cd /cluster/data/hg16/bed/blastzSelf/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with netFilter -syn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg16/bed/blastzSelf/axtChain netFilter -minGap=10 human.net | hgLoadNet hg16 netSelf stdin netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 syntenyNetSelf stdin # Add entries for net and chain to human/hg16 trackDb # MAKING SELF AXTTIGHT FROM AXTCHROM (DONE - 2003-09-09 - Hiram) ssh eieio cd /cluster/data/hg16/bed/blastzSelf/axtChrom mkdir -p /cluster/data/hg16/bed/blastzSelf/axtTight tcsh foreach i (*.axt) echo $i subsetAxt $i /cluster/data/hg16/bed/blastzSelf/axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/90.mat 5000 end # translate to psl cd ../axtTight mkdir -p ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightSelf.psl echo "Done: $i" end # Load tables into database ssh hgwdev cd /cluster/data/hg16/bed/blastzSelf/pslTight hgLoadPsl -noTNameIx hg16 chr*_blastzTightSelf.psl # MAKING SELF SYNTENY - Can be done after Best (NEEDS TO BE REDONE 2003-09-09) ssh hgwdev mkdir -p /cluster/data/hg16/bed/syntenySelf cd /cluster/data/hg16/bed/syntenySelf # Use the scripts that were already copied to ../syntenyMm3 The first one takes 3 to 4 hours. ../syntenyMm3/syntenicBest.pl -db=hg16 -table=blastzBestSelf > synBest.out 2>&1 XXXX - Running 2003-08-27 21:32 ../syntenyMm3/smooth.pl ../syntenyMm3/joinsmallgaps.pl ../syntenyMm3/fillgap.pl -db=hg16 -table=blastzBestSelf ../syntenyMm3/synteny2bed.pl # Load results hgLoadBed hg16 syntenySelf ucsc100k.bed # SGP GENE PREDICTIONS vs Mm4 (DONE - 2003-12-30 - Hiram) mkdir -p /cluster/data/hg16/bed/sgp_mm4/download cd /cluster/data/hg16/bed/sgp_mm4/download foreach f (/cluster/data/hg16/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget --timestamping \ http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/$chr.gtf wget --timestamping \ http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/$chr.prot end wget --timestamping \ http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/chrUn.gtf -O chrUn_random.gtf wget --timestamping \ http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/chrUn.prot -O chrUn_random.prot wget --timestamping \ http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/readme # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. # since this is a relolad of this table updating the data # from Mm3 to Mm4. First check what is there: # featureBits hg16 sgpGene # 39781330 bases of 2865248791 (1.388%) in intersection # now drop that table, and reload hgsql -e "drop table sgpGene;" hg16 # This used to be done with -exon=CDS but it will do the same # thing _AND_ add stop codons when done with -gtf, so do this # with -gtf ldHgGene -gtf hg16 sgpGene download/*.gtf # Read 42880 transcripts in 322086 lines in 39 files # 42880 groups 39 seqs 1 sources 3 feature types # 42880 gene predictions hgsql -e "drop table sgpPep;" hg16 hgPepPred hg16 generic sgpPep download/*-fixed.prot # featureBits hg16 sgpGene # 39698249 bases of 2865248791 (1.386%) in intersection # featureBits hg15 sgpGene # 40395614 bases of 2866466359 (1.409%) in intersection # SGP GENE PREDICTIONS - Mm3 (DONE - 2003-09-14 - Hiram - to be verified) mkdir -p /cluster/data/hg16/bed/sgp/download cd /cluster/data/hg16/bed/sgp/download foreach f (/cluster/data/hg16/?{,?}/chr?{,?}{,_random}.fa) set chr = $f:t:r wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/$chr.gtf wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/$chr.prot end wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/chrUn.gtf -O chrUn_random.gtf wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/chrUn.prot -O chrUn_random.prot # Add missing .1 to protein id's foreach f (*.prot) perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot end cd .. ldHgGene hg16 sgpGene download/*.gtf -exon=CDS # Read 43109 transcripts in 323911 lines in 39 files # 43109 groups 39 seqs 1 sources 3 feature types # 43109 gene predictions hgPepPred hg16 generic sgpPep download/*-fixed.prot # featureBits hg16 sgpGene # 39781330 bases of 2865248791 (1.388%) in intersection # featureBits hg15 sgpGene # 40395614 bases of 2866466359 (1.409%) in intersection # SGP GENES (UPDATE 1/18/2006) sgpPep table dropped, replaced by hgc generated protein seq in browser LOAD NCI60 (DONE: Fan 10/20/2003) o - # ssh hgwdev cd /projects/cc/hg/mapplots/data/NCI60/dross_arrays_nci60/ mkdir hg16 cd hg16 findStanAlignments hg16 ../BC2.txt.ns ../../image/cumulative_plates.011204.list.human hg16.image.psl >& hg16.image.log cp ../experimentOrder.txt ./ sed -e 's/ / \.\.\//g' < experimentOrder.txt > epo.txt egrep -v unknown hg16.image.psl > hg16.image.good.psl stanToBedAndExpRecs hg16.image.good.psl hg16.nci60.exp hg16.nci60.bed `cat epo.txt` hgsql hg16 < ../../scripts/nci60.sql echo "load data local infile 'hg16.nci60.bed' into table nci60" | hgsql hg16 mkdir /cluster/store4/gs.17/build34/bed/nci60 mv hg16.nci60.bed /cluster/store4/gs.17/build34/bed/nci60 rm *.psl # LOAD AFFYRATIO [GNF in progress jk Sept 19, 2003] # LOAD AFFYRATIO U95Av2 sequences [DONE hartera Feb 2, 2004] # Used consensus/exemplar sequences instead of target sequences # LOAD AFFYRATIO [in progress, Feb 4, 2004] # changed pslReps parameters as minAli = 0.97 was too stringent # Set up cluster job to align consenesus/exemplars to hg16 ssh kkr1u00 cd /cluster/data/hg16/bed rm -rf affyGnf.2004-02-04/ mkdir affyGnf.2004-02-04 cd affyGnf.2004-02-04/ mkdir -p /iscratch/i/affy cp /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa /iscratch/i/affy iSync ssh kk cd /cluster/data/hg16/bed/affyGnf.2004-02-04 ls -1 /iscratch/i/affy/HG-U95Av2_all.fa > affy.lst ls -1 /scratch/hg/gs.17/build34/trfFa/ > allctg.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para create para.spec # Actually do the job with usual para try/check/push/time etc. # para time 2/4/04 #Completed: 491 of 491 jobs #CPU time in finished jobs: 8344s 139.06m 2.32h 0.10d 0.000 y #IO & Wait Time: 2281s 38.02m 0.63h 0.03d 0.000 y #Average job time: 22s 0.36m 0.01h 0.00d #Longest job: 289s 4.82m 0.08h 0.00d #Submission to last job: 388s 6.47m 0.11h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU95.psl pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least 95% identity in aligned region. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl # Merge with spot data and load into database. added -chip flag to # affyPslAndAtlasToBed to allow correct parsing ssh hgwdev cd /cluster/data/hg16/bed/affyGnf.2004-02-04 /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 affyU95.psl /projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt affyRatio.bed affyRatio.exr >& affyPslAndAtlasToBed.log hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg16 affyRatio affyRatio.bed # This affyU95 load was later changed to eliminate the long names # hgLoadPsl hg16 affyU95.psl # by the following: sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl hgLoadPsl hg16 -table=affyU95 affyU95shortQname.psl # Clean up rm -r psl tmp err affyRatio.bed affyRatio.exr bed.tab scores.tab *.debug batch.bak contig.psl raw.psl LOAD AffyUclaRatio [in progress jk Sept 19, 2003] #LOAD AffyUclaRatio and AFFY U133A and U133B sequences[DONE hartera Feb 3, 2004] # Used consensus/exemplar sequences instead of target sequences # Set up cluster job to align consensus/exemplars to hg16 ssh kkr1u00 cd /cluster/data/hg16/bed rm -rf affyUcla.2004-02-04/ mkdir affyUcla.2004-02-04 cd affyUcla.2004-02-04/ mkdir -p /iscratch/i/affy cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa /iscratch/i/affy iSync ssh kk cd /cluster/data/hg16/bed/affyUcla.2004-02-04/ ls -1 /iscratch/i/affy/HG-U133AB_all.fa > affy.lst ls -1 /scratch/hg/gs.17/build34/trfFa/ > allctg.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para create para.spec # Actually do the job with usual para try/check/push/time etc. # on 2/4/04: #Completed: 491 of 491 jobs #CPU time in finished jobs: 23137s 385.61m 6.43h 0.27d 0.001 y #IO & Wait Time: 23057s 384.29m 6.40h 0.27d 0.001 y #Average job time: 94s 1.57m 0.03h 0.00d #Longest job: 617s 10.28m 0.17h 0.01d #Submission to last job: 747s 12.45m 0.21h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyU133.psl. pslSort dirs raw.psl tmp psl # change filter parameters for these sequences. only use alignments that # cover 30% of sequence and have at least 95% identity in aligned region. # minAli = 0.97 too high. low minCover as a lot of n's in these sequences pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyU133.psl ../../jkStuff/liftAll.lft warn contig.psl # Merge with spot data and load into database. ssh hgwdev cd /cluster/data/hg16/bed/affyUcla.2004-01-28/ # added to hashPsls to process shorter Affy probe set names # assumes that names has 2 colons but when shortened to fit in the seq # database, there is only 1. # e.g. full name: "consensus:HG-U133A:212933_x_at;" short name: "HG-U133A:212933_x_at;" affyUclaMergePslData affyUclaMergePslData -pslFile=affyU133.psl -affyFile=/projects/compbio/data/microarray/affyUcla/data/030602_ucla_normal_human_tissue_snapshot.txt -bedOut=affyUcla.bed -expRecordOut=affyUcla.expRecords -expFile=/projects/compbio/data/microarray/affyUcla/data/expNames -toDiffFile=toDiff.txt hgLoadBed -sqlTable=$HOME/src/hg/lib/affyUcla.sql hg16 affyUcla affyUcla.bed hgLoadPsl hg16 affyU133.psl # Clean up rm -r psl tmp err affyUcla.bed affyUcla.expRecords bed.tab *.debug batch.bak contig.psl raw.psl # Add in sequence data for affyU95 and affyU133 tracks. # Copy probe sequence to /gbdb if it isn't already mkdir -p /gbdb/hgFixed/affyProbes cd /gbdb/hgFixed/affyProbes ln -s /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa . ln -s /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa . # use perl -pi.bak -e 's/;/ /' to remove ";" after probe name # in HG-U95Av2_all.fa seque # reload sequences with "U95Av2" prefix removed so acc matches name used # in other dependent tables for affyU95Av2 only hgLoadSeq -abbr=U95Av2: hg16 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa hgLoadSeq hg16 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa # QA repush 2006-02-08 seq/extFile to correct mismatched ID for affyU133 alignment data (Jen) # Load AFFYUCLANORM, extended version of affyUcla track. Hopefully # final freeze of data set. mkdir ~sugnet/store1/ cd hg16 mkdir affyUcla cd affyUcla/ ssh kk cd /cluster/store1/sugnet/hg16/affyUcla cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all ./ ls -1 /scratch/hg/gs.17/build34/trfFa/* > allctg.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub echo "HG-U133AB_all" > affy.lst gensub2 allctg.lst affy.lst template.sub para.spec mkdir psl para create para.spec Checking input files 491 jobs written to batch updated job database on disk para push # Wait until jobs run... exit pslSort dirs hg16.affyU133AB_all.psl tmp psl # Lots of messages writing hg16.affyU133AB_all.psl Cleaning up temp files wc hg16.affyU133AB_all.psl 60962 1280141 13677509 hg16.affyU133AB_all.psl ls /cluster/data/hg16/jkStuff/liftAll.lft /cluster/data/hg16/jkStuff/liftAll.lft liftUp hg16.affyU133AB_all.lifted.psl /cluster/data/hg16/jkStuff/liftAll.lft warn hg16.affyU133AB_all.psl Got 491 lifts in /cluster/data/hg16/jkStuff/liftAll.lft Lifting hg16.affyU133AB_all.psl pslReps -minCover=0.5 -sizeMatters -minAli=0.97 -nearTop=0.005 hg16.affyU133AB_all.lifted.psl hg16.affyU133AB_all.lifted.pslReps.psl out.psr Processing hg16.affyU133AB_all.lifted.psl to hg16.affyU133AB_all.lifted.pslReps.psl and out.psr Processed 60957 alignments affyUclaMergePslData -pslFile=hg16.affyU133AB_all.lifted.pslReps.psl -affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt -bedOut=hg16.affyUcla.bed -expRecordOut=hg16.affyUcla.expRecords -expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt Reading psls from: hg16.affyU133AB_all.lifted.pslReps.psl Outputing beds: ............................................ Freeing Memory. Done. addUclaAnnotations.pl hg16.affyUcla.expRecords /projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg16.affyUcla.annotations.expRecords # Load the databases cp ~/jk/hg/lib/affyRatio.sql ./ sed -e 's/affyRatio/affyUclaNorm/' < affyRatio.sql > affyUclaNorm.sql # Just use the hgLoadBed program specifying sqlFile hgLoadBed hg16 affyUclaNorm hg16.affyUcla.bed -sqlTable=affyUclaNorm.sql Reading hg16.affyUcla.bed Loaded 44446 elements of size 15 Sorted Saving bed.tab Loading hg16 cp ~/jk/hg/lib/expRecord.sql ./ sed -e 's/expRecord/affyUclaNormExps/' < expRecord.sql > affyUclaNormExps.sql hgFixedS -A < affyUclaNormExps.sql echo "load data local infile 'hg16.affyUcla.annotations.expRecords' into table affyUclaNormExps" | hgFixedS -A # Cleanup rm HG-U133AB_all # DO FAMILY BROWSER VERSIONS OF AFFYUCLANORMAL TRACK (In Progress -jk 3/2/2004) # (This is suspended because GNF Gene Atlas data is available and public!) # Create affyU133Orient table data ssh eieio cd /cluster/data/hg16/bed/affyUcla.2044-02-04 pslSortAcc nohead chrom temp affyU133.psl rm -r temp cd chrom #This loop takes about 15 minutes foreach i (*.psl) polyInfo $i /cluster/data/hg16/nib/$i:r.nib \ /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa \ $i:r.polyInfo echo done $i end cat *.polyInfo > ../affyU133OrientInfo.bed rm *.polyInfo # Load orientation table data ssh hgwdev cd /cluster/data/hg16/bed/affyUcla.2044-02-04 sed 's/mrnaOrientInfo/affyU133OrientInfo/' \ $HOME/kent/src/hg/lib/mrnaOrientInfo.sql > affyU133OrientInfo.sql hgLoadBed hg16 affyU133OrientInfo affyU133OrientInfo.bed \ -sqlTable=affyU133OrientInfo.sql > /dev/null # Do clustering (this takes about 10 minutes to run) clusterRna hg16 u133Cluster.bed /dev/null -noEst -noRefSeq -group=u133Group.tab -mRNAOrient=affyU133OrientInfo -rna=affyU133 ~~~ # GNF ATLAS 2 [Done jk 3/29/2004] # Align probes from GNF1H chip. ssh kk cd /cluster/data/hg16/bed mkdir -p geneAtlas2/run/psl cd geneAtlas2/run mkdir -p /cluster/bluearc/geneAtlas2 cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2 ls -1 /scratch/hg/gs.17/build34/trfFa/ > genome.lst ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub gensub2 genome.lst mrna.lst gsub spec para create spec para try para check para push para time #Completed: 491 of 491 jobs #CPU time in finished jobs: 10718s 178.63m 2.98h 0.12d 0.000 y #IO & Wait Time: 1499s 24.99m 0.42h 0.02d 0.000 y #Average job time: 25s 0.41m 0.01h 0.00d #Longest job: 652s 10.87m 0.18h 0.01d #Submission to last job: 723s 12.05m 0.20h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create gnf1h.psl. pslSort dirs raw.psl tmp psl pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl rm -r contig.psl raw.psl psl # Load probes and alignments from GNF1H into database. ssh hgwdev cd /cluster/data/hg16/bed/geneAtlas2 ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /gbdb/hgFixed/affyProbes hgLoadPsl hg16 affyGnf1h.psl hgLoadSeq hg16 /gbdb/hgFixed/affyProbes/gnf1h.fa grep -v U133B ../affyUcla.2004-02-04/affyU133.psl | sed 's/exemplar://' \ | sed 's/consensus://' \ | sed 's/HG-U133A://' | sed 's/;//' > affyU133A.psl hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \ affyU133A.psl /cluster/data/hg16/bed/geneAtlas2/affyGnf1h.psl # Note that the unmapped 11000 records are from all-N sequences. hgLoadBed hg16 gnfAtlas2 gnfAtlas2.bed # GENE BOUNDS (RNACLUSTER) (DONE 10-05-03 Chuck) # Create rnaCluster table (depends on {est,mrna}OrientInfo created but not checked in) cd /cluster/store4/gs.17/build34/ # Create a list of accessions that come from RAGE libraries and need to # be excluded. (added by Chuck Wed Nov 27 13:09:07 PST 2002) ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg16 \ rage.libs mkdir -p bed/rnaCluster/chrom # Exclude accesions in the RAGE file foreach f (?{,?}/chr*.fa) set c = $f:t:r set out = bed/rnaCluster/chrom/$c.bed echo clusterRna -mrnaExclude=hg16.rage.libs hg16 /dev/null $out -chrom=$c clusterRna -mrnaExclude=hg16.rage.libs hg16 /dev/null $out -chrom=$c end cd bed/rnaCluster hgLoadBed hg16 rnaCluster chrom/*.bed > /dev/null # MAKE UNIGENE ALIGNMENTS (DONE - 2003-10-09 - Hiram) # Download of the latest UniGene version is now automated by a # cron job -- see /cluster/home/angie/crontab , # /cluster/home/angie/unigeneVers/unigene.csh . # If hgwdev gets rebooted, that needs to be restarted... maybe there's # a more stable place to set up that cron job. # substitute XXX -> the uniGene version used by SAGE, if building the # uniGene/SAGE track; or just the latest uniGene version in # /projects/cc/hg/sugnet/uniGene/ , if doing uniGene alignments only. # set Version = XXX set Version = 162 (bash: export Version=162) cd /projects/cc/hg/sugnet/uniGene/uniGene.$Version gunzip Hs.seq.uniq.gz Hs.data.gz ../countSeqsInCluster.pl Hs.data counts.tab ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects) ssh kkstore set Version = 162 # same as above mkdir -p /iscratch/i/uniGene.$Version cp -p \ /projects/cc/hg/sugnet/uniGene/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \ /iscratch/i/uniGene.$Version ssh kkr1u00 ~kent/bin/iSync ssh kk set Version = 162 # same as above mkdir -p /cluster/data/hg16/bed/uniGene.$Version cd /cluster/data/hg16/bed/uniGene.$Version ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > allctg.lst ls -1S /iscratch/i/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \ > uniGene.lst cat << '_EOF_' > template.sub #LOOP /cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 allctg.lst uniGene.lst template.sub para.spec para create para.spec mkdir psl para try para check para push # Checking finished jobsCompleted: 491 of 491 jobs # CPU time in finished jobs: 39689s 661.49m 11.02h 0.46d 0.001 y # IO & Wait Time: 38269s 637.81m 10.63h 0.44d 0.001 y # Average job time: 159s 2.65m 0.04h 0.00d # Longest job: 1805s 30.08m 0.50h 0.02d # Submission to last job: 1972s 32.87m 0.55h 0.02d # ssh eieio set Version = 162 # same as above cd /cluster/data/hg16/bed/uniGene.$Version pslSort dirs raw.psl tmp psl >& pslSort.log liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \ | pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \ stdin hg16.uniGene.lifted.pslReps.psl /dev/null # use hg16.uniGene.lifted.pslReps.psl for building SAGE track (next). # LOAD SAGE DATA (TBD) ssh hgwdev cd ~/kent/src/hg/sage make # XXX = uniGene build for which SAGE was built -- not necessarily current! # Figure out the build number by peeking at this file: wget -O - ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt 2> /dev/null # Or, look at the contents of this directory: ls /projects/cc/hg/sugnet/uniGene # set Version = XXX set Version=162 mkdir /projects/cc/hg/sugnet/sage/sage.$Version cd /projects/cc/hg/sugnet/sage/sage.$Version ncftp ftp://ftp.ncbi.nih.gov/pub/sage mget -R map/readme.txt map/info.txt extr info map/Hs quit # That downloaded about 380 Mb of data mkdir map mv Hs map cd map/Hs/NlaIII unzip -j SAGEmap_tag_ug-rel.zip cd ../../../extr/ ../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_* ../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab ../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \ ./SAGE_* ../../scripts/countsPerExp.pl expCounts.tab expList.tab cd ../map/Hs/NlaIII/ cat << '_EOF_' > /tmp/t.pl #!/usr/local/bin/perl while (<>) { chomp($_); @p = split(/\t/, $_); print "$p[2]\t$p[3]\t$p[0]\n"; } '_EOF_' chmod +x /tmp/t.pl cat SAGEmap_tag_ug-rel | /tmp/t.pl | sort | sed -e 's/ /_/g' \ > SAGEmap_ug_tag-rel_Hs cd ../../../extr createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \ tagExpArrays.tab sageSummary.sage # Create the uniGene alignments # /cluster/data/hg16/uniGene/hg16.uniGene.lifted.pslReps.psl # -- see "MAKE UNIGENE ALIGNMENTS" above # continuing from above, we are already in this extr directory cd /projects/cc/hg/sugnet/sage/sage.$Version/extr addAveMedScoreToPsls \ /cluster/data/hg16/bed/uniGene.$Version/hg16.uniGene.lifted.pslReps.psl \ sageSummary.sage uniGene.wscores.bed hgLoadBed hg16 uniGene_2 uniGene.wscores.bed hgsql hg16 < ~kent/src/hg/lib/sage.sql echo "load data local infile 'sageSummary.sage' into table sage" \ | hgsql hg16 cd ../info ../../scripts/parseRecords.pl ../extr/expList.tab > sageExp.tab hgsql hg16 < ~/kent/src/hg/lib/sageExp.sql echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg16 # update ~/kent/src/hg/makeDb/trackDb/human/hg16/uniGene_2.html # with current uniGene date. # MAKING FOLDUTR TABLES (DONE - jk - 2003-10-14, REDONE jk 2004-04-07) # First set up directory structure and extract UTR sequence on hgwdev ssh hgwdev mkdir -p /cluster/data/hg16/bed/rnaStruct cd /cluster/data/hg16/bed/rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold utrFa hg16 knownGene utr3 utr3/utr.fa utrFa hg16 knownGene utr5 utr5/utr.fa # Split up files and make files that define job. ssh kk cd /cluster/data/hg16/bed/rnaStruct faSplit sequence utr3/utr.fa 50000 utr3/split/s faSplit sequence utr5/utr.fa 50000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat > gsub < md5sum.txt # load renumbered chains into database (2004-03-14 kate) ssh hgwdev cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain foreach i (*.chain) set c = $i:r hgLoadChain hg16 ${c}_rBestChainPanTro1 $i echo done $c end # save for download (2004-05-14 kate) ssh kksilo cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain chainMergeSort -saveId *.chain > ../rBest.chain ssh hgwdev cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain set dir = /usr/local/apache/htdocs/goldenPath/hg16/vsPanTro1 mkdir -p $dir cp -p ../rBest.chain $dir/human.best.chain cd $dir gzip *.chain # copy README file # load net into database (2004-03-14 kate) ssh hgwdev cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted cat rBestNet/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net netClass noClass.net hg16 panTro1 human.net netFilter -chimpSyn human.net > rBest.net hgLoadNet -warn hg16 rBestNetPanTro1 rBest.net # EXPERIMENT: TBA WHOLE CHROM 5 SPECIES (DONE ENOUGH 3/8/04 angie) # Put 2-ways in /cluster/bluearc ssh eieio mkdir /cluster/bluearc/hg16/tba mkdir /cluster/bluearc/hg16/tba/{hp,hg} cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/chr*.maf \ /cluster/bluearc/hg16/tba/hp # hg16-mm3 already in /cluster/bluearc/hg16/bed/blastz.mm3/mafNet300 # hg16-rn3 already in /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300 cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf \ /cluster/bluearc/hg16/tba/hg ssh kolossus mkdir /cluster/data/hg16/bed/tbaExperiment cd /cluster/data/hg16/bed/tbaExperiment # tba needs to run multiz, so make sure they're in $PATH: set path = (/cluster/bin/penn $path) # Try just one chromosome: set chr = chr16 # tba needs filenames to correspond to its tree input, so make links to # maf and fasta: rm -f human.chimp.maf human.mouse.maf human.rat.maf human.chicken.maf \ human mafSort /cluster/bluearc/hg16/tba/hp/$chr.z.mm3/mafNet300/$chr.mm3.maf > \ human.mouse.mafmaf > human.chimp.maf mafSort /cluster/bluearc/hg16/bed/blastz.mm3/mafNet300/$chr.mm3.maf > \ human.mouse.maf mafSort /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300/$chr.rn3.maf > \ human.rat.maf mafSort /cluster/bluearc/hg16/tba/hg/$chr.hg.maf > human.chicken.maf ln -s /cluster/data/hg16/?{,rror that tba is dying with is this: # ?}/$chr.fa human tba "(((human chimp) (mouse rat)) chicken)" \ human.chimp.maf human.mouse.maf human.rat.maf human.chicken.maf # Doh -- looks like tba wants *all* pairwise inputs, and how do we # tell which rat-chicrror that tba is dying with is this: # ken alignments to include for a given human chr?? # The error that tba is dying with is this: # pair2tb.v4: alignments of human out of order around 172596-175110 # ... even though inputs are sorted...? Oh well, clean up: rm human* rm -r /cluster/bluearc/hg16/tba/ # CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene) # First see if need to update superfamily data from # ftp server at supfam.mrc-lmb.cam.ac.uk following instructions # in /cluster/store1/superFamily/genomes/README.ucsc. Then # make sure that knownToEnsembl and ensGtp tables are created, then: zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg16 hs stdin # BLASTZ CHICKEN (done, 11/3/2003, Adam) # (adapted from BLASTZ mouse/rat, above) # NOTE: this first time we're using the contigs that Terry has # installed at /cluster/bluearc/gg0 (see fa and split100 # subdirectories). When we have an assembly, things should be able to # proceed more as with mouse and rat ssh kk mkdir -p /cluster/data/hg16/bed/blastz.gg0 cd /cluster/data/hg16/bed/blastz.gg0 # first it looks like we need to run TRF on the contigs (realizing # this on second time through!) mkdir trf cd trf rm -rf jobList foreach file (/cluster/bluearc/gg0/split100/*.fa) set root=$file:t:r echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $file /dev/null -bedAt=/cluster/data/hg16/bed/blastz.gg0/trf/${root}.bed -tempDir=/tmp" >> jobList end #(run jobList on cluster) -- took 2.5 min. # add new softmasking to reflect TRF output mkdir /cluster/bluearc/gg0/split100_with_trf rm -rf jobList foreach file (/cluster/bluearc/gg0/split100/*.fa) set root=$file:t:r echo "/cluster/bin/i386/maskOutFa -softAdd $file /cluster/data/hg16/bed/blastz.gg0/trf/${root}.bed /cluster/bluearc/gg0/split100_with_trf/${root}.fa" >> jobList end (run jobList on cluster) # took <1 min. # now set up for BLASTZ (picking up with instructions above for # mouse and rat) cat << '_EOF_' > DEF # chicken vs. human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_Q=/cluster/data/penn/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET # Human SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Chicken SEQ2_DIR=/cluster/bluearc/gg0/split100_with_trf # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK= SEQ2_LAP= BASE=/cluster/store4/gs.17/build34/bed/blastz.gg0 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place DS=`date -I` cp DEF ~angie/hummus/DEF.gg0-hg16.$DS # source the DEF file to establish environment for following commands . ./DEF # follow the next set of directions slavishly mkdir -p $BASE/run # give up on avoiding angie's directories # tcl script # creates xdir.sh and joblist run/j ~angie/hummus/make-joblist $DEF > $BASE/run/j # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size sh $BASE/xdir.sh cd $BASE/run # now edit j to prefix path to executable name # NOTE: we should have a controlled version of schwartz bin executables sed -e 's#^#/cluster/bin/penn/#' j > j2 wc -l j* head j2 # make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j para try para check para push # ... etc ... #Completed: 33561 of 33561 jobs #CPU time in finished jobs: 11426279s 190437.98m 3173.97h 132.25d 0.362 y #IO & Wait Time: 212940s 3549.01m 59.15h 2.46d 0.007 y #Average job time: 347s 5.78m 0.10h 0.00d #Longest job: 4036s 67.27m 1.12h 0.05d #Submission to last job: 16433s 273.88m 4.56h 0.19d # post-process blastz ssh kk cd /cluster/data/hg16/bed/blastz.gg0 # source the DEF file again in case you are coming back to this # (must be bash shell) . ./DEF # a new run directory mkdir -p run.1 mkdir -p $BASE/lav # create a new job list to convert out files to lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \ > run.1/jobList cd run.1 # make sure the job list is OK wc -l jobList # 339 jobs head jobList # run on cluster ssh kk cd /cluster/data/hg16/bed/blastz.rn3/run.1 para create jobList para try para check para push # etc. #Completed: 339 of 339 jobs #CPU time in finished jobs: 8611s 143.52m 2.39h 0.10d 0.000 y #IO & Wait Time: 106450s 1774.17m 29.57h 1.23d 0.003 y #Average job time: 339s 5.66m 0.09h 0.00d #Longest job: 456s 7.60m 0.13h 0.01d #Submission to last job: 465s 7.75m 0.13h 0.01d # convert lav files to axt ssh kk cd /cluster/data/hg16/bed/blastz.gg0 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 # create custom version of blastz-chromlav2axt with -fa option, # because nibs aren't available for chicken cp /cluster/bin/scripts/blastz-chromlav2axt . # (hand edit: add -fa option to call to lavToAxt) # create template file for gensub2 # usage: blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir cat << '_EOF_' > gsub #LOOP /cluster/store4/gs.17/build34/bed/blastz.gg0/run.2/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.gg0/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.gg0/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /cluster/bluearc/gg0/fa/chicken_with_trf.fa #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1S /cluster/store4/gs.17/build34/bed/blastz.gg0/lav > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList # 42 jobs head jobList para create jobList para try para check para push # ... etc ... #Completed: 39 of 42 jobs #Crashed: 3 jobs #CPU time in finished jobs: 32763s 546.05m 9.10h 0.38d 0.001 y #IO & Wait Time: 48182s 803.03m 13.38h 0.56d 0.002 y #Average job time: 2076s 34.59m 0.58h 0.02d #Longest job: 5291s 88.18m 1.47h 0.06d #Submission to last job: 5291s 88.18m 1.47h 0.06d # The crashes are three of the "randoms" (chr8, 18, 19) -- parasol # thinks they crashed because of 0-length output files # This run took quite a bit longer than with mouse and rat, presumably # because of the use of the fa file # Remove the empty axtChrom/chr*_random.axt files to avoid future # processing errors # translate sorted axt files into psl ssh eieio cd /cluster/data/hg16/bed/blastz.gg0 mkdir -p pslChrom set tbl = "blastzGg0" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # (~5 minutes) # Load database tables ssh hgwdev set tbl = "blastzGg0" cd /cluster/data/hg16/bed/blastz.gg0/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl # New entry in human/hg16/trackDb.ra # track blastzGg0 # shortLabel Chicken Blastz # longLabel Blastz Chicken (Gg0-contigs, 5.2x coverage) # group compGeno # priority 145.9 # visibility hide # color 100,50,0 # altColor 255,240,200 # spectrum on # type psl xeno # MAKE BLASTZ BEST CHICKEN (finished, Adam, 11/3/03) # Consolidate AXT files to chrom level, sort, pick best, make psl. ssh eieio cd /cluster/data/hg16/bed/blastz.gg0/axtChrom mkdir -p /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom # copy chrom axt's to bluearc, to avoid hitting fileserver too hard cp -p *.axt /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom ssh kk cd /cluster/data/hg16/bed/blastz.gg0 mkdir -p axtBest pslBest mkdir run.3 cd run.3 # create script to filter files cat << '_EOF_' > doBestAxt #!/bin/csh -f # usage: doBestAxt chr axt-file best-file psl-file /cluster/bin/i386/axtBest $2 $1 $3 -minScore=300 sleep 1 /cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.gg0/S1.len \ /cluster/data/hg16/bed/blastz.gg0/S2.len $4 '_EOF_' # << this line makes emacs coloring happy # NOTE: in a subsequent run, we have used -minScore=6000 and added # the -matrix option to use HoxD55.q (need to add a line with gap # penalties to the bottom of the score matrix file, e.g., "O = # 400, E = 30"; see # /cluster/data/hg16/bed/blastz.gg0/run.3.2003-11-11). These new # options should be considered part of the standard procedure, at # least for now. chmod +x doBestAxt cd ../axtChrom ls -1S | sed 's/.axt$//' > ../run.3/chrom.list cd ../run.3 # create template for cluster job cat << '_EOF_' > gsub #LOOP doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.gg0/axtBest/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.gg0/pslBest/$(root1)_blastzBestGg0.psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 chrom.list single gsub jobList wc -l jobList head jobList cd /cluster/data/hg16/bed/blastz.gg0 cd run.3 para create jobList para try para check para push #Checking finished jobs #Completed: 39 of 39 jobs #CPU time in finished jobs: 1111s 18.52m 0.31h 0.01d 0.000 y #IO & Wait Time: 7775s 129.58m 2.16h 0.09d 0.000 y #Average job time: 228s 3.80m 0.06h 0.00d #Longest job: 1375s 22.92m 0.38h 0.02d #Submission to last job: 1375s 22.92m 0.38h 0.02d # create human/chicken mafs cd /cluster/data/hg16/bed/blastz.gg0 mkdir maf foreach file (axtBest/*.axt) set root=$file:t:r echo $root /cluster/bin/i386/axtToMaf $file S1.len S2.len maf/${root}.maf.unfixed -tPrefix=hg16. -qPrefix=gg0. /cluster/bin/scripts/fixmaf.pl < maf/${root}.maf.unfixed > maf/${root}.maf end # MULTIZ HUMAN/MOUSE/RAT/CHICKEN (Finished, Adam, 11/3) # (chicken added to human/mouse/rat alignments described above [HUMOR]) ssh kk mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0 cd /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0 mkdir hmrc # wrapper script for multiz cat << EOF > mz #!/bin/csh /cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3 EOF chmod +x mz # put the MAFs on bluearc ssh eieio mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0/hmr mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0/hc cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf /cluster/bluearc/multiz.hg16mm3rn3gg0/hmr cp /cluster/data/hg16/bed/blastz.gg0/maf/*.maf /cluster/bluearc/multiz.hg16mm3rn3gg0/hc logout # back to kk # set up joblist rm -f jobList foreach file (/cluster/bluearc/multiz.hg16mm3rn3gg0/hmr/*.maf) set root=`echo $file:t:r | sed 's/\.hmr//'` echo "/cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/mz /cluster/bluearc/multiz.hg16mm3rn3gg0/hc/${root}.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/hmrc/${root}.maf" >> jobList end # (run on cluster) 41 jobs, ~10 min # FIXME: maybe should run on the common denominator of the two # sets, then copy over remaining MAFs (?) In this case, copied # chr8_random and chr18_random from hmr # clean up bluearc (these are big files!) rm -r /cluster/bluearc/multiz.hg16mm3rn3gg0 # setup external files for database reference ssh hgwdev mkdir -p /gbdb/hg16/multizMm3Rn3Gg0 ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/hmrc/*.maf /gbdb/hg16/multizMm3Rn3Gg0 # load into database # cd $multizDir/hmr/*.maf /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3Gg0 # add dummy entry to dbDb so that name shows up as "Chicken" echo 'insert into dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName) values ("gg0", "November 2003", "", "Chicken", "", 0, 0, "Chicken", "Gallus gallus");' | hgsql -h genome-testdb hgcentraltest # BLASTZ Mm4 (DONE - 2003-10-31 - Hiram) ssh kk mkdir -p /cluster/data/hg16/bed/blastz.mm4.2003-10-29 cd /cluster/data/hg16/bed ln -s blastz.mm4.2003-10-29 blastz.mm4 cd blastz.mm4 cat << '_EOF_' > DEF # human vs. mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Human SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs # RMSK not currently used SEQ1_RMSK=/iscratch/i/gs.17/build34/rmsk # FLAG not currently used SEQ1_FLAG=-primate SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Mouse SEQ2_DIR=/scratch/mus/mm4/softNib # RMSK not currently used SEQ2_RMSK=/scratch/mus/mm4/rmsk # FLAG not currently used SEQ2_FLAG=-rodent SEQ2_SMSK=/scratch/mus/mm4/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg16/bed/blastz.mm4 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # prepare first cluster run ssh kk cd /cluster/data/hg16/bed/blastz.mm4 source DEF /cluster/data/mm4/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # Completed: 43390 of 43392 jobs # Crashed: 2 jobs # CPU time in finished jobs: 15770466s 262841.10m 4380.69h 182.53d 0.500 y # IO & Wait Time: 626227s 10437.11m 173.95h 7.25d 0.020 y # Average job time: 378s 6.30m 0.10h 0.00d # Longest job: 8052s 134.20m 2.24h 0.09d # Submission to last job: 45886s 764.77m 12.75h 0.53d # the two crashed jobs: # /cluster/home/angie/schwartzbin/blastz-run chr10.nib 40000001 50010000 chrX.nib 120000001 150000000 /cluster/data/hg16/bed/blastz.mm4/DEF # blastz: Illegal character '@' in sequence file. # /cluster/home/angie/schwartzbin/blastz-run chr18.nib 1 10010000 chr15.nib 60000001 90000000 /cluster/data/hg16/bed/blastz.mm4/DEF # seq_read(/tmp/blastz.zstcGa/s1.fa): Input/output error # unusual errors. Simply try them again and they work # Second cluster run to convert the .out's to .lav's # You do NOT want to run this on the big cluster. It brings # the file server to its knees. Run this on the small cluster. ssh kkr1u00 cd /cluster/data/hg16/bed/blastz.mm4 source DEF /cluster/data/mm4/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # Completed: 339 of 339 jobs # CPU time in finished jobs: 15434s 257.23m 4.29h 0.18d 0.000 y # IO & Wait Time: 2393s 39.89m 0.66h 0.03d 0.000 y # Average job time: 53s 0.88m 0.01h 0.00d # Longest job: 1128s 18.80m 0.31h 0.01d # Submission to last job: 2561s 42.68m 0.71h 0.03d # Third cluster run to convert lav's to axt's source DEF cd /cluster/data/hg16/bed/blastz.mm4 /cluster/data/mm4/jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # Completed: 38 of 42 jobs # Crashed: 4 jobs # CPU time in finished jobs: 1826s 30.44m 0.51h 0.02d 0.000 y # IO & Wait Time: 9781s 163.01m 2.72h 0.11d 0.000 y # Average job time: 305s 5.09m 0.08h 0.00d # Longest job: 1489s 24.82m 0.41h 0.02d # Submission to last job: 5125s 85.42m 1.42h 0.06d # FAILED: chr1, chr19, chr19_random, chr5 # try these on kolossus ssh kolossus cd /cluster/data/hg16/bed/blastz.mm4/run.2 /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \ /cluster/data/hg16/bed/blastz.mm4/lav/chr1 \ /cluster/data/hg16/bed/blastz.mm4/axtChrom/chr1.axt \ /cluster/data/hg16/nib /cluster/data/mm4/nib /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \ /cluster/data/hg16/bed/blastz.mm4/lav/chr19 \ /cluster/data/hg16/bed/blastz.mm4/axtChrom/chr19.axt \ /cluster/data/hg16/nib /cluster/data/mm4/nib /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \ /cluster/data/hg16/bed/blastz.mm4/lav/chr19_random \ /cluster/data/hg16/bed/blastz.mm4/axtChrom/chr19_random.axt \ /cluster/data/hg16/nib /cluster/data/mm4/nib /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \ /cluster/data/hg16/bed/blastz.mm4/lav/chr5 \ /cluster/data/hg16/bed/blastz.mm4/axtChrom/chr5.axt \ /cluster/data/hg16/nib /cluster/data/mm4/nib # about 26 minutes total time for those four # chr19_random.axt is still empty, remove it to avoid errors later # translate sorted axt files into psl ssh eieio cd /cluster/data/hg16/bed/blastz.mm4 mkdir -p pslChrom set tbl = "blastzMm4" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # That takes about 30 minutes # Load database tables ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm4/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzMm4.psl # this is a 55 minute job # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of # memory. But if you reset your ~/.hg.conf to use the read-only # user and contact the hgwdev host, and build featureBits as a # x86_64 binary, you can run it on kolossus: # featureBits hg16 blastzMm3 # 1050190071 bases of 2865248791 (36.653%) in intersection # featureBits hg16 blastzMm4 # 1056761609 bases of 2865248791 (36.882%) in intersection # CHAIN Mm4 BLASTZ (DONE - 2003-11-03 - Hiram) # The axtChain is best run on the small kluster, or the kk9 kluster # in this case, it was run on the kk kluster ssh kkr1u00 mkdir -p /cluster/data/hg16/bed/blastz.mm4/axtChain/run1 cd /cluster/data/hg16/bed/blastz.mm4/axtChain/run1 mkdir out chain ls -1S /cluster/data/hg16/bed/blastz.mm4/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ_random $1 | axtChain stdin \ /iscratch/i/gs.17/build34/bothMaskedNibs \ /iscratch/i/mm4/softNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain # 41 jobs gensub2 input.lst single gsub jobList para create jobList para try para push # ... etc ... # Completed: 41 of 41 jobs # CPU time in finished jobs: 24547s 409.12m 6.82h 0.28d 0.001 y # IO & Wait Time: 3955s 65.91m 1.10h 0.05d 0.000 y # Average job time: 695s 11.59m 0.19h 0.01d # Longest job: 7336s 122.27m 2.04h 0.08d # Submission to last job: 8251s 137.52m 2.29h 0.10d # now on the file server, sort chains ssh eieio cd /cluster/data/hg16/bed/blastz.mm4/axtChain time chainMergeSort run1/chain/*.chain > all.chain # real 10m5.525s # user 8m9.350s # sys 0m48.450s time chainSplit chain all.chain # real 10m23.201s # user 7m51.930s # sys 0m53.910s # these steps take ~20 minutes # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm4/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg16 ${c}_chainMm4 $i echo done $c end # NET Mm4 (DONE - 2003-11-03 - Hiram) ssh eieio cd /cluster/data/hg16/bed/blastz.mm4/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \ /cluster/data/mm4/chrom.sizes ../preNet/$i end # real 11m58.018s # user 4m10.390s # sys 2m10.780s cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \ /cluster/data/mm4/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 2505211904, utime 15891 s/100, stime 3245 ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm4/axtChain time netClass hNoClass.net hg16 mm4 mouse.net \ -tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \ -qNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman # real 14m2.042s # user 10m6.450s # sys 1m46.950s # If things look good do ssh eieio cd /cluster/data/hg16/bed/blastz.mm4/axtChain rm -r n1 hNoClass.net # Make a 'syntenic' subset of these with netFilter -syn mouse.net > mouseSyn.net # real 9m44.445s # user 6m42.660s # sys 1m10.100s # Load the nets into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm4/axtChain netFilter -minGap=10 mouse.net | hgLoadNet hg16 netMm4 stdin netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm4 stdin # real 12m53.070s # user 6m6.540s # sys 0m50.580s # check results # featureBits hg16 netMm4 # 2823565051 bases of 2865248791 (98.545%) in intersection # featureBits hg16 netMm3 # 2834484276 bases of 2865248791 (98.926%) in intersection # featureBits hg16 syntenyNetMm3 # 2804467412 bases of 2865248791 (97.879%) in intersection # featureBits hg16 syntenyNetMm4 # 2786960572 bases of 2865248791 (97.268%) in intersection # Add entries for net and chain to mouse/hg16 trackDb # make net ssh eieio cd /cluster/data/hg16/bed/blastz.mm4/axtChain mkdir mouseNet time netSplit mouse.net mouseNet # real 10m44.479s # user 6m43.680s # sys 1m20.860s mkdir ../axtNet foreach n (mouseNet/chr*.net) set c=$n:t:r echo "netToAxt: $c.net -> $c.axt" rm -f ../axtNet/$c.axt netToAxt mouseNet/$c.net chain/$c.chain \ /cluster/data/hg16/nib \ /cluster/data/mm4/nib \ ../axtNet/$c.axt echo "Complete: $c.net -> axtNet/$c.axt" end ssh hgwdev mkdir -p /cluster/data/hg16/bed/blastz.mm4/axtBest cd /cluster/data/hg16/bed/blastz.mm4/axtBest ln -s ../axtNet/chr*.axt . # copy net axt's to download area ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm4/axtNet mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet cd /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet gzip *.axt # add README.txt file to dir, if needed # Convert those axt files to psl ssh eieio cd /cluster/data/hg16/bed/blastz.mm4 mkdir pslBest foreach a (axtBest/chr*.axt) set c=$a:t:r echo "processing $c.axt -> ${c}_blastzBestMm4.psl" /cluster/bin/i386/axtToPsl axtBest/${c}.axt \ S1.len S2.len pslBest/${c}_blastzBestMm4.psl echo "Done: ${c}_blastzBestMm4.psl" end # Load tables ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm4/pslBest time /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzBestMm4.psl # real 10m47.853s # user 2m48.700s # sys 0m24.250s # check results # featureBits hg16 blastzBestMm4 # 996722004 bases of 2865248791 (34.787%) in intersection # featureBits hg16 blastzBestMm3 # 1007362800 bases of 2865248791 (35.158%) in intersection # Make /gbdb links and add them to the axtInfo table: mkdir -p /gbdb/hg16/axtBestMm4 cd /gbdb/hg16/axtBestMm4 ln -s /cluster/data/hg16/bed/blastz.mm4/axtNet/chr*.axt . cd /cluster/data/hg16/bed/blastz.mm4/axtNet rm -f axtInfoInserts.sql foreach f (/gbdb/hg16/axtBestMm4/chr*.axt) set chr=$f:t:r echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \ VALUES ('mm4','Blastz Best in Genome','$chr','$f');" \ >> axtInfoInserts.sql end hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql # table axtInfo may already exist, ignore create error. hgsql hg16 < axtInfoInserts.sql # MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-11-04 - Hiram) # After creating axtBest alignments above, use subsetAxt to get axtTight: ssh eieio cd /cluster/data/hg16/bed/blastz.mm4/axtNet mkdir ../axtTight tcsh foreach i (*.axt) echo subsetAxt $i ../axtTight/$i subsetAxt $i ../axtTight/$i \ ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400 end # translate to psl cd ../axtTight mkdir ../pslTight foreach i (*.axt) set c = $i:r axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm4.psl echo "Done: $i" end # Load tables into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm4/pslTight hgLoadPsl -noTNameIx hg16 chr*_blastzTightMm4.psl # check results # featureBits hg16 blastzTightMm4 # 162641577 bases of 2865248791 (5.676%) in intersection # featureBits hg16 blastzTightMm3 # 164148288 bases of 2865248791 (5.729%) in intersection # copy to axt's to download area cd /cluster/data/hg16/bed/blastz.mm4/axtTight mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight cd /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight gzip *.axt # add README.txt file to dir, if needed # RUNNING AXTBEST (DONE 12/2/03 angie) # Penn State complained of a loss in coverage when using axtNet instead # of axtBest. So run axtBest for them, and axtToMaf in prep for multiz. ssh eieio cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29 # I removed links from axtBest/* to axtNet/* foreach f (axtChrom/chr*.axt) set chr=$f:t:r echo axtBesting $chr axtBest $f $chr axtBest/$chr.axt -minScore=300 end # As usual, ran out of mem on chr19, so use kolossus & 2 passes: ssh kolossus cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29 set chr = chr19 foreach d (lav/$chr/*.lav) set smallout=$d.axt lavToAxt $d /cluster/data/hg16/nib /cluster/data/mm4/nib stdout \ | axtSort stdin $smallout axtBest $smallout $chr $smallout:r.axtBest end cat `ls -1 lav/$chr/*.axtBest | sort -g` \ > lav/$chr/$chr.axtBestPieces axtBest lav/$chr/$chr.axtBestPieces $chr axtBest/$chr.axt rm lav/$chr/*.axt* # MAKE MAF FROM AXTBEST FOR PENN STATE (DONE 12/2/03 angie) ssh eieio cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29 mkdir mafBest foreach f (axtBest/chr*.axt) set maf = mafBest/$f:t:r.hm.maf echo translating $f to $maf axtToMaf $f \ /cluster/data/hg16/chrom.sizes /cluster/data/mm4/chrom.sizes \ $maf -tPrefix=hg16. -qPrefix=mm4. end # MAKING MOUSE MM4 SYNTENY (DONE 2003-11-05 - Hiram) ssh hgwdev mkdir -p /cluster/data/hg16/bed/syntenyMm4 cd /cluster/data/hg16/bed/syntenyMm4 # updating the scripts in use here from # /cluster/data/hg16/bed/syntenyMm3 cp -p /cluster/data/hg16/bed/syntenyMm3/*.pl . # fix the syntenicBest script to not try and work on empty # results from its queries. Also, set the db and table name # in the script itself so the arguments are not needed ./syntenicBest.pl # on the order of 3 to 4 hours to complete syntenicBest # almost no time, or only a few minutes at most for any of # the rest ../syntenyMm3/smooth.pl ../syntenyMm3/joinsmallgaps.pl # set db and table name in fillgap.pl ./fillgap.pl ../syntenyMm3/synteny2bed.pl hgLoadBed hg16 syntenyMm4 ucsc100k.bed # featureBits hg16 syntenyMm3 # 2651945520 bases of 2865248791 (92.556%) in intersection # featureBits hg16 syntenyMm4 # 2560252977 bases of 2865248791 (89.355%) in intersection # hgTracks.c needed to be updated to recognize syntenyMm4 so it # would color properly. # TIGR GENE INDEX (DONE 2004-05020 Fan) mkdir -p /cluster/data/hg16/bed/tigr cd /cluster/data/hg16/bed/tigr wget ftp://ftp.tigr.org/pub/data/tgi/Homo_sapiens/TGI_track_HumanGenome_hg16_05-2004.tgz tar xvzf TGI*.tgz foreach f (*cattle*) set f1 = `echo $f | sed -e 's/cattle/cow/g'` mv $f $f1 end foreach o (mouse cow human pig rat) echo $o setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ssh hgwdev cd /cluster/data/hg16/bed/tigr hgsql hg16 -e "drop table tigrGeneIndex" hgsql hg16 < ~/kent/src/hg/lib/tigrGeneIndex.sql foreach f (*.gff) echo Processing $f ... /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC hg16 tigrGeneIndex $f hgsql hg16 -e "select count(*) from tigrGeneIndex" end # Total of 354491 entries created in tigrGeneIndex table. hgsql hg16 -e "update tigrGeneIndex set cdsStart = txStart;" hgsql hg16 -e "update tigrGeneIndex set cdsEnd = txEnd;" checkTableCoords hg16 tigrGeneIndex gzip *.gff *TCs # LOAD VEGA GENES AND PSEUDOGENES (DONE 2003-11-11 braney ) ##### ##### WARNING: vega procedure changed, use process later in file ##### mkdir ~/hg16/bed/vega cd ~/hg16/bed/vega wget "http://www.sanger.ac.uk/Users/keenan/vega_homo_sapiens_core_4_0.gtf.gz" gunzip vega_homo_sapiens_core_4_0.gtf.gz # Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks awk '$2 != "Pseudogene" && $2 != "Ig_Pseudogene_Segment" && $2 != "Ig_Segment" {print "chr"$0}' \ vega_homo_sapiens_core_4_0.gtf > vega_fixed.gtf awk '$2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment" {print "chr"$0}' \ vega_homo_sapiens_core_4_0.gtf > vega_pseudo.gtf ldHgGene hg16 vegaGene vega_fixed.gtf -gtf ldHgGene hg16 vegaPseudoGene vega_pseudo.gtf -gtf wget "http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz" hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql echo "load data local infile 'vegaInfo.tab' into table vegaInfo" | hgsql hg16 # Set cdsStart and cdsEnd to 0 if method is Novel_Transcript foreach ntname (`echo 'select name from vegaGene,vegaInfo \ where vegaGene.name = vegaInfo.transcriptId AND \ vegaInfo.method = "Novel_Transcript"' \ | hgsql -N hg16`) echo "update vegaGene set cdsStart = 0 where name = '$ntname'" \ | hgsql hg16 echo "update vegaGene set cdsEnd = 0 where name = '$ntname'" \ | hgsql hg16 end # LOAD FIRSTEF TRACK Done 2003-07-31 braney # Create firstEF track from Zhang lab at CSHL # contacts # Gengxin Chen # Ivo Grosse # Michael Zhang mkdir /cluster/data/hg16/bed/firstEF cd /cluster/data/hg16/bed/firstEF # Got firstEF.txt from Gengzin 7/30/03 hgLoadBed hg16 firstEF firstEF.txt # Load chicken sequence loaded & processed by booch & acs (2003-11-4 kate) hgLoadSeq hg16 /gbdb/gg0/chicken.fa # 73234 sequences # LOAD ENSEMBL GENES (DONE 2003-11-07 angie) mkdir /cluster/data/hg16/bed/ensembl cd /cluster/data/hg16/bed/ensembl # Get the ensembl protein data from # http://www.ensembl.org/Homo_sapiens/martview # Follow this sequence through the pages: # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensemblGene.gtf.gz # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. # Finally, get rid of the ".1" or ".2" after the name gunzip -c ensemblGene.gtf.gz \ | grep -v ^6_DR51 \ | grep -v ^DR51 \ | grep -v _NT_ \ | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \ || die "Line $. doesnt start with human chrom:\n$_"' \ | sed -e 's/\..\"/\"/g' \ > ensGene.gtf ssh hgwdev /cluster/bin/i386/ldHgGene hg16 ensGene \ /cluster/data/hg16/bed/ensembl/ensGene.gtf # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16 # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz gunzip ensemblPep.fa.gz hgPepPred hg16 ensembl ensemblPep.fa LOAD GENOMIC DUPES (DONE - 2003-11-11 - Hiram) o - Load genomic dupes ssh hgwdev mkdir /cluster/data/hg16/bed/genomicDups cd /cluster/data/hg16/bed/genomicDups # pick up Build34GenomicDups.gz from # http://humanparalogy.cwru.edu/build34/files_for_ucsc/build34_ucsc.htm # it has a user and password login. you can use this wget command # with the user/password: wget --http-user=X --http-passwd=X \ "http://humanparalogy.cwru.edu/build34/files_for_ucsc/Build34GenomicDups.gz" gunzip *.gz # awk -f filter.awk oo33_dups_for_kent > genomicDups.bed hgsql hg16 < ~/kent/src/hg/lib/genomicDups.sql hgLoadBed hg16 -oldTable genomicDups Build34GenomicDups # load of genomicDups did not go as planned: 57702 record(s), 0 row(s) skipped, 57702 warning(s) loading bed.tab # There was an error in this data delivery. To fixup: hgsql -e \ 'update genomicDups set name = concat(otherChrom,":",otherStart);' \ hg16 # LOAD CHIMP NET (2003-11-20 kate) # NOTE: Net preparation doc'ed in makePt0.doc ssh hgwdev cd /cluster/data/pt0/bed/blastz.hg16/axtChain netFilter -minGap=10 chimp.net | hgLoadNet hg16 netPt0 stdin netFilter -minGap=10 chimpSyn.net | hgLoadNet hg16 syntenyNetPt0 stdin # CHIMP BEST CHAINS, IN CHROMOSOME COORDINATES (2004-02-25 kate) # NOTE: start with scaffold-based human-reference reciprocal best chains # doc'ed in makePt0.doc, then lift using scaffold lift file in panTro1 # NOTENOTENOTE: Angie redid this with chain renumbering ssh kksilo mkdir -p /cluster/data/hg16/bed/blastz-blat.panTro1 cd /cluster/data/hg16/bed/blastz-blat.panTro1 liftUp -chainQ best.chain \ /cluster/data/panTro1/jkStuff/scaffolds.lft \ warn /cluster/data/pt0/bed/blastz-blatHg16/human.best.chain chainSplit bestChain best.chain # Load chains into database ssh hgwdev cd /cluster/data/hg16/bed/blastz-blat.panTro1/bestChain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain hg16 ${c}_bestChainPanTro1 $i end # CHIMP ALL CHAINS, IN CHROMOSOME COORDINATES (2004-02-25 kate) ssh kksilo cd /cluster/data/hg16/bed/blastz-blat.panTro1 liftUp -chainQ all.chain \ /cluster/data/panTro1/jkStuff/scaffolds.lft \ warn /cluster/data/pt0/bed/blastz-blatHg16/all.chain chainSplit chain all.chain ssh hgwdev cd /cluster/data/hg16/bed/blastz-blat.panTro1/chain foreach i (*.chain) set c = $i:r hgLoadChain hg16 ${c}_chainPanTro1 $i echo done $c end # CHIMP RECIPRCAL BEST NET, IN CHROMOSOME COORDS (kate) # Redo the netting on chrom-based chain files ssh kolossus cd /cluster/data/hg16/bed/blastz-blat.panTro1 ~/bin/x86_64/chainNet all.chain -minSpace=10 \ /cluster/data/hg16/chrom.sizes /cluster/data/panTro1/chrom.sizes \ human.net chimp.net ssh kksilo cd /cluster/data/hg16/bed/blastz-blat.panTro1 chainSwap all.chain all.swap.chain ~/bin/i386/netChainSubset chimp.net all.swap.chain stdout | \ chainSort stdin chimpNet.chain ssh kolossus cd /cluster/data/hg16/bed/blastz-blat.panTro1 ~/bin/x86_64/chainNet all.chain -minSpace=10 \ /cluster/data/hg16/chrom.sizes /cluster/data/panTro1/chrom.sizes \ human.net chimp.net # UPDATE WOODY BINARIES (see PHYLOHMM CONSERVATION entries below) # done, acs, 2003-11-19 ssh hgwdev cd /cluster/data/woody # better place? don't have permission in /cluster/install cvs update -dP cd src make # make sure Makefile has INSTALLDIR = /cluster/bin/woody make install # CFTR PHYLOHMM CONSERVERVATION # done, acs, 2003-11-19 (currently using 9-way alignment) # NOTE: essentially the same procedure applies for any Zoo or ENCODE # target, as long as a suitable tree topology is available for the # species in question (when distant species are included, e.g., # chicken and fish, the branch-length estimation procedure may need to # be adapted slightly -- details to come) ssh hgwdev # (update woody binaries, if necessary -- see above) # make sure /cluster/bin/penn/tbaBin and /cluster/bin/woody in path mkdir -p /cluster/data/nisc/targets/cftr/phyloHMMcons cd /cluster/data/nisc/targets/cftr/phyloHMMcons # extract sufficient stats for phylog. analysis from MAF file CFTR_START=115365025 # boundaries of CFTR region in hg16 coords CFTR_END=117242450 # (these don't have to be perfect) maf_project /cluster/data/nisc/targets/cftr/tba9way.maf /cluster/data/nisc/targets/cftr/tba9Mammal/human > cftr9_humanref.maf msa_view cftr9_humanref.maf -i MAF -o SS -s ${CFTR_START} -e ${CFTR_END} -r 1 -O hg16,chimp,baboon,mm3,rn3,cow,pig,cat,dog > cftr9.ss head cftr9.ss #NSEQS = 9 #LENGTH = 2063003 #TUPLE_SIZE = 1 #NTUPLES = 57302 #NAMES = hg16,chimp,baboon,mm3,rn3,cow,pig,cat,dog #ALPHABET = ACGTN #IDX_OFFSET = 115365024 #NCATS = -1 # #0 C-------- 26480 # fit a phylogenetic model to the data, with rate variation echo "((((1,2),3),(4,5)),((6,7),(8,9)))" > cftr9.nh # (indexes refer to sequences in the order of the NAMES line in # the *.ss file) fit_tree_model -m cftr9.ss -i SS -t cftr9.nh -s REV -o cftr9_rev -E -l fit.log -k 5 -a 4.8 -T -p MED # (takes about 5 min. Watch log file for convergence -- # single lines correspond to outer maximization algorithm, # interleaved sets of lines correspond to inner maximization # algorithms [see http://www.cse.ucsc.edu/~acs/Siepel-03-0304.pdf # for background]) # Note: k=5 is adequate for a good estimate of the alpha # parameter, even though we'll use k=10 in the next step. The -a # argument just provides a reasonable starting value for alpha, to # speed convergence # (check estimated branch lengths to be sure they make sense) cat cftr9_rev.nh #((((hg16:0.005601,chimp:0.005707):0.019356,baboon:0.034458):0.080743,(mm3:0.072487,rn3:0.079445):0.287368):0.035643,((cow:0.107791,pig:0.102431):0.040419,(cat:0.074444,dog:0.104476):0.053251):0.035643); # (small deviations from one data set to the next are normal) # you can also do "draw_tree cftr9_rev.nh > cftr9_rev.ps" to get a # simple postscript rendering of the tree. Zero- or # near-zero-length branches usually indicate a problem, e.g., # incorrect topology # (also check cftr9_rev.mod; look in particular at ALPHA) cat cftr9_rev.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #NRATECATS: 5 #ALPHA: 4.778715 #TRAINING_LNL: -6471907.615171 #BACKGROUND: 0.304536 0.191156 0.191907 0.312401 #RATE_MAT: # -0.848833 0.150792 0.552489 0.145552 # 0.240232 -1.259134 0.166198 0.852704 # 0.876738 0.165547 -1.285792 0.243507 # 0.141887 0.521764 0.149586 -0.813238 #TREE: ((((1:0.005601,2:0.005707):0.019356,3:0.034458):0.080743,(4:0.072487,5:0.079445):0.287368):0.035643,((6:0.107791,7:0.102431):0.040419,(8:0.074444,9:0.104476):0.053251):0.035643); # now compute the posterior probabilities of interest, according to # a phylo-HMM label -m cftr9.ss -d cftr9_rev.mod -i SS -o cftr9 -k 10 -L 0.9 -A -p 0 -j 1 -x -s chr7 # (takes 12 min) # (check postprob file) wc cftr9.postprob #1752168 3504336 31539024 cftr9.postprob head cftr9.postprob #115370785 0.0664 #115370786 0.0583 #115370787 0.0448 #115370788 0.0271 #115370789 0.0217 #115370790 0.0232 #115370791 0.0331 #115370792 0.0396 #115370793 0.0417 #115370794 0.0557 # load as a (Hiramesque) wiggle track cd /cluster/data/nisc/targets/cftr/phyloHMMcons zcat cftr9.postprob.gz | wigAsciiToBinary -chrom=chr7 -binsize=1024 \ -dataSpan=1 -wibFile=chr7_phyloHMMcons_CFTR -name=cftr9 stdin rm -r /gbdb/hg16/wib/chr7_phyloHMMcons_CFTR.wib ln -s \ /cluster/data/nisc/targets/cftr/phyloHMMcons/chr7_phyloHMMcons_CFTR.wib \ /gbdb/hg16/wib/chr7_phyloHMMcons_CFTR.wib hgLoadWiggle hg16 chr7_phyloHMMcons_CFTR chr7_phyloHMMcons_CFTR.wig chmod 664 chr7_phyloHMMcons_CFTR.wib chmod 775 . # add trackDb.ra entry, e.g., #track phyloHMMcons_CFTR #shortLabel phyloHMMcons CFTR #longLabel phylo-HMM-based conservation, CFTR (post. prob. of slowest of 10 rates) #group compGeno #priority 150 #visibility hide #color 175,150,128 #altColor 255,128,0 #type wig 0.0 1.0 #autoScale Off # adapt HTML for details page, if necessary (e.g., copy an existing # phyloHMMcons*.html page to phyloHMMcons_CFTR.html, edit to # reflect data set, do "make update", don't forget to cvs add and # commit) # cleanup rm cftr9.ss cftr9_humanref.maf # easy to regenerate gzip cftr9.postprob # CFTR PHYLOHMM CONSERVATION, 25-way alignment # done, acs, 2003-11-21 # This can be done exactly as above for the 9-way alignment, except # that the tree estimation procedure has to be adjusted to circumvent # the problem that the distant species align only in conserved regions # (so that a tree estimated from the whole data set will have # disproportionally short branches to and among these species). The # procedure I've used is semi-manual and somewhat ad hoc, but I'll # record the main steps here for completeness. I'll only cover the # tree estimation procedure (running 'label' and loading the track is # the same as before) . ssh hgwdev mkdir /cluster/data/nisc/targets/cftr/phyloHMMcons25 cd /cluster/data/nisc/targets/cftr/phyloHMMcons25 # extract sufficient statistics for two data sets: all sites for # mammals and sites in 3rd codon positions for all species. I'm # not including platyplus with the mammals (it's technically a # mammal, but a monotreme, and quite distant) because it seems to # align mostly in conserved regions maf_project /cluster/data/nisc/targets/cftr/25way/tba.maf /cluster/data/nisc/targets/cftr/25way/human > cftr25_humanref.maf setenv CFTR_START 115365025 setenv CFTR_END 117242450 setenv SPEC_ORDER hg16,chimp,orangutan,baboon,macaque,vervet,lemur,rabbit,rn3,mm3,cow,pig,horse,cat,dog,ajbat,cpbat,hedgehog,opossum,dunnart,platypus,chicken,zfish,tetra,fr1 msa_view cftr25_humanref.maf -i MAF -o SS -s $CFTR_START -e $CFTR_END -r 1 -O $SPEC_ORDER > cftr25.ss # whole data set, ordered suff stats -- use this for 'label' msa_view cftr25.ss -i SS -o SS -z -l 21,22,23,24,25 -x > cftr20.ss # exclude non-mammals (plus platypus) /bin/echo -e 'NCATS = 3\ncds 1-3' > cats.cm # category map for cds sites /cluster/home/acs/woody/scripts/refFlat2gff.pl -S -P -A hg16 -w 'chrom="chr7" and cdsStart > 115365025 and cdsEnd < 117242450' | sed 's/chr7/hg16/' | egrep -v 'NM_152829|NM_001233|NM_018412' > cftr.gff # gets refseq annotations for this region as a gff; the egrep # explicitly removes some duplicate entries (should have a # better way of doing this); the sed changes the seq name so # that msa_view recognizes it's the same as the name in the # alignment msa_view cftr25_humanref.maf -i MAF -o SS -z -c cats.cm -g cftr.gff -O $SPEC_ORDER > cftr25.3.ss # now fit a tree model to each data set echo "((((((((((1,2),3),((4,5),6)),7),(8,(9,10))),((((11,12),(13,(14,15))),(16,17)),18)),(19,20)),21),22),(23,(24,25)))" > cftr25.nh fit_tree_model -m cftr25.3.ss -C 3 -i SS -t cftr25.nh -s REV -o cftr25 -E -l cftr25.3.log -T -p MED -k 5 -a 1.8 # (this next one may take an hour or two -- run it on a fast # workstation or be sure to nice if on hgwdev; you can speed it up # by giving it a good starting *.mod file based on the above [-M option]) echo "(((((((1,2),3),((4,5),6)),7),(8,(9,10))),((((11,12),(13,(14,15))),(16,17)),18)),(19,20))" > cftr20.nh fit_tree_model -m cftr20.ss -i SS -t cftr20.nh -s REV -o cftr20 -E -l cftr20.log -T -p MED -k 5 -a 4 cp cftr20.mod cftr25_hybrid.mod # Now edit cftr25_hybrid.mod by hand. Copy the tail end of the # TREE line from cftr25.3.mod, corresponding to all nodes and # branches outside of the clade for the non-monotreme mammals, and # append it to the TREE line in cftr25_hybrid.mod (adjusting # parens as necessary). Then multiply each one of these new # branch lengths by a factor of 1.2 (computed as the sum of all # branch lengths in cftr25.mod divided by the sum of the # corresponding branch lengths in cftr25.3.mod). The resulting # tree is well supported within the (non-monotreme mammals) and # includes a reasonable approximation of the non-mammal branch # lengths. Proceed with 'label' using cftr25_hybrid.mod. # cleanup rm cftr25_humanref.maf cftr*.ss # easy to regenerate # HMR PHYLOHMM CONSERVATION # (started, acs, 2003-11-11, finished 11-19) ssh hgwdev # (update woody binaries, if necessary -- see above) # (also, make sure /cluster/bin/woody in path) mkdir /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11 cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11 # estimate a phylog. model using the entire genome-wide alignments # first extract sufficient statistics by chromosome ssh eieio cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11 foreach file (/cluster/data/hg16/bed/humor/hmr/*.maf) set prefix = $file:t:r msa_view -i MAF $file -o SS -z -O hg16,mm3,rn3 > $prefix.ss end logout # NOTE: may be worth doing the above as a small cluster job instead # (put the mafs on bluearc -- end up doing this below anyway) # now combine suff stats across chromosomes # (back on hgwdev) ls chr*.ss > files msa_view -i SS -o SS -A hg16,mm3,rn3 '*files' > all.ss # estimate the model (very fast, now that suff stats are avail) echo "(1,(2,3));" > tree.nh fit_tree_model -i SS -m all.ss -t tree.nh -s REV -k 10 -o rev_dg cat rev_dg.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #NRATECATS: 10 #ALPHA: 4.428803 #TRAINING_LNL: -448054115.568696 #BACKGROUND: 0.286083 0.213573 0.213691 0.286652 #RATE_MAT: # -0.891523 0.166770 0.574850 0.149902 # 0.223389 -1.146311 0.153784 0.769137 # 0.769591 0.153699 -1.147159 0.223869 # 0.149605 0.573055 0.166888 -0.889548 #TREE: (1:0.192598,(2:0.076303,3:0.083043):0.192598); # now, break up the genome-wide MAFs into pieces; it's worth doing # this as a little cluster job ssh eieio mkdir -p /cluster/bluearc/hg16/bed/humor cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/data/hg16/?{,?}/chr*.fa /cluster/bluearc/hg16/bed/humor logout ssh kk cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11 cat << EOF > doSplit #!/bin/sh WOODY=/cluster/bin/woody FA_SRC=/cluster/bluearc/hg16/bed/humor WINDOWS=/cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11/WINDOWS maf=$1 prefix=`echo $maf | awk -F\/ '{print $NF}' | awk -F\. '{print $1}'` mkdir -p /scratch/msa_split ${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$prefix.fa -O hg16,mm3,rn3 -w 1000000,0 -r /scratch/msa_split/$prefix -o SS -I 1000 -d 1 -B 5000 cd /scratch/msa_split for file in ${prefix}.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done rm -f /scratch/msa_split/${prefix}.*.ss EOF chmod +x doSplit mkdir -p WINDOWS rm -f WINDOWS/* jobs.lst foreach file (/cluster/bluearc/hg16/bed/humor/*.maf) echo "doSplit $$file" >> jobs.lst end para create jobs.lst # etc ... (run cluster job) # now setup and run the cluster job to compute the conservation scores # NOTE: the TMP dir should be set to something other than /scratch, # as it is not shared between cluster nodes ? cat << EOF > doPostProbs #!/bin/sh WOODY=/cluster/bin/woody TMP=/scratch/phyloHMMcons file=$1 root=`echo $file | awk -F\/ '{print $NF}' | sed 's/\.ss\.gz//'` chrom=`echo $root | awk -F\. '{print $1}'` mkdir -p $TMP zcat $file | $WOODY/label -m - -d rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x mkdir -p POSTPROBS/$chrom gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz rm $TMP/$root.postprob EOF chmod +x doPostProbs mkdir -p POSTPROBS rm -f jobs2.lst foreach file (WINDOWS/chr*.ss.gz) echo "doPostProbs $file" >> jobs2.lst end para create jobs2.lst # etc ... (run cluster job) logout # finally, make track # phyloHMMcons.hg16mm3rn3.2003-11-11 dir) ssh eieio cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11 mkdir wibLimits mkdir wib foreach dir (POSTPROBS/*) set chrom = $dir:t echo $chrom zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \ wigAsciiToBinary -chrom=$chrom -binsize=1024 \ -dataSpan=1 -wibFile=wib/${chrom}_phyloHMMcons -name=hmr \ stdin > wibLimits/${chrom} end ssh hgwdev cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11 hgLoadWiggle hg16 phyloHMMcons_HMR wib/*_phyloHMMcons.wig ln -s `pwd`/wib/chr*_phyloHMMcons.wib /gbdb/hg16/wib chmod 775 . wib chmod 664 wib/*.wib # add entry to trackDb.ra #track phyloHMMcons_HMR #shortLabel phyloHMMcons HMR #longLabel phylo-HMM-based conservation, human-mouse-rat (post. prob. of slowest of 10 rates) #group compGeno #priority 150 #visibility hide #color 175,150,128 #altColor 255,128,0 #type wig 0.0 1.0 #autoScale Off # cleanup (only when you're pretty sure you're done!) rm -r chr*.ss WINDOWS wiggle.tab para.results batch* # CHICKEN BLAT (translated) # (done, acs, 2003-11-19) # (using repeat- and TRF-masked files already created -- see # CHICKEN BLASTZ, above) ssh kk # set up main dir cd /cluster/data/hg16/bed mkdir blat.gg0.2003-11-19 ln -s blat.gg0.2003-11-19 blat.gg0 cd blat.gg0 # warning: I'm writing this up in a rush -- watch for errors! # set up cluster job cat << EOF > make-joblist.pl #!/usr/bin/perl # script to create a job list for translated blat of human # vs. another species; assumes directory of fa files for the xeno # species. Output directories are set up as a side effect. # USAGE: make-joblist.pl $SIZE=10000000; # partitioning params for human $OVERLAP=10000; # read lengths of chromosomes open(LENF, $ARGV[2]); while () { ($chr, $l) = split(/\s+/); $length{$chr} = $l;} close(LENF); @falist = <$ARGV[1]/*.fa>; foreach $nib (<$ARGV[0]/*.nib>) { $nib =~ /.*(chr.*)\.nib/ || die(); $chr = $1; $l = $length{$chr}; for ($start = 1; $start <= $l; $start += $SIZE) { $end = $start + $SIZE + $OVERLAP - 1; if ($end > $l) { $end = $l; } $dir = sprintf("%s/%s/%d_%d", $ARGV[3], $chr, $start, $end); foreach $fa (@falist) { $fa =~ /.*\/([^\/]+)\.fa/ || die(); $name = $1; printf "/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax %s:%d-%d %s {check out line+ %s/%s_%d_%d_%s.psl}\\n", $nib, $start, $end, $fa, $dir, $chr, $start, $end, $name; } `mkdir -p $dir`; # set up output directories } } EOF # NOTE: there's a slight error above with indexing. Next time use # something like: # for ($start = 0; $start < $l; $start += $SIZE) { # $end = $start + $SIZE + $OVERLAP; # if ($end >= $l) { $end = $l; } # The "make-lift.pl" script below should be changed also to be # consistent (should be enough to change exactly the same lines) chmod +x make-joblist.pl cp /cluster/data/hg16/bed/blastz.gg0/S1.len . # just borrow existing lens mkdir -p run ./make-joblist.pl /iscratch/i/gs.17/build34/bothMaskedNibs /cluster/bluearc/gg0/split100_with_trf S1.len /cluster/data/hg16/bed/blat.gg0/psl > run/jobs.lst # make sure directory structure is created under psl cd run para create jobs.lst ; para try ; para check ; para push ; # etc... #33561 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 33561 of 33561 jobs #CPU time in finished jobs: 14432527s 240542.12m 4009.04h 167.04d 0.458 y #IO & Wait Time: 147210s 2453.50m 40.89h 1.70d 0.005 y #Average job time: 434s 7.24m 0.12h 0.01d #Longest job: 14117s 235.28m 3.92h 0.16d #Submission to last job: 31483s 524.72m 8.75h 0.36d # post process psl files cd .. # back to main blat.gg0 dir cat << EOF > make-lift.pl #!/usr/bin/perl # create a lift spec to map psl files for windows to chromosome coords # USAGE: make-lift.pl $SIZE=10000000; $OVERLAP=10000; open(LENF, $ARGV[1]); while () { ($chr, $l) = split(/\s+/); $length{$chr} = $l;} close(LENF); foreach $nib (<$ARGV[0]/*.nib>) { $nib =~ /.*(chr.*)\.nib/ || die(); $chr = $1; $l = $length{$chr}; for ($start = 1; $start <= $l; $start += $SIZE) { $end = $start + $SIZE + $OVERLAP - 1; if ($end > $l) { $end = $l; } printf "%d\t%s:%d-%d\t%d\t%s\t%d\n", $start, $chr, $start, $end, $end-$start, $chr, $l; } } EOF chmod +x make-lift.pl ./make-lift.pl /iscratch/i/gs.17/build34/bothMaskedNibs S1.len > psl.lft mkdir -p pslChrom foreach dir ( psl/* ) set chrom = $dir:t echo $chrom /cluster/bin/i386/pslCat -dir $dir/* | /cluster/bin/i386/liftUp pslChrom/${chrom}_blatGg0.psl psl.lft warn stdin end # Load database tables ssh hgwdev cd /cluster/data/hg16/bed/blat.gg0/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*.psl # New entry in human/hg16/trackDb.ra # track blatGg0 # shortLabel Chicken Blat # longLabel Chicken Translated Blat (Gg0-contigs, 5.2x coverage) # group compGeno # priority 145.95 # visibility hide # color 100,50,0 # altColor 255,240,200 # spectrum on # type psl xeno # look at coverage featureBits hg16 blatGg0 knownGene:CDS #18205137 bases of 2865248791 (0.635%) in intersection featureBits hg16 knownGene:CDS #31268809 bases of 2865248791 (1.091%) in intersection # RELOAD ENSEMBL GENES WITH VERSION 34a (DONE 2003/12/16 markd) # save current tables, just in case. rename table ensGene to ensGene_old; rename table ensGtp to ensGtp_old; rename table ensPep to ensPep_old; mkdir /cluster/data/hg16/bed/ensembl34a cd /cluster/data/hg16/bed/ensembl34a # Get the ensembl protein data from # http://www.ensembl.org/Homo_sapiens/martview # Follow this sequence through the pages: # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensemblGene.gtf.gz # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. # Finally, get rid of the ".1" or ".2" after the name zcat ensemblGene.gtf.gz \ | grep -v ^6_DR51 \ | grep -v ^DR51 \ | grep -v _NT_ \ | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \ || die "Line $. doesnt start with human chrom:\n$_"' \ | sed -e 's/\..\"/\"/g' \ > ensGene.gtf ssh hgwdev /cluster/bin/i386/ldHgGene hg16 ensGene \ /cluster/data/hg16/bed/ensembl34a/ensGene.gtf # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16 gzip ensGtp.txt # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz zcat ensemblPep.fa.gz | hgPepPred hg16 ensembl stdin # compare size of old and new tables as a sanity check drop table ensGene_old; drop table ensGtp_old; drop table ensPep_old; # Create knownToEnsembl column and knownToSuperfamily column hgMapToGene hg16 ensGene knownGene knownToEnsembl zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg16 hs stdin # LOAD ECgene tables ((redone with existing data) braney, 2004-01-30) cd /cluster/data/hg16/bed rm -f ECgene mkdir ECgene.2003-12-18 ln -s ECgene.2003-12-18 ECgene cd ECgene wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_hg16_v1.1_25oct2003_genes.txt.gz" wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_hg16_v1.1_25oct2003_genepep.txt.gz" gunzip *.gz ldHgGene -predTab hg16 ECgene ECgene_hg16_v1.1_25oct2003_genes.txt hgPepPred hg16 tab ECgenePep ECgene_hg16_v1.1_25oct2003_genepep.txt rm genePred.tab gzip * # QA NOTE: [ASZ, 2007-10-01] mytouch to ECGenePep table 200401301000.00 # contents were fine. passed -keys rule. # MULTIZ HUMAN/MOUSE/RAT/CHIMP (kpollard, 12/16/03) # chimp added to human/mouse/rat (HUMOR) alignment described above # for now, human referenced and no new BLASTZ runs ssh kk #fix order in human/chimp BLASTZ MAF files #use Kate's new files in humanBestAxt.2 cd /cluster/data/pt0/bed/blastz-blatHg16 mkdir humanBestAxt.ord mkdir maf.ord foreach file (humanBestAxt.2/*.axt) set root=$file:t:r echo $root /cluster/bin/i386/axtSort $file humanBestAxt.ord/${root}.axt /cluster/bin/i386/axtToMaf humanBestAxt.ord/${root}.axt ../blastz.hg16/S1.len /cluster/data/pt0/scaffold.sizes maf.ord/${root}.maf.unfixed -tPrefix=hg16. -qPrefix=pt0. /cluster/bin/scripts/fixmaf.pl < maf.ord/${root}.maf.unfixed > maf.ord/${root}.maf end #test on chr11 with HMR ssh eieio mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord cp /cluster/data/pt0/bed/blastz-blatHg16/maf.ord/chr11.maf /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord logout # back to kk /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0pt0/mz /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord/chr11.maf /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hmr/chr11.hmr.maf /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0pt0/hmrp/chr11.ord.maf #looks good, go ahead with HMRP multiz mkdir -p /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0 cd /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0 mkdir hmrp # wrapper script for multiz cat << EOF > mz #!/bin/csh /cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3 EOF chmod +x mz ssh eieio # clean up bluearc rm -r /cluster/bluearc/multiz.hg16mm3rn3gg0pt0 # move MAFS to bluearc mkdir -p /cluster/bluearc/multiz.hg16mm3rn3pt0/hmr mkdir -p /cluster/bluearc/multiz.hg16mm3rn3pt0/hp cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/bluearc/multiz.hg16mm3rn3pt0/hmr cp /cluster/data/pt0/bed/blastz-blatHg16/maf.ord/*.maf /cluster/bluearc/multiz.hg16mm3rn3pt0/hp logout # set up joblist (common denominator set: no chr19_random in hmr) foreach file (/cluster/bluearc/multiz.hg16mm3rn3pt0/hmr/*.maf) set root=`echo $file:t:r | sed 's/\.hmr//'` echo "/cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/mz /cluster/bluearc/multiz.hg16mm3rn3pt0/hp/${root}.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/${root}.maf" >> jobList end #run MULTIZ chmod +x jobList para create jobList #submit 10 jobs para try #keep an eye on them para check para finished para running #once these are done, submit rest para push para check para time #ran on cluster: 41 jobs, longest 42 min #copy over chr19_random.maf from human/chimp cp /cluster/bluearc/multiz.hg16mm3rn3pt0/hp/chr19_random.maf /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/chr19_random.maf # clean up bluearc ssh eieio rm -r /cluster/bluearc/multiz.hg16mm3rn3pt0 logout # setup external files for database reference ssh hgwdev mkdir -p /gbdb/hg16/multizMm3Rn3Pt0 ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/*.maf /gbdb/hg16/multizMm3Rn3Pt0 #load into database cd /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3Pt0 # 5385226 mafs in 42 files # 0-2594 warnings/file #NOTE: only added track to hgwdev-kpollard (for now). # LIFTOVER RNAGENE FROM HG15 (DONE CIRCA 12/27/03 schattner) # Replaced below by new RNAGENES (2004-03-09) cd /cluster/data/hg16/bed/bedOver mkdir rnaGene cd rnaGene hgsql -N hg15 '-e select * from rnaGene' > rnaGeneHg15.bed liftOver rnaGeneHg15.bed ../over.chain rnaGeneLiftGene.bed \ rnaGeneLiftGeneMiss.bed hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/rnaGene.sql hg16 rnaGene \ /cluster/data/hg16/bed/bedOver/rnaGene/rnaGeneLiftGene.bed LOAD RNAGENES (DONE - 2004-03-09 - Hiram) # http://www.genetics.wustl.edu/eddy # Sean Eddy, eddy@genetics.wustl.edu # Dept. of Genetics, Washington University School of Medicine ssh hgwdev mkdir -p /cluster/data/hg16/bed/rnaGene cd /cluster/data/hg16/bed/rnaGene mkdir rnaGenePrevious # save previous rnaGene track for reference hgsqldump -T rnaGenePrevious hg16 rnaGene wget --timestamping \ ftp://ftp.genetics.wustl.edu/pub/eddy/annotation/human-hg16/* grep -v "^#" ncrna-hg16-mito.gff | sed -e "s/^NT_999999/chrM/" > mito.gff grep -v "^#" ncrna-hg16-chrom.gff > chrom.gff cat chrom.gff mito.gff > all.gff hgsql -e 'drop table rnaGene;' hg16 hgsql hg16 < ~/kent/src/hg/lib/rnaGene.sql hgRnaGenes hg16 all.gff # rmMm3Rn3 3-way Regulatory Potential Score track (DONE - 2004-01-14 - Hiram) # Data from: James Taylor james@bx.psu.edu # Track description from: Francesca Chiaromonte chiaro@stat.psu.edu ssh eieio # Right now we are out of space on this /cluster/store4 filesystem, # so send the data to the bluearc mkdir /cluster/bluearc/hg16/bed/regPotential3X ln -s /cluster/bluearc/hg16/bed/regPotential3X \ /cluster/data/hg16/bed/regPotential3X cd /cluster/data/hg16/bed/regPotential3X mkdir data cd data foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) wget --timestamping \ "http://www.bx.psu.edu/~james/rp/hg16_mm3_rn3/chr${c}.hmr.maf.gz_rpscores.txt.truncated.bz2" wget --timestamping \ "http://www.bx.psu.edu/~james/rp/hg16_mm3_rn3/chr${c}.hmr.maf.gz_rpscores.txt.bz2" end # The truncated files were a test. They want to see the raw data. ssh eieio cd /cluster/data/hg16/bed/regPotential3X mkdir wigRawData foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) bzcat data/chr${c}.hmr.maf.gz_rpscores.txt.bz2 | sort -n | \ wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \ -verbose -wibFile=wigRawData/chr${c}_rpMm3Rn3_Data \ -name=${c} stdin > chr${c}.out echo chr${c} done end ssh hgwdev cd /cluster/data/hg16/bed/regPotential3X/wigRawData hgLoadWiggle hg16 regPotential3X chr*_rpMm3Rn3_Data.wig ln -s `pwd`/chr*_rpMm3Rn3_Data.wib /gbdb/hg16/wib # rmMm4 2-way Regulatory Potential Score track (DONE - 2004-01-14 - Hiram) # Data from: James Taylor james@bx.psu.edu # Track description from: Francesca Chiaromonte chiaro@stat.psu.edu ssh eieio # Right now we are out of space on this /cluster/store4 filesystem, # so send the data to the bluearc mkdir /cluster/bluearc/hg16/bed/regPotential2X ln -s /cluster/bluearc/hg16/bed/regPotential2X \ /cluster/data/hg16/bed/regPotential2X cd /cluster/data/hg16/bed/regPotential2X mkdir data cd data foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y) wget --timestamping \ "http://www.bx.psu.edu/~james/rp/hg16_mm4/chr${c}.axt_rpscores.txt.truncated" wget --timestamping \ "http://www.bx.psu.edu/~james/rp/hg16_mm4/chr${c}.axt_rpscores.txt" end gzip *.truncated *.txt # I'll bet you could gzip the .wig files too and zcat them # into hgLoadWiggle ? # The truncated files were a test. It turns out the full scores # are desired to be seen # The data is for every 5 bases. Doesn't appear to be in order, # so sort it into wigAsciiToBinary ssh eieio cd /cluster/data/hg16/bed/regPotential2X mkdir wigFiles foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y) zcat data/chr${c}.axt_rpscores.txt.gz | sort -n | \ wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \ -wibFile=wigFiles/chr${c}_rpMm4 stdin > chr${c}.limits echo chr${c} done end # To load the data # (some day in the future the above wigAsciiToBinary function # will be folded into hgLoadWiggle and thus one command) ssh hgwdev cd /cluster/data/hg16/bed/regPotential2X/wigFiles hgLoadWiggle hg16 regPotential2X chr*_rpMm4.wig ln -s `pwd`/chr*_rpMm4.wib /gbdb/hg16/wib # an optional data load to check a display problem mkdir wigTrunc foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y) zcat data/chr${c}.axt_rpscores.txt.truncated.gz | sort -n | \ wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \ -wibFile=wigTrunc/chr${c}_rpMm4t stdin > chr${c}t.limits end ssh hgwdev cd /cluster/data/hg16/bed/regPotential2X/wigTrunc hgLoadWiggle hg16 regPotential2XTrunc chr*_rpMm4t.wig ln -s `pwd`/chr*_rpMm4t.wib /gbdb/hg16/wib # CREATE chimpSimpleDiff TRACK AND TABLE # Convert chimp quality scores from uncompressed contig to compressed # supercontig format. This will take half an hour or so. cd /cluster/data/pt0 zcat contigs.quals.gz | qaToQac stdin stdout | \ chimpSuperQuals assembly.agp stdin scaffolds.qac # Make single base pair high quality differences into a bed file # and load into database cd /cluster/data/hg16/bed mkdir chimpSimpleDiff cd chimpSimpleDiff chimpHiQualDiffs /cluster/data/pt0/bed/blastz-blatHg16/axtBest \ /cluster/data/pt0/scaffolds.qac chimpSimpleDiff.bed sed 's/simpleNucDiff/chimpSimpleDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > \ chimpSimpleDiff.sql hgLoadBed -sqlTable=simpleNucDiff.sql hg16 chimpSimpleDiff.bed ### chimpFixedDiff -- panTro1 (Daryl, July 8, 2005) # Convert chimp quality scores from uncompressed to compressed # chromosome format. This took 22 minutes on crow. cd /cluster/data/panTro1 cat */chr*.qa | qaToQac stdin chrom.qac # Make single base pair high quality differences into a bed file # and load into database cd /cluster/data/hg16/bed mkdir chimpFixedDiff cd chimpFixedDiff sed 's/simpleNucDiff/chimpFixedDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > chimpFixedDiffs.sql # chimpHiQualDiffs was changed to allow different quality # parameters as command line options ## FIRST ATTEMPT: set axtDir = cluster/data/hg16/bed/blastz-blat.panTro1.lifted/axtRBestNet ## time chimpFixedDiffs /$axtDir /cluster/data/panTro1/chrom.qac chimpFixedDiffs.bed >& chimpFixedDiffs.log # This crashed twice at the same place, but ran successfully when # each chromosome was run separately. mkdir chroms; cd chroms ls -1 $axtDir | grep chr | grep axt | sed 's/.axt//' | xargs mkdir rmdir chr*random foreach f (chr*) echo -n $f " " ln -s /$axtDir/$f.axt $f/$f.axt time nice chimpFixedDiffs $f /cluster/data/panTro1/chrom.qac $f.chimpFixedDiffs.bed>>& cfd.log end cat chr*bed > ../chimpFixedDiffs.bed ## The load (sort) ran out of memory on hgwdev, so I sorted the ## file first on kolossus (3 minutes) and then loaded it on hgwdev ssh kolossus hgLoadBed -strict -sqlTable=chimpFixedDiffs.sql -noLoad hg16 chimpFixedDiff chimpFixedDiffs.bed exit ## hgwdev (37 minutes) hgLoadBed -hasBin -noSort -sqlTable=chimpFixedDiffs.sql hg16 chimpFixedDiff bed.tab TODO: need to filter out polymorphic sites (SNPs) ## LS-SNP links [load data only] (Daryl Thomas; November 3, 2005) # Data from Rachel Karchin in the Andrej Sali lab at UCSF # /cluster/data/hg16/bed/lssnp hgsql hg16 < ${HOME}/kent/src/hg/lib/lsSnpFunction.sql hgsql hg16 < ${HOME}/kent/src/hg/lib/lsSnpStructure.sql mysql> load data local infile "snp-human2-function-predictions.txt" into table lsSnpFunction; Query OK, 7689 rows affected (0.52 sec) mysql> load data local infile "snp-human2-structure-predictions.txt" into table lsSnpStructure; Query OK, 28144 rows affected (2.39 sec) # gc5Base wiggle TRACK (DONE - 2004-03-12 - Hiram) # reloaded wib files 2005-05-17 to place them in /gbdb/hg16/wib/gc5Base # a demonstration wiggle track. Perform a gc count with a 5 base # window. Also compute a "zoomed" view for display efficiency. mkdir /cluster/data/hg16/bed/gc5Base cd /cluster/data/hg16/bed/gc5Base # in the script below, the 'grep -w GC' selects the lines of # output from hgGcPercent that are real data and not just some # information from hgGcPercent. The awk computes the number # of bases that hgGcPercent claimed it measured, which is not # necessarily always 5 if it ran into gaps, and then the division # by 10.0 scales down the numbers from hgGcPercent to the range # [0-100]. Two columns come out of the awk print statement: # and which are fed into wigAsciiToBinary through # the pipe. It is set at a dataSpan of 5 because each value # represents the measurement over five bases beginning with # . The result files end up in ./wigData5. cat << '_EOF_' > runGcPercent.sh #!/bin/sh mkdir -p wigData5 mkdir -p dataLimits5 for n in ../../nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 hg16 ../../nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \ wigAsciiToBinary \ -dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \ -name=${C} stdin 2> dataLimits5/${c} echo "done" done '_EOF_' chmod +x runGcPercent.sh # This is going to take perhaps two hours to run. It is a lot of # data. make sure you do it on the fileserver: ssh eieio cd /cluster/data/hg16/bed/gc5Base ./runGcPercent.sh # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/hg16/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/gc5Base hg16 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir /gbdb/hg16/wib/gc5Base ln -s `pwd`/wigData5/*.wib /gbdb/hg16/wib/gc5Base # to speed up display for whole chromosome views, compute a "zoomed" # view and load that on top of the existing table. The savings # comes from the number of data table rows the browser needs to load # for a full chromosome view. Without the zoomed view there are # over 43,000 data rows for chrom 1. With the zoomed view there are # only 222 rows needed for the display. If your original data was # at 1 value per base the savings would be even greater. # Pretty much the same data calculation # situation as above, although this time note the use of the # 'wigZoom -dataSpan=1000 stdin' in the pipeline. This will average # together the data points coming out of the awk print statment over # a span of 1000 bases. Thus each coming out of wigZoom # will represent the measurement of GC in the next 1000 bases. Note # the use of -dataSpan=1000 on the wigAsciiToBinary to account for # this type of data. You want your dataSpan here to be an exact # multiple of your original dataSpan (5*200=1000) and on the order # of at least 1000, doesn't need to go too high. For data that is # originally at 1 base per value, a convenient span is: -dataSpan=1024 # A new set of result files ends up in ./wigData5_1K/*.wi[gb] cat << '_EOF_' > runZoom.sh #!/bin/sh mkdir -p wigData5_1K mkdir -p dataLimits5_1K for n in ../../nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 hg16 ../../nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \ -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \ -name=${C} stdin 2> dataLimits5_1K/${c} echo "done" done '_EOF_' chmod +x runZoom.sh # This is going to take even longer than above, certainly do this # on the fileserver ssh eieio time ./runZoom.sh real 232m3.265s user 302m37.050s sys 16m13.770s # Then load these .wig files into the same database as above ssh hgwdev hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/gc5Base -oldTable hg16 gc5Base \ wigData5_1K/*.wig # and symlink these .wib files into /gbdb mkdir -p /gbdb/hg16/wib/gc5Base ln -s `pwd`/wigData5_1K/*.wib /gbdb/hg16/wib/gc5Base # KNOWN GENES TRACK (STARTED - 2004-01-15 - with Gene Sorter complete # 1004-02-17 Hiram) # you will probably need to make the programs in kent/src/hg/protein cd ~/kent/src/hg/protein make # The scripts run below will check for programs and let you know # which ones are missing # obtain new SwissProt database (should be done about once a month) # the swiss prot data is currently living on store5, first step is # on the fileserver. This script was used once as it was created, # it may need to be verified and improved as it is used again. See # comments at the top of the script. ssh eieio cd /cluster/data/swissprot ~/kent/src/hg/protein/mkSwissProtDB.sh # that obtains the data and unpacks it, second step is on hgwdev # to create the database ssh hgwdev cd /cluster/data/swissprot ~/kent/src/hg/protein/mkSwissProtDB.sh # Now the proteins database can be created from that. Must be on hgwdev # Again, a script that has been used once upon creation, see # comments in it. For example currently it is assumed these two # scripts have been run on the same day. In this case 03112 ssh hgwdev cd /cluster/data/proteins ~/kent/src/hg/protein/mkProteinsDB.sh # with those two databases existing, read for the actual known genes # track build. Must be on hgwdev since it is all mostly database # operations. The {Date} argument is the date stamp created by the # above two scripts. Something of the form YYMMDD, e.g.: 031112 # Again, a script that has been used only once at creation, see # comments at top of script. ssh hgwdev mkdir /cluster/data/hg16/bed/knownGenes cd /cluster/data/hg16/bed/knownGenes DateStamp=040115 ~/kent/src/hg/protein/KGprocess.sh ${DateStamp} # that runs to a point where it prepares data and jobList for a # cluster run. Continue with a cluster run on kk ssh kk cd /cluster/data/hg16/bed/knownGenes/kgBestMrna para create jobList para try para check para push # this is a quick cluster job. Less than five minutes. e.g.: # Completed: 43580 of 43580 jobs # CPU time in finished jobs: 114636s 1910.60m 31.84h 1.33d 0.004 y # IO & Wait Time: 111889s 1864.82m 31.08h 1.30d 0.004 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest job: 9s 0.15m 0.00h 0.00d # Submission to last job: 282s 4.70m 0.08h 0.00d # Continuing back on hgwdev, run the same script again ssh hgwdev cd /cluster/data/hg16/bed/knownGenes DateStamp=031112 ~/kent/src/hg/protein/KGprocess.sh ${DateStamp} # that should run to completion and the known genes track is ready # Add the proteins link into gdbPdb.hgcentral: hgsql -e 'INSERT INTO gdbPdb (genomeDb, proteomeDb) \ VALUES ("hg16","proteins040115");' \ -h genome-testdb hgcentraltest # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK) # This depends on the go and uniProt databases as well as # the kgAlias and kgProAlias tables. The hgKgGetText takes # about 5 minutes when the database is not too busy. The rest # is real quick. ssh hgwdev cd /cluster/data/hg16/bed/knownGenes.2004-01-29 mkdir index cd index hgKgGetText hg16 knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx ln -s /cluster/data/hg16/bed/knownGenes.2004-01-29/index/knownGene.ix /gbdb/hg16/knownGene.ix ln -s /cluster/data/hg16/bed/knownGenes.2004-01-29/index/knownGene.ixx /gbdb/hg16/knownGene.ixx # VEGA GENES UPDATE from 2004/01/15 below (2004-02-04 - Hiram) mv ~/hg16/bed/vega ~/hg16/bed/vega.badcds mkdir /cluster/data/hg16/bed/vegaUpdate cd /cluster/data/hg16/bed/vegaUpdate wget --timestamping ftp://ftp.sanger.ac.uk/pub/searle/*.gtf.gz # Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks. Just # omit snoRNAs, as there are so few of them zcat *.gtf.gz | awk '!(/small nucleolar RNA/ || $2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}' > vegaGene.gtf zcat *.gtf.gz | awk '($2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}' > vegaPseudoGene.gtf ldHgGene -gtf hg16 vegaGeneUpdate vegaGene.gtf ldHgGene -gtf hg16 vegaPseudoGeneUpdate vegaPseudoGene.gtf wget http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql hgsql -e "load data local infile 'vegaInfo.tab' into table vegaInfo" hg16 # LOAD VEGA GENES AND PSEUDOGENES (reloaded 2004/01/15 markd) # reloaded due to bug in creating bogus CDS mv ~/hg16/bed/vega ~/hg16/bed/vega.badcds mkdir ~/hg16/bed/vega cd ~/hg16/bed/vega wget http://www.sanger.ac.uk/Users/keenan/vega_homo_sapiens_ncbi34.gtf.gz # Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks. Just # omit snoRNAs, as there are so few of them zcat vega_homo_sapiens_ncbi34.gtf.gz | awk '!(/small nucleolar RNA/ || $2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}' > vegaGene.gtf zcat vega_homo_sapiens_ncbi34.gtf.gz | awk '($2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}' > vegaPseudoGene.gtf ldHgGene -gtf hg16 vegaGene vegaGene.gtf ldHgGene -gtf hg16 vegaPseudoGene vegaPseudoGene.gtf wget http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql hgsql -e "load data local infile 'vegaInfo.tab' into table vegaInfo" hg16 # KNOWN GENES UPDATE (DONE - 2004-01-29 - Hiram) # RELOADED THE cgapBiocDesc AND cgapAlias TABLES TO REMOVE REPLICATED ROWS # (DONE, 2005-07-26, hartera) # RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06) # update swissProt and proteins databases # You want to run these two scripts on the same day to keep the # the date stamp consistent. In this case the data stamp is 040115 ssh eieio cd /cluster/data/swissprot ~kent/src/hg/protein/mkSwissProtDB.sh # that obtains the data and unpacks it, second step is on hgwdev # to create the database ssh hgwdev cd /cluster/data/swissprot ~/kent/src/hg/protein/mkSwissProtDB.sh # Now the proteins database can be created from that. Must be on # hgwdev ssh hgwdev cd /cluster/data/proteins ~/kent/src/hg/protein/mkProteinsDb.sh 040115 # prepare all the tables in a temporary database, then move # into Hg16. Leave a link in hg16/bed so it can be found mkdir /cluster/data/kgDB/bed/hg16 ln -s /cluster/data/kgDB/bed/hg16 \ /cluster/data/hg16/bed/knownGenes.2004-01-29 cd /cluster/data/kgDB/bed/hg16 ~/kent/src/hg/protein/KGprocess.sh kgDB hg16 040115 # That runs to a point that prepares a cluster job, continuing on kk ssh kk cd /cluster/data/kgDB/bed/hg16/kgBestMrna para create jobList para try para push ... etc ... # on a busy cluster, takes almost an hour: # Completed: 46583 of 46583 jobs # CPU time in finished jobs: 127351s 2122.51m 35.38h 1.47d 0.004 y # IO & Wait Time: 119182s 1986.37m 33.11h 1.38d 0.004 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest job: 14s 0.23m 0.00h 0.00d # Submission to last job: 3513s 58.55m 0.98h 0.04d # Continuing back on hgwdev, run the same script again ssh hgwdev cd /cluster/data/kgDB/bed/hg16 ~/kent/src/hg/protein/KGprocess.sh kgDB hg16 040115 # should continue to completion, all tables are in kgDB and can be # moved if they check out to be similar to existing tables in hg16 # You can verify table sizes with the script: ~kent/src/hg/protein/checkTbls.pl kgDB ~kent/src/hg/protein/checkTbls.pl hg16 kg # should have similar row counts in each of these outputs # This rename can be done more simply with the 'rename' command # instead of the 'alter table' used here. cat << '_EOF_' > renameTables.sh #!/bin/sh SOURCE=kgDB TARGET=hg16 for T in cgapAlias cgapBiocDesc cgapBiocPathway dupSpMrna \ keggMapDesc keggPathway kgAlias kgProtAlias kgXref \ knownGene knownGeneLink knownGeneMrna knownGenePep mrnaRefseq spMrna do hgsql -e "drop table ${T};" ${TARGET} hgsql -e "alter table ${SOURCE}.${T} rename ${TARGET}.${T}" mysql echo "done $T" done '_EOF_' # << this line keeps emacs coloring happy chmod +x renameTables.sh ./renameTables.sh # RELOAD THE cgapBiocDesc AND cgapAlias TABLES (hartera, 2005-07-26) # Reload the cgapBiocDesc and cgapAlias tables as they have replicated # rows. Need to sort and unique the file before loading into the database. cd /cluster/data/kgDB/bed/hg16 sort -u cgapBIOCARTAdesc.tab > cgapBIOCARTAdescSorted.tab # for cgapAlias, the number of rows in the table is different to the # tab file here so dump the table first. # RELOAD cgapAlias AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u # OR sort -n | uniq. #USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ(hartera, 2005-10-06) # hgsql -N -e 'select * from cgapAlias;' hg16 > cgapAliasDump.txt # above command used to get alias file from hg16 before sorting sort -n cgapAliasDump.txt | uniq > cgapAliasDumpSorted.tab hgsql hg16 -e "drop table cgapBiocDesc" hgsql hg16 -e "drop table cgapAlias" hgsql hg16 < ~/kent/src/hg/lib/cgapBiocDesc.sql hgsql hg16 < ~/kent/src/hg/lib/cgapAlias.sql hgsql hg16 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" \ into table cgapBiocDesc' hgsql hg16 -e 'load data local infile "cgapAliasDumpSorted.tab" \ into table cgapAlias' # the following extra process will be included in the next version # of KGprocess.sh to create the kgProtMap table: mkdir /cluster/data/kgDB/bed/hg16/kgProtMap cd /cluster/data/kgDB/bed/hg16/kgProtMap awk '{print ">" $1;print $2}' ../refMrna.tab > kgMrna.fa /scratch/blast/formatdb -i kgMrna.fa -p F echo "`date` creating kgPep.fa" hgsql -N -e 'select spID,seq from kgXref,knownGenePep where kgID=name' ${DB} \ | awk '{print ">" $1;print $2}' >kgPep.fa rm -fr kgPep rm -f jobList mkdir kgPep faSplit sequence kgPep.fa 5000 kgPep/kgPep for f in kgPep/*.fa do echo ./kgProtBlast.csh $f >> jobList done awk '{printf "%s\t%s\n", $3,$2}' ../kgXref.tab > kgProtMrna.pairs # run a cluster job ssh kk9 cd /cluster/data/kgDB/bed/hg16/kgProtMap para create jobList para try para push ... etc # Completed: 4949 of 4949 jobs # CPU time in finished jobs: 1061454s 17690.90m 294.85h 12.29d 0.034 y # IO & Wait Time: 13400s 223.33m 3.72h 0.16d 0.000 y # Average job time: 217s 3.62m 0.06h 0.00d # Longest job: 996s 16.60m 0.28h 0.01d # Submission to last job: 12152s 202.53m 3.38h 0.14d # back to hgwdev ssh hgwdev cd /cluster/data/kgDB/bed/hg16/kgProtMap find ./psl.tmp -name '*.psl.gz' | xargs zcat | \ pslReps -nohead stdin psl.tmp/kgProtMrna.psl /dev/null cd psl.tmp (pslMap kgProtMrna.psl ../../tight_mrna.psl stdout | \ sort -k 14,14 -k 16,16n -k 17,17n > kgProtMap.psl) > kgProtMap.out 2>&1 # this table data is ready to load, verify it by comparison with # existing kgProtMap data, then load: hgLoadPsl hg16 kgProtMap.psl # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN (DONE 2/18/04 angie) # In an email 2/13/04, Arian said we could treat all human repeats as # lineage-specific for human-chicken blastz. Scripts expect *.out.spec # filenames, so set that up: ssh kkr1u00 cd /cluster/data/hg16 mkdir /iscratch/i/gs.17/build34/linSpecRep.Chicken foreach f (/scratch/hg/gs.17/build34/rmsk/chr*.fa.out) cp -p $f /iscratch/i/gs.17/build34/linSpecRep.Chicken/$f:t:r:r.out.spec end iSync # Use these the next time we run human-chicken blastz. # BLASTZ CHICKEN (GALGAL2) (DONE 2/26/04 angie) ssh kk # space is awful tight on store4 -- use store7. mkdir -p /cluster/store7/hg16/bed/blastz.galGal2.2004-02-25 ln -s /cluster/store7/hg16/bed/blastz.galGal2.2004-02-25 \ /cluster/data/hg16/bed/ cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25 # Set L=10000 (higher threshold on blastz's outer loop) and abridge # repeats. cat << '_EOF_' > DEF # human vs. chicken export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Specific settings for chicken (per Webb email to Brian Raney) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.Chicken SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chicken SEQ2_DIR=/iscratch/i/galGal2/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/store7/hg16/bed/blastz.galGal2.2004-02-25 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # first cluster run: raw blastz alignments ssh kk bash # if a csh/tcsh user cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25 source DEF mkdir $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, check, push, check, .... #Completed: 51189 of 51189 jobs #Average job time: 477s 7.95m 0.13h 0.01d #Longest job: 2318s 38.63m 0.64h 0.03d #Submission to last job: 29598s 493.30m 8.22h 0.34d # second cluster run: lift raw alignments -> lav dir ssh kki bash # if a csh/tcsh user cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25 source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList para try, check, push, etc ... #Completed: 339 of 339 jobs #Average job time: 6s 0.11m 0.00h 0.00d #Longest job: 21s 0.35m 0.01h 0.00d #Submission to last job: 150s 2.50m 0.04h 0.00d # third run: lav -> axt ssh kki cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25 mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | $HOME/bin/x86_64/lavToAxt stdin \ /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/galGal2/nib stdout \ | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check #Completed: 42 of 42 jobs #Average job time: 38s 0.64m 0.01h 0.00d #Longest job: 147s 2.45m 0.04h 0.00d #Submission to last job: 147s 2.45m 0.04h 0.00d # RUN AXTBEST AND GENERATE MAF FOR MULTIZ (DONE 2/26/04 angie) ssh kolossus cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25 mkdir axtBest pslBest foreach chrdir (lav/chr*) set chr=$chrdir:t echo axtBesting $chr axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300 axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/$chr.psl end mkdir mafBest foreach f (axtBest/chr*.axt) set maf = mafBest/$f:t:r.hg.maf axtToMaf $f \ /cluster/data/hg16/chrom.sizes /cluster/data/galGal2/chrom.sizes \ $maf -tPrefix=hg16. -qPrefix=galGal2. end ssh hgwdev cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25 cat pslBest/chr*.psl | hgLoadPsl -table=blastzBestGalGal2 hg16 stdin # CHAIN CHICKEN BLASTZ (DONE 2/26/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Make our own linear gap file with reduced gap penalties, # in hopes of getting longer chains: cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter -notQ=chrUn $1 \ | axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap \ -minScore=5000 stdin \ /iscratch/i/gs.17/build34/bothMaskedNibs \ /iscratch/i/galGal2/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # axtChrom/chr1{8,9}_random.axt are empty, so the {out line +} checks # failed: #Completed: 40 of 42 jobs #Crashed: 2 jobs #Average job time: 28s 0.46m 0.01h 0.00d #Longest job: 76s 1.27m 0.02h 0.00d #Submission to last job: 92s 1.53m 0.03h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # Load chains into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain hg16 ${c}_chainGalGal2 $i end # RESCORE CHICKEN BLASTZ (DONE 3/1/04 angie) # Webb noticed low scores in latest runs with repeats abridged -- # PSU's restore_rpts program rescored alignments with default matrix # instead of BLASTZ_Q matrix. Rescore them here so the chainer sees # the higher scores: ssh kolossus cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25 mkdir axtChrom.rescore foreach f (axtChrom/chr*.axt) axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \ $f axtChrom.rescore/$f:t end mv axtChrom axtChrom.orig mv axtChrom.rescore axtChrom # NET HUMAN BLASTZ (DONE 2/26/04 angie) ssh kksilo cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain netClass noClass.net hg16 galGal2 human.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain netFilter -minGap=10 human.net | hgLoadNet hg16 netGalGal2 stdin netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 netSyntenyGalGal2 stdin # Add entries for chainGalGal2, netGalGal2, syntenyGalGal2 to # human/hg16 trackDb # MAKE VSGALGAL2 DOWNLOADABLES (DONE 3/1/04 angie) ssh kksilo cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25 # Webb asked for axtChrom/chr22.axt... since axtChrom is rel. small # this time, just put it all out there. zip /cluster/data/hg16/zip/GGaxtChrom.zip axtChrom/chr*.axt cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain cp all.chain chicken.chain zip /cluster/data/hg16/zip/chicken.chain.zip chicken.chain rm chicken.chain cp human.net chicken.net zip /cluster/data/hg16/zip/chicken.net.zip chicken.net rm chicken.net cp humanSyn.net chickenSyn.net zip /cluster/data/hg16/zip/chickenSyn.net.zip chickenSyn.net rm chickenSyn.net ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/hg16/vsGalGal2 cd /usr/local/apache/htdocs/goldenPath/hg16/vsGalGal2 mv /cluster/data/hg16/zip/GGaxtChrom.zip axtChrom.zip mv /cluster/data/hg16/zip/chicken*.zip . md5sum *.zip > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MULTIZ HUMAN/MOUSE/RAT/GALGAL2 (DONE 3/8/04 angie) # (galGal2 added to human/mouse/rat alignments described above [HUMOR]) # put the MAFs on bluearc ssh eieio mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf \ /cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf \ /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg ssh kki mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2 cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2 mkdir hmrg # Wrapper script required because of stdout redirect: cat << '_EOF_' > doMultiz #!/bin/csh /cluster/bin/penn/multiz $1 $2 - > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doMultiz rm -f jobList foreach file (/cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr/*.maf) set root=$file:t:r:r echo "doMultiz /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg/${root}.hg.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/${root}.maf" >> jobList end para create jobList para try, check, push, check #Completed: 41 of 41 jobs #Average job time: 88s 1.47m 0.02h 0.00d #Longest job: 276s 4.60m 0.08h 0.00d #Submission to last job: 278s 4.63m 0.08h 0.00d # clean up bluearc (these are big files!) rm -r /cluster/bluearc/multiz.hg16mm3rn3galGal2 # setup external files for database reference ssh hgwdev mkdir -p /gbdb/hg16/multizMm3Rn3GalGal2 ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/*.maf \ /gbdb/hg16/multizMm3Rn3GalGal2 # load into database /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3GalGal2 # LOAD SOFTBERRY GENES (DONE - 2004-02-10 - Hiram) mkdir -p /cluster/data/hg16/bed/softberry cd /cluster/data/hg16/bed/softberry set file = Soft_fgenesh_jul03.tar.gz wget --timestamping ftp://www.softberry.com/pub/SC_HUM_JUL03/$file tar xzvf $file ldHgGene hg16 softberryGene fgenesh_jul03/chr*.gff hgPepPred hg16 softberry fgenesh_jul03/*.protein hgSoftberryHom hg16 fgenesh_jul03/*.protein # CHIMP (panTro1) ALIGNMENTS (2004-02-12 kate) # lift scaffold-based reciprocal best chains to chrom coordinates ssh eieio mkdir -p bed/blastz-blat.panTro1 cd bed/blastz-blat.panTro1 cp /cluster/data/pt0/bed/blastz-blatHg16/human.best.chain \ best.scaffolds.chain cp /cluster/data/panTro1/jkStuff/scaffolds.lft scaffolds.lft ~kate/bin/i386/liftUp -chainQ best.chain scaffolds.lft \ warn best.scaffolds.chain #Make a track from Tarjei's chimp deletions file (2/12/04, kpollard) # 80-12000 bp indels in human/chimp alignments #make .bed files from Tarjei's .fa files cd /cluster/data/panTro1/bed/indels /cluster/bin/i386/faSimplify indels.human.fa , , temp.fa /cluster/bin/i386/faSize detailed=on temp.fa > human.start.txt /cluster/bin/i386/faSimplify indels.human.fa ">" , temp.fa /cluster/bin/i386/faSize detailed=on temp.fa > human.chr.txt R #Commands in R chr<-read.table("human.chr.txt") #read in chromosome and size start<-read.table("human.start.txt") #read in start and size both<-cbind(chr,start) #concatinate: chrN size start size sum(both[,2]!=both[,4]) #check that the size columns are identical #0 both[,4]<-both[,2]+both[,3] #add start and size to get stop both<-both[,c(1,3,4,2)] #reorder columns to get chrN start stop size both[,4]<-paste("CD",1:length(both[,4]),"_",both[,4],sep="") #make name like CDN_size write(t(both),"indels.human.bed",ncol=4) #write bed file q() #quit #delimit with tabs cat indels.human.bed | gawk '{print $1"\t"$2"\t"$3"\t"$4}' > indels.human.tab.bed #load track into browser mkdir -p /gbdb/hg16/hg_insert ln -s /cluster/data/panTro1/bed/indels/indels.human.tab.bed /gbdb/hg16/hg_insert cd /cluster/data/panTro1/bed/indels /cluster/bin/i386/hgLoadBed hg16 hg_insert indels.human.tab.bed #change name to humanDels hgsql hg16 rename table hg_insert to chimpDels; exit #add description file chimpDels.html # to ~/kent/src/hg/makeDb/trackDb/human/hg16 #add a track entry to trackDb.ra # in ~/kent/src/hg/makeDb/trackDb/human/hg16 # FAMILY BROWSER UPDATE (DONE - 2004-02-17 - Hiram) # to be done after knownGene tables are complete from known gene # process. # # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables ssh hgwdev mkdir /cluster/data/hg16/bed/famBro.2004-02-17 ln -l /cluster/data/hg16/bed/famBro.2004-02-17 /cluster/data/hg16/bed/famBro cd /cluster/data/hg16/bed/famBro hgClusterGenes hg16 knownGene knownIsoforms knownCanonical # Extract peptides from knownGenes into fasta file # and create a blast database out of them. mkdir /cluster/data/hg16/bed/famBro/blastp cd /cluster/data/hg16/bed/famBro/blastp pepPredToFa hg16 knownGenePep known.faa # You may need to build this binary in src/hg/near/pepPredToFa /scratch/blast/formatdb -i known.faa -t known -n known # This command is in /projects/compbio/bin/$MACH/formatdb # Copy over database to bluearc rm -fr /cluster/bluearc/hg16/blastp mkdir -p /cluster/bluearc/hg16/blastp cp -p /cluster/data/hg16/bed/famBro/blastp/known.* /cluster/bluearc/hg16/blastp # Load up cluster/bluearc with blastp and related files # if necessary if (! -e /cluster/bluearc/blast/blastall) then mkdir -p /cluster/bluearc/blast cp /projects/compbio/bin/i686/blastall /cluster/bluearc/blast mkdir -p /cluster/bluearc/blast/data cp /projects/compbio/bin/i686/data/* /cluster/bluearc/blast/data endif # Split up fasta file into bite sized chunks for cluster cd /cluster/data/hg16/bed/famBro/blastp mkdir split faSplit sequence known.faa 8000 split/kg # Make parasol run directory (this would not work on kk, use kk9 instead) # Need to check the difference between the blast in /scratch/blast # and this /cluster/bluearc/blast ssh kk9 mkdir /cluster/data/hg16/bed/famBro/blastp/self cd /cluster/data/hg16/bed/famBro/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data export BLASTMAT /cluster/bluearc/blast/blastall -p blastp \ -d /cluster/bluearc/hg16/blastp/known -i $1 -o $2 \ -e 0.01 -m 8 -b 1000 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # 'ls ../../split/*.fa' is too much, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~15 minutes if the cluster is free. # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 73213s 1220.22m 20.34h 0.85d 0.002 y # IO & Wait Time: 20054s 334.23m 5.57h 0.23d 0.001 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest job: 118s 1.97m 0.03h 0.00d # Submission to last job: 1117s 18.62m 0.31h 0.01d # Load into database. This takes about an hour. ssh hgwdev cd /cluster/data/hg16/bed/famBro/blastp/self/run/out hgLoadBlastTab hg16 knownBlastTab *.tab # Scanning through 7748 files # Loading database with 11376875 rows cd /cluster/data/hg16/bed/famBro # Create table that maps between known genes and RefSeq hgMapToGene hg16 refGene knownGene knownToRefSeq # may need to build this command in src/hg/near/hgMapToGene # row count changed from 32674 to 35416 # Create table that maps between known genes and LocusLink hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg16 \ > refToLl.txt hgMapToGene hg16 refGene knownGene knownToLocusLink -lookup=refToLl.txt # row count went from 32845 to 35146 # Create table that maps between known genes and Pfam domains hgMapViaSwissProt hg16 knownGene name proteinID Pfam knownToPfam # row count went from 31201 to 32225 # JK Fixed bug that let multiple identical columns happen in knownToPfam # on April 15, 2004. Row count now 30467 # Create table to map between known genes and GNF Atlas2 # expression data. hgMapToGene hg16 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' # Create expression distance table - takes about an hour # (Regenerated April 16, 2004 in response to knownToGnfAtlas2 update) hgExpDistance hg16 hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 # Create a table that maps between known genes and # the nice affy expression data. hgMapToGene "-type=bed 12" hg16 affyUcla knownGene knownToU133 # row count went from 34148 to 36818 # Create expression distance table. This will take about an hour. cd ~/kent/src/hg/near/hgExpDistance time hgExpDistance hg16 affyUcla affyUclaExp knownExpDistance \ -weights=affyUcla.weight -lookup=knownToU133 # 42 genes, 42 weights, 26.500000 total wieght # Got 36818 unique elements in affyUcla # Made knownExpDistance.tab # Loaded knownExpDistance # Made query index # real 80m50.113s # user 62m33.290s # sys 2m15.200s # This command should be done elsewhere, /tmp or something like that # It makes a temporary .tab file of almost 1 Gb # row count went from 34148000 to 36818000 # Create table that maps between known genes and # the GNF data. hgMapToGene hg16 affyU95 knownGene knownToU95 cd /tmp # hgFixed.gnfHumanU95Exps argument is unused, no need to exist hgExpDistance hg16 hgFixed.gnfHumanU95MedianRatio hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 # row count went from 11718000 to 17330000 # original makeNear.doc had this as: # hgExpDistance hg16 affyGnfU95 affyGnfU95Exps knownGnfDistance -lookup=knownToU95 # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. # I update this GO database very carefully, checking that all # structures in it remain the same from release to release and # backing up the current go DB in a backup database. In this case # the backup is go040107 - when it was loaded for Mm4, and the new # go database is based on data from Dec 17th 2003 and Feb 2004 according # to the time stamp on the fetched data. This build was done in # /cluster/store1/geneOntology/20040217 cd /cluster/data/hg16/bed/famBro # Create knownToEnsembl column hgMapToGene hg16 ensGene knownGene knownToEnsembl # table row count went from previous version: 36068 to 38251 # Make knownToCdsSnp column. This is a little complicated by # having to merge data form the snpTsc and the snpNih tracks. hgMapToGene hg16 snpTsc knownGene knownToCdsSnp -createOnly -all -cds hgMapToGene hg16 snpTsc knownGene snp1 -noLoad -all -cds hgMapToGene hg16 snpNih knownGene snp2 -noLoad -all -cds sort snp1.tab snp2.tab > knownToCdsSnp.tab rm snp1.tab snp2.tab hgsql \ -e 'load data local infile "knownToCdsSnp.tab" into table knownToCdsSnp;' \ hg16 # row count went from 87273 to 106199 # Make C. elegans ortholog column using blastp on wormpep. # First make C. elegans protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/ce1/blastp should have data # Create the ceBlastTab (the blastall binary only works on kk9 for now ...) ssh kk9 mkdir /cluster/data/hg16/bed/famBro/blastp/ce1 cd /cluster/data/hg16/bed/famBro/blastp/ce1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \ -p blastp -d /cluster/bluearc/ce1/blastp/wormPep \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # This should finish in ~10 minutes if the cluster is free. # Here's the para time results # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 28869s 481.16m 8.02h 0.33d 0.001 y # IO & Wait Time: 20454s 340.89m 5.68h 0.24d 0.001 y # Average job time: 6s 0.11m 0.00h 0.00d # Longest job: 52s 0.87m 0.01h 0.00d # Submission to last job: 584s 9.73m 0.16h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg16/bed/famBro/blastp/ce1/run/out hgLoadBlastTab hg16 ceBlastTab -maxPer=1 *.tab # row count went from 25599 to 26958 # Make mouse ortholog column using blastp on mouse known genes. # First make mouse protein database and copy it to cluster/bluearc # if it doesn't exist already # This already exists. See makeMm4.doc for procedure # the directory: /cluster/bluearc/mm4/blastp should have data # Make parasol run directory ssh kk9 mkdir /cluster/data/hg16/bed/famBro/blastp/mm4 cd /cluster/data/hg16/bed/famBro/blastp/mm4 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \ -p blastp -d /cluster/bluearc/mm4/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # (wordLine wouldn't run on kk9: # wordLine: /lib/i686/libc.so.6: version `GLIBC_2.3' not found # run this echo statement on hgwdev # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # takes about 15 minutes: # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 54179s 902.98m 15.05h 0.63d 0.002 y # IO & Wait Time: 20428s 340.47m 5.67h 0.24d 0.001 y # Average job time: 10s 0.16m 0.00h 0.00d # Longest job: 76s 1.27m 0.02h 0.00d # Submission to last job: 2031s 33.85m 0.56h 0.02d # Load into database. ssh hgwdev cd /cluster/data/hg16/bed/famBro/blastp/mm4/run/out hgLoadBlastTab hg16 mmBlastTab -maxPer=1 *.tab # Scanning through 7748 files # Loading database with 35611 rows # row count went from 33191 to 35611 # REFSEQ HOMOLOGS (DONE 6/18/04 angie) # Translate mmBlastTab's knownGene acc's into RefSeq where possible, # since our users frequently ask for help in determining homologs for # human/mouse RefSeq accs... ssh hgwdev hgsql hg16 -e \ 'create table mmRefSeqHomolog \ select hg16.knownToRefSeq.value as name, \ mm3.knownToRefSeq.value as homolog, \ mmBlastTab.identity, mmBlastTab.aliLength, mmBlastTab.mismatch, \ mmBlastTab.gapOpen, mmBlastTab.qStart, mmBlastTab.qEnd, \ mmBlastTab.tStart, mmBlastTab.tEnd, mmBlastTab.eValue , \ mmBlastTab.bitScore \ from mmBlastTab, hg16.knownToRefSeq, mm3.knownToRefSeq \ where hg16.knownToRefSeq.name = mmBlastTab.query and \ mm3.knownToRefSeq.name = mmBlastTab.target;' # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dr1/blastp should have data # Make parasol run directory ssh kk9 mkdir /cluster/data/hg16/bed/famBro/blastp/dr1 cd /cluster/data/hg16/bed/famBro/blastp/dr1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \ -p blastp -d /cluster/bluearc/dr1/blastp/ensembl \ -i $1 -o $2 -e 0.005 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 40575s 676.24m 11.27h 0.47d 0.001 y # IO & Wait Time: 19781s 329.69m 5.49h 0.23d 0.001 y # Average job time: 8s 0.13m 0.00h 0.00d # Longest job: 95s 1.58m 0.03h 0.00d # Submission to last job: 2036s 33.93m 0.57h 0.02d # Load into database. ssh hgwdev cd /cluster/data/hg16/bed/famBro/blastp/dr1/run/out hgLoadBlastTab hg16 drBlastTab -maxPer=1 *.tab # Scanning through 7748 files # Loading database with 32204 rows # row count went from 30339 to 32204 # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq. # First make protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/sc1/blastp should have data # Make parasol run directory ssh kk9 mkdir /cluster/data/hg16/bed/famBro/blastp/sc1 cd /cluster/data/hg16/bed/famBro/blastp/sc1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \ -p blastp -d /cluster/bluearc/sc1/blastp/sgd \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 8577s 142.96m 2.38h 0.10d 0.000 y # IO & Wait Time: 19756s 329.26m 5.49h 0.23d 0.001 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest job: 15s 0.25m 0.00h 0.00d # Submission to last job: 1172s 19.53m 0.33h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg16/bed/famBro/blastp/sc1/run/out hgLoadBlastTab hg16 scBlastTab -maxPer=1 *.tab # row count went from 17089 to 17886 # Make Drosophila melanagaster ortholog column using blastp on FlyBase. # First make SwissProt protein database and copy it to cluster/bluearc # if it doesn't exist already # This is already done, see makeMm3.doc for procedure # the directory: /cluster/bluearc/dm1/blastp should have data # Make parasol run directory ssh kk9 mkdir /cluster/data/hg16/bed/famBro/blastp/dm1 cd /cluster/data/hg16/bed/famBro/blastp/dm1 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \ -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \ -i $1 -o $2 -e 0.01 -m 8 -b 1 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a couple of minutes, and do a para check, if all is good # then do a para push # Completed: 7748 of 7748 jobs # CPU time in finished jobs: 33371s 556.18m 9.27h 0.39d 0.001 y # IO & Wait Time: 19546s 325.77m 5.43h 0.23d 0.001 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest job: 53s 0.88m 0.01h 0.00d # Submission to last job: 1657s 27.62m 0.46h 0.02d # Load into database. ssh hgwdev cd /cluster/data/hg16/bed/famBro/blastp/dm1/run/out hgLoadBlastTab hg16 dmBlastTab -maxPer=1 *.tab # Scanning through 7748 files # Loading database with 28645 rows # row count went from 27173 to 28645 LOAD SNPS (Done. Daryl Thomas; February 18, 2004) # SNP processing has been condensed into a single script, # which makes snpNih, snpTsc, and snpMap # ${HOME}/kent/src/hg/snp/locations/processSnpLocations.csh # snpBuild = 119 # Run from directory $oo/bed/snp/build$snpBuild/snpMap mkdir -p $oo/bed/snp/build$snpBuild/snpMap cd $oo/bed/snp/build$snpBuild/snpMap processSnpLocations.csh hg16 human 34_2 119 >& log & # check data: # wc -l snpTsc.bed; hg16 -e "select count(*) from snpTsc; # wc -l snpNih.bed; hg16 -e "select count(*) from snpNih; # wc -l snpMap.bed; hg16 -e "select count(*) from snpMap; # select * from snpNih limit 5; desc snpNih; show indexes from snpNih" # select * from snpTsc limit 5; desc snpTsc; show indexes from snpTsc" # select * from snpMap limit 5; desc snpMap; show indexes from snpMap" # remove temp files # rm human* *bed.gz LOAD SNP DETAILS (Done. Daryl Thomas; February 18, 2004) # SNP processing has been condensed into a single script, # which makes dbSnpRsHg # ${HOME}/kent/src/hg/snp/details/processSnpDetails.csh # snpBuild = 119 # Run from directory $oo/bed/snp/build$snpBuild/snpMap mkdir -p $oo/bed/snp/build$snpBuild/details/Done mkdir -p $oo/bed/snp/build$snpBuild/details/Observed cd $oo/bed/snp/build$snpBuild/details processSnpDetails.csh hg16 human 119 >& log & load data local infile "$fileBase.out" into table $database.$table gzip $fileBase.out # check data: # hgFixed -e "select count(*) from dbSnpRsHg; # select * from dbSnpRSHg limit 5; desc dbSnpRsHg; show indexes from dbSnpRSHg" # remove temp files # rm dbSnpRs* # LOAD SNPS ( Daryl Thomas; February ??, 2005) set db = hg16 set org = human set build = 122 set dir = /cluster/bluearc/snp/$db/build$build # ssh to some quiet machine with fast access to the bluearc # it takes ~4.5 hours to download the data # (build 124 directly to /cluster/bluearc/... from eieio) # Check to make sure the chrMT file is included mkdir -p $dir $dir/ds_ch.xml $dir/det $dir/str $dir/loc $dir/seq cd $dir ln -s /cluster/data/$db/jkStuff/liftAll.lft . screen ftp ftp.ncbi.nih.gov cd snp/$org/XML prompt mget ds_ch*.xml.gz exit # screen exit # machine # TODO: check chromStart for each locType cp -f {$HOME}/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts chmod 775 /cluster/bin/scripts/parseDbSnpXML ssh kk touch jobList foreach file ( /cluster/bluearc/snp/$db/build$build/ds_ch*.xml.gz ) set out = $file:t:r echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/$db/build$build $out.contig >> jobList end # para create jobList; para push; para check ... # CPU time in finished jobs: 28235s 470.58m 7.84h 0.33d 0.001 y # IO & Wait Time: 1986s 33.10m 0.55h 0.02d 0.000 y # Average job time: 1119s 18.65m 0.31h 0.01d # Longest job: 2339s 38.98m 0.65h 0.03d exit # kk mv -r $dir /cluster/data/$db/bed/snp/build$build set dir = /cluster/data/$db/bed/snp/build$build cd $dir ssh eieio # or wherever data is local # concatenate the details files to make it easier to lift (and load) time zcat det/ds_ch*.xml.contig.det.gz > $db.build$build.contig.bed # 16.120u 13.070s 1:35.26 30.6% 0+0k 0+0io 86pf+0w (hgwdev) time gzip $db.build$build.contig.bed # 102.307u 5.524s 1:48.97 98.9% 0+0k 0+0io 1pf+0w (eieio/store5) # some of the NT contigs are not in the liftSpec - this is expected as snps that map to # alternate assemblies (Celera) are in the original files, but we disregard their mappings. time liftUp $db.build$build.bed liftAll.lft warn $db.build$build.contig.bed.gz # 190.473u 18.873s 3:52.33 90.1% 0+0k 0+0io 1pf+0w (eieio/store5) time gzip $db.build$build.bed # 107.476u 5.286s 1:54.25 98.6% 0+0k 0+0io 0pf+0w ssh hgwdev # or wherever database is located # hgLoadBed is the important step - check to make sure there are no warnings time hgLoadBed $db snp $db.build$build.bed.gz -sqlTable=${HOME}/kent/src/hg/lib/snp.sql # Loaded 8722437 elements of size 16 # 206.170u 48.370s 35:59.52 11.7% 0+0k 0+0io 82994pf+0w # basic snp table is now loaded, but exception column needs to be updated # ~ 3 hours wall clock time from here to end # run queries from snpException.query against snp table mkdir -p /usr/local/apache/htdocs/qa/test-results/snpException/build$build cd /usr/local/apache/htdocs/qa/test-results/snpException/build$build time snpException $db 0 ${db}snpException > ${db}snpException.log chmod o+rx . chmod o+r * # 24.590u 34.150s 41:04.48 2.3% 0+0k 0+0io 191pf+0w # check alignment of flanking sequences time snpValid $db /cluster/data/$db/bed/snp/build$build/seq > ${db}snpValid.log # 4688.790u 172.770s 1:28:45.62 91.2% 0+0k 0+0io 23000pf+0w # 5205.860u 216.570s 1:55:10.27 78.4% 0+0k 0+0io 72408pf+0w (hgwdev) ### NOTE: the pseudoautosomal snps are reported in the chrX files ### only, which causes problems for snpValid when checking the ### chrY snp mappings. I got around this by confirming that all ### of the 'missing flank' errors (#23) were in pseudoautosomal ### regions and ignoring them. I manually truncated the ### hg17snpException.23.bed file before continuing with the next ### step. This could/should be fixed in the next iteration. # update snpExceptions table to match the number of exceptions found in the snpValid results # these numbers come from counting the numbers of lines in the output files without headers mysql> update snpExceptions set num=60797 where exceptionId=21; mysql> update snpExceptions set num=5657 where exceptionId=22; mysql> update snpExceptions set num=284098 where exceptionId=23; mysql> update snpExceptions set num=173 where exceptionId=24; # create list of statements to update the snp table and run them time tail +3 ${db}snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt # ~10 seconds time updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql # 36.270u 1.980s 0:38.27 99.9% 0+0k 0+0io 337pf+0w time hgsql $db < updateExceptionList.sql # 18.130u 26.680s 58:39.97 1.2% 0+0k 0+0io 413pf+0w build122 (had to optimize table during run) # 8.420u 10.370s 11:58.44 2.6% 0+0k 0+0io 413pf+0w build123 (this is mostly a mysql process) # 6.550u 9.370s 14:34.17 1.8% 0+0k 0+0io 413pf+0w build124 # > wc -l build12*/updateExceptionList.sql # 1110994 build122/updateExceptionList.sql # 387166 build123/updateExceptionList.sql # 383759 build124/updateExceptionList.sql # Add Affy SNPs from new submission #!/bin/csh -fe set db = hg16 cd /cluster/data/$db/bed/snp/affy/latest touch affy.txt affy.bed Affy.bed bed.tab rm -f affy*.txt affy*.bed Affy.bed* bed.tab # datafile was provided by Valmeekam, Venu [Venu_Valmeekam@affymetrix.com] tar xfz affyhg16maps.tgz wc -l affy*txt awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10K\t0\n", $1,$2,$3,$4,$6,$7);}' < affy10K.txt > affy10K.bed awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10Kv2\t0\n", $1,$2,$3,$4,$6,$7);}' < affy10Kv2.txt > affy10Kv2.bed awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_HindIII\t0\n",$1,$2,$3,$4,$6,$7);}' < affy50K_HindIII.txt > affy50K_HindIII.bed awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_XbaI\t0\n", $1,$2,$3,$4,$6,$7);}' < affy50K_XbaI.txt > affy50K_XbaI.bed # this is a temporary kluge to fix some bad input data. cat affy*.bed | sed 's/_par//' > Affy.bed # the source enum for 'dbSnp' is 2; all of the affy* values are higher. hgsql $db -e "delete from snp where source > 2 " hgLoadBed $db snp Affy.bed -oldTable -tab rm -f affy*.txt affy*.bed bed.tab gzip Affy.bed #mysql> select source, count(*) from snp group by source; #+-----------------+----------+ #| source | count(*) | #+-----------------+----------+ #| dbSnp | 8722437 | #| Affy10K | 11464 | #| Affy10Kv2 | 10128 | #| Affy50K_HindIII | 56965 | #| Affy50K_XbaI | 58646 | #+-----------------+----------+ #5 rows in set (52.96 sec) # March 7, 2005: fixed pseudoautosomal snps: #affy10Kv2.txt:chrX_par 1920780 1920781 SNP_A-1606360 0 ? C/T #affy10Kv2.txt:chrX_par 2047561 2047562 SNP_A-1510197 0 ? G/T #affy10Kv2.txt:chrX_par 2047486 2047487 SNP_A-1510243 0 ? A/G #affy10Kv2.txt:chrX_par 2060858 2060859 SNP_A-1606356 0 ? A/G #affy10Kv2.txt:chrX_par 2163964 2163965 SNP_A-1606329 0 ? C/T delete from snp where chrom = 'chrY' and name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329'); update snp set chrom = 'chrX' where name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329'); insert into snp select bin, 'chrY' as chrom, chromStart, chromEnd, name, score, strand, observed, molType, class, valid, avHet, avHetSE, func, locType, source, exception from snp where name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329'); select chrom, count(*) from snp where name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329') group by chrom; ### hapmapRecombRate (Daryl; September 19, 2005) # updated coordinates (Daryl; December 8, 2005) mkdir -p /cluster/data/hg16/bed/hapmap/recombination/HapMap_PhaseI/20051115 cd /cluster/data/hg16/bed/hapmap/recombination/HapMap_PhaseI/20051115 wget -N http://www.stats.ox.ac.uk/~cfreeman/HapMap_Phase1/genetic_map_HapMap_Phase1_UCSC.tar.gz tar xvfz genetic_map_HapMap_Phase1_UCSC.tar.gz tail --lines=+2 -q Gen_map_chr*_COMBINED_UCSC.txt | sed 's/_non_par//;s/_par1//;s/_par2//' | awk '{printf "%s\t%d\t%d\t%0.3f\n",$1,$2,$3,$4}' >! hg16.hapmapRecombRate.bed liftOver hg16.hapmapRecombRate.bed /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain.gz hg17.hapmapRecombRate.bed hg16ToHg17.unmapped hgLoadBed -bedGraph=4 hg16 hapmapRecombRate hg16.hapmapRecombRate.bed hgLoadBed -bedGraph=4 hg17 hapmapRecombRate hg17.hapmapRecombRate.bed rm -f bed.tab Gen_map_chr*.txt ### hapmapRecombHotspot (Daryl; September 19, 2005; chr X data update October 21, 2005) wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/Genomewidehots16a.txt wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/chrX_non_par_hotspots.txt wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/chrX_par1_hotspots.txt # this takes about 3 seconds to run rm -f hg*.hapmapRecombHotspots.bed tail +2 Genomewidehots16a.txt | awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' > hg16.hapmapRecombHotspots.bed tail +2 chrX_non_par_hotspots.txt | sed s/_non_par// | awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' >> hg16.hapmapRecombHotspots.bed tail +2 chrX_par1_hotspots.txt | sed s/_par1// | awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' >> hg16.hapmapRecombHotspots.bed liftOver hg16.hapmapRecombHotspots.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.hapmapRecombHotspots.bed hg16ToHg17.unmapped hgLoadBed hg16 hapmapRecombHotspots hg16.hapmapRecombHotspots.bed hgLoadBed hg17 hapmapRecombHotspots hg17.hapmapRecombHotspots.bed rm -f bed.tab ### encodeRecombHotspot (Daryl; December 8, 2005) mkdir -p /cluster/data/hg16/bed/hapmap/recombination/ENCODE_16c.1/hotspots cd /cluster/data/hg16/bed/hapmap/recombination/ENCODE_16c.1/hotspots wget -N http://www.stats.ox.ac.uk/~cfreeman/ENCODE_16c.1/Hotspots16c1.txt wget -N http://www.stats.ox.ac.uk/~cfreeman/ENCODE_16c.1/Readme_rates_hotspots.txt tail +2 Hotspots16c1.txt | sed 's/ENm010\.7p15\.2/chr7/;s/ENm013\.7q21\.13/chr7/;s/ENm014\.7q31\.33/chr7/;s/ENr112\.2p16\.3/chr2/;s/ENr113\.4q26/chr4/;s/ENr123\.12q12/chr12/;s/ENr131\.2q37\.1/chr2/;s/ENr213\.18q12\.1/chr18/;s/ENr232\.9q34\.11/chr9/;s/ENr321\.8q24\.11/chr8/' | awk '{printf "%s\t%d\t%d\n", $1, $3, $4}' > hg16.encodeRecombHotspot.bed liftOver hg16.encodeRecombHotspot.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.encodeRecombHotspot.bed hg16ToHg17.unmapped hgLoadBed hg16 encodeRecombHotspot hg16.encodeRecombHotspot.bed hgLoadBed hg17 encodeRecombHotspot hg17.encodeRecombHotspot.bed rm -f bed.tab *bed *unmapped ### Perlegen Recombination Rates and Hotspots (Daryl; December 9, 2005) # Home page: http://www.stats.ox.ac.uk/mathgen/Recombination.html mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen cd /cluster/data/hg16/bed/hapmap/recombination/Perlegen wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen/hotspots cd /cluster/data/hg16/bed/hapmap/recombination/Perlegen/hotspots wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/hotspots.zip unzip hotspots.zip tail +2 hotspots.txt | grep -v 1.51000 | awk '{printf "chr%s\t%d\t%d\n",$1,$3-1,$4}' > hg16.perlegenRecombHotspots.bed tail +2 coldspots.txt | grep -v "-" | awk '{printf "chr%s\t%d\t%d\n",$1,$3-1,$4}' > hg16.perlegenRecombColdspots.bed liftOver hg16.perlegenRecombHotspots.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombHotspots.bed hg16ToHg17.hots.unmapped liftOver hg16.perlegenRecombColdspots.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombColdspots.bed hg16ToHg17.cold.unmapped hgLoadBed hg16 perlegenRecombHotspots hg16.perlegenRecombHotspots.bed hgLoadBed hg17 perlegenRecombHotspots hg17.perlegenRecombHotspots.bed hgLoadBed hg16 perlegenRecombColdspots hg16.perlegenRecombColdspots.bed hgLoadBed hg17 perlegenRecombColdspots hg17.perlegenRecombColdspots.bed rm -f bed.tab hg1*ed *spots*txt mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen/rates cd /cluster/data/hg16/bed/hapmap/recombination/Perlegen/rates cp ../makeBed.pl . chmod ug+x makeBed.pl wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/recombination_rates.zip unzip recombination_rates.zip rm -f hg16.perlegenRecombRate.bed time ./makeBed.pl > hg16.perlegenRecombRate.bed cut -f1 hg16.perlegenRecombRate.bed | sort -u wc -l hg16.perlegenRecombRate.bed liftOver hg16.perlegenRecombRate.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombRate.bed hg16ToHg17.rates.unmapped hgLoadBed hg16 perlegenRecombRate hg16.perlegenRecombRate.bed hgLoadBed hg17 perlegenRecombRate hg17.perlegenRecombRate.bed rm -f bed.tab chr*_rates.txt hg1*ed # HapMap Linkage Disequilibrium (Daryl; January 2006) mkdir -p /cluster/data/hg16/bed/hapmap/ld_data/2005-10/data cd /cluster/data/hg16/bed/hapmap/ld_data/2005-10/data screen ftp www.hapmap.org cd ld_data/2005-10 prompt mget ld_chr*.txt.gz # look for consistency in max LD distance set out = maxDist.txt rm -f $out touch $out foreach f (ld_*.txt.gz) echo -n "$f " >> $out zcat $f | awk '{if ($2-$1>max) max=$2-$1} END {print max}' >> $out end # most should be 249999 grep -v 249999 maxDist.txt # look for consistency in line counts # ssh eieio; screen set out = wcList.txt rm -f $out touch $out # this takes about 2 hours to run completely on eieio (local disk)" foreach f (*.txt.gz) echo -n $f:r:r " " | sed 's/ld_//;s/chr//;s/_/\t/' >> $out zcat $f | cut -f1 -d " " | uniq | wc -l >> $out end # plot the sizes from wcList.txt by population (lines) # with chrom on the X axis and size on the Y axis. # look for anomalies mkdir ../bed cd ../bed # from the raw LD values, compute colors and encode cat << EOF > makeLdBed.pl #!/usr/bin/perl -W sub min ($$) { my $a = shift @_; my $b = shift @_; if ($a<$b) {return $a;} return $b; } sub encodeDprime($) { my $val = shift @_; if ( ($val > 1) || ($val < -1) ) { die "Dprime value ($val) is out of range [-1,1]";} elsif ($val>=0) { $ret = ord('a') + $val*9;} else { $ret = ord('A') - $val*9;} return chr($ret); } sub encodeRsquared($) { my $val = shift @_; if ( ($val > 1) || ($val < 0) ) { die "R^2 value ($val) is out of range [0,1]";} return encodeDprime($val); } sub encodeLod($$) { my $lod = shift @_; my $dPrime = shift @_; $ret = ord('a'); if ($lod>=2) # high LOD { if (abs($dPrime)<0.5) { $ret = ord('y'); } # high LOD, low D' -> pink else { $ret += min((int($lod-abs($dPrime)-1.5)), 9) ;} } elsif (abs($dPrime)>0.99) { $ret = ord('z'); } # high D', low LOD -> blue return chr($ret); } $inDir = shift||"data"; $outDir = shift||"bed"; $foo = ""; $bar = ""; @rest = (); @pops = ("CEU", "CHB", "JPT", "YRI"); foreach $pop (@pops) { opendir(DIR, $inDir) || die "can't open $inDir"; @hmFiles = grep {/^ld_/ && /_${pop}.txt.gz$/} readdir(DIR); #ld_chr22_CEU.txt.gz closedir(DIR); printf "\nPOP:\t$pop\t$#hmFiles\n"; foreach $hmFile (sort @hmFiles) { ($foo, $chrom, $bar) = split /_/, $hmFile; $chrom =~ s/chrx/chrX/; $chrom =~ s/chry/chrY/; $outfile = "$outDir/${pop}_${chrom}.bed"; if ((-e $outfile)||(-e "$outfile.gz")) { next; } $tmpFile = "/tmp/${pop}_${chrom}.bed"; printf("$inDir/$hmFile => $outfile.gz\t" . `date`); open(OUT, "> $tmpFile" ) || die "can't open $tmpFile"; open(IN, "zcat $inDir/$hmFile | " ) || die "can't open $inDir/$hmFile"; $line = ; chomp($line); ($chromStart, $chromEnd, $pop, $name, $marker2, $dprime, $rsquared, $lod, @rest) = split / /, $line; $ldCount = 1; while () { chomp(); ($chromStartNew, $chromEndNew, $pop, $nameNew, $marker2, $dprime, $rsquared, $lod, @rest) = split / /; if ($chromStart ne $chromStartNew) { $chromStart--; printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n"); $chromStart = $chromStartNew; $chromEnd = $chromEndNew; $name = $nameNew; $ldCount = 1; $dprimeList = encodeDprime($dprime); $rsquaredList = encodeRsquared($rsquared); $lodList = encodeLod($lod, $dprime); } elsif ($chromEndNew-$chromStartNew<250000) { $chromEnd = $chromEndNew; $ldCount++; $dprimeList .= encodeDprime($dprime); $rsquaredList .= encodeRsquared($rsquared); $lodList .= encodeLod($lod, $dprime); } } close(IN); $chromStart--; printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n"); close(OUT); system("gzip $tmpFile"); system("mv $tmpFile.gz $outDir"); } } EOF # chmod ug+x ./makeLdBed.pl ssh eieio screen time ./makeLdBed.pl # look for consistency in line counts # ssh eieio set out = wcList.txt rm -f $out touch $out foreach f (*.bed.gz) echo -n $f:r:r " " | sed 's/chr//g;s/_/\t/g' >> $out zcat $f | wc -l >> $out end # plot the sizes from wcList.txt by population (lines) # with chrom on the X axis and size on the Y axis. # look for anomalies # load data sed 's/hapmapLd/hapmapLdCeu/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16 sed 's/hapmapLd/hapmapLdChb/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16 sed 's/hapmapLd/hapmapLdJpt/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16 sed 's/hapmapLd/hapmapLdYri/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16 # The length of each of the three value vectors (rsquared, dprime, # and lod) is the same and is stored in the score field. # 30-40 minutes foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X) echo echo -n loading CEU chr${c} zcat CEU_chr${c}.bed.gz | wc -l hgLoadBed -noSort -oldTable -strict hg16 hapmapLdCeu CEU_chr${c}.bed.gz echo echo -n loading CHB chr${c} zcat CHB_chr${c}.bed.gz | wc -l hgLoadBed -noSort -oldTable -strict hg16 hapmapLdChb CHB_chr${c}.bed.gz echo echo -n loading JPT chr${c} zcat JPT_chr${c}.bed.gz | wc -l hgLoadBed -noSort -oldTable -strict hg16 hapmapLdJpt JPT_chr${c}.bed.gz echo echo -n loading YRI chr${c} zcat YRI_chr${c}.bed.gz | wc -l hgLoadBed -noSort -oldTable -strict hg16 hapmapLdYri YRI_chr${c}.bed.gz end rm -f bed.tab # Tajima's D (DONE -- 2005-06-04 -- Daryl) # Data from Chris Carlson in Debbie Nickerson's lab # Chris Carlson [csc47uwashingtonedu] set db=hg16 set dir=/cluster/data/$db/bed/tajdpoly/latest cd $dir set chain = "/gbdb/hg17/liftOver/hg17ToHg16.over.chain" foreach p (AD ED XD) # lift SNP tracks set f = $p.SNP.track set in = /cluster/data/hg17/bed/tajdpoly/latest/$f.bed4 set out = /cluster/data/hg16/bed/tajdpoly/latest/$f.$db liftOver $in $chain $out.$db.bed4 $out.$db.unmapped # lift tajd tracks set f = $p.tajd.track set in = /cluster/data/hg17/bed/tajdpoly/latest/$f.bedGraph set out = /cluster/data/hg16/bed/tajdpoly/latest/$f.$db liftOver $in $chain $out.bedGraph $out.unmapped # load SNP tracks set f = $p.SNP.track.hg16 echo `date` $f "=>" $f.bed4 hgLoadBed $db tajdSnp$p $f.bed4 head -3 $f* hgsql -e "select * from tajdSnp$p limit 3" $db # load tajd tracks set f = $p.tajd.track.$db echo `date` $f "=>" $f.bedGraph hgLoadBed -bedGraph=4 $db tajd$p $f.bedGraph head -3 $f* hgsql -e "select * from tajd$p limit 3" $db end # deleting elements that overlap with gaps -- tajd files have overlaps due to the windowing scheme (snps are not found in gaps) rm -f delete.sql touch delete.sql set $where="where t.chrom=g.chrom and (t.chromStart between g.chromStart and g.chromEnd or t.chromEnd between g.chromStart and g.chromEnd)" foreach p (AD ED XD SnpAD SnpED SnpXD) foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y) echo "select 'tajd$p' as pop, t.chrom, t.chromStart from tajd${p} t, chr${c}_gap g $where " | \ hgsql $db | grep -v pop | \ awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n", $1, $2, $3}' >> delete.sql end end $db < delete.sql # cleanup elements that didn't get deleted properly ## cleanup.pl #!/usr/bin/perl -W $pop=shift; while (<>) { if (/^(chr..?)\s+(\d+)/) { print "delete from tajd$pop where chrom='$1' and chromStart<$2 and chromEnd>$2;\n"; } } ## foreach p (AD ED XD) featureBits $db tajd$p gap -bed=$p.inGaps.bed cleanup.pl < $p.inGaps.bed $p | $db featureBits $db tajd$p gap -bed=$p.inGaps.bed ## should be empty now end # JAX ORTHOLOG (WORKING hiram 2004-02-20 ) # Add Jackson labs info cd /cluster/data/hg16/bed mkdir jaxOrtholog cd jaxOrtholog wget --timestamping ftp://ftp.informatics.jax.org/pub/reports/HMD_Human4.rpt # save a little space gzip HMD_Human4.rpt # this is a tricky one to parse. This .rpt file is plain text, no # tabs, with expected text columns to contain the data. We need to # convert this. Beware of table changes, you may need to rework # this each time if they change the data. Here is what we have # today, an example first line with text columns numbered: # 1234567 101234567 201234567 301234567 401234567 501234567 601234567 701234567 801234567 90123456 100123456 110123456 120123456 130123456 140123456 150123456 1 60123456 170123456 180123456 170 # MGI:1918914 71664 0610006F02Rik 10 syntenic D3 196410 MGC173 01 12q13.13 # ^ mgiId # ^ mouse chr # ^ mouseCm position # ^ possible Mouse band # Mouse-Human Symbol ^ # Human Symbol ^ # ^ Human Band(s) # This awk script picks out the correct columns, removes spaces, # picks the first of possibly several human band designations, # and decides if a mouse band has been specified cat << '_EOF_' > jaxToUCSC.awk /^MGI:/ { LAST=NF PREV=LAST-1 humanSymbol = substr($0,153,26) gsub(" ","",humanSymbol) Band = substr($0,179) gsub(" *$","",Band) gsub("^ *","",Band) mgiId = substr($0,1,31) gsub(" ","",mgiId) mouseSym = substr($0,63,26) gsub(" ","",mouseSym) mouseChr = substr($0,89,13) gsub(" ","",mouseChr) mouseCm = substr($0,102,9) gsub(" ","",mouseCm) mouseBand = substr($0,111,11) gsub(" ","",mouseBand) if (length(mouseBand) < 1) { mouseBand = "N/A" } printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", humanSymbol,Band, mgiId,mouseSym,mouseChr,mouseCm,mouseBand } '_EOF_' # << this line makes emacs coloring happy # then using that script to fix it: zcat HMD_Human4.rpt.gz | awk -f jaxToUCSC.awk > jaxOrtholog.tab # Drop (just in case), create and load the table: hgsql -e 'drop table jaxOrtholog;' hg16 hgsql hg16 < ~/kent/src/hg/lib/jaxOrtholog.sql hgsql -e \ 'load data local infile "jaxOrtholog.tab" into table jaxOrtholog;' hg16 # save a little space gzip jaxOrtholog.tab LOAD ACEMBLY (DONE - 2004-03-30 - Hiram) mkdir -p /cluster/data/hg16/bed/acembly cd /cluster/data/hg16/bed/acembly # Data is obtained from: # Danielle et Jean Thierry-Mieg mieg@ncbi.nlm.nih.gov wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.proteins.fasta.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.gff.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.mrnas.fasta.tar.gz wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.pfamhits.tar.gz tar xvzf acembly.ncbi_34.genes.gff.tar.gz tar xvzf acembly.ncbi_34.genes.proteins.fasta.tar.gz cd acembly.ncbi_34.genes.gff # chrom 6.gff is broken, it has a bogus number in the first column # where a 6 should be. Fix-up until I hear from the authors: mv x1.acemblygenes.6.gff x1.acemblygenes.6.gff.broken sed -e "s/^28212469/6/" x1.acemblygenes.6.gff.broken > x1.acemblygenes.6.gff # There are a number of start and end coordinates that are # in reversed order. Until I hear from the authors, I have # switched those coords: cat << '_EOF_' > fixupReversedBlocks #!/bin/sh for i in x1*.gff do echo -n "$i working ..." awk -F"\t" ' { if ($4 > $5) { printf "%s\t%s\t%s\t%s\t%s", $1, $2, $3, $5, $4 for ( i = 6; i <= NF; ++i ) { printf "\t%s", $i } printf "\n" } else print } ' $i > $i.fixed echo " done" done '_EOF_' # << this line makes emacs coloring happy chmod +x fixupReversedBlocks ./fixupReversedBlocks # Save just the floating-contig features to different files for lifting # and lift up the floating-contig features to chr*_random coords: # NOTE: file prefix (x1) has been added since build 31 foreach f (x1.acemblygenes.*.gff.fixed) set c=$f:r:e set c=$f:r:r:e egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \ perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff if (-e ../../../$c/lift/random.lft) then liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \ ctg-chr${c}_random.gff endif grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \ grep -v "^chr//" > chr$c.gff echo "done $c" end # that last grep strips out _random or floating contig lines from the # normal chrom gff, and add the "chr" prefix # Three of them end up empty, check for this and remove them # if necessary rm -f chr19_random.gff chr18_random.gff chrUn.gff # There was one error in a coordinate on chr17_random: # chr17_random acembly stop_codon -2 0 . + 1 gene_id M17S2; transcript_id M17S2.cDec03; # This line was removed (shows up as first line) from # chr17_random.gff before the database load #- Load into database: cd .. ldHgGene -gtf hg16 acembly acembly.ncbi_34.genes.gff/chr*.gff hgPepPred hg16 generic acemblyPep \ acembly.ncbi_34.genes.proteins.fasta/*.fasta # check that the track is OK checkTableCoords hg16 acembly # should display no errors # MAKE HUMAN-CHIMP OVER.CHAIN FOR LIFTOVER (DONE 3/2/04 angie) ssh kolossus mkdir /cluster/data/hg16/bed/bedOver/hg16toPt0 cd /cluster/data/hg16/bed/bedOver/hg16toPt0 # use the combined blastz-blat best human chain, but assign unique IDs # so that netChainSubset doesn't die: chainSort /cluster/data/pt0/bed/blastz-blatHg16/human.best.2.chain stdout \ | chainMergeSort stdin \ | chainSplit chain stdin # re-net with the new IDs: mkdir net foreach f (chain/*.chain) echo chaining $f chainNet $f /cluster/data/hg16/chrom.sizes \ /cluster/data/pt0/scaffold.sizes net/$f:t:r.net /dev/null end # Now get a single-cov subset as usual: mkdir subset foreach f (chain/*.chain) echo subsetting net/$f:t:r.net, $f to subset/$f:t netChainSubset net/$f:t:r.net $f subset/$f:t end cat subset/*.chain > /cluster/data/hg16/bed/bedOver/hg16Topt0.chain # make it available: ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/hg16/liftOver/ zip -j hg16Topt0.zip /cluster/data/hg16/bed/bedOver/hg16Topt0.chain # update README.txt # lift scaffold-based over.chain to chrom-based (2004-07-09 kate) ssh kksilo cd /cluster/data/hg16/bed/bedOver liftUp -chainQ hg16TopanTro1.chain /cluster/data/panTro1/jkStuff/scaffolds.lft warn hg16Topt0.chain # NOTE: these chains appear to be broken up -- try using all chains, # instead of reciprocal best ssh kolossus cd /cluster/data/hg16/bed/blastz-blat.panTro1 netChainSubset human.net all.chain over.chain # load just for ENCODE dev hgLoadChain hg16 liftOverPanTro1Chain over.chain # TODO: delete table ssh kolossus cd /cluster/data/hg16/bed/blastz-blat.panTro1 chainSwap \ /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap/all.newId.chain \ all.newId.swp.chain chainSplit chain.newId all.newId.swp.chain mkdir preNet cd chain.newId cat > preNet.csh << 'EOF' foreach i (*.chain) echo pre-netting $i chainSort $i stdout | \ chainPreNet stdin /cluster/data/hg16/chrom.sizes \ /cluster/data/panTro1/chrom.sizes ../preNet/$i end 'EOF' csh preNet.csh >&! preNet.log & tail -100f preNet.log cd .. # << for emacs mkdir n1 cd preNet cat > net.csh << 'EOF' foreach i (*.chain) set n = $i:r.net echo netting $i chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \ /cluster/data/panTro1/chrom.sizes ../n1/$n /dev/null end 'EOF' csh net.csh >&! net.log & tail -100f net.log cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # GOT HERE ssh hgwdev cd /cluster/data/hg16/bed/blastz-blat.panTro1 netClass hNoClass.net hg16 panTro1 chimp.newId.net # chain files from the net ssh kolossus cd /cluster/data/hg16/bed/blastz-blat.panTro1 netChainSubset chimp.newId.net all.newId.swp.chain over.newId.chain cp over.newId.chain \ /cluster/data/hg16/bed/liftOver/hg16ToPanTro1.newId.over.chain mv hg16TopanTro1.chain hg16Topantro1.chain.old cd /cluster/data/hg16/bed/liftOver ln -s hg16ToPanTro1.newId.over.chain hg16TopanTro1.chain ssh hgwdev cd /cluster/data/hg16/bed/blastz-blat.panTro1 hgLoadChain hg16 liftOverPanTro1NewIdChain over.newId.chain # MAKE HUMAN-CHICKEN OVER.CHAIN FOR LIFTOVER (DONE 3/2/04 angie) ssh kolossus mkdir /cluster/data/hg16/bed/bedOver/hg16TogalGal2 cd /cluster/data/hg16/bed/bedOver/hg16TogalGal2 set chainDir = /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain netSplit $chainDir/human.net net mkdir subset foreach f ($chainDir/chain/*.chain) echo subsetting $f:t:r netChainSubset net/$f:t:r.net $f subset/$f:t end cat subset/*.chain > /cluster/data/hg16/bed/bedOver/hg16TogalGal2.chain # HUMAN/MOUSE/RAT/CHICKEN (HMRG) PHYLOHMM CONSERVATION (IN PROGRESS 2004-03-8 kate) # Set path set path = ($path /cluster/bin/woody) # Obtain phylogenetic model (hmrc_rev_dg.mod) # from Adam (hand-tuned, instead of fit_model) # then, create New Hampshire tree for data (.nh file) cat hmrc_rev_dg.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #NRATECATS: 10 #ALPHA: 4.4 #BACKGROUND: 0.286083 0.213573 0.213691 0.286652 #RATE_MAT: #-0.891523 0.166770 0.574850 0.149902 #0.223389 -1.146311 0.153784 0.769137 #0.769591 0.153699 -1.147159 0.223869 #0.149605 0.573055 0.166888 -0.889548 #TREE: ((1:0.192598,(2:0.076303,3:0.083043):0.192598):0.47,4:0.47); /cluster/data/woody/scripts/extract-tree.pl human,mouse,rat,chicken \ hmrc_rev_dg.mod #((human:0.192598,(mouse:0.076303,rat:0.083043):0.192598):0.47,chicken:0.47); ssh eieio set path = ($path /cluster/bin/woody) cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2 cd phyloHMM # now, break up the genome-wide MAFs into pieces; it's worth doing # this as a little cluster job # NOTE: using the hg16 chr fasta files stashed on bluearc for hg16 humor run # NOTE: next time add "check out" lines to assure files are created ssh eieio mkdir -p /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2 cp /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/*.maf /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2 cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM cat << 'EOF' > doSplit #!/bin/sh WOODY=/cluster/bin/woody FA_SRC=/cluster/bluearc/hg16/bed/humor WINDOWS=/cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM/WINDOWS maf=$1 c =`basename $maf .maf` echo $c mkdir -p /scratch/msa_split ${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg16,mm3,rn3,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -d 1 -B 5000 echo "Copying..." cd /scratch/msa_split for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done rm -f /scratch/msa_split/$c.*.ss echo "Done copying" 'EOF' chmod +x doSplit mkdir -p WINDOWS rm -f WINDOWS/* jobs.lst foreach file (/cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2/*.maf) echo "doSplit $file" >> jobs.lst end ssh kk cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM para create jobs.lst # para try, para push, etc. # now setup and run the cluster job to compute the conservation scores # NOTE: need to use gensub2, check out+ facilities to check for # failures. Will want to chunk msa_split output (above) into chr dirs. # to make the gensub template reasonable. cat << 'EOF' > doPostProbs #!/bin/sh WOODY=/cluster/bin/woody TMP=/tmp/phyloHMMcons file=$1 root=`basename $file .ss.gz` chrom=`echo $root | awk -F\. '{print $1}'` echo $chrom mkdir -p $TMP zcat $file | $WOODY/label -m - -d hmrc_rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x mkdir -p POSTPROBS/$chrom gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz rm $TMP/$root.postprob 'EOF' chmod +x doPostProbs mkdir -p POSTPROBS rm -f jobs2.lst foreach file (WINDOWS/chr*.ss.gz) echo "doPostProbs $file" >> jobs2.lst end wc -l job2.lst para create jobs2.lst # etc ... (run cluster job) # Create wiggle (.wib) file using and load into database ssh eieio cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM mkdir wibLimits mkdir wib cat > makeWig.csh << 'EOF' foreach dir (POSTPROBS/*) set chrom = $dir:t echo $chrom zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \ wigAsciiToBinary -chrom=$chrom \ -dataSpan=1 -wibFile=wib/${chrom}_hmrg_phyloHMM -name=hmrg \ stdin > wibLimits/${chrom} end 'EOF' csh makeWig.csh >&! makeWig.log & ssh hgwdev cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM hgLoadWiggle hg16 multizMm3Rn3GalGal2_phyloHMM_wig wib/*_hmrg_phyloHMM.wig ln -s `pwd`/wib/chr*_hmrg_phyloHMM.wib /gbdb/hg16/wib chmod 775 . wib chmod 664 wib/*.wib # Add zoom records to table to speed display of large regions (>600Kbp) # NOTE: this doesn't work -- the rows were dropped ssh eieio cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM mkdir -p wib1K wibLimits1K cat > wigZoom1K.csh << 'EOF' foreach dir (POSTPROBS/*) set chrom = $dir:t echo $chrom zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \ wigZoom stdin | wigAsciiToBinary -chrom=$chrom \ -dataSpan=1024 -wibFile=wib1K/${chrom}_hmrg_phyloHMM_1K \ -name=hmrg stdin > wibLimits1K/${chrom} end 'EOF' csh wigZoom1K.csh >&! wigZoom1K.log & tail -100f wigZoom1K.log ssh hgwdev cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM/wib1K hgLoadWiggle -oldTable hg16 multizMm3Rn3GalGal2_phyloHMM_wig *.wig # create symlinks for .wib files ln -s `pwd`/*.wib /gbdb/hg16/wib # NOTE: this doesn't work -- the rows were dropped # setup external files for database reference # reuse mafs loaded in the maf track (just symlink the /gbdb dir before # loading ssh hgwdev ln -s /gbdb/hg16/multizMm3Rn3GalGal2 /gbdb/hg16/multizMm3Rn3GalGal2_phyloHMM # load into database cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3GalGal2_phyloHMM # create trackDb entry # track multizMm3Rn3GalGal2_phyloHMM # type wigMaf 0.0 1.0 # wiggle multizMm3Rn3GalGal2_phyloHMM_wig # etc. # Load pairwise mafs ssh hgwdev cd /gbdb/hg16 mkdir -p mouse_hmrg rat_hmrg chicken_hmrg foreach f (/cluster/data/hg16/bed/humor/maf/*.mm3.maf) ln -s $f /gbdb/hg16/mouse_hmrg end cd /tmp hgLoadMaf -WARN hg16 mouse_hmrg foreach f (/cluster/data/hg16/bed/humor/maf/*.rn3.maf) ln -s $f /gbdb/hg16/rat_hmrg end cd /tmp hgLoadMaf -WARN hg16 mouse_hmrg foreach f (/cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf) ln -s $f /gbdb/hg16/chicken_hmrg end cd /tmp hgLoadMaf -WARN hg16 chicken_hmrg # copy files to download area set dir = /usr/local/apache/htdocs/goldenPath/hg16/multizMm3Rn3GalGal2 mkdir $dir ln -s $dir multiz cp -p /gbdb/hg16/multizMm3Rn3GalGal2_phyloHMM/*.maf $dir cd $dir gzip * # As the 5-way alignment is imminent, this wasn't completed # edit downloads page to add links # add pairwise mafs to downloads page mkdir $dir/{rn3,mm3} cd /cluster/data/hg16/bed/humor/maf cp *.mm3.maf $dir/mm3 cp *.rn3.maf $dir/rn3 gzip $dir/mm3/* gzip $dir/rn3/* # also add human/chicken maf's # Create upstream files ssh hgwdev echo hg16 mm3 rn3 galGal2> org.txt foreach i (1000 2000 5000) featureBits hg16 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad mafFrags hg16 multizMm3Rn3GalGal2 up.bed upstream$i.maf -orgs=org.txt rm up.bed end # miRNA track (DONE - 2004-05-04 - Hiram) # data from: Sam Griffiths-Jones # and Michel.Weber@ibcg.biotoul.fr # notify them if this assembly updates to renew this track ssh hgwdev mkdir /cluster/data/rn3/bed/miRNA cd /cluster/data/rn3/bed/miRNA wget --timestamping \ hgLoadBed rn3 miRNA rn3.bed # entry in trackDb/trackDb.ra already there # miRNA track (UPDATED - 2004-05-04 - Hiram) # (first version done 2004-03-02) # data from: Sam Griffiths-Jones # and Michel.Weber@ibcg.biotoul.fr # notify them if this assembly updates to renew this track cd /cluster/data/hg16/bed mv miRNA miRNA.2004_03_02 mkdir miRNA cd miRNA wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/hsa_ncbi34.*" grep -v "^track " hsa_ncbi34.bed | sed -e "s/ /\t/g" > hg16.bed # check existing track for comparison after update load # featureBits hg16 miRNA # 15385 bases of 2865248791 (0.001%) in intersection hgLoadBed hg16 miRNA hg16.bed # featureBits hg16 miRNA # 16923 bases of 2865248791 (0.001%) in intersection # added an entry to trackDb/trackDb.ra: (good for Mm4 and Ce1 too) track miRNA shortLabel miRNA longLabel MicroRNAs from the miRNA Registry group genes priority 63 visibility hide useScore 1 color 255,64,64 type bed 8 url http://www.sanger.ac.uk/cgi-bin/Rfam/mirna/mirna_entry.pl?id=$$ # Note the useScore item. This colors plus strand items in black # and minus strand items in gray. A rarely used option. # This same track is in Rn3, Mm4 and Ce2 too. Added # findBedPos(query, hgp, "miRNA"); # to lib/hgFind.c to allow searching for these items. #5-WAY MULTIZ & PHYLO-HMM HUMAN/CHIMP/MOUSE/RAT/CHICKEN (3/19/04, kpollard) # UPDATE WOODY BINARIES ssh hgwdev cd /cluster/data/woody cvs update -dP cd src make # make sure Makefile has INSTALLDIR = /cluster/bin/woody make install #MULTIZ to add chimp, then chicken to HUMOR (see above) ssh kk set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2 mkdir -p $fiveDir/hmrp mkdir -p $fiveDir/hmrpg cd $fiveDir #wrapper script for multiz cat << EOF > mz #!/bin/csh /cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3 EOF chmod +x mz #CHIMP # put the MAFs on bluearc ssh eieio set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2 mkdir -p $clustDir/hp mkdir -p $clustDir/hmr cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf $clustDir/hmr cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/*.maf $clustDir/hp logout # back to kk #set up joblist (common denominator set: no chr19_random in hmr) set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2 cd $fiveDir rm -f jobList foreach file ($clustDir/hmr/*.maf) set root=`echo $file:t:r | sed 's/\.hmr//'` echo "mz $clustDir/hp/${root}.maf $file $fiveDir/hmrp/${root}.maf" >> jobList end #run on kk chmod +x jobList para create jobList #para try, para check, para push, etc. #add chr19_random from hp to hmrp cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/chr19_random.maf $fiveDir/hmrp #clean up bluearc ssh eieio set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2 rm -r $clustDir/hp rm -r $clustDir/hmr #CHICKEN # put the MAFs on bluearc ssh eieio set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2 mkdir -p $clustDir/hmrp mkdir -p $clustDir/hg cp $fiveDir/hmrp/*.maf $clustDir/hmrp cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf $clustDir/hg logout # back to kk logout #move to kki #set up job list 2 ssh kki set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2 set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2 cd $fiveDir rm -f jobList.2 foreach file ($clustDir/hg/*.maf) set root=`echo $file:t:r | sed 's/\.hg//'` echo "mz $file $clustDir/hmrp/${root}.maf $fiveDir/hmrpg/${root}.maf" >> jobList.2 end #run on kki chmod +x jobList.2 para create jobList.2 #para try, para check, para push, etc. # clean up bluearc ssh eieio rm -r /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2 logout #PHYLO-HMM CONSERVATION #Set path set path = ($path /cluster/bin/woody) #Create "sufficient statistics" (SS) file from maf ssh eieio cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2 mkdir phyloHMM cd phyloHMM # create script to run msa_view. cat > makeSS.csh << 'EOF' set path = ($path /cluster/bin/woody) cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg foreach f (chr*.maf) set c = $f:r echo "$c" msa_view $f -i MAF -o SS -s 1 -r 1 -O hg16,mm3,rn3,panTro1,galGal2 > \ ../phyloHMM/$c.ss end 'EOF' csh makeSS.csh >&! makeSS.log & tail -100f makeSS.log head phyloHMM/chr1.ss head phyloHMM/chrY.ss #model hpmrc_rev_dg.mod (from Adam) set path = ($path /cluster/bin/woody) cat hpmrc_rev_dg.mod #ALPHABET: A C G T #ORDER: 0 #SUBST_MOD: REV #NRATECATS: 10 #ALPHA: 4.4 #BACKGROUND: 0.286083 0.213573 0.213691 0.286652 #RATE_MAT: # -0.891523 0.166770 0.574850 0.149902 # 0.223389 -1.146311 0.153784 0.769137 # 0.769591 0.153699 -1.147159 0.223869 # 0.149605 0.573055 0.166888 -0.889548 #TREE: ((1:0.0056,2:0.0057):0.1043,(3:0.076303,4:0.083043):0.2753):0.47,5:0.47); /cluster/data/woody/scripts/extract-tree.pl human,chimp,mouse,rat,chicken \ hpmrc_rev_dg.mod #((human:0.0056,chimp:0.0057):0.1043,(mouse:0.076303,rat:0.083043):0.2753):0.47,chicken:0.47); #order is human-chimp-mouse-rat-chicken, so fix maf order in next step #break up the genome-wide MAFs into pieces # NOTE: using the hg16 chr fasta files stashed on bluearc for hg16 humor mkdir -p /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2 cp /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg/*.maf /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2 cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM cat << 'EOF' > doSplit #!/bin/sh WOODY=/cluster/bin/woody FA_SRC=/cluster/bluearc/hg16/bed/humor WINDOWS=/cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM/WINDOWS maf=$1 c=`basename $maf .maf` echo $c mkdir -p /scratch/msa_split ${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg16,panTro1,mm3,rn3,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -d 1 -B 5000 echo "Copying..." cd /scratch/msa_split for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done rm -f /scratch/msa_split/$c.*.ss echo "Done copying" 'EOF' chmod +x doSplit mkdir -p WINDOWS rm -f WINDOWS/* jobs.lst foreach file (/cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/*.maf) echo "doSplit $file" >> jobs.lst end #run on kki ssh kki cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM para create jobs.lst # para try, para check, para push, etc. logout #compute the conservation scores # NOTE: need to use gensub2, check out+ facilities to check for # failures. Will want to chunk msa_split output (above) into chr dirs. # to make the gensub template reasonable. ssh kk cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM cat << 'EOF' > doPostProbs #!/bin/sh WOODY=/cluster/bin/woody TMP=/tmp/phyloHMMcons file=$1 root=`basename $file .ss.gz` chrom=`echo $root | awk -F\. '{print $1}'` echo $chrom mkdir -p $TMP zcat $file | $WOODY/label -m - -d hpmrc_rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x mkdir -p POSTPROBS/$chrom gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz rm $TMP/$root.postprob 'EOF' # << this line makes emacs coloring happy chmod +x doPostProbs mkdir -p POSTPROBS rm -f jobs2.lst foreach file (WINDOWS/chr*.ss.gz) echo "doPostProbs $file" >> jobs2.lst end wc -l jobs2.lst para create jobs2.lst #para try, para check, para push, etc. #1 problem: chr19_random crashed - due to no alignments in HMR. Leave out. # Create wiggle (.wib) file and load into database ssh eieio cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM mkdir wibLimits mkdir wib cat > makeWig.csh << 'EOF' foreach dir (POSTPROBS/*) set chrom = $dir:t echo $chrom zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \ wigAsciiToBinary -chrom=$chrom \ -dataSpan=1 -wibFile=wib/${chrom}_hpmrg_phyloHMM -name=hpmrg \ stdin > wibLimits/${chrom} end 'EOF' # << this line makes emacs coloring happy csh makeWig.csh >&! makeWig.log & #load tables ssh hgwdev cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM hgLoadWiggle hg16 mzPt1Mm3Rn3Gg2_pHMM_wig wib/*_hpmrg_phyloHMM.wig ln -s `pwd`/wib/chr*_hpmrg_phyloHMM.wib /gbdb/hg16/wib chmod 775 . wib chmod 664 wib/*.wib cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg mkdir -p /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg/*.maf /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM hgLoadMaf hg16 -warn mzPt1Mm3Rn3Gg2_pHMM cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/ mkdir -p /gbdb/hg16/chimp_hmrg ln -s /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/*.maf /gbdb/hg16/chimp_hmrg hgLoadMaf hg16 -warn chimp_hmrg #cleanup bluearc ssh eieio rm -r /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2 logout #Add description file: mzPt1Mm3Rn3Gg2_pHMM.html #Add track to trackDb.ra: mzPt1Mm3Rn3Gg2_pHMM #Copy files to download area cd /gbdb/hg16 set dir = /usr/local/apache/htdocs/goldenPath/hg16/mzPt1Mm3Rn3Gg2 mkdir $dir ln -s $dir multiz cp -p /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM/*.maf $dir cd $dir gzip * # edit downloads page to add links # add pairwise mafs to downloads page mkdir $dir/{rn3,mm3,pt1,gg2} cd /cluster/data/hg16/bed/humor/maf cp *.mm3.maf $dir/mm3 cp *.rn3.maf $dir/rn3 gzip $dir/mm3/* gzip $dir/rn3/* cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest cp *.maf $dir/gg2 gzip $dir/gg2/* cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/ cp *.maf $dir/pt1 gzip $dir/pt1/* # EXONIPHY HMR # (started, acs, 2004-03-23) # (redone 2004-07-01, with new version of software; have revised # docs accordingly) # Warning: some commands here require bash shell ssh hgwdev # (make sure /cluster/bin/phast is in path) mkdir /cluster/store6/exoniphy.hg16mm3rn3.2004-03-23 cd /cluster/data/hg16/bed ln -s /cluster/store6/exoniphy.hg16mm3rn3.2004-03-23 ln -s exoniphy.hg16mm3rn3.2004-03-23 exoniphy.hg16mm3rn3 # first, break up the genome-wide MAFs into pieces; it's worth doing # this as a little cluster job ssh eieio mkdir -p /cluster/bluearc/hg16/bed/humor cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/data/hg16/?{,?}/chr*.fa /cluster/bluearc/hg16/bed/humor logout ssh kk cd /cluster/data/hg16/bed/exoniphy.hg16mm3rn3 cat << '_EOF_' > doSplit #!/bin/sh PHAST=/cluster/bin/phast FA_SRC=/cluster/bluearc/hg16/bed/humor WINDOWS=/cluster/data/hg16/bed/exoniphy.hg16mm3rn3/WINDOWS maf=$1 prefix=`basename $maf .hmr.maf` chr=`echo $prefix | sed 's/chr//g ; s/_random//g'` mkdir -p /scratch/msa_split ${PHAST}/msa_split $maf --in-format MAF --refseq ${FA_SRC}/$prefix.fa --order hg16,mm3,rn3 --windows 50000,2000 --out-root /scratch/msa_split/$prefix --out-format SS --min-informative 1000 --between-blocks 1000 --tuple-size 3 mkdir -p ${WINDOWS}/$chr cd /scratch/msa_split for file in `ls | egrep -w ${prefix}` ; do gzip -c $file > ${WINDOWS}/$chr/$file.gz ; rm $file ; done _EOF_ # << this line makes emacs coloring happy chmod +x doSplit mkdir -p WINDOWS rm -rf WINDOWS/* jobs.lst for file in /cluster/bluearc/hg16/bed/humor/*.maf ; do echo "doSplit $file" >> jobs.lst ; done para create jobs.lst # etc ... (run cluster job) # now set up cluster job for exoniphy. cat << '_EOF_' > doExoniphy #!/bin/bash zcat $1 | /cluster/bin/phast/exoniphy - ${*:3} > $2 _EOF_ # << this line makes emacs coloring happy chmod +x doExoniphy rm -f jobs.lst for dir in WINDOWS/* ; do chrNo=`basename $dir` mkdir -p OUTPUT/$chrNo for file in $dir/* ; do base=`basename $file .ss.gz` chrStr=`echo $base | awk -F\. '{print $1}'` echo "doExoniphy $file OUTPUT/$chrNo/$base.gff --seqname $chrStr --idpref $base --score --indels --quiet " >> jobs.lst done done #[acs@kk exoniphy.hg16mm3rn3]$ wc jobs.lst # 59175 591750 7179445 jobs.lst para create jobs.lst # etc... (run cluster job) #Completed: 59175 of 59175 jobs #CPU time in finished jobs: 49361849s 822697.48m 13711.62h 571.32d 1.565 y #IO & Wait Time: 258451s 4307.52m 71.79h 2.99d 0.008 y #Average job time: 839s 13.98m 0.23h 0.01d #Longest job: 1868s 31.13m 0.52h 0.02d #Submission to last job: 75584s 1259.73m 21.00h 0.87d # create track logout ssh hgwdev cd /cluster/data/hg16/bed/exoniphy.hg16mm3rn3 for dir in OUTPUT/* ; do chrNo=`basename $dir` echo $chrNo find $dir -name "*.gff" | grep -v random > files if [ -s files ] ; then cat `cat files` | refeature - --unique --sort --include-only CDS,start_codon,stop_codon > chr$chrNo.gff ; fi find $dir -name "*.gff" | grep random > files if [ -s files ] ; then cat `cat files` | refeature - --unique --sort --include-only CDS,start_codon,stop_codon > chr${chrNo}_random.gff ; fi done ldHgGene -gtf -frame hg16 exoniphy chr*.gff #track exoniphy #shortLabel Exoniphy #longLabel Exoniphy: Conserved Exon Predictions (Human/Mouse/Rat) #group genes #priority 50.9 #visibility hide #color 173,17,162 #type genePred # # Load tfbsCons track DONE 2004-03-31 braney # set humordir=/gbdb/hg16/humorMm3Rn3 set transfacdir=/projects/compbio/data/transfac set outdir=hg16_tfbsCons ssh hgwdev mkdir /cluster/data/hg16/bed/tfbsCons cd /cluster/data/hg16/bed/tfbsCons # Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts weirauch@soe.ucsc.edu set tarfile=/cluster/data/hg15/bed/tfbsCons/tfbsConsUtils.tar.gz tar zxf $tarfile # the following takes days (says Matt) nice getTfbsConsData.pl `pwd` $humordir $transfacdir ./IDS.txt $outdir -over & cd $outdir rm chr*.bed hgLoadBed -noSort hg16 tfbsCons -sqlTable=$HOME/kent/src/hg/lib/tfbsCons.sql tfbsCons.bed -tab # Get mapping of ID's from Matt so we can link into the TRANSFAC database set idmap=/cluster/data/hg16/bed/tfbsCons/tfbsConsMap hgsql hg16 < ~/kent/src/hg/lib/tfbsConsMap.sql echo "load data local infile '$idmap' into table tfbsConsMap;" | hgsql hg16 # PREP FOR LIFTOVER CHAINS TO HG16 (2004-04-12 kate) # split into 3K chunks ssh eieio set tempDir = /cluster/bluearc/hg/gs.17/build34/liftOver cd $tempDir mkdir lift cat > split.csh << 'EOF' set scratch = /iscratch/i/gs.17/build34/liftOver/split mkdir -p $scratch foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M) echo chr$i faSplit -lift=lift/chr$i.lft size /cluster/data/hg16/$i/chr$i.fa -oneFile 3000 $scratch/chr$i end 'EOF' csh split.csh >&! split.log & tail -100f split.log /cluster/bin/iSync # ECORES FROM GENOSCOPE [DONE, hartera, 2004-03-31] # download data from http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecores# ecotigHF - ecores on Human, genome conserved with Fugu, Fr1 # ecotigHT - ecores on Human, genome conserved with Tetraodon (March 2004) ssh hgwdev mkdir /cluster/data/hg16/bed/ecores/ # add parse_ecotig.pl to this directory # FUGU mkdir /cluster/data/hg16/bed/ecores/fr1 cd /cluster/data/hg16/bed/ecores/fr1/ # download data for ecotigHF to this directory # parse ecotig files to produce a bed format file perl ../parse_ecotig.pl < ecotigHF > ecotigHF.bed # change from upper to lower case for "CHR" perl -pi.bak -e 's/CHR/chr/g' ecotigHF.bed hgLoadBed -tab hg16 ecoresFr1 ecotigHF.bed # clean up rm *.bak # TETRAODON mkdir /cluster/data/hg16/bed/ecores/tetraodon cd /cluster/data/hg16/bed/ecores/tetraodon/ # download data for ecotigHT to this directory # parse ecotig files to produce a bed format file perl ../parse_ecotig.pl < ecotigHT > ecotigHT.bed # change from upper to lower case for "CHR" perl -pi.bak -e 's/CHR/chr/g' ecotigHT.bed hgLoadBed -tab hg16 ecoresTetraodon ecotigHT.bed # clean up rm *.bak # add entries in kent/src/hg/makeDb/trackDb/human/hg16/trackDb.ra # add html for details pages to this directory: # ecoresFr1.html and ecoresTetraodon.html # VNTR MICROSATELLITE REPEATS FROM GEROME BREEN (DONE 4/28/04 angie) ssh hgwdev mkdir /cluster/data/hg16/bed/vntr cd /cluster/data/hg16/bed/vntr # saved email attachment from Gerome Breen # as HumJuly2003microsats_finished_for_angieH.txt # Replace 1-based start coords with 0-based, tweak n/a distance values: tail +2 HumJuly2003microsats_finished_for_angieH.txt \ | perl -wpe 's/(first|last) in chromosome\/sequence/-1/i' \ | awk '{printf "%s\t%d\t%d\t%s\t%s\t%d\t%s\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6, $7, $8, $9, $10;}' \ > vntr.bed hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/vntr.sql hg16 \ vntr vntr.bed # WEBB'S PUTATIVE NON-EXONIC CONSERVED REGIONS (DONE 4/6/04 angie) ssh hgwdev mkdir /cluster/data/hg16/bed/webbNonExonic cd /cluster/data/hg16/bed/webbNonExonic wget http://bio.cse.psu.edu/~webb/nonexonic.tar.gz tar xvzf nonexonic.tar.gz # Score should really be scaled from 5k..276k --> 200-1000 cat chr* \ | awk '{printf "%s\t%d\t%d\t%s:%d-%d\t%d\t%c\n", $2, $3-1, $4, $5, $6, $7, $9, $8;}' \ > webbNonExonic.bed hgLoadBed hg16 webbNonExonic webbNonExonic.bed # phylo HMM data quintile calculation ssh eieio cat << '_EOF_' > /tmp/allpHMMdata.sh #!/bin/sh # there is only an empty file in chr13_random, it causes all # files following it on the xargs zcat line to be missed. # Eliminate it from the processing find ./POSTPROBS -type f | grep -v chr13_random | sort -t\. -k2,2n | \ xargs zcat | awk '{print $2}' > /tmp/pHMM.data '_EOF_' chmod +x /tmp/allpHMMdata.sh cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11 time /tmp/allpHMMdata.sh # Create top 5 % set of data for phyloHMMcons.hg16mm3rn3.2003-11-11 # (DONE - 2004-05-15 - Hiram) cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11 cat << '_EOF_' > top5.sh #!/bin/sh # # Do not work on chr13_random, it has no data # this for loop should have been: # ls POSTPROBS/chr* | sort -t\. -k2,2n | while read i # to get the data in properly sorted order. With this as is, # we will need to sort the coords later to make any wiggle # track out of this data # mkdir top5_data for i in POSTPROBS/chr* do c=${i/POSTPROBS\//} echo $i $c if [ "$c" != "chr13_random" ]; then if [ ! -f top5_data/$c.ascii.gz ]; then find ${i} -type f | sort -t\. -k2,2n | while read FN do zcat ${FN} done | awk '{if ($2 > 0.450) print}' > top5_data/$c.ascii rm -f top5_data/$c.ascii.gz gzip top5_data/$c.ascii & else ls -og top5_data/$c.ascii.gz fi fi done '_EOF_' chmod +x top5_data # running this script takes several hours, make sure you do it # on the file server ssh eieio cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11 # Then, to make the histogram data: cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11/top5_data cat << '_EOF' > mkHisto.sh #!/bin/sh for f in chr*.ascii.gz do zcat $f done | textHistogram -real -col=2 -binSize=0.001 -maxBinCount=1000 stdin '_EOF_' chmod +x mkHisto.sh ./mkHisto.sh > histoGram.data # BLASTZ FUGU (FR1) (DONE 4/19/04 angie) ssh kk # space is awful tight on store4 -- use store7. mkdir -p /cluster/store7/hg16/bed/blastz.fr1.2004-04-19 ln -s /cluster/store7/hg16/bed/blastz.fr1.2004-04-19 \ /cluster/data/hg16/bed/ cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19 # Set L=6000 (more relaxed than chicken) and abridge repeats. # Treat all repeats as lineage-specific (reuse linSpecRep.Chicken). cat << '_EOF_' > DEF # human vs. fugu export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from human-chicken. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.Chicken SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Fugu SEQ2_DIR=/iscratch/i/fr1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/fr1/linSpecRep SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/store7/hg16/bed/blastz.fr1.2004-04-19 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy bash # if a csh/tcsh user source DEF mkdir $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, check, push, check, .... #Completed: 11865 of 11865 jobs #Average job time: 414s 6.90m 0.11h 0.00d #Longest job: 709s 11.82m 0.20h 0.01d #Submission to last job: 5678s 94.63m 1.58h 0.07d # second cluster run: lift raw alignments -> lav dir ssh kki cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19 bash # if a csh/tcsh user source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList para try, check, push, etc ... #Completed: 339 of 339 jobs #Average job time: 4s 0.07m 0.00h 0.00d #Longest job: 19s 0.32m 0.01h 0.00d #Submission to last job: 91s 1.52m 0.03h 0.00d # third run: lav -> axt ssh kki cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19 mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | $HOME/bin/x86_64/lavToAxt stdin \ /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/fr1/nib stdout \ | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check #Completed: 42 of 42 jobs #Average job time: 16s 0.26m 0.00h 0.00d #Longest job: 75s 1.25m 0.02h 0.00d #Submission to last job: 80s 1.33m 0.02h 0.00d # CHAIN FUGU BLASTZ (REDONE 10/1/04 angie) # NOTE: originally done 4/19, but with a buggy axtChain. # axtChain dir moved aside to axtChain.orig before rebuilding. # Run axtChain on little cluster ssh kki cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain # Check size>0 for .axt files (empty inputs cause out line+ check to fail): cp /dev/null input.lst foreach f (`ls -1S /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChrom/*.axt`) if (-s $f) then echo $f >> input.lst endif end cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from chicken run. cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \ -minScore=5000 $1 \ /iscratch/i/gs.17/build34/bothMaskedNibs \ /iscratch/i/fr1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 41 of 41 jobs #Average job time: 26s 0.44m 0.01h 0.00d #Longest job: 121s 2.02m 0.03h 0.00d #Submission to last job: 121s 2.02m 0.03h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # Load chains into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain hg16 ${c}_chainFr1 $i end # NET FUGU BLASTZ (REDONE 10/1/04 angie) # NOTE: originally done 4/19, but with results of a buggy axtChain. ssh kksilo cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain netClass noClass.net hg16 fr1 fugu.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn fugu.net > fuguSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain netFilter -minGap=10 fugu.net | hgLoadNet hg16 netFr1 stdin netFilter -minGap=10 fuguSyn.net | hgLoadNet hg16 netSyntenyFr1 stdin # LIFTOVER CHAIN TO FUGU FR1 (DONE 2004-09-28 kate) ssh kolossus cd /cluster/data/hg16/bed/blastz.fr1/axtChain time netChainSubset human.net all.chain \ /cluster/data/hg16/bed/liftOver/hg16ToFr1.chain # RUN AXTBEST (DONE 4/20/04 angie) # Webb asked for axtBest too... ssh kolossus cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19 mkdir axtBest foreach f (axtChrom/*.axt) set chr=$f:t:r echo axtBesting $chr axtBest $f $chr axtBest/$chr.axt -minScore=300 end # H-INVITATIONAL GENE ANNOTATION DATABASE (2004-04-29 kate) # https://www.jbirc.aist.go.jp/hinv/top.html # Create knownGene table to reference HINV gene ID's # for link on knownGenes details page # Also, create an HINV gene track, just to look at # (probably not publish, as these are just mRNA alignments # already visible on browser). # download CDNA file (release 1.0) ssh kksilo mkdir /cluster/data/hinv cd /cluster/data/hinv wget http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz gunzip FCDNA.gz mv FCDNA FCDNA.1.0 # set up assembly work area ssh eieio cd /cluster/data/hg16 mkdir -p bed/hinv cd bed/hinv # extract H-INV ID's and Genbank accessions of mRNAs awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/FCDNA.1.0 \ > accessions.txt awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/FCDNA.1.0 \ > ids.txt paste accessions.txt ids.txt > queries.txt # create PSL file from alignments for these mRNA's, extracted from the # table of all aligned mRNA's hgsql hg16 -s -e "SELECT * FROM all_mrna" | cut -f 2- > all_mrna.tab pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl # using pslReps to generate the PSL file header pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl # load track of mrna alignments hgwdev cd /cluster/data/hg16/bed/hinv hgLoadPsl hg16 -table=HInvGeneMrna hinv_mrna.psl # also make a gene track using the genomic exon coordinates for build34 # in the FCDNA file. NOTE: not all of the genes have these ssh kksilo cd /cluster/data/hg16/bed/hinv /cluster/data/hinv/hinvToGff.pl < /cluster/data/hinv/FCDNA.1.0 > hinv.gff ssh hgwdev cd /cluster/data/hg16/bed/hinv ldHgGene hg16 HInvGene hinv.gff # Read 40140 transcripts # TrackDb for this # track HInvGene # shortLabel H-INV Gene # longLabel H-Invitational Genes # group genes # priority 37 # visibility hide # color 0,100,180 # type genePred . # also make a table with various useful items for each gene ssh hgwdev hgsql hg16 < ~/kent/src/hg/lib/HInv.sql cd /cluster/data/hg16/bed/hinv /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/FCDNA.1.0 > HInv.tab echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg16 # create table for knownGenes detail page ssh hgwdev cd /cluster/data/hg16/bed/hinv hgMapToGene hg16 HInvGeneMrna knownGene knownToHInv # GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 5/10/04 angie) ssh kksilo cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain netSplit human.net net ssh kolossus cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg16/nib \ /cluster/data/galGal2/nib stdout \ | axtSort stdin axtNet/$chr.axt end mkdir mafNet foreach f (axtNet/chr*.axt) set maf = mafNet/$f:t:r.hg.maf axtToMaf $f \ /cluster/data/hg16/chrom.sizes /cluster/data/galGal2/chrom.sizes \ $maf -tPrefix=hg16. -qPrefix=galGal2. end # MULTIZ HUMAN/MOUSE/RAT/GALGAL2 WITH NET MAF FOR ALL (DONE 5/10/04 angie) # (galGal2 net maf added to human/mouse/rat alignments described above [HUMOR]) # put the MAFs on bluearc ssh eieio mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf \ /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafNet/*.maf \ /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg ssh kki mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet mkdir hmrg # Wrapper script required because of stdout redirect: cat << '_EOF_' > doMultiz #!/bin/csh /cluster/bin/penn/multiz $1 $2 - > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doMultiz rm -f jobList foreach file (/cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr/*.maf) set root=$file:t:r:r echo "doMultiz /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg/${root}.hg.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg/${root}.maf" >> jobList end para create jobList para try, check, push, check #Completed: 40 of 41 jobs #Crashed: 1 jobs #Average job time: 84s 1.40m 0.02h 0.00d #Longest job: 267s 4.45m 0.07h 0.00d #Submission to last job: 290s 4.83m 0.08h 0.00d # The crash was due to empty hg/chr18_random.hg.maf -- OK. # clean up bluearc (these are big files!) rm -r /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet # put this out there for Glenn Tesler (not a browser track!) ssh eieio cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg gzip * ssh hgwdev mkdir /usr/local/apache/htdocs/angie/hg16.multizMm3Rn3GalGal2.allNet foreach f (/cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg/*) ln -s $f /usr/local/apache/htdocs/angie/hg16.multizMm3Rn3GalGal2.allNet end # EPONINE TSS PREDICTION (DONE 5/21/04 angie) # Eponine runs fine on 2.5Mb contig, but barfs on much larger contig; # chop up sequence at gaps into ~2.5Mb chunks for cluster run. ssh eieio mkdir /cluster/bluearc/hg16/chunks cd /cluster/data/hg16 # Note: faSplit seems to ignore the ".chunk_" suffix below: foreach f (?{,?}/NT_*/NT_??????.fa) set ctg = $f:t:r faSplit -minGapSize=10 -lift=/cluster/bluearc/hg16/chunks/$ctg.lft \ gap $f 2500000 /cluster/bluearc/hg16/chunks/$ctg.chunk_ end mkdir /cluster/data/hg16/bed/eponine cd /cluster/data/hg16/bed/eponine wget http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar cat << '_EOF_' > doEpo #!/bin/csh set path=(/usr/java/j2re1.4.1_01/bin $path) java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doEpo cp /dev/null jobList foreach f (/cluster/bluearc/hg16/chunks/NT*.fa) echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \ >> jobList end mkdir out ssh kk9 cd /cluster/data/hg16/bed/eponine para create jobList para try, check, push, check, ... #Completed: 1588 of 1588 jobs #Average job time: 208s 3.47m 0.06h 0.00d #Longest job: 447s 7.45m 0.12h 0.01d #Submission to last job: 3591s 59.85m 1.00h 0.04d # lift chunks -> contigs mkdir contigs/ foreach l (/cluster/bluearc/hg16/chunks/*.lft) set ctg = $l:t:r liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff end # lift contigs -> chrom liftUp eponine.gff ../../jkStuff/liftAll.lft warn contigs/NT_*.gff # Translate to bed 4 + float-score -- it would be a shame to lose # those scores in genePred or bed 5 (int score) awk 'BEGIN {i=0;} \ {printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \ i = i + 1;}' \ eponine.gff > eponine.bed # load up ssh hgwdev cd /cluster/data/hg16/bed/eponine sed -e 's/bed6FloatScore/eponine/g' \ $HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql hgLoadBed hg16 eponine eponine.bed -tab -sqlTable=eponine.sql # RELOAD ENSEMBL GENES WITH VERSION 34d (DONE 2004/05/20 baertsch) # save current tables, just in case. rename table ensGene to ensGene_old; rename table ensGtp to ensGtp_old; rename table ensPep to ensPep_old; mkdir /cluster/data/hg16/bed/ensembl34d cd /cluster/data/hg16/bed/ensembl34d # Get the ensembl protein data from # http://www.ensembl.org/Homo_sapiens/martview # Follow this sequence through the pages: # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensemblGene.gtf.gz # Ensembl handles random chromosomes differently than us, so we # strip this data. Fortunately it just loses a couple of genes. # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. # Finally, get rid of the ".1" or ".2" after the name zcat ensbuild34d.gff.gz \ | grep -v ^6_DR51 \ | grep -v ^DR51 \ | grep -v _NT_ \ | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \ || die "Line $. doesnt start with human chrom:\n$_"' \ | sed -e 's/\..\"/\"/g' \ > ensGene.gtf ssh hgwdev /cluster/bin/i386/ldHgGene -gtf -genePredExt hg16 ensGene \ /cluster/data/hg16/bed/ensembl34d/ensGene.gtf # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16 gzip ensGtp.txt # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz zcat ensemblPep.fa.gz | hgPepPred hg16 ensembl stdin # compare size of old and new tables as a sanity check drop table ensGene_old; drop table ensGtp_old; drop table ensPep_old; # Create knownToEnsembl column hgMapToGene hg16 ensGene knownGene knownToEnsembl #### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-05-24 - Fan) # Get the ensembl gene/protein cross-reference data from # http://www.ensembl.org/Homo_sapiens/martview # Follow this sequence through the pages: # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Feature" box, select gene, transcript, protein, SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC # Page 4) Choose "Text, tab separated". choose gzip compression. hit export. # Save as ensXref.txt sed ensXref.txt -e 's/\./\t/g' > ensemblXref3.tab hgsql hg16 -e "drop table ensemblXref3" hgsql hg16 < ~/src/hg/lib/ensemblXref3.sql hgsql hg16 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines' #### REBUILD SUPERFAMILY RELATED TABLES (DONE - 2004-05-21 - Fan) # Download Superfamily data files and build the Superfamily DB # from supfam.mrc-lmb.cam.ac.uk mkdir /cluster/store1/superFamily/040516 cd /cluster/store1/superFamily/040516 # ftp over the following two files: ass_16-May-2004.tab.gz supfam_16-May-2004.sql.gz # This may take about an hour. hgsql hg16 -e "create database superfam040516" hgsql superfam040516 < supfam_16-May-2004.sql # Make sure to add an index on id of the des table of superfam040516. hgsql superfam040516 < ~/src/hg/lib/sfAssign.sql hgsql superfam040516 -e 'load data local infile "ass_16-May-2004.tab" into table superfam040516.sfAssign;' # Build or rebuild Superfamily track and create sf tables needed for PB hgsql hg16 < ~/src/hg/lib/sfAssign.sql cd /cluster/store1/superFamily/040516 hgsql hg16 -e 'load data local infile "ass_16-May-2004.tab" into table hg16.sfAssign;' # If hg16.sfDes already exists, drop it. hgsql superfam040516 -e "select * from des" >sfDes.tab hgsql hg16 < ~/src/hg/lib/sfDes.sql hgsql hg16 -e 'load data local infile "sfDes.tab" into table hg16.sfDes ignore 1 lines;' # If hg16.superfamily already exists, drop it. hgSuperfam hg16 > sf.log # It is normal that many proteins does not have corresponding Superfamily entries. # If hg16.sfDescription exists, drop it. hgsql hg16 < ~/src/hg/lib/sfDescription.sql hgsql hg16 -e 'LOAD DATA local INFILE "sfDescription.tab" into table hg16.sfDescription;' # Finally, load the superfamily table. hgLoadBed hg16 superfamily superfamily.tab -tab # Create knownToSuperfamily table cat /cluster/store1/superFamily/040516/ass_16-May-2004.tab \ | hgKnownToSuper hg16 hs stdin # creates 32542 rows in knownToSuper # seq table acc field is too small; up the max to match new hgLoadSeq # schema (2004/05/22 markd) alter table modify column `acc` varchar(128) NOT NULL default ''; #### Blat knownGene proteins to determine exons (braney 2004-06-02) ssh kk mkdir -p /cluster/data/hg16/bed/blat.hg16KG.2004-05-27 cd /cluster/data/hg16/bed rm blat.hg16KG ln -s blat.hg16KG.2004-05-27 blat.hg16KG cd blat.hg16KG pepPredToFa hg16 knownGenePep known.fa hgPepPred hg16 generic blastKGPep00 known.fa cat << '_EOF_' > blatSome #!/bin/csh -fe /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3 '_EOF_' ls -1S /scratch/hg/gs.17/build34/bothMaskedNibs/*.nib > human.lst mkdir kgfa cd kgfa faSplit sequence ../known.fa 300 kg ls -1S kgfa/*.fa > kg.lst cat << '_EOF_' > blatGsub #LOOP blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' gensub2 human.lst kg.lst blatGsub blatSpec mkdir psl cd psl foreach i (`cat ../human.lst`) mkdir `basename $i .nib` end para create blatSpec para push # Completed: 12222 of 12222 jobs # CPU time in finished jobs: 23286365s 388106.09m 6468.43h 269.52d 0.738 y # IO & Wait Time: 710342s 11839.03m 197.32h 8.22d 0.023 y # Average job time: 1963s 32.72m 0.55h 0.02d # Longest job: 106239s 1770.65m 29.51h 1.23d # Submission to last job: 106248s 1770.80m 29.51h 1.23d pslSort dirs raw.psl /tmp psl/* pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null sort -rn cooked.psl | pslUniq stdin hg16KG.psl pslxToFa hg16KG.psl hg16KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft kgName hg16 hg16KG.psl blastKGRef00 ssh hgwdev cd /cluster/data/hg16/bed/blat.hg16KG hgsql hg16 < ~/kent/src/lib/hg/blastRef.sql echo "rename table blastRef to blastKGRef00" | hgsql hg16 echo "load data local infile 'blastKGRef00' into table blastKGRef00" | hgsql hg16 ### RUN BLASTZ VS. MACACA MULATTA #get sequence from trace repository cd /cluster/bluearc/macaca for i in 01 02 03 04 05 06 07 08 09 10 11 12 13 14 ; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/macaca_mulatta/fasta.macaca_mulatta.0$i.gz ; done # distribute contigs to bluearc and /iscratch/i for cluster run #split the sequence into 1mb chunks (about 13k reads per file) ssh kksilo mkdir -p /cluster/bluearc/macaca/split for i in 001 002 003 004 005 006 007 008 009 010 011 012 013 014 ; do faSplit about macacca_mulatta.$i.fa 10000000 split/$i/mac ; done find split -name \*.fa > mac.lst hgsql hg16 -N < chromLen.sql > S1.len #flatten directory structure for Angie's scripts for i in `ls` ; do cd /iscratch/i/macaca/$i ; for j in `ls` ; do mv $j ../$i.$j ; done ; done ssh kkr1u00 mkdir -p /iscratch/i/macaca/ df /iscratch/i cp /cluster/bluearc/macaca/split/* /iscratch/i/macaca /cluster/bin/scripts/iSync # make DEF file for blastz ssh kksilo cd /cluster/bluearc/macaca # NOTE: need schwartzbin below for utils still not in penn bin cat << '_EOF_' > DEF # human vs. macaca mulatta export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/cluster/data/blastz/human_mulatta.q BLASTZ_ABRIDGE_REPEATS=0 SEQ1_DIR=/scratch/hg/gs.17/build34/bothMaskedNibs/ SEQ1_RMSK=/scratch/hg/gs.17/build34/rmsk/ SEQ1_SMSK= SEQ1_FLAG=-primate SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ2_DIR=/iscratch/i/macaca/ SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG=-primate SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/bluearc/macaca DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len DEBUG=0 '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place cp DEF ~angie/hummus/DEF.hg16-rm0.`date -I` ssh kk cd /cluster/bluearc/macaca # source the DEF file to establish environment for following commands bash source ./DEF cp /cluster/data/mm4/jkStuff/BlastZ_run0.sh . ./BlastZ_run0.sh cd run.1 para try para check para push # Second cluster run to convert the .out's to .lav's cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh . ssh kk cd /cluster/data/pt0/bed/blastz.hg16 bash source DEF ./BlastZ_run1.sh cd run.2 para try para check para push # Prepare third cluster run script to convert lav's to axt's cd /cluster/bluearc/macaca/ cat << '_EOF_' > ../../jkStuff/BlastZ_run2.sh #!/bin/sh # prepare third cluster run for blastz processing # NOTE: should run this on iservers (4G), # with chr19 and chr1 on kolossus (8G) M=`uname -n` if [ "$M" != "kk" ]; then echo "ERROR: you are on machine: '$M'" echo -e "\tthis script expects machine kk" exit 255 fi source DEF mkdir axtChrom mkdir run.2 cd run.2 # usage: blastz-contiglav2axt lav-dir axt-file seq1-dir seq2-file echo '#LOOP' > gsub echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR}' /cluster/bluearc/macaca/split/'${path2} >> gsub echo '#ENDLOOP' >> gsub ls -1S ${BASE}/lav > chrom.list gensub2 chrom.list ../mac.lst gsub jobList wc -l jobList echo "running 'para create'" para create jobList echo "Ready for cluster run. para try, check, push, etc ..." '_EOF_' chmod +x ../../jkStuff/BlastZ_run2.sh # Third cluster run to convert lav's to axt's source DEF ../../jkStuff/BlastZ_run2.sh cd run.2 para try, check, push, etc ... # NOTE: ran this on kolossus and mini-cluster # 30 min. to 2 hrs. per chrom # Wrapper script required because of stdout redirect: cd /cluster/bluearc/macaca cat << '_EOF_' > doMultiz #!/bin/csh /cluster/bin/penn/multiz $1 $2 - > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doMultiz rm -f jobList foreach file (/cluster/data/pt0/bed/blastz-blatHg16.2003-11-24/maf/*.maf) set root=$file:t:r:r echo "doMultiz /cluster/data/pt0/bed/blastz-blatHg16.2003-11-24/maf/${root}.maf $file /cluster/bluearc/macaca/blastz.hg16/${root}.maf" >> jobList end para create jobList para try, check, push, check # seq table acc field is too small; up the max to match new hgLoadSeq # schema (2004/05/22 markd) alter table modify column `acc` varchar(128) NOT NULL default ''; #### Blat knownGene proteins to determine exons (braney 2004-06-02) ssh kk mkdir blat.hg16KG.2004-05-27 rm blat.hg16KG ln -s blat.hg16KG.2004-05-27 blat.hg16KG pepPredToFa hg16 knownGenePep known.fa grep ">" known.fa | sed "s/>//" > kgName.lst kgName hg16 kgName.lst kg.mapNames cat << '_EOF_' > blatSome #!/bin/csh -fe /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3 '_EOF_' ls -1S /scratch/hg/gs.17/build34/bothMaskedNibs/*.nib > human.lst mkdir kgfa cd kgfa faSplit sequence ../known.fa 300 kg ls -1S kgfa/*.fa > kg.lst cat << '_EOF_' > blatGsub #LOOP blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' gensub2 human.lst kg.lst blatGsub blatSpec mkdir psl cd psl foreach i (`cat ../human.lst`) mkdir `basename $i .nib` end para create blatSpec para push # Completed: 12222 of 12222 jobs # CPU time in finished jobs: 23286365s 388106.09m 6468.43h 269.52d 0.738 y # IO & Wait Time: 710342s 11839.03m 197.32h 8.22d 0.023 y # Average job time: 1963s 32.72m 0.55h 0.02d # Longest job: 106239s 1770.65m 29.51h 1.23d # Submission to last job: 106248s 1770.80m 29.51h 1.23d pslSort dirs raw.psl /tmp psl/* pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null pslxToFa uniq.psl uniq_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft # LIFTOVER CHAINS TO HG17 (DONE 2004-07-14 kate) # run alignment # NOTE: split hg17 to /iscratch/i is doc'ed in makeHg17.doc ssh kk cd /cluster/data/hg16 makeLoChain-align hg16 /scratch/hg/gs.17/build34/bothMaskedNibs \ hg17 /iscratch/i/hg17/liftOver/split # Created parasol job in bed/blat.hg17.2004-07-14/run cd bed rm blat.hg17 ln -s blat.hg17.2004-07-14 blat.hg17 cd blat.hg17/run para try para check para push # lift results # the lift directory was defined in makeHg17.doc when split was performed # this expects data in bed/blat.hg17, so symlink must be there # use kolossus for speed ssh kolossus cd /cluster/data/hg16/bed/blat.hg17 makeLoChain-lift hg16 hg17 /cluster/data/hg17/bed/liftOver/liftSplit \ >&! lift.log & tail -100f lift.log # 25 minutes # chain alignments ssh kk makeLoChain-chain hg16 /cluster/data/hg16/nib hg17 /cluster/data/hg17/nib # Created parasol job in /cluster/data/hg16/bed/blat.hg17/chainRun cd /cluster/data/hg16/bed/blat.hg17/chainRun para try # 46 jobs para check para push # make alignment net ssh kolossus makeLoChain-net hg16 hg17 # load into database and copy to download directory ssh hgwdev makeLoChain-load hg16 hg17 cp /cluster/data/hg16/bed/blat.hg17/over.chain \ /cluster/data/hg16/bed/liftOver/hg16ToHg17.chain # Finished loading hg16ToHg17.over.chain # Now, add download link for /usr/local/apache/htdocs/goldenPath/hg16/liftOver/hg16ToHg17.over.chain.gz # LIFTOVER CHAIN FROM HG17 TO HG16 (IN PROGRESS 2005-01-03 kate) ssh kolossus cd /cluster/data/hg16/bed/blast.hg17 mkdir net.hg17 cd chain chainMergeSort chainNet stdin /cluster/data/hg16/chrom.sizes \ /cluster/data/hg17/chrom.sizes \ /dev/null ../net.hg17 time chainSwap netChainSubset net.hg17 # ENCODE Regions (kate) # NOTE: these instructions are not yet complete (scripts and datafiles # are currently in ~kate/encode) mkRegionsBed.pl build34_regions.txt > encodeRegionsHg16.bed hgLoadBed hg16 encodeRegions encodeRegionsHg16.bed -noBin mkdir -p /cluster/data/hg16/bed/encodeRegions cp encodeRegionsHg16.bed /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed # Create hgFixed table for name+description hgsql -D hgFixed < ${HOME}/kent/src/hg/lib/encodRegionInfo.sql sed -e 's/^/INSERT INTO encodeRegionInfo (name, descr) VALUES (\"/' \ -e 's/|/\",\"/' \ -e 's/$/\");/' < regionInfo.txt | hgsql -D hgFixed # create frameset for region display make # create sequence downloads set dir = /usr/local/apache/htdocs/ENCODE/sequences rm sizes.txt foreach b (hg12 hg13 hg15 hg16) encodeSequence.pl regions.$b.txt /cluster/data/$b/nib > $b.fa cp $b.fa $dir faCount $b.fa | awk '{print $1, $2}' > $dir/${b}_count.txt echo $b >> sizes.txt faSize $b.fa >> sizes.txt echo "" >> sizes.txt end cp sizes.txt $dir cd $dir md5sum *.fa > md5sum.txt # QA checkEncodeRegions.pl regions.hg12.txt /cluster/data/hg12/nib > hg12.check cp sizes.txt $dir # etc. csh printRegionDiffs.csh > regionDiffs.out ## end of blastz macaca mulatta alignment # UN-ANNOTATED (EXCEPT FOR CROSS-SPECIES) REGIONS (DONE 6/8/04 angie) # Anton Nekrutenko asked for this... easy to do with featureBits! # NOTE: excluding mRNAs this time because of the controversial # just-submitted-to-GenBank intronic BV* "mRNA" seqs. ssh hgwdev mkdir /cluster/data/hg16/bed/unAnnotated cd /cluster/data/hg16/bed/unAnnotated nice featureBits hg16 -minSize=12 \ \!gap \ \!knownGene \!refGene \!mgcGenes \ \!vegaGene \!vegaPseudoGene \!ensGene \!acembly \!ECgene \ \!geneid \!genscan \!twinscan \!slamMouse \!sgpGene \!softberryGene \ \!rnaGene \!superfamily \ \!est \!xenoMrna \!HInvGene \!tigrGeneIndex \ \!uniGene_2 \ \!cpgIsland \!rmsk \!simpleRepeat \ -bed=unAnnotated.bed #905732944 bases of 2865248791 (31.611%) in intersection hgLoadBed hg16 unAnnotated unAnnotated.bed # not much of a drop in coverage with the -minSize: nice featureBits hg16 unAnnotated #903585585 bases of 2865248791 (31.536%) in intersection # ANDY LAW CPGISSLANDS (DONE 6/15/04 angie) # See notes about this in makeGalGal2.doc. ssh eieio mkdir /cluster/data/hg16/bed/cpgIslandGgfAndy cd /cluster/data/hg16/bed/cpgIslandGgfAndy cp /dev/null cpgIslandAndy.bed cp /dev/null cpgIslandGgfAndy.bed foreach f (../../?{,?}/chr*.fa) set chr = $f:t:r echo preproc $chr /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.preproc echo running original on $chr awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.preproc \ | /cluster/home/angie/andy-cpg-island.pl \ | perl -wpe '$i=0 if (not defined $i); \ chomp; ($s,$e) = split("\t"); $s--; \ $_ = "'$chr'\t$s\t$e\tcpg$i\n"; $i++' \ >> cpgIslandAndy.bed echo running modified on $chr /cluster/home/angie/ggf-andy-cpg-island.pl $chr.preproc \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndy.bed end # load into database: ssh hgwdev cd /cluster/data/hg16/bed/cpgIslandGgfAndy # this one is a bed 4: hgLoadBed hg16 cpgIAndy -tab -noBin cpgIslandAndy.bed # this one is a cpgIslandExt but with a different table name: sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql hgLoadBed hg16 cpgIslandGgfAndy -tab -noBin \ -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed # WOW, even masking out repeat bases from the results, there's a huge # increase in reported islands!! featureBits hg16 cpgIsland #21077002 bases of 2865248791 (0.736%) in intersection featureBits hg16 cpgIslandGgfAndy #135249416 bases of 2865248791 (4.720%) in intersection featureBits hg16 cpgIslandGgfAndy \!rmsk #68714633 bases of 2865248791 (2.398%) in intersection wc -l ../cpgIsland/cpgIsland.bed *bed # 27596 ../cpgIsland/cpgIsland.bed # 376478 cpgIslandAndy.bed # 260761 cpgIslandGgfAndy.bed # http://www.pnas.org/cgi/content/full/99/6/3740 # Takai D Jones PA # Comprehensive analysis of CpG islands in human chromosomes 21 and 22 # # Regions of DNA of greater than 500 bp with a G+C equal to or # greater than 55% and observed CpG/expected CpG of 0.65 were more # likely to be associated with the 5' regions of genes and this # definition excluded most Alu-repetitive elements. # # Also, our description reduced the number of CpG islands located # on these chromosomes from 14,062 to 1,101, which is more # consistent with the expected number of genes (750) located on # these two chromosomes. # # To exclude "mathematical CpG islands" (for example, a 300-bp # sequence containing one G, 150 Cs, and only one CpG, which would # meet the criteria of a CpG island), we added one more condition: # that there are at least seven CpGs in these 200 bp. This number # was selected on the basis that there would be 200/16 (i.e., # 12.5) CpGs in a random DNA fragment containing no suppression of # CpG. Because Gardiner-Garden and Frommer's criterion (1) of # ObsCpG/ExpCpG of 0.6 would accommodate (0.6 × 12.5) CpGs (i.e., # 7.5), we selected seven CpGs as being a reasonable cutoff for # the initial analysis. # egrep -w '^chr2[12]' ../cpgIsland/cpgIsland.bed | wc -l # 1033 egrep -w '^chr2[12]' cpgIslandAndy.bed | wc -l # 16462 # Hmm, how did I find fewer with looser params?? Better run Takai and # Jones's script on chr21 and chr22 for comparison... egrep -w '^chr2[12]' cpgIslandGgfAndy.bed |wc -l # 10680 # OK, I just have to try again with masked sequence: ssh eieio cd /cluster/data/hg16/bed/cpgIslandGgfAndy cp /dev/null cpgIslandMaskedAndy.bed cp /dev/null cpgIslandMaskedGgfAndy.bed foreach f (../../?{,?}/chr*.fa.masked.gz) set chr = $f:t:r:r:r echo preproc $chr zcat $f \ | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \ > $chr.masked.preproc echo running original on $chr awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.masked.preproc \ | /cluster/home/angie/andy-cpg-island.pl \ | perl -wpe '$i=0 if (not defined $i); \ chomp; ($s,$e) = split("\t"); $s--; \ $_ = "'$chr'\t$s\t$e\tcpg$i\n"; $i++' \ >> cpgIslandMaskedAndy.bed echo running modified on $chr /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandMaskedGgfAndy.bed end ssh hgwdev cd /cluster/data/hg16/bed/cpgIslandGgfAndy hgLoadBed hg16 cpgIAndyMasked -tab -noBin cpgIslandMaskedAndy.bed # this one is a cpgIslandExt but with a different table name: sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandMaskedGgfAndy.sql hgLoadBed hg16 cpgIslandGgfAndyMasked -tab -noBin \ -sqlTable=cpgIslandMaskedGgfAndy.sql cpgIslandMaskedGgfAndy.bed featureBits hg16 cpgIAndyMasked #93307698 bases of 2865248791 (3.257%) in intersection featureBits hg16 cpgIslandGgfAndyMasked #56180461 bases of 2865248791 (1.961%) in intersection wc -l *ed # 376478 cpgIslandAndy.bed # 260761 cpgIslandGgfAndy.bed # 125851 cpgIslandMaskedAndy.bed # 80350 cpgIslandMaskedGgfAndy.bed # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's, # might not be the right thing to do (?). Give it a try with less-masked # sequence. ssh eieio cd /cluster/data/hg16/bed/cpgIslandGgfAndy cp /dev/null cpgIslandGgfAndyOnlyRM.bed cp /dev/null cpgIslandGgfAndyOnlyRMAlu.bed foreach f (../../?{,?}/chr*.fa) set chr = $f:t:r echo preproc, ggf-andy $chr onlyRM zcat $f.out.gz > /tmp/tmp.fa.out maskOutFa $f /tmp/tmp.fa.out stdout \ | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyOnlyRM.bed echo preproc, ggf-andy $chr onlyRMAlu head -3 /tmp/tmp.fa.out > /tmp/tmp2.fa.out awk '$11 == "SINE/Alu" {print;}' /tmp/tmp.fa.out >> /tmp/tmp2.fa.out maskOutFa $f /tmp/tmp2.fa.out stdout \ | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \ | /cluster/home/angie/ggf-andy-cpg-island.pl \ | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \ $gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \ $pGc = (100.0 * $gc / $n); \ $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \ "$pCpG\t$pGc\t$oE\n";' \ >> cpgIslandGgfAndyOnlyRMAlu.bed end # 80314 cpgIslandGgfAndyOnlyRM.bed # 110598 cpgIslandGgfAndyOnlyRMAlu.bed ssh hgwdev cd /cluster/data/hg16/bed/cpgIslandGgfAndy sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql hgLoadBed hg16 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \ cpgIslandGgfAndyOnlyRM.bed sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRMAlu/g' \ $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql hgLoadBed hg16 cpgIslandGgfAndyOnlyRMAlu -tab -noBin -sqlTable=/tmp/c.sql \ cpgIslandGgfAndyOnlyRMAlu.bed featureBits hg16 cpgIslandGgfAndyOnlyRM #56275308 bases of 2865248791 (1.964%) in intersection featureBits hg16 cpgIslandGgfAndyOnlyRMAlu #78743130 bases of 2865248791 (2.748%) in intersection #### mrnaBlastz track - all mrnas aligned using blastz Robert 2/20/2004 mkdir /cluster/data/hg16/bed/mrnaBlastz cd /cluster/data/hg16/bed/mrnaBlastz /cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank genbank mrna mrna.fa -db=hg16 -native faTrimPolyA mrna.fa hg16Mrna.fa faSize hg16Mrna.fa -detailed=on > S2.len mkdir /cluster/bluearc/hg/mrnaHg16 faSplit sequence hg16Mrna.fa 100 /cluster/bluearc/hg/mrnaHg16/mrna ls -1 /cluster/bluearc/scratch/hg/mrnaHg16/ > mrna.lst hgsql hg16 < chromInfo.sql > S1.len awk '{print $1}' S1.len |grep -v random > S1.lst cd /cluster/bluearc/hg/gs.17/build34/mrnaBlastz make-joblist para create spec para push ~angie/hummus/do.out2lav DEF > j para create j para push #!/bin/tcsh set base="/cluster/bluearc/hg/gs.17/build34/mrnaBlastz" cd $base mkdir -p pslRaw foreach c (lav/*) pushd $c set chr=$c:t set out=$base/pslRaw/$chr.psl echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | lavToPsl stdin stdout \ | sed -e 's@scratch/hg/gs.17/build34/bothMaskedNibs//@@' | sed -e 's/\.nib:[0-9]*-[0-9]*//' > $out popd end for i in `ls pslRaw/` ; do echo sortIt.sh pslRaw/$i pslSort/$i >> spec.sort ; done para create spec.sort - sorts pslRaw to pslSort for i in `awk '{print $1}' S1.len` ; do echo pslFilterDups pslSort/$i.psl pslFilter/$i.psl >> spec.dup ; done para create spec.dup - filters pslSort to pslFilter using pslFilterDups for i in `awk '{print $1}' S1.len` ; do echo axtChain -linearGap=linearGap.txt -psl pslFilter/$i.psl /scratch/hg/gs.17/build34/bothMaskedNibs/ -faQ /cluster/data/hg16/bed/mrnaBlastz/hg16Mrna.fa chain/$i.chain >> spec.chain ; done para create spec.chain - chains pslFilter to chain mkdir chainFilter for i in `awk '{print $1}' S1.len` ; do echo doFilter ../chain/$i.chain ../chainFilter/$i.chain >> spec.filter ; done spec.filter - filters chain to chainFilter using doFilter mkdir -p preNet cd chainFilter foreach i ( *.chain) chainPreNet $i ../S1.len ../S2.len ../preNet/$i end ls /cluster/data/hg16/nib/*.nib > S1.lst for i in `awk '{print $1}' S1.len`; do chainToPsl ../preNet/$i.chain ../S1.len ../S2.len ../S1.lst /cluster/data/hg16/bed/mrnaBlastz/hg16Mrna.fa ../psl/$i.psl >> spec.chain2psl.new ; echo $i done chainToPsl ; done ssh kk9-10 para create spec.chain2psl.new for i in `awk '{print $1}' S1.len`; do hgLoadPsl -noTNameIx hg16 -table=${i}_mrnaBlastz psl/$i.psl ; echo $i done ; done ## end of blastz Mrna track #### BUILD RETROGENE TRACK ( done Robert 6/15/2004) cp /cluster/data/genbank/data/aligned/genbank.137.0/hg16/full/mrna.native.rawPsl.gz . gunzip mrna.native.rawPsl.gz awk '{OFS="\t";print $1,$2,$3,$4,$5,$6,$7,$8,$9,substr($10,1,index($10,".")-1),$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23}' mrna.native.rawPsl > mrnaBlat.psl hgLoadPsl hg16 mrnaBlat.psl hgsql hg16 -N -B < refGene.sql > refGene.tab cd /cluster/bluearc/hg/gs.17/build34/mrnaBlastz/ netToBed /cluster/data/hg16/bed/blastz.mm3/axtChain/mouseSynNet.net mouseSyn.bed ssh eieio pslCat -nohead -check all_mrna.psl /cluster/bluearc/hg/gs.17/build34/mrnaBlastz/psl/*.psl |awk '{print $0, $1*3-$2}' | sort -k 10,10 -k 22nr -T /tmp | awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' > blatBlastz.psl awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' /scratch/blatBlastz.psl > /scratch/x.psl hgsql hg16 < mrna.sql | grep -v matches | awk '{OFS="\t"; print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22}' > all_mrna.psl tawk '$12 > 1 && $12 < 9999999{x=$11;$11=$12;$12=x;print $0}' /cluster/data/kgDB/bed/hg16/kgBestMrna/sortedKnownGene.tab > sortedKnownGene.tab ssh kkr1u00 cd /cluster/data/hg16/bed/pseudo cp refGene.tab /iscratch/i/hg/gs.17/build34/pseudo cp /cluster/data/hg16/bed/simpleRepeat.bed /iscratch/i/hg/gs.17/build34/pseudo cp mrnaHg16.fa /iscratch/i/hg/gs.17/build34/pseudo cp mouseSyn.bed /iscratch/i/hg/gs.17/build34/pseudo cp sortedKnownGene.tab /iscratch/i/hg/gs.17/build34/pseudo pslSplit nohead -chunkSize=121 /iscratch/i/hg/gs.17/build34/pseudo blatBlastz.psl cd /iscratch/i/hg/gs.17/build34/pseudo iSync ssh kk cd /cluster/data/hg16/bed/pseudo para create spec.kk para push #post process and load track ./buildSort.sh ### PHASTCONS HUMAN/CHIMP/MOUSE/RAT/CHICKEN (6/20/04, acs) # this is an addendum to Katie's '5-WAY MULTIZ & PHYLO-HMM' (see above) # just redoing the 'label' step with the new 'phastCons' program # picking up where it says "compute the conservation scores" ssh hgwdev cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM # set up wrapper for phastCons cat << '_EOF_' > doPhastCons #!/bin/sh PHAST=/cluster/bin/phast TMP=/tmp/phastCons file=$1 root=`basename $file .ss.gz` chrom=`echo $root | awk -F\. '{print $1}'` mkdir -p $TMP PREDICTIONS/$chrom PHASTCONS/$chrom zcat $file | $PHAST/phastCons - hpmrc_rev_dg.mod --nrates 20 --transitions 0.018,0.002 --viterbi PREDICTIONS/$chrom/$root.bed --score --seqname $chrom --quiet > ${TMP}/$root.pp gzip -c $TMP/$root.pp > PHASTCONS/$chrom/$root.pp.gz rm $TMP/$root.pp '_EOF_' chmod u+x doPhastCons # the --transitions arguments are approximate maximum likelihood # estimates obtained by running the program *without* --transitions # (causes estimation by EM) on five randomly selected 1M bp # windows. All estimates were in the same ballpark (took a rough average) # set up cluster job ssh eieio cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM cp WINDOWS/*.ss.gz /cluster/bluearc/hg16/bed/hg16mm3rn3panTro1galGal2-SS/ logout rm -f jobs.lst for file in /cluster/bluearc/hg16/bed/hg16mm3rn3panTro1galGal2-SS/*.ss.gz ; do echo doPhastCons $file >> jobs.lst ; done ssh kk cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM para create ; para try ; para push ... etc. # now create tracks mkdir -p PHASTCONS/wib for dir in PHASTCONS/chr* ; do \ echo $dir ;\ chr=`basename $dir` ;\ zcat `ls $dir/*.pp.gz | sort -t\. -k2,2n` | \ wigAsciiToBinary -chrom=$chr \ -wibFile=PHASTCONS/wib/${chr}_phastCons stdin ;\ done hgLoadWiggle hg16 phastCons PHASTCONS/wib/chr*_phastCons.wig mkdir -p /gbdb/hg16/wib rm -f /gbdb/hg16/wib/chr*phastCons.wib ln -s `pwd`/PHASTCONS/wib/*.wib /gbdb/hg16/wib chmod 775 . PHASTCONS PHASTCONS/wib chmod 664 PHASTCONS/wib/*.wib # tweak scores and names of predictions cat PREDICTIONS/*/*.bed | sed 's/id //' | \ awk '{printf "%s\t%s\t%s\tlod=%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", \ $1, $2, $3, $5, 147.49 * log($5) - 240.34, $6, $7, $8, $9, \ $10, $11, $12}' > all.bed hgLoadBed hg16 phastConsElements all.bed # Scores are transformed as follows, for a reasonable-looking # "spectrum". Let x_max be the maximum score (here # x_max = 4490) and let x_med be the median score (here x_med = # 39). The scores are transformed via the function f(x) = a * # log x + b, s.t. f(x_med) = 300 and f(x_max) = 1000. Solving # for a and b, you get b = (300 log x_max - 1000 log x_med) / # (log x_max - log x_med), a = (1000 - b) / log x_max. Here a = # 147.49, b = -240.34 #track phastCons #shortLabel phastCons #longLabel phastCons Conservation Score, Human/Chimp/Mouse/Rat/Chicken #group compGeno #priority 103 #visibility hide #color 0,10,100 #maxHeightPixels 40 #type wig 0.0 1.0 #autoScale off #track phastConsElements #shortLabel phastConsElements #longLabel phastCons Conserved Elements, Human/Chimp/Mouse/Rat/Chicken #group compGeno #priority 104 #visibility hide #spectrum on #color 0,60,120 #altColor 200,220,255 #exonArrows off #type bed 12 . # Ensembl 34d GENE PREDICTIONS (2004-07-13 baertsch) ## reloaded ensGene to add frame info, no change to data /cluster/bin/i386/ldHgGene -gtf -genePredExt hg16 ensGene \ /cluster/data/hg16/bed/ensembl34d/ensGene.gtf # TWINSCAN 1.3 GENE PREDICTIONS (2004-07-13 baertsch) ## reloaded twinscan to add frame info, no change to data ldHgGene hg16 twinscan chr_gtf/chr*.gtf -gtf -genePredExt #### AFFYTRANSFRAG AND AFFYTRANCRIPTION TRACKS - (2004-07-21 sugnet) # tracks covering about 1/3 of genome with probes # every 5bp and hybridized to RNA from SK-N-AS cell line. # Lifted from genome version hg15. # affyTransfrag track: lift tranfrags to hg16 cd /cluster/store6/weber/affy/transfrags/transfragsLabeled/ mkdir hg16 cd hg16 liftOver ../SK_phase2_tfgs_final.biggerThan50bp.tab /cluster/store4/gs.17/build34/bed/bedOver/33to34.chain \ SK_phase2_tfgs_final.hg16.bed SK_phase2_tfgs_final.err.bed # check to make sure that most lifted... wc *.bed # 12 49 346 SK_phase2_tfgs_final.err.bed # 170749 853745 6936780 SK_phase2_tfgs_final.hg16.bed # 170761 853794 6937126 total hgLoadBed hg16 affyTransfrags SK_phase2_tfgs_final.hg16.bed # Reading SK_phase2_tfgs_final.hg16.bed # Loaded 170749 elements of size 5 # Sorted # Creating table definition for # Saving bed.tab # Loading hg16 # affyTranscription track: cd /cluster/store6/weber/affy/graph/hg15/gz gunzip *.gz mkdir hg16 cd hg16 ln -s ../*.signal ./ # remapGraphs.pl just makes a quick bed file for each signal file with 1bp spans # and then lifts via liftOver to new genome. remapGraphs.pl -liftChain /cluster/store4/gs.17/build34/bed/bedOver/33to34.chain \ -oldGenome hg15 -newGenome hg16 *.signal # Lifting chr13.hg16.signal. # Lifting chr13.sk.signal. # Lifting chr14.sk.signal. # Lifting chr19.sk.signal. # Lifting chr20.sk.signal. # Lifting chr21.sk.signal. # Lifting chr22.hg16.signal. # Lifting chr22.sk.signal. # Lifting chr6.sk.signal. # Lifting chr7.sk.signal. # Lifting chrX.sk.signal. # Lifting chrY.sk.signal. # runWiggles.sh just calls wigAsciiToBinary for each signal file. cat ../runWiggles.sh | sed -e 's/hg15/hg16/g' | sed -e 's/sk/hg16/g' > runWiggles.sh ./runWiggles.sh hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/affyTranscription hg16 affyTranscription *.wig Connected to database hg16 for track affyTranscription Creating table definition with 13 columns in hg16.affyTranscription Saving wiggle.tab Loading hg16 cp *.wib /cluster/data/hg16/bed/affyTranscription/wib/ cd /gbdb/hg15/wib/affyTranscription/ ln -s /cluster/data/hg16/bed/affyTranscription/wib/*.wib ./ cd /cluster/data/hg16/bed/affyTranscription/wib/*.wib chmod 664 *.wib cd /cluster/store6/weber/affy/graph/hg15/gz/hg16 rm *.wib *.wig *.bed gzip *hg16.signal & # EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 2004/08/11 markd) cd /cluster/bluearc/scratch/hg/gs.17/build34/rmsk # Run Arian's DateRepsinRMoutput.pl to add extra columns telling # whether repeats in -query are also expected in -comp species. # Even though we already have the human-mouse linSpecReps, # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl # additions. So add mouse, then ignore it. # Dog in extra column 1, Mouse in extra column 2 foreach outfl ( *.out ) echo "$outfl" /cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \ ${outfl} -query human -comp dog -comp mouse end # Now extract dog (extra column 1), ignore mouse. cd /cluster/bluearc/scratch/hg/gs.17/build34 mkdir linSpecRep.notInDog foreach f (rmsk/*.out_dog_mus) set base = $f:t:r:r echo $base.out.spec /cluster/bin/scripts/extractLinSpecReps 1 $f > \ linSpecRep.notInDog/$base.out.spec end # Clean up. rm /cluster/bluearc/scratch/hg/gs.17/build34/rmsk/*.out_dog_mus # copy to iservers ssh kkr1u00 cp -r /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInDog /iserver/kkr1u00/i/gs.17/build34/ iSync # BLASTZ DOG (CANFAM1) (DONE 2004/08/12 markd) ssh kk # store4 low on disk space; symlink to store7 mkdir -p /cluster/store7/hg16/bed/blastz.canFam1.2004-08-10 ln -s /cluster/store7/hg16/bed/blastz.canFam1.2004-08-10 /cluster/data/hg16/bed cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10 # Use default (Human-Mouse) settings for starters. cat << '_EOF_' > DEF # human vs. dog export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Default BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Human SEQ1_DIR=/scratch/hg/gs.17/build34/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInDog SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Dog SEQ2_DIR=/scratch/hg/canFam1/nib SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg16/bed/blastz.canFam1.2004-08-10 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # first cluster run: raw blastz alignments bash # if a csh/tcsh user cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10 source DEF mkdir -p $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j 2>log sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList # edit jobList to do chr19 first; hg17 run notes indicated # this might save around 4 hours para create jobList para try, check, push, check, .... #Completed: 93225 of 93225 jobs #CPU time in finished jobs: 18459718s 307661.97m 5127.70h 213.65d #IO & Wait Time: 429193s 7153.21m 119.22h 4.97d #Average job time: 203s 3.38m 0.06h 0.00d #Longest job: 18951s 315.85m 5.26h 0.22d #Submission to last job: 58889s 981.48m 16.36h 0.68d # second cluster run: lift raw alignments -> lav dir ssh kki bash # if a csh/tcsh user cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10 source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList para try, check, push, etc ... #Completed: 339 of 339 jobs #CPU time in finished jobs: 3771s 62.85m 1.05h 0.04d 0.000 y #IO & Wait Time: 6671s 111.18m 1.85h 0.08d 0.000 y #Average job time: 31s 0.51m 0.01h 0.00d #Longest job: 334s 5.57m 0.09h 0.00d #Submission to last job: 1464s 24.40m 0.41h 0.02d # third run: lav -> axt ssh kki cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10 mkdir axtChrom pslChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | /cluster/bin/x86_64/lavToAxt stdin \ /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/canFam1/nib stdout \ | /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt /cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList para try, check, push, check #Completed: 42 of 42 jobs #CPU time in finished jobs: 1297s 21.62m 0.36h 0.02d 0.000 y #IO & Wait Time: 15428s 257.13m 4.29h 0.18d 0.000 y #Average job time: 398s 6.64m 0.11h 0.00d #Longest job: 1714s 28.57m 0.48h 0.02d #Submission to last job: 1723s 28.72m 0.48h 0.02d # axtChrom/chr19_random.axt is empty, probably ok # CHAIN DOG BLASTZ (DONE) # Run axtChain on little cluster ssh kki cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/gs.17/build34/bothMaskedNibs \ /iscratch/i/canFam1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList # edit to remove chr19_random para create jobList para try, check, push, check... #Completed: 41 of 41 jobs #CPU time in finished jobs: 8233s 137.22m 2.29h 0.10d 0.000 y #IO & Wait Time: 11718s 195.29m 3.25h 0.14d 0.000 y #Average job time: 487s 8.11m 0.14h 0.01d #Longest job: 4623s 77.05m 1.28h 0.05d #Submission to last job: 4971s 82.85m 1.38h 0.06d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # hg17 said: # Lots of chaff with scores in the 3000's. Many very-high-scoring # chains. So filter the chain down somewhat... # didn't bother rechecking, just filtered. mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain rm chain/* chainSplit chain all.chain gzip all.chain.unfiltered # Load chains into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain hg16 ${c}_chainCanFam1 $i end # Coverage is significantly higher than mouse: featureBits hg16 -chrom=chr1 chainCanFam1Link # 123343602 bases of 221562941 (55.670%) in intersection # NET DOG BLASTZ (DONE 2004/08/15) ssh kolossus cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain netClass noClass.net hg16 canFam1 dog.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn dog.net > dogSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain netFilter -minGap=10 dog.net | hgLoadNet hg16 netCanFam1 stdin netFilter -minGap=10 dogSyn.net | hgLoadNet hg16 syntenyNetCanFam1 stdin # Add entries for chainCanFam1, netCanFam1 to human/hg16 trackDb # LIFTOVER CHAIN TO DOG CANFAM1 (DONE 2004-09-16 kate) ssh kolossus cd /cluster/data/hg16/bed/blastz.canFam1/axtChain time netChainSubset dog.net all.chain \ /cluster/data/hg16/bed/liftOver/hg16ToCanFam1.chain # LOAD ENSEMBL ESTS (DONE 2004-09-07 braney) cd /cluster/data/hg16/bed mkdir ensEst cd ensEst # Get the ensembl EST data from http://www.ensembl.org/ # Go to the Martview link # Choose Homo sapiens as the organism # Follow this sequence through the pages: # Page 1) Choose the Ensembl ESTs choice. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput, choose gzip compression and then hit Export. # Name file ensEst.gff.gz # Ensembl handles random chromosomes differently than us. They give the # contig name. We can lift these up to our chrN_random chromosomes gunzip ensEst.gff.gz sed "/^[0-9XY]*\t/d" ensEst.gff | sed "s/^.*_NT/NT/" > random.gff liftUp -type=".gff" liftRandom.gff /cluster/data/hg16/jkStuff/liftAll.lft warn random.gff sed "/_NT_/d" ensEst.gff | sed "s/^/chr/" > unrandom.gff cat liftRandom.gff unrandom.gff > fixed.gff ldHgGene hg16 ensESTGene fixed.gff # Get the ensembl protein data from http://www.ensembl.org/ # Go to the Martview link # Choose Homo sapien as the organism # Follow this sequence through the pages: # Page 1) Choose the Ensembl ESTs choice. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Sequences" box. # Page 4) Choose Transcripts/Proteins and Gene sequence Only as the ouput, # choose text/fasta and gzip compression and then hit export. Name to ensEstPep.fasta gunzip ensEstPep.fasta.gz sed "s/|.*//" ensEstPep.fasta > fixedPep.fa hgPepPred hg16 generic ensESTPep fixedPep.fa # ensGtp associates geneId/transcriptId/proteinId for name searches # Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. # Save file as ensGtp.tsv.gz gunzip ensGtp.tsv.gz sed "s/ensGtp/ensESTGtp/" ~/kent/src/hg/lib/ensGtp.sql | hgsql hg16 echo "load data local infile 'ensESTGtp.tsv' into table ensESTGtp ignore 1 lines" | hgsql hg16 # QA Note - table ensGtp was updated on 2004-08-18 to remove a header line that was included in the actual table data. This was not ever pushed out to the rr. Table fix (push) done on 2006-01-31 (Jen). Original push on 2004-06. No other pushQ entries exist for table change on 2004-08. # BLASTZ MOUSE MM5 (DONE 2004-09-10 kate) ssh kk # use store7 (lots of space) mkdir -p /cluster/store7/hg16/bed/blastz.mm5.2004-09-10 ln -s /cluster/store7/hg16/bed/blastz.mm5.2004-09-10 \ /cluster/data/hg16/bed cd /cluster/data/hg16/bed ln -s blastz.mm5.2004-09-10 blastz.mm5 cd blastz.mm5 cat << '_EOF_' > DEF # human vs. mouse export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # Human SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # Mouse SEQ2_DIR=/scratch/mus/mm5/softNib # RMSK not currently used SEQ2_RMSK=/scratch/mus/mm5/rmsk # FLAG not currently used SEQ2_FLAG=-rodent SEQ2_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/hg16/bed/blastz.mm5.2004-09-10 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # first cluster run: blastz alignments ssh kk bash # if a csh/tcsh user cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10 source DEF mkdir $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList # 44060 jobs para try, check, push, check, .... # Average job time: 382s 6.37m 0.11h 0.00d # Longest job: 4510s 75.17m 1.25h 0.05d # Submission to last job: 26324s 438.73m 7.31h 0.30d # second cluster run: lift raw alignments -> lav dir ssh kki bash # if a csh/tcsh user cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10 source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList # 339 jobs para try, check, push, etc ... # Average job time: 16s 0.27m 0.00h 0.00d # Longest job: 112s 1.87m 0.03h 0.00d # Submission to last job: 401s 6.68m 0.11h 0.00d # convert lav files to axt ssh kki cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10 mkdir axtChrom pslChrom # a new run directory mkdir run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | /cluster/bin/x86_64/lavToAxt -dropSelf stdin \ /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/mus/mm5/softNib stdout \ | /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt /cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \ ../../pslChrom/$chr.psl '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.mm5.2004-09-10/pslChrom/$(root1).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList # 42 jobs head jobList para create jobList para try, check, push, check,... # Load database tables ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm5/pslChrom foreach f (*.psl) set c = $f:r hgLoadPsl -noTNameIx hg16 -table=${c}_blastzMm5 $f end # takes 30-60 min # CHAIN MOUSE MM5 BLASTZ (DONE 2004-09-15 kate) # Run axtChain on little cluster ssh kki cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/gs.17/build34/bothMaskedNibs \ /iscratch/i/mus/mm5/softNib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList # edit to remove chr19_random para create jobList # 41 jobs para try, check, push, check... # now on the cluster server, sort chains ssh kksilo cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain time chainMergeSort run1/chain/*.chain > all.chain # 5 min -- 230.070u 58.980s 5:07.13 94.1% 0+0k 0+0io 117pf+0w time chainSplit chain all.chain # 5 min -- 208.490u 56.360s 4:48.81 91.7% 0+0k 0+0io 125pf+0w rm run1/chain/*.chain # Load chains into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain/chain foreach i (*.chain) set c = $i:r echo $c hgLoadChain hg16 ${c}_chainMm5 $i end # compare with previous mouse, and with this assembly on later human featureBits hg16 -chrom=chr1 chainMm5 featureBits hg17 -chrom=chr1 chainMm5 featureBits hg16 -chrom=chr1 chainMm3 featureBits hg16 -chrom=chr1 chainMm5Link # 83288228 bases of 221562941 (37.591%) in intersection featureBits hg17 -chrom=chr1 chainMm5Link # 83773012 bases of 222827847 (37.595%) in intersection featureBits hg16 -chrom=chr1 chainMm3Link # 82665800 bases of 221562941 (37.310%) in intersection # NET MOUSE MM5 BLASTZ (DONE 2004-09-16 kate) ssh kolossus cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # < 10 minutes # Add classification info using db tables: ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain time netClass noClass.net hg16 mm5 human.net # 15 minutes # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain rm noClass.net # Make a 'syntenic' subset of these with netFilter -syn human.net > humanSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain netFilter -minGap=10 human.net | hgLoadNet hg16 netMm5 stdin netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 netSyntenyMm5 stdin # GOT HERE # Add entries for chainMm5, netMm5, netSyntenyMm5 # human/hg16 trackDb # LIFTOVER CHAIN TO MOUSE MM5 (DONE 2004-09-16 kate) ssh kolossus cd /cluster/data/hg16/bed/blastz.mm5/axtChain time netChainSubset human.net all.chain \ /cluster/data/hg16/bed/liftOver/hg16ToMm5.chain # 7 mins. # TIGHT FOR MOUSE MM5 (TBD kate) # BEST FOR MOUSE MM5 (TBD kate) # SYNTENIC NET FOR MOUSE MM5 (TBD kate) # DOWNLOADS FOR MOUSE MM5 (TBD kate) # BLASTZ FOR ZEBRAFISH DANRER1 (WORKING 2004-09-29 kate) # Treat all repeats as lineage-specific ssh kkr1u00 mkdir /iscratch/i/gs.17/build34/linSpecRep.notInZebrafish foreach f (/iscratch/i/gs.17/build34/rmsk/chr*.fa.out) cp -p $f \ /iscratch/i/gs.17/build34/linSpecRep.notInZebrafish/$f:t:r:r.out.spec end iSync ssh kk # use store7 (lots of space) mkdir -p /cluster/store7/hg16/bed/blastz.danRer1.2004-09-29 ln -s /cluster/store7/hg16/bed/blastz.danRer1.2004-09-29 \ /cluster/data/hg16/bed cd /cluster/data/hg16/bed ln -s blastz.danRer1.2004-09-29 blastz.danRer1 cd blastz.danRer1 cat << '_EOF_' > DEF # human vs zebrafish (danRer1) # params for zebrafish -- L=6000 (threshold for gapped alignments) # (same params as used for Fugu) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from hg16-fr1. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # Target: Human SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInZebrafish SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # Query: Zebrafish (danRer1) SEQ2_DIR=/iscratch/i/danRer1/nib/ SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/danRer1/linSpecRep.notInHuman SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/hg16/bed/blastz.danRer1.2004-09-29 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # Save the DEF file in the current standard place cp DEF ~angie/hummus/DEF.hg16-danRer1.2004-09-29 # prepare first cluster run ssh kk bash # if a csh/tcsh user cd /cluster/data/hg16/bed/blastz.danRer1 source DEF mkdir $RAW run.0 /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList # 57630 jobs para try, check, push, check, .... # Average job time: 477s 7.95m 0.13h 0.01d # Longest job: 12147s 202.45m 3.37h 0.14d # second cluster run: lift raw alignments -> lav dir ssh kki cd /cluster/data/hg16/bed/blastz.danRer1 bash # if a csh/tcsh user source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList # 339 jobs para try para check para push # GOT HERE # third run: lav -> axt ssh kki cd /cluster/data/hg16/bed/blastz.danRer1 mkdir axtChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /iscratch/i/gs.17/build34/bothMaskedNibs \ /iscratch/i/danRer1/nib stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg16/bed/blastz.danRer1/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList # 42 jobs # GOT HERE # CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera) # Make chains with rescored blastz # Run axtChain on little cluster ssh kki cd /cluster/data/hg17/bed/blastz.tetNig1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Make our own linear gap file with reduced gap penalties, # in hopes of getting longer chains - works well for species at # chicken-human distance or greater cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -linearGap=../../chickenHumanTuned.gap $1 \ /iscratch/i/gs.18/build35/bothMaskedNibs \ /iscratch/i/tetNig1/nib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList ara try, check, push, check,... ======= # 29 jobs para try, check, push, check,... # SEGMENTAL DUPLICATIONS (DONE 10/21/04 angie) ssh hgwdev mkdir /cluster/data/hg16/bed/genomicSuperDups cd /cluster/data/hg16/bed/genomicSuperDups wget http://humanparalogy.gs.washington.edu/segDupDb.tar # This tar file contains files for both hg16 and hg17. A note # from Xinwei She about the contents: #Build34 contains 4 tables: 3 of them are already in the genome browser source code: #genomicSuperDups, celereCoverage and celeraDupPositive. A new table, vanillaTrack, #which display the Celera assembly overlay in the public assembly build34, is added. #There trackDb entries can be founded in the file trackDb.add. # #Build35 contains only 2 tables: genomicSuperDups and celeraDupPositive. tar xvf segDupDb.tar cd bd34 # use tail +2 to skip past the header line: zcat celeraCoverage.tab.gz | tail +2 \ | hgLoadBed -tab hg16 celeraCoverage stdin zcat celeraDupPositive.tab.gz | tail +2 \ | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraDupPositive.sql \ hg16 celeraDupPositive stdin zcat genomicSuperDups.tab.gz | tail +2 \ | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql \ hg16 genomicSuperDups stdin # Change the name of "vanillaTrack" to celeraOverlay: zcat vanillaTrack.mysqldump.gz | sed -e 's/vanillaTrack/celeraOverlay/g' \ | hgsql hg16 # It needs a new index, and it needs a bin field, so dump out its # contents and load them back in using hgLoadBed and an edited # SQL definition: hgsql hg16 -N -e 'select * from celeraOverlay' > celeraOverlay.bed # Make a ~/kent/src/hg/lib/celeraOverlay.as and run autoSql. # Add bin and indices to celeraOverlay.sql, and reload with hgLoadBed: hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraOverlay.sql \ hg16 celeraOverlay celeraOverlay.bed # clean up rm celeraOverlay.bed bed.tab # YALE PSEUDOGENES (started Robert Baertsch, finished JK 2/21/05) ssh hgwdev cd /cluster/data/hg16/bed mkdir pseudoYale cd pseudoYale # Place file obtained from Mark Gerstein at yale in pseudoYale.gtf ldHgGene hg16 pseudoYale pseudoYale.gtf # Note - I'm guessing how this goes. Robert left no record. -jk ## refresh vega tracks with vega build30 (done 5/4/04 Robert) ##download vega mysql tables cd /cluster/store8/ensembl mkdir vega30_35c cd vega30_35c ln /cluster/store8/ensembl/vega30_35c /cluster/data/hg17/bed/vega30 -s for i in `cat tables` ; do wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/$i.gz ; done wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/homo_sapiens_vega_30_35c_mysql40_compatible..sql.gz gunzip *.gz ##create mysql database mysql create database vega30 use vega30 source homo_sapiens_vega_30_35c_mysql40_compatible.sql source dropMt.sql source load.sql exit hgsql vega30 -N -B < vegaGene.sql > vegaGene.tab awk -f vegaGene.awk < vegaGene.tab > vegaGene.gp ldHgGene hg17 vegaGene -predTab vegaGene.gp -gtf -genePredExt hgsql vega30 -N -B < vegaPseudo.sql > vegaPseudo.tab awk -f vegaPseudo.awk < vegaPseudo.tab > vegaPseudo.gp ldHgGene hg17 vegaPseudoGene -predTab vegaPseudo.gp -gtf -genePredExt #load processed pseudogenes grep Processed vegaPseudo.tab > vegaProcPseudo.tab awk -f vegaPseudo.awk < vegaProcPseudo.tab > vegaProcPseudo.gp ldHgGene hg17 vegaProcessedPseudo -predTab vegaProcPseudo.gp -gtf -genePredExt #load vegaInfo hgsql vega30 -N -B < vegaGeneInfo.sql > vegaInfo.tab hgsql vega30 -N -B < vegaPseudoInfo.sql >> vegaInfo.tab hgsql hg17 -N -B < /cluster/home/baertsch/kent/src/hg/lib/vegaInfo.sql echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg17 -N -B #load down to hg16 liftOver vegaGene.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaGeneHg16.gp unMapped.gp -genePred liftOver vegaPseudo.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaPseudoGeneHg16.gp unMappedPseudo.gp -genePred ldHgGene hg16 vegaGene -predTab vegaGeneHg16.gp -gtf ldHgGene hg16 vegaPseudoGene -predTab vegaPseudoGeneHg16.gp -gtf echo 'truncate table vegaInfo' | hgsql hg16 -N -B echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg16 -N -B # QA note - table vegaPep dropped during this update. Not dropped from rr at time of initial push, creating a -times error in joinerCheck. Table vegaPep dropped from hgwbeta and rr/mgc on 2006-01-31. ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) ########################################################################## # CNPs from University of Washington (Done, Heather and Daryl, June/July 2005) # data from http://humanparalogy.gs.washington.edu/structuralvariation ssh hgwdev cd /cluster/data/hg16/bed mkdir cnp cd cnp # Sharp data cp dupArray.txt cnpSharp.bed.orig # change CNP type to match Iafrate data (with permission from Andy) sed -e "s/dup/Gain" cnpSharp.bed.orig > cnpSharp.bed.2 sed -e "s/del/Loss/" cnpSharp.bed.2 > cnpSharp.bed.3 sed -e "s/Both Loss and Gain/Gain and Loss/" cnpSharp.bed.3 > cnpSharp.bed hgLoadBed hg16 cnpSharp -tab -sqlTable=cnpSharp.sql cnpSharp.bed # Loaded 160 elements of size 14 # note: 11 names with special characters: CTD-2183E4*, RP11-111A4?, RP11-325E8#, RP11-1000I9*, RP11-159F11*, # RP11-177L24*, RP11-136P13*, RP11-1151C19*, RP11-1008M3*, RP11-379N11?, CTD-3185D7# # no apparent problems with these hgsql hgFixed < cnpSharpCutoff.sql echo 'load data local infile "sampleCUTOFF.txt" into table cnpSharpCutoff' | hgsql hgFixed hgsql hg16 < cnpSharpSamples.sql echo 'load data local infile "andyArraySample.txt" into table cnpSharpSamples' | hgsql hg16 hgsql hg16 < cnpSharpSampleCount.sql hgsql hg16 < sampleCount.sql # fosmid discordants # don't need the id column cp fosmidDiscordant.txt fosmidDiscordant.bed hgLoadBed hg16 fosmidDiscordantPrelim -tab -sqlTable=fosmidDiscordantPrelim.sql fosmidDiscordant.bed hgsql hg16 < fosmidDiscordant.sql echo 'insert into fosmidDiscordant select bin, chrom, chromStart, chromEnd, name from fosmidDiscordantPrelim' | hgsql hg16 echo 'drop table fosmidDiscordantPrelim' | hgsql hg16 # Iafrate data cp Iafrate.txt cnpIafrate.bed hgLoadBed hg16 cnpIafrate -tab -sqlTable=cnpIafrate.sql cnpIafrate.bed # Sebat data cp Sebat.txt cnpSebat.bed hgLoadBed hg16 cnpSebat -tab -sqlTable=cnpSebat.sql cnpSebat.bed # deletions added May 2006 # From mccarroll@molbio.mgh.harvard.edu genId.pl < mcCarrolldels.txt > mcCarrolldels.bed hgLoadBed hg16 -noBin -tab delMccarroll mcCarrolldels.bed # Hinds data via Andy Sharp sort -n hindsDels.txt > hindsDels.sort genId.pl < hindsDels.sort > hindsDels.bed hgLoadBed hg16 -noBin -tab delHinds hindsDels.bed # From conrad@uchicago.edu conrad.pl < conradDels.txt > conradDels.bed hgLoadBed hg16 -noBin -tab delConrad conradDels.bed ########################################################################## # sno/miRNA track from Michel Weber (DONE - 2005-06-16 - Hiram) # received the data file UCSC_snotrack_hg16.txt via email ssh hgwdev cd /cluster/data/hg16/bed/wgRna # As a quick first pass at classification, take a look at the # items in the hg17.wgRna table and use those as a guide hgsql -N -e "select * from wgRna;" hg17 > hg17.wgRna.txt awk '{print $5,$10}' hg17.wgRna.txt > name.type.hg17 # combine this new sno data with the existing miRNA data hgsql -N -e "select * from miRNA;" hg16 > hg16.miRNA.txt cat << '_EOF_' > addTypes.pl #!/usr/bin/env perl use warnings; use strict; my %types; # key is name, value is the type open (FH, "name.type.hg17") or die "Can not open name.type.hg17"; while (my $line=) { chomp $line; my ($name, $type) = split('\s+',$line); $types{$name} = $type; } close (FH); open (FH,"grep ^chr UCSC_snotrack_hg16.txt | sort -k1,1 -k2,2n|") or die "can not open UCSC_snotrack_hg16.txt"; while (my $line=) { chomp $line; my $type="unknown"; my ($chrom, $start, $end, $name, $score, $strand) = split('\s+',$line); if (exists($types{$name})) { $type = $types{$name}; } else { if ($name =~ m/^HBII/) { $type = "CDBox"; } } print "$chrom\t$start\t$end\t$name\t$score\t$strand\t0\t0\t$type\n"; } close (FH); '_EOF_' # happy emacs chmod +x addTypes.pl awk '{print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"}' \ hg16.miRNA.txt > hg16.wgRna.tab ./addTypes.pl >> hg16.wgRna.tab hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/wgRna.sql hg16 wgRna \ hg16.wgRna.tab # this leaves 16 items classified as unknown, request to # Michel Weber for proper classification ################################################################################ # Build hg17Kg table for KG II for hg16, using hg17 KG data (DONE 2005-07-11 Fan). ssh hgwdev cd /cluster/data/mm6/bed mkdir hg17Kg cd hg17Kg hgsql hg16 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl hgsql hg16 -N -e \ 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \ |sort -u > all_mrna.cds bash mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log exit hgsql hg16 -e 'drop table mrnaGp' hgsql hg16 < ~/src/hg/lib/mrnaGp.sql hgsql hg16 -e 'load data local infile "all_mrna.gp" into table mrnaGp' hgsql hg16 -N -e \ 'select mrnaGp.* from mrnaGp,hg17.knownGene where mrnaGp.name = knownGene.name and mrnaGp.chrom=knownGene.chrom' \ |sort -u > mrnaGp2.tab hgsql hg16 -e 'drop table mrnaGp2' hgsql hg16 < ~/src/hg/lib/mrnaGp2.sql hgsql hg16 -e 'load data local infile "mrnaGp2.tab" into table mrnaGp2' # Create hg16Kg table in hg17 to get over a hurdle that we can not do join # between mySQL DBs hgsql hg17 -e 'drop table hg16Kg' hgsql hg17 < ~/src/hg/lib/hg16Kg.sql hgsql hg16 -N -e 'select * from knownGene' >hg16Kg.tab hgsql hg17 -e 'load data local infile "hg16Kg.tab" into table hg16Kg' hgsql hg17 -N -e \ 'select hg16Kg.* from hg16Kg, knownGene where hg16Kg.name=knownGene.name and knownGene.name not like "NM_%" and hg16Kg.chrom=knownGene.chrom '\ >j cut -f 1-10 j >j1 # j1 are mRNA records through old KG process. # j2 are RefSeq records based on hg17 KG # mrnaGp2 are mRNA records based on hg17 KG non-Refseq entries and GenBank CDS data (which is incomplete). hgsql hg16 -N -e \ 'select refGene.* from refGene, hg17.knownGene where hg17.knownGene.name=refGene.name' >j2 cat j1 j2 mrnaGp2.tab |sort -u >j.tab ~/kent/src/hg/protein/sortKg.pl j.tab >hg17Kg.tab wc hg17Kg.tab hgsql hg16 -e "delete from hg17Kg" hgsql hg16 -e 'load data local infile "hg17Kg.tab" into table hg17Kg' #################################################################### # Make mouse ortholog column using blastp on mm6 known genes. (DONE 7/12/05, Fan). # First make mouse protein database and copy it to /cluster/panasas # if it doesn't exist already # This already exists. See makeMm6.doc for procedure # Make parasol run directory ssh kk mkdir -p /cluster/data/hg16/bed/blastp/mm6 cd /cluster/data/hg16/bed/blastp/mm6 mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \ -p blastp -d /cluster/panasas/home/store/mm6/blastp/known \ -i $1 -o $2 -e 0.001 -m 8 -b 1 '_EOF_' # << keep emacs happy chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch # this echo trick is used because otherwise the command line is # too long and you can not do a simple ls echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try para check para push ... etc ... Completed: 5812 of 5812 jobs CPU time in finished jobs: 96031s 1600.52m 26.68h 1.11d 0.003 y IO & Wait Time: 15641s 260.68m 4.34h 0.18d 0.000 y Average job time: 19s 0.32m 0.01h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 168s 2.80m 0.05h 0.00d Submission to last job: 766s 12.77m 0.21h 0.01d # Load into database. ssh hgwdev cd /cluster/data/hg16/bed/blastp/mm6/run/out hgLoadBlastTab hg16 mmBlastTab -maxPer=1 *.tab # Scanning through 5812 files # Loading database with 35707 rows # Update otherOrg.ra under hg/hgGene/hgGeneData/Human/hg16 to mm6 instead of # mm4. ########################################################################## # EVOFOLD - RNA secondary structure predictions lifted from hg17 (Jakob Skou Pedersen) # Jakob Skou Pedersen, July 12, 2005 ssh -C hgwdev mkdir -p /cluster/data/hg16/bed/evofold cd /cluster/data/hg16/bed/evofold # lifting folds from hg17 to hg16 echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed liftOver -minMatch=1.0 foldsHg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg16.over.chain tmp.bed unmapped.bed # remove elements which are wrong size after lifting awk '$3-$2 == $7' tmp.bed > foldsHg16.bed hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql hg16 evofold foldsHg16.bed # clean up rm foldsHg17.bed unmapped.bed tmp.bed # Tajima's D (DONE -- 2005-09-20 -- Daryl) # Data from Chris Carlson in Debbie Nickerson's lab # Chris Carlson [csc47uwashingtonedu] # lifted down from hg17. See makeHg17.doc for details # AFFYHUEX1 track (sugnet Wed Oct 5 12:18:18 PDT 2005) mkdir hg16 cd hg16 pwd # /cluster/store1/sugnet/affymetrixHumanAllExon/hg16 mkdir gff beds annot cd gff # download gff design files cp ../../hg17/gff/parseGff.pl . # parse gff script... #!/usr/bin/perl -w if(scalar(@ARGV) == 0) { print STDERR "parseGff.pl - Parse out affymetrixes gff annotation probesets for human all exon design. usage: parseGff.pl file1.design.gff file2.design.gff ... fileN.design.gff "; exit(1); } sub splitField($) { my $l = shift(@_); my @w = split / /, $l; return $w[1]; } while($file = shift(@ARGV)) { if(!($file =~ /(.+)\.gff/)) { die "$file doesn't have .gff suffix\n"; } $prefix = $1; print STDERR "Doing file $file.\n"; open(IN, $file) or die "Can't open $file to read."; open(BED, ">../beds/$prefix.pset.bed") or die "Can't open ../beds/$prefix.pset.bed to write."; open(ANNOT, ">../annot/$prefix.tab") or die "Can't open ../annot/$prefix.tab to write."; while($line = ) { # Only want the probeset records. if($line =~ /\tprobeset\t/) { $score = 0; $cds = 0; $bounded = 0; chomp($line); # pop off an microsoft line endings. $line =~ s/\r$//; @words = split /\t/, $line; # This makes the evidence be comman separated. $words[8] =~ s/\" \"/,/g; # This gets rid of pesky quotes. $words[8] =~ s/\"//g; # Set the score based on the annotation type if($words[8] =~ /full/) { $score = 200; } elsif($words[8] =~ /extended/) { $score = 500; } elsif($words[8] =~ /core/) { $score = 900; } if($words[8] =~ /bounded/) { $score -= 200; } if($words[8] =~ /cds/) { $score += 100; } if($score <= 0) { $score = 100; } # Print out the annotation fields. @fields = split /; /,$words[8]; $id = splitField($fields[1]); $f = shift(@fields); $f = splitField($f); print ANNOT "$f"; while($f = shift(@fields)) { if($f =~ /^bounded/) { $bounded = 1; } if($f =~ /^cds/) { $cds = 1; } if(!($f =~ /^bounded/ || $f =~ /^cds/)) { $f = splitField($f); print ANNOT "\t$f"; } } print ANNOT "\t$bounded\t$cds"; print ANNOT "\n"; print BED "$words[0]\t$words[3]\t$words[4]\t$id\t$score\t$words[6]\n"; } } close(IN); close(BED); close(ANNOT); } ./parseGff.pl *.gff cat beds/*.bed > affyHuEx1.bed hgLoadBed hg16 affyHuEx1 affyHuEx1.bed -strict cat annot/*.tab > affyHuEx1.annot.tab cp ../hg17/affyHuEx1Annot.sql ./ # Contents of affyHuEx1Annot.sql file CREATE TABLE affyHuEx1Annot ( numIndependentProbes smallint not null, probesetId int(11) not null, exonClustId int(11) not null, numNonOverlapProbes smallint not null, probeCount smallint not null, transcriptClustId int(11) not null, probesetType smallint not null, numXHybeProbe smallint not null, psrId int(11) not null, level varchar(10) not null, evidence varchar(255) not null, bounded smallint not null, cds smallint not null, PRIMARY KEY (probesetId) ); hg16S -A < affyHuEx1Annot.sql echo "load data local infile 'affyHuEx1.annot.tab' into table affyHuEx1Annot;" | hg16S -A # end AFFYHUEX1 track ########################################################################## # NHGRI DNASE I HYPERSENSITIVE SITES (2005-10-05 kate) # Submitted by Greg Crawford via web site, # http://research.nhgri.nih.gov/DNaseHS/May2005/ # In addition, a file containing the 'randoms' was FTP'ed by Greg # NOTE: bad chr8_random entry removed, as per G. Crawford # Same display as ENCODE track by Angie... # Jim asked to add scores for grayscale-coloring: # clusters of 2 drawn in 50%, clusters of 3 drawn in 75%, # and clusters of 4 or more drawn in 100% black. mkdir /cluster/data/hg16/bed/nhgri/lab cd /cluster/data/hg16/bed/nhgri/lab foreach c (`cut -f 1 /cluster/data/hg16/chrom.sizes`) echo $c wget -nd http://research.nhgri.nih.gov/DNaseHS/May2005/clusters/$c.LynxClusters.bed end cd .. # special handling for ID's on chrM (they are preceded by 'M_') ls lab/chr*.bed lab/randoms.txt \ | grep -v chrM | xargs cat | grep '^chr' \ | perl -wpe 'if (/500bp_(\d+)_(\d+)/) { \ $id = $1 . "_" . $2; \ $score = ($2 >= 4) ? 1000 : $2 * 250; \ s/500bp.+/$id\t$score/; } else { die "parse"; }' > hs.bed cat lab/chrM*.bed | grep '^chr' \ | perl -wpe 'if (/500bp_(M_\d+)_(\d+)/) { \ $id = $1 . "_" . $2; \ $score = ($2 >= 4) ? 1000 : $2 * 250; \ s/500bp.*/$id\t$score/; } else { die "parse"; }' >> hs.bed hgLoadBed hg16 nhgriDnaseHs hs.bed # Loaded 14224 elements of size 5 checkTableCoords hg16 nhgriDnaseHs # MYTOUCH FIX - jen - 2006-01-24 sudo mytouch hg16 superfamily 0407141100.00 sudo mytouch hg16 acemblyPep 0406151200.00 sudo mytouch hg16 twinscanPep 0407141200.00 sudo mytouch hg16 superfamily 0407141100.00 sudo mytouch hg16 ensPep 0407141100.00 sudo mytouch hg16 knownToEnsembl 0407141100.00 sudo mytouch hg16 sfDescription 0407141100.00 sudo mytouch hg16 ensEstGtp 0409081800.00 sudo mytouch hg16 ensEstPep 0409081800.00 ########################################################################## # AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14) ssh hgwdev cd /cluster/data/hg16/bed/affyHumanExon liftOver /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.fixed.bed \ /gbdb/hg17/liftOver/hg17ToHg16.over.chain.gz affyHuEx1.fixed.bed affyHuEx1.unmapped awk 'BEGIN{OFS="\t"}{print $4,$3-$2}' affyHuEx1.fixed.bed | sort -k2,2nr | head #2325773 204918 #2402134 204802 #3645108 60419 #2366900 52086 #3016074 9552 #3641787 8061 #2321649 8054 # So there's 4 of them with problems this time: egrep -v "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed > alreadyok.bed egrep "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed \ /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.fixed.bed > good.hg17.bed bedToFa /cluster/data/hg17/hg17.2bit good.hg17.bed good.hg17.fa gfClient blat6 17785 /cluster/data/hg16/nib good.hg17.fa bad.hg16.psl tail +6 bad.hg16.psl | awk '$11==$13{print}' > good.hg16.psl pslToBed good.hg16.psl good.hg16.bed # Scores were lost in the transformations. Put em back in. egrep "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed #chr1 24924744 25129662 2325773 500 + #chr1 24924872 25129674 2402134 900 - #chr1 168139941 168192027 2366900 1000 + #chr16 2600606 2661025 3645108 200 + awk 'BEGIN{OFS="\t"} $4=="2325773"{score="500";} $4=="2402134"{score="900";} $4=="3645108"{score="200";} {print $1,$2,$3,$4,score,$6}' good.hg16.bed > good.bed cat alreadyok.bed good.bed > affyHuEx1.fixed.bed bedSort affyHuEx1.fixed.bed tmp.bed rm good.* bad.* alreadyok.bed hgLoadBed hg16 affyHuEx1 affyHuEx1.fixed.bed ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg16 ################################################ # SPLIT EXPRESSION & REGULATION GROUPS # (2008-09-09 kate) echo "insert into grp (name, label, priority) values ('expression', 'Expression', 4.5)" | hgsql hg16 echo "update grp set label='Regulation' where name='regulation'" | hgsql hg16 ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) echo hg16 panTro1 mm3 rn3 galGal2> /hive/data/genomes/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/species.lst update genbank.conf: hg16.upstreamGeneTbl = refGene hg16.upstreamMaf = mzPt1Mm3Rn3Gg2_pHMM /hive/data/genomes/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/species.lst ############################################################################# # MAKE PCR TARGET FOR UCSC GENES (REGENERATED BUT NOT RELOADED 4/2/10 angie) # Originally done 11/4/08 (with file from not quite the right UCSC Genes build directory, # resulting in some missing genes in kgTargetAli) ssh hgwdev mkdir /cluster/data/hg16/bed/mrnaPcr cd /cluster/data/hg16/bed/mrnaPcr # First, get consistent FA and PSL for UCSC Genes. hgsql hg16 -NBe 'select * from knownGene' | cut -f 1-10 > knownGene.gp genePredToBed knownGene.gp \ > ucscGenes.bed hgsql hg16 -NBe 'select kgId,geneSymbol from kgXref' \ | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \ > idSub.txt subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed sequenceForBed -keepName -db=hg16 -bedIn=ucscGenesIdSubbed.bed \ -fastaOut=stdout \ | faToTwoBit -ignoreDups stdin kgTargetSeq.2bit cut -f 1-10 knownGene.gp \ | genePredToFakePsl hg16 stdin kgTargetAli.psl /dev/null # NOT DONE -- not worth QA effort: # Load up the UCSC Genes target PSL table and put 2bit in /gbdb:: hgLoadPsl hg16 -table=kgTargetAliMar10 kgTargetAli.psl mkdir /gbdb/hg16/targetDb ln -s /cluster/data/hg16/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg16/targetDb/kgTargetSeqMar10.2bit # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on # /gbdb/hg16/targetDb/kgTargetSeq.2bit . ssh hgwdev # Add records to hgcentraltest blatServers and targetDb: hgsql hgcentraltest -e \ 'INSERT into blatServers values ("hg16KgMar10", "blat13", 17795, 0, 1);' hgsql hgcentraltest -e \ 'INSERT into targetDb values("hg16KgMar10", "UCSC Genes", \ "hg16", "kgTargetAliMar10", "", "", \ "/gbdb/hg16/targetDb/kgTargetSeqMar10.2bit", 1, now(), "");' ############################################################################# # LIFTOVER TO Hg19 (DONE - 2009-04-24 - Hiram ) mkdir /hive/data/genomes/hg16/bed/blat.hg19.2009-04-24 cd /hive/data/genomes/hg16/bed/blat.hg19.2009-04-24 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -buildDir=`pwd` -debug hg16 hg19 # Real run: time nice -n +19 \ $HOME/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \ -buildDir=`pwd` -verbose=2 \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ hg16 hg19 > do.log 2>&1 & # real 93m11.093s #############################################################################