# for emacs: -*- mode: sh; -*- # Danio Rerio (zebrafish) from Sanger, version Zv3 (released 11/27/03) # Project website: # http://www.sanger.ac.uk/Projects/D_rerio/ # Assembly notes: # http://www.sanger.ac.uk/Projects/D_rerio/Zv3_assembly_information.shtml # DOWNLOAD SEQUENCE (DONE, 2004-05-17, kate) ssh kksilo mkdir /cluster/store7/danRer1 ln -s /cluster/store7/danRer1 /cluster/data cd /cluster/data/danRer1 wget ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/README wget ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/Zv3.contigs.agp wget ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/Zv3.supercontigs.agp wget ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/Zv3.supercontigs.fa wget ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/Zv3.supercontigs.fa.tag # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2004-05-24, hartera) # CREATE ChrM.agp (DONE, 2004-05-27, hartera) # Add "chr" prefix to chrM.agp (DONE, 2004-07-06, hartera) # Error reported by a user for danRer2, also a problem in danRer1: # chrM.agp is space delimited rather than tab delimited so correct this. NCBI # defines the agp format as being tab delimited. (DONE, 2005-04-25, hartera) ssh kksilo mkdir /cluster/data/danRer1/M cd /cluster/data/danRer1/M # go to http://www.ncbi.nih.gov/ and search Nucleotide for # "Danio mitochondrion genome". That shows the gi number: # 8576324 for the accession, AC024175 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA' # Edit chrM.fa: make sure the header line says it is the # Danio Rerio mitochondrion complete genome, and then replace the # header line with just ">chrM". # Make a "pseudo-contig" for processing chrM too: mkdir ./chrM_1 sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa mkdir ./lift echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst echo "chrM_1" > ./lift/ordered.lst echo "0 M/chrM_1 16596 chrM 16596" > ./lift/ordered.lft # create a .agp file for chrM as hgGoldGapGl and other # programs require a .agp file so create chrM.agp cat << '_EOF_' > ./chrM.agp M 1 16596 1 F AC024175.3 1 16596 + '_EOF_' # Add "chr" prefix to M in chrM.agp (2004-07-06) perl -pi.bak -e 's/M/chrM/' ./M/chrM.agp # check file then remove backup rm ./M/chrM.agp.bak # chrM.agp is space delimited instead of tab delimited # so correct this (hartera, 2005-04-25) cd /cluster/data/danRer1/M # edit chrM.agp and change field delimiters from spaces to tabs. # add this new file to zipped files of agp and get it pushed to the # downloads area - see "MAKE DOWNLOADABLE SEQUENCE FILES" section of # this make doc. # SPLIT AGP FILES BY CHROMOSOME (DONE, 2004-05-26, hartera) ssh kksilo cd /cluster/data/danRer1 # There are 2 .agp files: one for supercontigs and then one for contigs # showing how they map on to supercontigs. # split up the agp into one per chrom. foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ 21 22 23 24 25 ) mkdir $c perl -we "while(<>){if (/^$c\t/) {print;}}" \ ./Zv3.contigs.agp \ > $c/chr$c.contigs.agp perl -we "while(<>){if (/^$c\t/) {print;}}" \ ./Zv3.supercontigs.agp \ > $c/chr$c.supercontigs.agp end # From agp files, get supercontigs and contigs for # ctg* as chrUn, NA* as chrNA and Finished* as chrUn foreach t ( ctg NA Finished ) if ($t == "ctg") then set c = "Un" else set c = $t endif mkdir $c perl -we "while(<>){if (/^$t/) {print;} }" \ ./Zv3.contigs.agp \ >> $c/chr$c.contigs.agp perl -we "while(<>){if (/^$t/) {print;} }" \ ./Zv3.supercontigs.agp \ >> $c/chr$c.supercontigs.agp end # BUILD CHROM-LEVEL SEQUENCE (DONE, 2004-05-27, hartera) ssh kksilo cd /cluster/data/danRer1 # Sequence is already in upper case so no need to change foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ 21 22 23 24 25) echo "Processing ${c}" $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.supercontigs.agp $c \ $c/chr$c.fa ./Zv3.supercontigs.fa echo "${c} - DONE" end # Need to change the number for each chromosome in the .agp and .fa # files to read "chrN" - should have done this before processing # original sequence and .agp files foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ 21 22 23 24 25) echo "Processing ${c}" perl -pi -e 's/^>([0-9]+)/>chr$1/' $c/*.fa perl -pi -e 's/^([0-9]+)/chr$1/' $c/*.agp echo "${c} - DONE" end # CREATE ChrUn, chrNA AND chrFinished FASTA, AGP, GAP AND LIFT FILES FROM # ctg*, NA* AND Finished* SUPERCONTIGS # (DONE, 2005-05-27, hartera) # ADD CORRECT FRAGMENT TYPE TO .agp FILES (DONE, 2004-07-07, hartera) ssh kksilo cd /cluster/data/danRer1 foreach c ( Finished NA Un ) awk '{print $1;}' $c/chr$c.supercontigs.agp > $c/chr$c.supercontigs.lst $HOME/bin/i386/faSomeRecords /cluster/data/danRer1/Zv3.supercontigs.fa \ $c/chr$c.supercontigs.lst $c/chr$c.fa end # check FASTA files then generate AGP and lift files # from the chromosome fastas foreach c ( Finished NA Un ) $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa mv $c/chr$c.fa $c/chr$c.supercontigs.fa perl -pi -e "s/chrUn/chr$c/" $c/chr$c.* $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.agp \ chr$c $c/chr$c.fa ./Zv3.supercontigs.fa end # chrFinished # scaffold gap size is 1000, total scaffolds: 209 # chrom size is 40167097 # chrNA # scaffold gap size is 1000, total scaffolds: 54798 # chrom size is 390413307 # chrUn # scaffold gap size is 1000, total scaffolds: 1842 # chrom size is 367113659 # Add correct fragment type to .agp files (2004-07-07) # chrUn is "W", chrFinished is "F" and chrNA is "W" for all supercontigs ssh kksilo cd /cluster/data/danRer1 foreach c (Un NA Finished) if ($c == "Un" || $c == "NA") then set f = "W" else set f = "F" endif perl -pi.bak -e "s/D/$f/;" $c/chr${c}.agp end # check .agp files and then remove backup files foreach c (Un NA Finished) rm ./$c/chr${c}.agp.bak end # CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2004-05-27, hartera) # Check that the size of each chromosome .fa file is equal to the # last coord of the .agp: ssh hgwdev cd /cluster/data/danRer1 foreach c ( Finished NA Un ) foreach f ( $c/chr$c.agp ) set agpLen = `tail -1 $f | awk '{print $3;}'` set g = $f:r set faLen = `faSize $g.fa | awk '{print $1;}'` if ($agpLen == $faLen) then echo " OK: $f length = $g length = $faLen" else echo "ERROR: $f length = $agpLen, but $g length = $faLen" endif end end foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ 21 22 23 24 25) foreach f ( $c/chr$c.supercontigs.agp ) set agpLen = `tail -1 $f | awk '{print $3;}'` set h = $f:r set g = $h:r echo "Getting size of $g.fa" set faLen = `faSize $g.fa | awk '{print $1;}'` if ($agpLen == $faLen) then echo " OK: $f length = $g length = $faLen" else echo "ERROR: $f length = $agpLen, but $g length = $faLen" endif end end # All are OK so all Fasta files are the correct size # BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS # (DONE, 2004-05-27, hartera) ssh kksilo cd /cluster/data/danRer1 foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ 21 22 23 24 25 Finished NA Un ) foreach agp ($c/chr$c.{,contigs.}agp) if (-e $agp) then set fa = $c/chr$c.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end # Create list of chromsosomes (DONE, 2004-05-27, hartera) ssh hgwdev cd /cluster/data/danRer1 foreach f (*/*.agp) set chr = `echo $f:h | sed -e 's/^chr//'` echo $chr >> chrom end sort -n chrom | uniq > chrom.lst rm chrom # MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-05-27, hartera) # This used to hold scripts -- better to keep them inline here # Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/danRer1/jkStuff # This is where most tracks will be built: mkdir /cluster/data/danRer1/bed # CREATING DATABASE (DONE, 2004-05-27 - hartera) # Create the database. # next machine ssh hgwdev echo 'create database danRer1' | hgsql '' # if you need to delete that database: !!! WILL DELETE EVERYTHING !!! echo 'drop database danRer1' | hgsql danRer1 # Use df to make sure there is at least 5 gig free on df -h /var/lib/mysql # Before loading data: # Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 303G 1.4T 19% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-05-27, hartera) # next machine ssh hgwdev # the following command copies all the data from the table # grp in the database gaGal2 to our new database danRer1 echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \ | hgsql danRer1 # if you need to delete that table: !!! WILL DELETE ALL grp data !!! echo 'drop table grp;' | hgsql danRer1 # REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2004-05-28, hartera) #- Split contigs into 500kb chunks, at gaps if possible: ssh kksilo cd /cluster/data/danRer1 foreach c (`cat chrom.lst`) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/danRer1 cat << '_EOF_' > jkStuff/RMZebrafish #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/danRer1/$2 /bin/cp $2 /tmp/danRer1/$2/ cd /tmp/danRer1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec danio $2 popd /bin/cp /tmp/danRer1/$2/$2.out ./ if (-e /tmp/danRer1/$2/$2.align) /bin/cp /tmp/danRer1/$2/$2.align ./ if (-e /tmp/danRer1/$2/$2.tbl) /bin/cp /tmp/danRer1/$2/$2.tbl ./ if (-e /tmp/danRer1/$2/$2.cat) /bin/cp /tmp/danRer1/$2/$2.cat ./ /bin/rm -fr /tmp/danRer1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMZebrafish mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/danRer1/jkStuff/RMZebrafish \ /cluster/data/danRer1/$d $f \ '{'check out line+ /cluster/data/danRer1/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh kk cd /cluster/data/danRer1/RMRun para create RMJobs para try, para check, para check, para push, para check,... # para time # Completed: 3623 of 3623 jobs # CPU time in finished jobs: 6514173s 108569.55m 1809.49h 75.40d 0.207 y # IO & Wait Time: 34466s 574.43m 9.57h 0.40d 0.001 y # Average job time: 1808s 30.13m 0.50h 0.02d # Longest job: 2539s 42.32m 0.71h 0.03d # Submission to last job: 23551s 392.52m 6.54h 0.27d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/danRer1 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/danRer1 hgLoadOut danRer1 */chr*.fa.out # MAKE LIFTALL.LFT (DONE, 2004-05-28, hartera) ssh kksilo cd /cluster/data/danRer1 cat */lift/ordered.lft > jkStuff/liftAll.lft # SIMPLE REPEAT [TRF] TRACK (DONE, 2004-06-07, hartera) # TRF runs pretty quickly now... it takes a few hours total runtime, # so instead of binrsyncing and para-running, just do this on the # local fileserver ssh kksilo mkdir -p /cluster/data/danRer1/bed/simpleRepeat cd /cluster/data/danRer1/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/danRer1/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end chmod a+x jobs.csh csh -ef jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l endsInLf trf/* # trf crashes with chr10_5 so remove from job list for now and run the rest # of the jobs. use jobs1.csh # contacted authors of trf - they are going to send a new binary. In the # meantime, they ran chr10_5.fa through the fixed version of trf # Need to change filenames from chr10_5.fa.* to chr10_5.tf.* mkdir /cluster/data/danRer1/bed/simpleRepeat/test cd /cluster/data/danRer1/bed/simpleRepeat/test mkdir trf # Download and unzip results.zip file in this directory cat << '_EOF_' > changefileName.pl #!/usr/bin/perl -w use strict; while () { chomp; my $a = $_; if ($a =~ /^chr10_5/) { print "cp $a "; $a =~ s/chr10_5.fa/chr10_5.tf/; print "$a\n"; } } '_EOF_' # change filenames and copy to /tmp/ directory ls > fileList perl changeFilename.pl < fileList > newFiles cp chr10_5.tf* /tmp/ # Edit ~/kent/src/hg/trfBig/trfBig.c to remove calls to trfSysCall # so trf is not used but output files are post processed # make and run trfBig /cluster/home/hartera/bin/i386/trfBig -trf=/cluster/bin/i386/trf \ /cluster/data/danRer1/10/chr10_5/chr10_5.fa /dev/null \ -bedAt=trf/chr10_5.bed -tempDir=/tmp # copy bed file output to directory of bed files for the other sequences cp /cluster/data/danRer1/bed/simpleRepeat/test/trf/chr10_5.bed \ /cluster/data/danRer1/bed/simpleRepeat/trf liftUp simpleRepeat.bed /cluster/data/danRer1/jkStuff/liftAll.lft warn \ trf/*.bed # Load into the database: ssh hgwdev hgLoadBed danRer1 simpleRepeat \ /cluster/data/danRer1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 582635 elements of size 16 # PROCESS SIMPLE REPEATS INTO MASK (DONE, 2004-06-07, hartera) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kksilo cd /cluster/data/danRer1/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/danRer1 mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF # (DONE, 2004-06-07, hartera) ssh kksilo cd /cluster/data/danRer1 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # This warning is extremely rare -- if it indicates a problem, it's only # with the repeat annotation and doesn't affect the masking. # WARNING: negative rEnd: -54 chr5:3971603-3971634 (TCTG)n # WARNING: negative rEnd: -31 chrNA:145096456-145096484 (TTTG)n # WARNING: negative rEnd: -29 chrNA:206929524-206929579 (TCCA)n foreach c (`cat chrom.lst`) echo "repeat- and trf-masking contigs of chr$c" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end # same warning here too. # WARNING: negative rEnd: -31 chrNA_29:4620077-4620105 (TTTG)n # WARNING: negative rEnd: -29 chrNA_42:1194625-1194680 (TCCA)n # WARNING: negative rEnd: -54 chr5_1:3971603-3971634 (TCTG)n # sent these to Arian # Build nib files, using the soft masking in the fa mkdir nib foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE, 2004-06-08 - hartera) # Make symbolic links from /gbdb/danRer1/nib to the real nibs. ssh hgwdev cd /cluster/data/danRer1 mkdir -p /gbdb/danRer1/nib foreach f (/cluster/data/danRer1/nib/chr*.nib) ln -s $f /gbdb/danRer1/nib end # Load /gbdb/danRer1/nib paths into database and save size info # hgNibSeq creates chromInfo table hgNibSeq -preMadeNib danRer1 /gbdb/danRer1/nib */chr*.fa echo "select chrom,size from chromInfo" | hgsql -N danRer1 > chrom.sizes # take a look at chrom.sizes, should be 29 lines wc chrom.sizes # Make one big 2bit file as well, and make a link to it in # /gbdb/danRer1/nib because hgBlat looks there: faToTwoBit */chr*.fa danRer1.2bit ln -s /cluster/data/danRer1/danRer1.2bit /gbdb/danRer1/nib/ # MAKE GOLD AND GAP TRACKS (DONE, 2004-06-08, hartera) # REMAKE GOLD AND GAP TRACKS (DONE, 2004-07-07, hartera) ssh hgwdev cd /cluster/data/danRer1 # the gold and gap tracks are created from the chrN.agp file and this is # the contigs agp so need to use chrN.supercontigs.agp to create # an assembly scaffolds track as for panTro1 #mv chrN.agp to a "contigs" dir and rename chrN.supercontigs.agp to chrN.agp foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ 21 22 23 24 25) mkdir ./$c/contigs cp ./$c/chr${c}.agp ./$c/contigs/ mv ./$c/chr${c}.contigs.agp ./$c/contigs/ mv ./$c/chr${c}.supercontigs.agp ./$c/chr${c}.agp end # check track and delete the chrN_supercontigs gap and gold tables foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ 21 22 23 24 25) hgsql -e "drop table chr${c}_supercontigs_gold;" danRer1 hgsql -e "drop table chr${c}_supercontigs_gap;" danRer1 end hgGoldGapGl -noGl -chromLst=chrom.lst danRer1 /cluster/data/danRer1 . # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DANRER1 # (DONE, 2004-06-08, hartera) # CHANGE DEFAULT POSITION FOR BROWSER (DONE, 2004-07-16, hartera) # CHANGED ERROR IN SCIENTIFIC NAME (DONE, 2004-08-19, hartera) # Make trackDb table so browser knows what tracks to expect: ssh hgwdev mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer1 cd $HOME/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add danRer1 in all the right places and do make update make alpha cvs commit -m "Added danRer1." makefile # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ values("danRer1", "Nov. 2003", \ "/gbdb/danRer1/nib", "Zebrafish", "chr1:1000-4000", 1, \ 41, "Zebrafish", "Danio Rerio", \ "/gbdb/danRer1/html/description.html", 0, 0, \ "Sanger Centre, Danio rerio Sequencing Project Zv3");' \ | hgsql -h genome-testdb hgcentraltest echo 'insert into defaultDb (genome, name) \ values ("Zebrafish", "danRer1");' \ | hgsql -h genome-testdb hgcentraltest # Move Zebrafish so it is between Chicken and Fugu in the organism list echo 'update dbDb set orderKey = 38 where name = "danRer1";' \ | hgsql -h genome-testdb hgcentraltest # Change default position to show "tiggy-winkle hedgehog" gene # involved in signaling in development # (2004-07-16, hartera) echo 'update dbDb set defaultPos = "chr2:16,330,443-16,335,196" \ where name = "danRer1";' | hgsql -h genome-testdb hgcentraltest # error in scientific name: uppercase beginning of species name so correct # (2004-08-19, hartera) echo 'update dbDb set scientificName = "Danio rerio" where \ name = "danRer1";' | hgsql -h genome-testdb hgcentraltest # set active = 0 for danRer1 in dbDb on hgcentralbeta and hgcentral # (see section on ARCHIVING danRer1) (2006-05-02, hartera) # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-06-30, hartera) ssh hgwdev mkdir /cluster/data/danRer1/html cd /cluster/data/danRer1/html # make a symbolic link from /gbdb/danRer1/html to /cluster/data/danRer1/html ln -s /cluster/data/danRer1/html /gbdb/danRer1/html # Add a description page for zebrafish cd /cluster/data/danRer1/html cp /cluster/data/fr1/html/*.html . # Edit this for zebrafish # create a description.html page here mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer1 cd ~/kent/src/hg/makeDb/trackDb/ cvs add zebrafish cvs commit zebrafish cd zebrafish cvs add danRer1 cvs commit danRer1 # Add description page here too cp /cluster/data/danRer1/html/description.html \ $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1/ chmod a+r \ $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1/description.html cd $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1/ # Check it in and copy (ideally using "make alpha" in trackDb) to # /gbdb/danRer1/html cvs add description.html cvs commit description.html # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE, 2004-06-09, hartera) ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/danRer1/nib mkdir -p /iscratch/i/danRer1/nib cp -p /cluster/data/danRer1/nib/chr*.nib /iscratch/i/danRer1/nib # Pseudo-contig fa that have been repeat- and trf-masked: rm -rf /iscratch/i/danRer1/trfFa mkdir /iscratch/i/danRer1/trfFa foreach d (/cluster/data/danRer1/*/chr*_?{,?}) cp $d/$d:t.fa /iscratch/i/danRer1/trfFa end rm -rf /iscratch/i/danRer1/rmsk mkdir -p /iscratch/i/danRer1/rmsk cp -p /cluster/data/danRer1/*/chr*.fa.out /iscratch/i/danRer1/rmsk cp -p /cluster/data/danRer1/danRer1.2bit /iscratch/i/danRer1/ iSync # CREATE gc5Base wiggle TRACK (DONE, 2004-06-09, hartera) ssh kki mkdir /cluster/data/danRer1/bed/gc5Base cd /cluster/data/danRer1/bed/gc5Base # in the script below, the 'grep -w GC' selects the lines of # output from hgGcPercent that are real data and not just some # information from hgGcPercent. The awk computes the number # of bases that hgGcPercent claimed it measured, which is not # necessarily always 5 if it ran into gaps, and then the division # by 10.0 scales down the numbers from hgGcPercent to the range # [0-100]. Two columns come out of the awk print statement: # and which are fed into wigAsciiToBinary through # the pipe. It is set at a dataSpan of 5 because each value # represents the measurement over five bases beginning with # . The result files end up in ./wigData5. # A new script is used (from makeHg17.doc) which gets around the # problem that wigAsciiToBinary was calculating chromEnd to be # beyond the real chromosome end mkdir wigData5 dataLimits5 cat << '_EOF_' > kkRun.sh #!/bin/sh NIB=$1 chr=${NIB/.nib/} chrom=${chr#chr} hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 danRer1 \ /iscratch/i/danRer1/nib | \ grep -w GC | \ awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \ wigAsciiToBinary -dataSpan=5 -chrom=${chr} \ -wibFile=wigData5/gc5Base_${chrom} \ -name=${chrom} stdin 2> dataLimits5/${chr} '_EOF_' # << this line makes emacs coloring happy chmod +x kkRun.sh ls /iscratch/i/danRer1/nib > nibList cat << '_EOF_' > gsub #LOOP ./kkRun.sh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 nibList single gsub jobList para create jobList para try, check, push ... etc # para time # Completed: 29 of 29 jobs # CPU time in finished jobs: 2450s 40.84m 0.68h 0.03d 0.000 y # IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y # Average job time: 80s 1.33m 0.02h 0.00d # Longest job: 506s 8.43m 0.14h 0.01d # Submission to last job: 650s 10.83m 0.18h 0.01d # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/danRer1/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/danRer1/wib/gc5Base \ danRer1 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir /gbdb/danRer1/wib/gc5Base ln -s `pwd`/wigData5/*.wib /gbdb/danRer1/wib/gc5Base # And then the zoomed data view ssh kki cd /cluster/data/danRer1/bed/gc5Base mkdir wigData5_1K dataLimits5_1K cat << '_EOF_' > kkRunZoom.sh #!/bin/sh NIB=$1 chr=${NIB/.nib/} chrom=${chr#chr} hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 danRer1 \ /iscratch/i/danRer1/nib | \ grep -w GC | \ awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \ -chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \ -name=${chrom} stdin 2> dataLimits5_1K/${chr} '_EOF_' # << this line makes emacs coloring happy chmod +x kkRunZoom.sh cat << '_EOF_' > gsubZoom #LOOP ./kkRunZoom.sh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 nibList single gsubZoom jobListZoom para create jobListZoom para try, check, push, ... etc. # para time # Completed: 29 of 29 jobs # CPU time in finished jobs: 2433s 40.55m 0.68h 0.03d 0.000 y # IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y # Average job time: 81s 1.36m 0.02h 0.00d # Longest job: 522s 8.70m 0.14h 0.01d # Submission to last job: 2503s 41.72m 0.70h 0.03d # Then load these .wig files into the same database as above ssh hgwdev cd /cluster/data/danRer1/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/danRer1/wib/gc5Base \ -oldTable danRer1 gc5Base wigData5_1K/*.wig # and symlink these .wib files into /gbdb mkdir -p /gbdb/danRer1/wib/gc5Base ln -s `pwd`/wigData5_1K/*.wib /gbdb/danRer1/wib/gc5Base # BLASTZ FOR HG17 (DONE, 2004-06-16, hartera) ssh kkr1u00 # blastz requires lineage-specific repeats # Treat all repeats as lineage-specific. mkdir /iscratch/i/danRer1/linSpecRep.notInHuman foreach f (/iscratch/i/danRer1/rmsk/chr*.fa.out) cp -p $f /iscratch/i/danRer1/linSpecRep.notInHuman/$f:t:r:r.out.spec end mkdir /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish foreach f (/iscratch/i/gs.18/build35/rmsk/chr*.fa.out) cp -p $f /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/$f:t:r:r.out.spec end iSync ssh kk mkdir -p /cluster/data/danRer1/bed/blastz.hg17.2004-06-08 ln -s /cluster/data/danRer1/bed/blastz.hg17.2004-06-08 \ /cluster/data/danRer1/bed/blastz.hg17 cd /cluster/data/danRer1/bed/blastz.hg17 # Set L=6000 and abridge repeats - these are the same parameters used # for hg16 and Fugu. cat << '_EOF_' > DEF # zebrafish vs human (hg17) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Reuse parameters from hg16-fr1. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Zebrafish SEQ1_DIR=/iscratch/i/danRer1/nib/ SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/danRer1/linSpecRep.notInHuman SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Human (hg17) SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInZebrafish SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/danRer1/bed/blastz.hg17 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy # Save the DEF file in the current standard place cp DEF ~angie/hummus/DEF.danRer1-hg17.2004-06-08 # Need shell scripts from mm4 to do cluster runs mv /cluster/data/mm4/jkStuff/BlastZ*.sh /cluster/data/danRer1/jkStuff/ # edit BlastZ_run0.sh # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/ # this is the directory for the latest version of blastz-run # prepare first cluster run ssh kk cd /cluster/data/danRer1/bed/blastz.hg17 bash # if a csh/tcsh user . ./DEF /cluster/data/danRer1/jkStuff/BlastZ_run0.sh cd run.0 # check batch looks ok then para try, check, push, check, .... # para time # Completed: 57970 of 57970 jobs # CPU time in finished jobs: 20265671s 337761.18m 5629.35h 234.56d 0.643 y# IO & Wait Time: 570125s 9502.09m 158.37h 6.60d 0.018 y# Average job time: 359s 5.99m 0.10h 0.00d # Longest job: 10482s 174.70m 2.91h 0.12d # Submission to last job: 73680s 1228.00m 20.47h 0.85d # Took about 17 hours to run. Output is 6.3 Gigabytes - rather large. # Try processing and check in browser before adjusting parameters # Second cluster run to convert the .out's to .lav's cd /cluster/data/danRer1/bed/blastz.hg17 bash # if a csh/tcsh user . ./DEF /cluster/data/danRer1/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Completed: 170 of 170 jobs # CPU time in finished jobs: 16047s 267.45m 4.46h 0.19d 0.001 y# IO & Wait Time: 146162s 2436.03m 40.60h 1.69d 0.005 y# Average job time: 954s 15.90m 0.27h 0.01d # Longest job: 2179s 36.32m 0.61h 0.03d # Submission to last job: 2451s 40.85m 0.68h 0.03d # Third cluster run to convert lav's to axt's ssh kk cd /cluster/data/danRer1/bed/blastz.hg17 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /iscratch/i/danRer1/nib \ /iscratch/i/gs.18/build35/bothMaskedNibs stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer1/bed/blastz.hg17/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 27 of 29 jobs # Crashed: 2 jobs # CPU time in finished jobs: 2063s 34.38m 0.57h 0.02d 0.000 y# IO & Wait Time: 19245s 320.75m 5.35h 0.22d 0.001 y# Average job time: 789s 13.15m 0.22h 0.01d # Longest job: 9103s 151.72m 2.53h 0.11d # Submission to last job: 14332s 238.87m 3.98h 0.17d # chrNA and chrUn are the crashed jobs. These are very large and ran out of # memory. Re-run these separately on kolossus ssh kk # put nibs on bluearc mkdir -p /cluster/bluearc/danRer1/nib mkdir -p /cluster/bluearc/hg17/bothMaskedNibs cp /iscratch/i/danRer1/nib/* /cluster/bluearc/danRer1/nib cp /iscratch/i/gs.18/build35/bothMaskedNibs/* \ /cluster/bluearc/hg17/bothMaskedNibs cat << '_EOF_' > do2.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /cluster/bluearc/danRer1/nib \ /cluster/bluearc/hg17/bothMaskedNibs stdout \ | axtSort stdin $2 '_EOF_' ssh kolossus cd /cluster/data/danRer1/bed/blastz.hg17/run.2 ./do2.csh ../lav/chrNA \ /cluster/data/danRer1/bed/blastz.hg17/axtChrom/chrNA.axt ./do2.csh ../lav/chrUn \ /cluster/data/danRer1/bed/blastz.hg17/axtChrom/chrUn.axt # translate sorted axt files into psl ssh kolossus cd /cluster/data/danRer1/bed/blastz.hg17 mkdir -p pslChrom set tbl = "blastzHg17" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/danRer1/bed/blastz.hg17/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl -noTNameIx danRer1 $f echo "$f Done" end # To tune Blastz parameters: these parameters above where used but it was # thought that there were probably too many pileups. So then the # blastz was repeated with L=10000 (as for hg16-galGal2) and with an # intermediate value of L=8000. L is the threshold for gapped alignments. # Looking at alignments in the browser it was found that L=10000 and L=8000 # were too stringent and alignments to coding regions were being lost. # L=6000 (as above) seemed to be the best choice and then low scoring # alignments could be filtered at the chaining step instead to reduce pileups. # e.g. using the browser with the non-zebrafish mRNAs track turned on # chr1:24,547-28,249 region lost an exon with all but blastz with L=6000 # chr2:1,124,174-1,135,282 is worse with an entire gene being lost except # with the Blastz with L=6000. # RESCORE HG17 BLASTZ (DONE, 2004-06-21, hartera) # USE HG17 BLASTZ WITH L=6000 SO AS NOT TO MISS TOO MUCH CDS ALIGNING # Low scores can occur with repeats abridged and using the # HoxD55.q matrix. PSU's restore_rpts program rescored alignments # with the default matrix instead of the BLASTZ_Q matrix. # Rescore them here so the chainer sees the higher scores: ssh kolossus cd /cluster/data/danRer1/bed/blastz.hg17 mkdir axtChrom.rescore foreach f (axtChrom/chr*.axt) axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \ $f axtChrom.rescore/$f:t end mv axtChrom axtChrom.orig mv axtChrom.rescore axtChrom # psl files and blastz tables will be the same regardless of score so # no need to reload # CHAIN HG17 BLASTZ (DONE, 2004-06-24, hartera) # Re do chains with rescored blastz Hg17 # Run axtChain on little cluster ssh kki cd /cluster/data/danRer1/bed/blastz.hg17 mv axtChain axtChain.orig mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain # create 2 input lists as need to process chrNA and chrUn separately ls -1S /cluster/data/danRer1/bed/blastz.hg17/axtChrom/*.axt \ > input.lst grep "chrNA" input.lst > inputNAandUn.lst grep "chrUn" input.lst >> inputNAandUn.lst # remove chrNA and chrUn from input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from hg16 vs chicken run. cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter $1 \ | axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap \ -minScore=10000 stdin \ /iscratch/i/danRer1/nib \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 27 of 27 jobs # CPU time in finished jobs: 4572s 76.21m 1.27h 0.05d 0.000 y# IO & Wait Time: 1252s 20.86m 0.35h 0.01d 0.000 y# Average job time: 216s 3.60m 0.06h 0.00d # Longest job: 1092s 18.20m 0.30h 0.01d # Submission to last job: 3242s 54.03m 0.90h 0.04d cat << '_EOF_' > gsub2 #LOOP doChain2 {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain2 #!/bin/csh axtFilter $1 \ | axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap \ -minScore=15000 stdin \ /iscratch/i/danRer1/nib \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 >& $3 '_EOF_' # << this line makes emacs coloring happy gensub2 inputNAandUn.lst single gsub2 jobList2 para create jobList2 para try, check, push, check... # para time # crashes (out of memory) so try again on kolossus ssh kolossus cd /cluster/data/danRer1/bed/blastz.hg17/axtChain/run1 # use sequences on bluearc for kolossus cat << '_EOF_' > doChain2 #!/bin/csh axtFilter $1 \ | axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap \ -minScore=15000 stdin \ /cluster/bluearc/danRer1/nib \ /cluster/bluearc/hg17/bothMaskedNibs $2 >& $3 '_EOF_' chmod +x doChain2 doChain2 /cluster/data/danRer1/bed/blastz.hg17/axtChrom/chrNA.axt \ chain/chrNA.chain out/chrNA.out doChain2 /cluster/data/danRer1/bed/blastz.hg17/axtChrom/chrUn.axt \ chain/chrUn.chain out/chrUn.out # now on the cluster server, sort chains ssh kksilo cd /cluster/data/danRer1/bed/blastz.hg17/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's,try also with larger bin size. foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist.out textHistogram -binSize=10000 /tmp/score.$f:t:r >> hist.out echo "" end # Load chains into database # next machine ssh hgwdev cd /cluster/data/danRer1/bed/blastz.hg17/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain danRer1 ${c}_chainHg17 $i echo done $c end # load in original chains as chainHg17NoFilt - minScore = 5000 on all chroms # featureBits -chrom=chr1 danRer1 chainHg17 # 12359874 bases of 40488791 (30.527%) in intersection # featureBits -chrom=chr1 danRer1 refGene:cds chainHg17 # 147375 bases of 40488791 (0.364%) in intersection # featureBits -chrom=chr1 danRer1 chainHg17NoFilt # 13518074 bases of 40488791 (33.387%) in intersection # featureBits -chrom=chr1 danRer1 refGene:cds chainHg17NoFilt # 149120 bases of 40488791 (0.368%) in intersection # featureBits -chrom=chrNA danRer1 chainHg17 # 79678867 bases of 335615307 (23.741%) in intersection # featureBits -chrom=chrNA danRer1 refGene:cds chainHg17 # 556727 bases of 335615307 (0.166%) in intersection # featureBits -chrom=chrNA danRer1 chainHg17NoFilt # 100579306 bases of 335615307 (29.969%) in intersection # featureBits -chrom=chrNA danRer1 refGene:cds chainHg17NoFilt # 622781 bases of 335615307 (0.186%) in intersection # featureBits -chrom=chrNA danRer1 refGene:cds # 741426 bases of 335615307 (0.221%) in intersection # Using minScore=10000 for all chroms but 15000 for chrUn and chrNA # reduces low scoring chains but without compromising CDS region coverage # too much. # NET HG17 BLASTZ (DONE, 2004-07-07, hartera) ssh kksilo cd /cluster/data/danRer1/bed/blastz.hg17/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 111640576, utime 625 s/100, stime 120 # Add classification info using db tables: cd /cluster/data/danRer1/bed/blastz.hg17/axtChain # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInHuman mkdir -p /cluster/bluearc/hg17/linSpecRep.notInZebrafish cp /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/* \ /cluster/bluearc/hg17/linSpecRep.notInZebrafish cp /iscratch/i/danRer1/linSpecRep.notInHuman/* \ /cluster/bluearc/danRer1/linSpecRep.notInHuman ssh hgwdev cd /cluster/data/danRer1/bed/blastz.hg17/axtChain time netClass noClass.net danRer1 hg17 humanhg17.net \ -tNewR=/cluster/bluearc/danRer1/linSpecRep.notInHuman \ -qNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish -noAr # 85.530u 52.020s 5:15.86 43.5% 0+0k 0+0io 2072pf+0w ssh hgwdev cd /cluster/data/danRer1/bed/blastz.hg17/axtChain netFilter -minGap=10 humanhg17.net | hgLoadNet danRer1 netHg17 stdin # MAKE 10.OOC, 11.OOC FILE FOR BLAT (DONE, 2004-06-09, hartera) # Use -repMatch=460 (based on size -- for human we use 1024, and # the zebrafish genome is ~45% of the size of the human genome ssh kkr1u00 mkdir /cluster/data/danRer1/bed/ooc cd /cluster/data/danRer1/bed/ooc mkdir -p /cluster/bluearc/danRer1 ls -1 /cluster/data/danRer1/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/danRer1/11.ooc -repMatch=460 # Wrote 44155 overused 11-mers to /cluster/bluearc/danRer1/11.ooc # For 10.ooc, repMatch = 4096 for human, so use 1840 blat nib.lst /dev/null /dev/null -tileSize=10 \ -makeOoc=/cluster/bluearc/danRer1/10.ooc -repMatch=1840 # Wrote 10767 overused 10-mers to /cluster/bluearc/danRer1/10.ooc # keep copies of ooc files in this directory and copy to iscratch cp /cluster/bluearc/danRer1/*.ooc . cp -p /cluster/bluearc/danRer1/*.ooc /iscratch/i/danRer1/ iSync # AUTO UPDATE GENBANK MRNA AND EST RUN (DONE, 2004-07-01, hartera) ssh eieio cd /cluster/data/genbank # This is a new organism, edit the etc/genbank.conf file and add: # danRer1 (zebrafish) danRer1.genome = /iscratch/i/danRer1/nib/chr*.nib danRer1.lift = /cluster/data/danRer1/jkStuff/liftAll.lft danRer1.downloadDir = danRer1 # Default includes native genbank mRNAs and ESTs, # genbank xeno mRNAs but no xenoESTs, native RefSeq mRNAs but # not xeno RefSeq cvs commit -m "Added danRer1" etc/genbank.conf # Edit $HOME/kent/src/hg/makeDb/genbank/src/align/gbBlat # to add /iscratch/i/danRer1/11.ooc # Add line: DANRER_OOC=/iscratch/i/danRer1/11.ooc # Add line: danRer*) oocOpt="-ooc=${DANRER_OOC}" ; cvs diff src/align/gbBlat make cvs commit -m "Added 11.ooc for danRer1." src/align/gbBlat # Edit $HOME/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c to add new genome # ADD: static char *danRerNames[] = {"Danio rerio", NULL}; # ADD: {"danRer", danRerNames, NULL}, # to static struct dbToSpecies dbToSpeciesMap[] cvs commit -m "Added zebrafish, danRer1." lib/gbGenome.c # Install to /cluster/data/genbank: cd $HOME/kent/src/hg/makeDb/genbank make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, RefSeq mRNA only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial danRer1 # Load results for RefSeq mRNAs: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad danRer1 # To start next run, results need to be moved out the way cd /cluster/data/genbank/work mv initial.danRer1 initial.danRer1.refseq.mrna # -initial for GenBank mRNAs ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial danRer1 # Load results for GenBank mRNAs ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 danRer1 cd /cluster/data/genbank/work mv initial.danRer1 initial.danRer1.genbank.mrna # -initial for ESTs ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial danRer1 # Load results for GenBank ESTs ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 danRer1 cd /cluster/data/genbank/work mv initial.danRer1 initial.danRer1.genbank.est # XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk) # see makeXenTro1.doc and search for zb.danRer1 # The results of this are also symlinked under danRer1/bed # ENSEMBL GENES (DONE, 2004-06-22, hartera) mkdir /cluster/data/danRer1/bed/ensembl cd /cluster/data/danRer1/bed/ensembl # Get the ensembl protein data from # http://www.ensembl.org/Danio_rerio/martview # Follow this sequence through the pages: # Page 1) Make sure that the Danio_rerio choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensemblGene.gtf.gz # Need to get lift files for each chrom from supercontig to chrom level ssh kksilo mkdir -p /cluster/data/danRer1/bed/ensembl/liftSupertoChrom cd /cluster/data/danRer1/bed/ensembl/liftSupertoChrom foreach c (NA Un Finished) set b = "/cluster/data/danRer1" awk '{print $6;}' $b/$c/chr$c.supercontigs.agp > chr$c.supercontigs.lst $HOME/bin/i386/faSomeRecords $b/Zv3.supercontigs.fa \ ./chr$c.supercontigs.lst ./chr$c.fa echo $c " done" end # check FASTA files then generate AGP and lift files # from the chromosome fastas foreach c (NA Un Finished) $HOME/bin/i386/scaffoldFaToAgp ./chr$c.fa sed -e "s/chrUn/${c}/;" chr$c.agp > chr$c.superToChrom.agp sed -e "s/chrUn/${c}/;" chr$c.lft > chr$c.superToChrom.lft end cat *.superToChrom.lft > liftUnSuperToChrom.lft # get chrUn, Finished and NA records cd /cluster/data/danRer1/bed/ensembl awk '$1 ~ /NA/ || $1 ~ /ctg/ || $1 ~ /Finished/' ensemblGene.gtf \ > ensemblGenechrUns.gtf awk '$1 !~ /NA/ && $1 !~ /ctg/ && $1 !~ /Finished/' ensemblGene.gtf \ > ensemblGenechroms.gtf liftUp -type=.gtf ensemblGenechrUns.lifted \ ./liftSupertoChrom/liftUnSuperToChrom.lft warn ensemblGenechrUns.gtf # Got 113698 lifts in ./liftSupertoChrom/liftUnSuperToChrom.lft # Process chroms and virtual chroms. Add "chr" to front of each line cp ensemblGenechroms.gtf all.gtf cat ensemblGenechrUns.lifted >> all.gtf sed -e "s/^/chr/" all.gtf > ensGene.gtf # load into database ssh hgwdev cd /cluster/data/danRer2/bed/ensembl /cluster/bin/i386/ldHgGene danRer1 ensGene \ /cluster/data/danRer1/bed/ensembl/ensGene.gtf # Read 30783 transcripts in 554794 lines in 1 files # 30783 groups 28 seqs 1 sources 4 feature types # 30783 gene predictions # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql danRer1 < ~/kent/src/hg/lib/ensGtp.sql # remove header line from ensGtp.txt echo "load data local infile 'ensGtp.txt' into table ensGtp" \ | hgsql -N danRer1 # Get the ensembl peptide sequences from # http://www.ensembl.org/Danio_rerio/martview # Choose Danio Rerio as the organism # Follow this sequence through the pages: # Page 1) Choose the Ensembl Genes choice. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Sequences" tab. # Page 4) Choose Transcripts/Proteins and peptide Only as the output, # choose text/fasta and gzip compression, # name the file ensGeneDanRer1.pep.gz and then hit export. gunzip ensGeneDanRer1.pep.gz # delete * at end of some proteins cat ensGeneDanRer1.pep | sed 's/\*$//' > ensembl.pep ~matt/bin/fixPep.pl ensembl.pep fixPep_ensembl.pep hgPepPred danRer1 generic ensPep fixPep_ensembl.pep # BLASTZ FUGU (Fr1) (DONE, 2004-06-13, hartera) ssh kkr1u00 # blastz requires lineage-specific repeats but there are none # available between these two fish species to make empty files mkdir -p /iscratch/i/danRer1/linSpecRep.notInDanRer mkdir -p /iscratch/i/fugu/linSpecRep.notInFugu cd /cluster/data/danRer1 # for zebrafish foreach c (`cat chrom.lst`) cp /dev/null /iscratch/i/danRer1/linSpecRep.notInDanRer/chr${c}.out.spec end # for Fugu cp /dev/null /iscratch/i/fugu/linSpecRep.notInFugu/chrUn.out.spec iSync ssh kk mkdir -p /cluster/data/danRer1/bed/blastz.fr1.2004-06-13 ln -s /cluster/data/danRer1/bed/blastz.fr1.2004-06-13 \ /cluster/data/danRer1/bed/blastz.fr1 cd /cluster/data/danRer1/bed/blastz.fr1 cat << '_EOF_' > DEF # zebrafish (danRer1) vs. Fugu (fr1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified BLASTZ_ABRIDGE_REPEATS=0 # TARGET - zebrafish (danRer1) SEQ1_DIR=/iscratch/i/danRer1/nib SEQ1_RMSK= # lineage-specific repeats # we don't have that information for these species so these files are empty SEQ1_SMSK=/iscratch/i/danRer1/linSpecRep.notInDanRer SEQ1_FLAG= SEQ1_IN_CONTIGS=0 # 10 MB chunk for target SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Fugu (fr1) # soft-masked chrom nib SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib SEQ2_RMSK= SEQ2_SMSK=/iscratch/i/fugu/linSpecRep.notInFugu SEQ2_FLAG= SEQ2_IN_CONTIGS=0 # 10 Mbase for query SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/danRer1/bed/blastz.fr1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.danRer1-fr1.2004-06-13 # setup cluster run # source the DEF file bash . ./DEF /cluster/data/danRer1/jkStuff/BlastZ_run0.sh cd run.0 # check batch looks ok then para try, check, push, check, .... # Completed: 5950 of 5950 jobs # CPU time in finished jobs: 1590351s 26505.86m 441.76h 18.41d 0.050 y # IO & Wait Time: 27786s 463.09m 7.72h 0.32d 0.001 y # Average job time: 272s 4.53m 0.08h 0.00d # Longest job: 601s 10.02m 0.17h 0.01d # Submission to last job: 4298s 71.63m 1.19h 0.05d cd /cluster/data/danRer1/bed/blastz.fr1 bash # if a csh/tcsh user . ./DEF /cluster/data/danRer1/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Completed: 170 of 170 jobs # CPU time in finished jobs: 1722s 28.69m 0.48h 0.02d 0.000 y # IO & Wait Time: 16055s 267.59m 4.46h 0.19d 0.001 y # Average job time: 105s 1.74m 0.03h 0.00d # Longest job: 152s 2.53m 0.04h 0.00d # Submission to last job: 310s 5.17m 0.09h 0.00d # Third cluster run to convert lav's to axt's ssh kk cd /cluster/data/danRer1/bed/blastz.fr1 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /iscratch/i/danRer1/nib \ /cluster/bluearc/fugu/fr1/chromNib stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer1/bed/blastz.fr1/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 29 of 29 jobs # CPU time in finished jobs: 600s 10.00m 0.17h 0.01d 0.000 y # IO & Wait Time: 2782s 46.37m 0.77h 0.03d 0.000 y # Average job time: 117s 1.94m 0.03h 0.00d # Longest job: 401s 6.68m 0.11h 0.00d # Submission to last job: 966s 16.10m 0.27h 0.01d # translate sorted axt files into psl ssh kolossus cd /cluster/data/danRer1/bed/blastz.fr1 mkdir -p pslChrom set tbl = "blastzFr1" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/danRer1/bed/blastz.fr1/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl danRer1 $f echo "$f Done" end # CHAIN FUGU (Fr1) BLASTZ (DONE, 2004-06-24, hartera) # Create chains using -minScore=5000 for axtChain for chroms and # -minScore = 10000 for the messy chroms - chrUn and chrNA. # Run axtChain on little cluster ssh kki cd /cluster/data/danRer1/bed/blastz.fr1 mkdir -p axtChain2/run1 cd axtChain/run1 mkdir out chain # create 2 input lists as need to process chrNA and chrUn separately ls -1S /cluster/data/danRer1/bed/blastz.fr1/axtChrom/*.axt \ > input.lst grep "chrNA" input.lst > inputNAandUn.lst grep "chrUn" input.lst >> inputNAandUn.lst # remove chrNA and chrUn from input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -minScore=5000 $1 \ /iscratch/i/danRer1/nib \ /cluster/bluearc/fugu/fr1/chromNib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 27 of 27 jobs # CPU time in finished jobs: 356s 5.94m 0.10h 0.00d 0.000 y # IO & Wait Time: 128s 2.13m 0.04h 0.00d 0.000 y# Average job time: 18s 0.30m 0.00h 0.00d # Longest job: 28s 0.47m 0.01h 0.00d # Submission to last job: 830s 13.83m 0.23h 0.01d cat << '_EOF_' > gsub2 #LOOP doChain2 {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain2 #!/bin/csh axtChain -minScore=10000 $1 \ /iscratch/i/danRer1/nib \ /cluster/bluearc/fugu/fr1/chromNib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod +x doChain2 gensub2 inputNAandUn.lst single gsub2 jobList2 para create jobList2 para try, check, ... # para time # CPU time in finished jobs: 175s 2.92m 0.05h 0.00d 0.000 y # IO & Wait Time: 64s 1.06m 0.02h 0.00d 0.000 y # Average job time: 120s 1.99m 0.03h 0.00d # Longest job: 120s 2.00m 0.03h 0.00d # Submission to last job: 120s 2.00m 0.03h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/danRer1/bed/blastz.fr1/axtChain2 chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # Load chains into database # next machine ssh hgwdev cd /cluster/data/danRer1/bed/blastz.fr1/ # replace original axtChain with default minScore (=1000) mv axtChain axtChain.orig mv axtChain2 axtChain # remove old chain tables from database for Filt and Filt2 cd /cluster/data/danRer1/bed/blastz.fr1/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain danRer1 ${c}_chainFr1 $i echo done $c end # chain tables for fr1: # chainFr1: default minScore = 1000 # chainFr1Filt1: minScore = 10000 for all chroms, except 15000 for Un and NA # chainFr1Filt2: minScore = 5000 for all chroms, except 10000 for Un and NA # (chainFr1Filt2 is now chainFr1) # coverage is calculated as # size(refGene:CDS & Fr1) / size(refGene:CDS) # enrichment is # (size(refGene:CDS & Fr1) / size(Fr1) ) / size(refGene:CDS) # Chains: # featureBits -chrom=chr1 danRer1 chainFr1Link # 2170390 bases of 40488791 (5.360%) in intersection # featureBits -chrom=chr1 danRer1 chainFr1FiltLink # 1505825 bases of 40488791 (3.719%) in intersection # featureBits -chrom=chr1 danRer1 chainFr1Filt2Link # 1897128 bases of 40488791 (4.686%) in intersection # featureBits -chrom=chr1 danRer1 refGene:cds chainFr1Link # 149280 bases of 40488791 (0.369%) in intersection # featureBits -chrom=chr1 danRer1 refGene:cds chainFr1FiltLink # 135943 bases of 40488791 (0.336%) in intersection # featureBits -chrom=chr1 danRer1 refGene:cds chainFr1Filt2Link # 145585 bases of 40488791 (0.360%) in intersection # featureBits -chrom=chr1 danRer1 refGene:cds # 165468 bases of 40488791 (0.409%) in intersection # featureBits danRer1 refGene:cds # 4852787 bases of 1459132082 (0.333%) in intersection # Coverage Enrichment # chainFr1Link: 90.2% 17.0X # chainFr1FiltLink: 82.2% 22.1X # chainFr1Filt2Link: 88.0% 18.8X # NET FUGU (Fr1) BLASTZ (DONE, 2004-06-25, hartera) ssh kksilo cd /cluster/data/danRer1/bed/blastz.fr1/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 97132544, utime 737 s/100, stime 115 # Add classification info using db tables: ssh hgwdev cd /cluster/data/danRer1/bed/blastz.fr1/axtChain # use -noAr option - don't look for ancient repeats netClass -noAr noClass.net danRer1 fr1 fr1.net # Load the nets into database ssh hgwdev cd /cluster/data/danRer1/bed/blastz.fr1/axtChain netFilter -minGap=10 fr1.net | hgLoadNet danRer1 netFr1 stdin # EXTRACT AXT'S AND MAF'S FROM FUGU NET (DONE, 2004-06-25, hartera) ssh eieio cd /cluster/data/danRer1/bed/blastz.fr1/axtChain netSplit fr1.net fr1Net mkdir -p ../axtNet # make sorted axts from net cat > axtNet.csh << 'EOF' foreach f (fr1Net/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt fr1Net/$c.net chain/$c.chain \ /cluster/data/danRer1/nib /cluster/data/fr1/nib stdout \ | axtSort stdin ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end 'EOF' csh axtNet.csh >&! axtNet.log & tail -100f axtNet.log ssh eieio cd /cluster/data/danRer1/bed/blastz.fr1 cd axtNet mkdir ../mafNet cat > makeMaf.csh << 'EOF' foreach f (chr*.axt) set maf = $f:t:r.fr1.maf echo translating $f to $maf axtToMaf $f \ /cluster/data/danRer1/chrom.sizes /cluster/data/fr1/chrom.sizes \ ../mafNet/$maf -tPrefix=danRer1. -qPrefix=fr1. end 'EOF' csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log # AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP # (DONE, 2004-07-01, hartera) mkdir /projects/compbio/data/microarray/affyZebrafish cd /projects/compbio/data/microarray/affyZebrafish # Download Zebrafish genome array consensus sequences from Affymetrix # http://www.affymetrix.com/support/technical/byproduct.affx?product=zebrafish unzip Zebrafish_consensus.zip sed -e 's/consensus://' Zebrafish_consensus \ | sed -e 's/;/ /' > Zebrafish_consensus.fa # copy sequences to a directory visible on kkr1u00 cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /cluster/bluearc/affy/ # Set up cluster job to align Zebrafish consensus sequences to danRer1 ssh kkr1u00 cd /cluster/data/danRer1/bed mkdir affyZebrafish.2004-06-30 ln -s /cluster/data/danRer1/bed/affyZebrafish.2004-06-30 \ /cluster/data/danRer1/bed/affyZebrafish cd affyZebrafish mkdir -p /iscratch/i/affy cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy iSync ssh kk cd /cluster/data/danRer1/bed/affyZebrafish ls -1 /iscratch/i/affy/Zebrafish_consensus.fa > affy.lst ls -1 /iscratch/i/danRer1/trfFa/ > genome.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/danRer1/11.ooc /iscratch/i/danRer1/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst affy.lst template.sub para.spec mkdir psl para create para.spec para try, check, push ... etc. # para time # Completed: 310 of 310 jobs # CPU time in finished jobs: 14931s 248.85m 4.15h 0.17d 0.000 y # IO & Wait Time: 1028s 17.13m 0.29h 0.01d 0.000 y # Average job time: 51s 0.86m 0.01h 0.00d # Longest job: 96s 1.60m 0.03h 0.00d # Submission to last job: 574s 9.57m 0.16h 0.01d # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyZebrafish.psl pslSort dirs raw.psl tmp psl # only use alignments that have at least # 95% identity in aligned region. # do not use minCover since a lot of sequence is in Un, NA and Finished # so genes may be split up so good to see all alignments pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyZebrafish.psl ../../jkStuff/liftAll.lft warn contig.psl # shorten names in psl file sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.bak mv affyZebrafish.psl.bak affyZebrafish.psl pslCheck affyZebrafish.psl # psl is good # load track into database ssh hgwdev cd /cluster/data/danRer1/bed/affyZebrafish hgLoadPsl danRer1 affyZebrafish.psl # Add consensus sequences for Zebrafish chip # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/hgFixed/affyProbes ln -s \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=Zebrafish: danRer1 \ /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa # Clean up rm batch.bak contig.psl raw.psl # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/zebrafish/danRer1 # ADD CONTIGS TRACK, ctgPos2 (DONE, 2004-07-02, hartera) # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from lift ssh kksilo cd /cluster/data/danRer1/bed mkdir ctgPos2 cd ctgPos2 # make a .as file for ctgPos2 which is an extended ctgPos with # a field for clone type cat << '_EOF_' > $HOME/kent/src/hg/lib/ctgPos2.as table ctgPos2 "Where a contig is inside of a chromosome including contig type information." ( string chrom; "Chromosome name" uint chromStart; "Start in chromosome" uint chromEnd; "End in chromosome" string name; "Name of contig" char[1] type; "(W)GS contig, (F)inished, (P)redraft, (D)raft, (O)ther" uint size; "Size of contig" ) '_EOF_' cd $HOME/kent/src/hg/lib/ autoSql ctgPos2.as ctgPos2 mv ctgPos2.h $HOME/kent/src/hg/inc # do make to check it works and commit the .as, .sql, .c and .h file to CVS foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ 21 22 23 24 25) awk 'BEGIN {OFS="\t"} \ {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \ /cluster/data/danRer1/$c/contigs/chr${c}.agp >> ctgPos2.tab end # need to parse with chrom, chromStart, chromEnd first for liftUp to work foreach c (ctg NA Finished) awk 'BEGIN {OFS="\t"} \ {if (($1 ~ /^'"${c}"'[0-9]+$/) && $5 != "N" ) \ print $1, $2-1, $3, $5, $6, $3-$2+1}' \ /cluster/data/danRer1/Zv3.contigs.agp >> chrUns.tmp end # Use liftUp to change co-ordinates to those of virtual chroms # make lift file from all 3 virtual chroms foreach c (NA Un Finished) cat /cluster/data/danRer1/$c/chr${c}.lft >> chrAllUns.lft end liftUp -type=.bed chrUns.lifted chrAllUns.lft warn chrUns.tmp # once have lifted co-ordinates, need to parse to change to table format awk 'BEGIN {OFS="\t"} {print $5, $6, $1, $2, $3, $4}' \ chrUns.lifted > chrUns.tab # add this to ctgPos2.tab cat chrUns.tab >> ctgPos2.tab ssh hgwdev cd /cluster/data/danRer1/bed/ctgPos2 echo "drop table ctgPos2" | hgsql danRer1 hgsql danRer1 < ~/kent/src/hg/lib/ctgPos2.sql echo "load data local infile 'ctgPos2.tab' into table ctgPos2" \ | hgsql danRer1 # WZ EST ALIGNMENTS (DONE, 2004-07-09, hartera) # WZ ESTs are compiled ESTs from WashU. They were provided by # Anthony DiBiase from Leonard Zon's lab at the Children's Hospital, Harvard # Contact: adibiase@enders.tch.harvard.edu # http://zon.tchlab.org ssh hgwdev mkdir -p /cluster/data/danRer1/bed/ZonLab/wzESTs cd /cluster/data/danRer1/bed/ZonLab/wzESTs # put WZ ESTs in this directory as wzcontigs.txt - # obtained from the Zon Lab, these are unmasked. # There are 42857 ESTs in this file. # Translate to upper case tr '[a-z]' '[A-Z]' < wzcontigs.txt > wzcontigs.fa ssh kkr1u00 mkdir -p /iscratch/i/danRer1/wzESTs cd /cluster/data/danRer1/bed/ZonLab/wzESTs faSplit sequence wzcontigs.fa 20 /iscratch/i/danRer1/wzESTs/wzcontigs iSync ssh kk cd /cluster/data/danRer1/bed/ZonLab/wzESTs mkdir psl ls -1 /iscratch/i/danRer1/wzESTs/*.fa > est.lst ls -1S /iscratch/i/danRer1/trfFa/*.fa > genome.lst # REdO without masking - not used for native ESTs cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -ooc={check in exists /iscratch/i/danRer1/11.ooc} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst est.lst gsub spec para create spec para try para check # para time # Completed: 6200 of 6200 jobs # CPU time in finished jobs: 50309s 838.48m 13.97h 0.58d 0.002 y# IO & Wait Time: 19334s 322.23m 5.37h 0.22d 0.001 y# Average job time: 11s 0.19m 0.00h 0.00d # Submission to last job: 326s 5.43m 0.09h 0.00d # Do sort, best in genome filter, and convert to chromosome coordinates # to create wzEsts.psl pslSort dirs raw.psl tmp psl # only use alignments that have at least # 96% identity in aligned region. use parameters used by auto # GenBank update for native ESTs pslReps -minAli=0.96 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp wz_ests.psl /cluster/data/danRer1/jkStuff/liftAll.lft warn contig.psl # Load EST alignments into database. ssh hgwdev cd /cluster/data/danRer1/bed/ZonLab/wzESTs hgLoadPsl danRer1 wz_ests.psl # Add WZ EST sequences # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/danRer1/wzESTs ln -s \ /cluster/data/danRer1/bed/ZonLab/wzESTs/wzcontigs.fa \ /gbdb/danRer1/wzESTs hgLoadSeq danRer1 /gbdb/danRer1/wzESTs/wzcontigs.fa # BACENDS (DONE, 2004-07-28, hartera) # Added bacEndPairsBad table (DONE, 2004-08-02, hartera) # EXTENDED bacCloneXRef TABLE TO INCLUDE MORE ALIASES, UNISTS ID AND CONTIG # (DONE, 2005-04-25, hartera) # RELOADED bacCloneXRef TABLE AFTER CORRECTING PROGRAM TO USE NULL # IN ALL PLACES WHERE THERE IS NO UniSTS ID (DONE, 2005-04-27, hartera) # ADDED SANGER STS NAME FIELD (sangerName) TO bacCloneAlias TABLE AND # RELOADED DATA INCLUDING THIS NEW FIELD, ADDED SEARCHES FOR BAC CLONES # ALIASES AND ASSOCIATED STS ALIASES (DONE, 2005-05-17, hartera) # ADDED MORE SEARCHES, CORRECTED BUGS IN CREATING bacCloneXRef TABLE, # ADDED MORE INDICES FOR bacCloneXRef AND REMOVED BAC INTERNAL NAMES # FROM bacCloneAlias TABLE, MADE CORRRECTED FILE OF SANGER AND UNISTS IDS # AND RELOADED DATA FOR THESE TABLES (DONE, 2005-05-25, hartera) # REMOVED INCORRECT ROW FROM bacCloneXRef TABLE (DONE, 2006-04-20, hartera) # Provided by Anthony DiBiase, Yi Zhou and Leonard Zon at the Boston # Children's Hospital. Collaborated with them on this track. # Anthony DiBiase:adibiase@enders.tch.harvard.edu # Yi Zhou:yzhou@enders.tch.harvard.edu # BAC clone end sequences are from Robert Geisler's lab, # Max Planck Institute for Developmental Biology, Tuebingen, Germany ssh kksilo cd /cluster/data/danRer1/bed/ZonLab mkdir /cluster/data/danRer1/bed/ZonLab/bacends cp cloneEnd.txt ./bacends/bacends.fa cd bacends # copy BAC clone ends seqeunce file to this directory faSize bacends.fa # 486978153 bases (39070196 N's 447907957 real) in 594614 sequences in # 1 files Total size: mean 819.0 sd 230.3 min 0 (zKp108-H09.za) # max 5403 (zC259G13.zb) median 796 # N count: mean 65.7 sd 154.7 mkdir /iscratch/i/danRer1/bacends faSplit sequence bacends.fa 20 /iscratch/i/danRer1/bacends/bacends ls -1S /iscratch/i/danRer1/bacends/*.fa > bacends.lst ls -1S /iscratch/i/danRer1/trfFa/*.fa > genome.lst cat << '_EOF_' > template #LOOP /cluster/bin/i386/blat $(path1) $(path2) -tileSize=10 -ooc=/iscratch/i/danRer1/10.ooc {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy mkdir psl gensub2 genome.lst bacends.lst template jobList ssh kkr1u00 # iSync bacends to kilokluster iSync ssh kk cd /cluster/data/danRer1/bed/ZonLab/bacends para create jobList para try, check, push, check, ... # para time # Completed: 6200 of 6200 jobs # CPU time in finished jobs: 4833892s 80564.87m 1342.75h 55.95d 0.153y # IO & Wait Time: 31979s 532.98m 8.88h 0.37d 0.001y # Average job time: 785s 13.08m 0.22h 0.01d # Longest job: 2300s 38.33m 0.64h 0.03d # Submission to last job: 14904s 248.40m 4.14h 0.17d # back on kksilo, filter and lift results: ssh kksilo cd /cluster/data/danRer1/bed/ZonLab/bacends pslCheck ./psl/*.psl >& pslCheck.log # No problems reported by pslCheck pslSort dirs raw.psl temp psl # tried minCover=0.6 but only 55% of sequences were aligned # but with minCover=0.4, 77% of seqeunces are aligned # No minCover, aligns 90% of sequences but with a large number # of extra alignments - noise pslReps -nearTop=0.02 -minCover=0.40 -minAli=0.85 -noIntrons raw.psl \ bacEnds.psl /dev/null liftUp bacEnds.lifted.psl /cluster/data/danRer1/jkStuff/liftAll.lft \ warn bacEnds.psl wc -l bacEnds.lifted.psl # 3717153 bacEnds.lifted.psl rm -r temp # Need to add accession information and BAC end sequence pairs mkdir -p /cluster/data/danRer1/bed/ZonLab/bacends/bacends.1 cd /cluster/data/danRer1/bed/ZonLab/bacends/bacends.1 grep ">" ../bacends.fa > bacEnds # remove '>' at beginning of each line perl -pi.bak -e 's/>//' bacEnds # remove ^M char from end of lines perl stripEndChar.pl < bacEnds > bacEnds.names rm bacEnds bacEnds.bak # Kerstin Jekosch at the Sanger Centre: kj2@sanger.ac.uk # provided accession data for BAC End clones - zfish_accs.txt # and for BAC End pairs- BACEnd_accessions.txt - put these in bacends.1 dir # write and use perl script to split BAC Ends into pairs and singles # bacEndPairs.txt and bacEndSingles.txt # For pslPairs, the reverse primer (T7 or .z*) should in the first column # and the forward primer (SP6 or .y*) should be in the second column # There are several reads for some sequences and these have similar names. # Reads for the same sequencee should be in a comma separated list. # Sequence read names can be translated to external clone names as found # in NCBI's clone registry using the name without the suffix and the # translation of prefixes as follows after removal of dashes # and extra zeros in the name: # library external prefix internal prefix # CHORI-211 CH211- zC # DanioKey DKEY- zK # DanioKey Pilot DKEYP- zKp # RZPD-71 RP71- bZ # BUSM1 (PAC) BUSM1- dZ # e.g. zC001-A03.za becomes CH211-1A03 perl getBacEndInfo.pl bacEnds.names BACEnd_accessions.txt > bacs.log # script not working so fix and zip again (2005-04-29, hartera) gzip getBacEndInfo.pl # 180280 bacEndPairs.txt - 571985 sequences # 16020 bacEndSingles.txt - 22629 sequences # 47048 bacEndAccs.aliases # bacEndAccs.aliases contains sequence read names and their # Genbank accessions. # First process BAC end alignments mkdir -p /cluster/data/danRer1/bed/ZonLab/bacends/pairs cd /cluster/data/danRer1/bed/ZonLab/bacends/pairs set bacDir = /cluster/data/danRer1/bed/ZonLab/bacends/bacends.1 # these bacEnds vary in size from around 2800 bp to 626,000 bp ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \ -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndPairs.txt \ all_bacends bacEnds wc -l bacEnds.* # 495 bacEnds.long # 13046 bacEnds.mismatch # 200235 bacEnds.orphan # 72820 bacEnds.pairs # 0 bacEnds.short # 139 bacEnds.slop # create header required by "rdb" tools echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\ > ../header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header # make pairs bed file cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairs.bed # also need to process bacEndSingles.txt into a database table # for singles in bacEndSingles.txt, create a dummy file where they # are given zJA11B12T7 as dummy sequence pair. If the single is a forward # sequence, put the dummy sequence in the second column, if the single is # a reverse sequence put in first column. use a perl script to do this. cd /cluster/data/danRer1/bed/ZonLab/bacends set bacDir = /cluster/data/danRer1/bed/ZonLab/bacends/bacends.1 mkdir singles cd singles perl formatSingles.pl $bacDir/bacEndSingles.txt > \ $bacDir/bacEndSingles.format # then run pslPairs on this formatted sequence ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \ -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndSingles.format \ all_bacends bacEnds wc -l bacEnds* # 0 bacEnds.long # 0 bacEnds.mismatch # 14126 bacEnds.orphan # 0 bacEnds.pairs # 0 bacEnds.short # 0 bacEnds.slop # there are 14126 orphans here and 200235 from pair analysis # so a total of 214361 orphans cat bacEnds.orphan ../bacEnds.orphan > bacEnds.singles wc -l bacEnds.singles # 214361 bacEnds.singles # make singles bed file cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndSingles.bed cp bacEndSingles.bed ../pairs cd ../pairs # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBad.bed # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBadNoOrphans.bed extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed \ bacEndPairsBadNoOrphans.bed bacEndSingles.bed | \ sorttbl tname tstart | headchg -del > bacEnds.load.psl # load BAC end sequences into seq table so alignments may be viewed mkdir -p /gbdb/danRer1/bacends ln -s /cluster/data/danRer1/bed/ZonLab/bacends/bacends.fa \ /gbdb/danRer1/bacends/BACends.fa hgLoadSeq danRer1 /gbdb/danRer1/bacends/BACends.fa # when loaded into danRer1, Heather found there were rows where # the aligments were the same but the lfNames were different. This is # due to the presence of multiple reads for the same BAC end sequence. # Sometimes they are slightly different lenghths so the alignments are # a little different. It would be good to consolidate all of these and # just pick the best alignment. However, here only the identical rows # were merged into one with a list of all the lfNames with that alignment. # load all alignments into temporary database ssh hgwdev echo "create database bacs_rah;" | hgsql danRer1 cd /cluster/data/danRer1/bed/ZonLab/bacends/pairs hgLoadBed bacs_rah bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 72272 elements of size 11 # create a bacEndSingles table like bacEndPairs cp $HOME/kent/src/hg/lib/bacEndPairs.sql ../singles/bacEndSingles.sql # edit to give correct table name hgLoadBed bacs_rah bacEndSingles bacEndSingles.bed \ -sqlTable=../singles/bacEndSingles.sql # Loaded 203683 elements of size 11 # load all_bacends later # note - this track isn't pushed to RR, just used for assembly QA hgLoadBed bacs_rah bacEndPairsBad bacEndPairsBad.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 203709 elements of size 11 # Need to consolidate similar rows for bacEndPairs and bacEndSingles - same # name, different lfNames and same alignments. mkdir -p /cluster/data/danRer1/bed/ZonLab/bacends/duplicates cd /cluster/data/danRer1/bed/ZonLab/bacends/duplicates hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \ bacEndPairs order by name, chrom, chromStart;" bacs_rah > pairs.txt sort pairs.txt | uniq -c > pairs.txt.uniq wc -l pairs* # 72272 pairs.txt # 53992 pairs.txt.uniq # for replicate rows, find all the unique lfNames and put these # in one row with the relevant lfStarts, lfSizes and correct lfCount perl removeReplicates.pl pairs.txt.uniq bacEndPairs > pairsNoReps.bed # repeat for singles hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \ bacEndSingles order by name, chrom, chromStart;" bacs_rah > singles.txt sort singles.txt | uniq -c > singles.txt.uniq wc -l singles* # 203683 singles.txt # 179170 singles.txt.uniq perl removeReplicates.pl singles.txt.uniq bacEndSingles \ > singlesNoReps.bed hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \ bacEndPairsBad order by name, chrom, chromStart;" bacs_rah \ > badPairs.txt sort badPairs.txt | uniq -c > badPairs.txt.uniq wc -l badPairs* # 203709 badPairs.txt # 178304 badPairs.txt.uniq perl removeReplicates.pl badPairs.txt.uniq bacEndPairsBad \ > badPairsNoReps.bed # reload sequences into tables in danRer1 # bed files are already sorted by chrom and chromStart # so no need to re-sort ssh hgwdev cd /cluster/data/danRer1/bed/ZonLab/bacends/duplicates echo "drop table bacEndPairs;" | hgsql danRer1 echo "drop table bacEndPairsBad;" | hgsql danRer1 echo "drop table bacEndSingles;" | hgsql danRer1 echo "drop table all_bacends;" | hgsql danRer1 hgLoadBed danRer1 bacEndPairs pairsNoReps.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 53992 elements of size 11 hgLoadBed danRer1 bacEndPairsBad badPairsNoReps.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 178304 elements of size 11 # edit to give correct table name hgLoadBed danRer1 bacEndSingles singlesNoReps.bed \ -sqlTable=../singles/bacEndSingles.sql # Loaded 179170 elements of size 11 cd /cluster/data/danRer1/bed/ZonLab/bacends/pairs hgLoadPsl danRer1 -table=all_bacends bacEnds.load.psl # 2835212 record(s), 0 row(s) skipped, 706 warning(s) loading psl.tab # after merging rows with the same BAC name, the scoring is now # wrong in bacEndPairs, bacEndSingles and bacEndBadPairs. # Scores should be 1000 if there is 1 row for that name, else # 1500/number of rows for that sequence name - calculated by pslPairs. # Correct the scores. ssh hgwdev mkdir -p /cluster/data/danRer1/bed/ZonLab/bacends/scores cd /cluster/data/danRer1/bed/ZonLab/bacends/scores hgsql -N -e "select name from bacEndPairs;" danRer1 \ | sort | uniq -c > pairs.hits # download bacEndPairs table hgsql -N -e "select * from bacEndPairs;" danRer1 > bacEndPairs.out # use perl script to change scores from this file and write to new file perl correctScores.pl bacEndPairs.out pairs.hits \ > bacEndPairsGoodScores.bed # same for singles hgsql -N -e "select name from bacEndSingles;" danRer1 \ | sort | uniq -c > singles.hits # dowload bacEndSingles table hgsql -N -e "select * from bacEndSingles;" danRer1 > bacEndSingles.out perl correctScores.pl bacEndSingles.out singles.hits \ > bacEndSinglesGoodScores.bed # add bacEndPairsBad - (hartera, 2004-08-02) # this was reloaded temporarily into bacs_rah hgsql -N -e "select name from bacEndPairsBad;" bacs_rah \ | sort | uniq -c > badPairs.hits hgsql -N -e "select * from bacEndPairsBad;" bacs_rah > bacEndPairsBad.out perl correctScores.pl bacEndPairsBad.out badPairs.hits \ > bacEndPairsBadGoodScores.bed # check scores are correct now # reload BAC ends database tables hgsql -e "drop table bacEndPairs;" danRer1 hgsql -e "drop table bacEndSingles;" danRer1 hgsql -e "drop table bacEndPairsBad;" danRer1 hgLoadBed danRer1 bacEndPairs bacEndPairsGoodScores.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 53992 elements of size 11 hgLoadBed danRer1 bacEndSingles bacEndSinglesGoodScores.bed \ -sqlTable=../singles/bacEndSingles.sql # featureBits danRer1 bacEndPairs gap # 18271004 bases of 1459132082 (1.252%) in intersection # featureBits danRer1 bacEndSingles gap # 2718421 bases of 1459132082 (0.186%) in intersection # featureBits danRer1 bacEndSingles bacEndPairs # 158155304 bases of 1459132082 (10.839%) in intersection # Loaded 179170 elements of size 11 # 179170 record(s), 0 row(s) skipped, 23 warning(s) loading bed.tab # loaded bacEndPairsBad table (hartera, 2004-08-02) hgLoadBed danRer1 bacEndPairsBad bacEndPairsBadGoodScores.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 178304 elements of size 11 # 178304 record(s), 0 row(s) skipped, 23 warning(s) loading bed.tab # From getBacEndInfo.pl, created a bacEndsAcc.aliases file # this has the format # where trueName is the accession provided by Sanger in # BACEnd_accessions.txt and aliases are the read names for sequences # Create a bacEndAlias table. cat > bacEndAlias.as << $HOME/kent/src/hg/lib table bacEndAlias "BAC ends aliases and associated identification numbers" ( string alias; "BAC end read name" uint identNo; "Identification number of BAC End" string acc; "GenBank accession for the BAC End" ) '_EOF_' # << this line makes emacs coloring happy cd $HOME/kent/src/hg/lib autoSql bacEndAlias.as bacEndAlias cp bacEndAlias.h ../inc # edit bacEndAlias.sql to add INDEX(identNo) to indices # add and commit to cvs bacEndAlias.as, .sql, .c and .h ssh hgwdev cd /cluster/data/danRer1/bed/ZonLab/bacends/bacends.1 echo "drop table bacEndAlias" | hgsql danRer1 hgsql danRer1 < $HOME/kent/src/hg/lib/bacEndAlias.sql echo "load data local infile 'bacEndAccs.aliases' into table bacEndAlias"\ | hgsql danRer1 # Add a xref table to give external clone registry names, internal names # and Genbank accessions - this table is expanded later, see further down # in documentation. cat << '_EOF_' > $HOME/kent/src/hg/lib/bacCloneXRef.as table bacCloneXRef "BAC clones external names, internal sequence names and Genbank accessions" ( string extName; "External name for BAC clone" string intName; "Internal sequencing name for BAC clone" string acc; "Genbank accession for the BAC End" ) '_EOF_' # << this line makes emacs coloring happy cd $HOME/kent/src/hg/lib autoSql bacCloneXRef.as bacCloneXRef cp bacCloneXRef.h ../inc # edit bacCloneXRef.sql to add INDEX(extName) to indices # add and commit to cvs bacCloneXRef.as, .sql, .c and .h cd /cluster/data/danRer1/bed/ZonLab/bacends # use zfish_accs.txt provided by the Sanger Centre to create a list # of BAC clone names, internal names and Genbank accessions perl getBACInfo.pl zfish_accs.txt > bacCloneXRef.txt # there are 35 clones with different prefixes to those in our track # these are in BACerror.log and are not loaded into the XRef table echo "drop table bacCloneXRef" | hgsql danRer1 hgsql danRer1 < $HOME/kent/src/hg/lib/bacCloneXRef.sql echo "load data local infile 'bacCloneXRef.txt' into table bacCloneXRef"\ | hgsql danRer1 mv bacCloneXRef.txt ./bacends.1 # edit doLinkedFeaturesSeries in hgc.c to use this information # so that links to clone and clone end accessions can be made on the # details pages. # ALIGNED BAC END SEQUENCES TO CHECK THAT ONES WITH SIMILAR NAMES # ARE READS FROM THE SAME SEQUENCES. # also the sequences ending in .z or .Z and .y or .Y were of unknown # direction. Sanger thought that z may be forward but aligning the # sequences suggested that z was actually reverse knowing that SP6 # is a forward sequence and T7 is reverse. # Anthony DiBiase wrote script : countns.pl to filter out sequences with # too many Ns. threshold is 80% Ns cat bacends.fa | perl countns.pl > bacends.percentNs80 grep '>' bacends.percentNs80 | sed -e 's/>//' > bacends.remove perl removeNames.pl ./bacends.1/bacEnds.names bacends.remove \ > bacends.noseqswithlotsNs # get these good sequences with < 80% Ns $HOME/bin/i386/faSomeRecords bacends.fa \ bacends.noseqswithlotsNs bacends_good.fa # blast ssh kkr1u00 mkdir -p /iscratch/i/danRer1/bacends/clusterBlastdb cp /cluster/data/danRer1/bed/ZonLab/bacends/bacends_good.fa \ /iscratch/i/danRer1/bacends/clusterBlastdb cd /iscratch/i/danRer1/bacends/clusterBlastdb # do all against all blast /scratch/blast/formatdb -i bacends_good.fa -t bacendsgood \ -n bacendsgood -p F mkdir -p /iscratch/i/danRer1/bacends/clusterBlast faSplit sequence /cluster/data/danRer1/bed/ZonLab/bacends/bacends_good.fa \ 500 /iscratch/i/danRer1/bacends/clusterBlast/bacendsgood iSync # do kilokluster run ssh kk cd /cluster/data/danRer1/bed/ZonLab/bacends mkdir clusterBlast cd clusterBlast mkdir blastout ls -1S /iscratch/i/danRer1/bacends/clusterBlast/*.fa > bacs.lst ls -1S /iscratch/i/danRer1/bacends/clusterBlastdb/*.nsq \ | sed "s/.nsq//" > bacsBlastdb.lst # Make blast script cat > blastSome << end #!/bin/csh setenv BLASTMAT /scratch/blast/data /scratch/blast/blastall -p blastn -d \$1 -i \$2 -o \$3 -e 0.0 end # << this line keeps emacs coloring happy chmod a+x blastSome # Make gensub2 file cat > blastGsub <>& blastToPsl.log end pslSort dirs raw.psl tmp psl mv raw.psl clusterBacEnds.psl awk '{print $10, $14, $9;}' clusterBacEnds.psl > clusterBacEnds.out # look at blast results for e-value threshold of 0 # ADD ALIASES FOR BAC ENDS (DONE, 2005-04-25, hartera) # 2005-04-25 (hartera) add more aliases for BAC ends and the UniSTS ID # add this to the bacCloneXRef table. # Changes to bacAlias table and alias searches (DONE, 2005-05-17, hartera) # download the files of aliases from Sanger. URL for ftp site from # Sean Humphray (sjh@sanger.ac.uk) ssh kksilo mkdir -p /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/README wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/clonemarkers.02.12.04.txt wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/ctgnames.02.12.04.txt wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/markers.02.12.04.txt ssh hgwdev cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases hgsql -N -e 'select name from bacEndPairs;' danRer1 > bacEnds.names hgsql -N -e 'select name from bacEndSingles;' danRer1 >> bacEnds.names hgsql -N -e 'select name from bacEndPairsBad;' danRer1 >> bacEnds.names sort bacEnds.names | uniq > bacEnds.names.uniq # 172990 unique names # 594614 bacEnd sequences to start with # some of the bacEnd sequences are replicates so count lfNames from table hgsql -N -e 'select lfNames from bacEndPairs;' danRer1 > bacEnds.lfNames hgsql -N -e 'select lfNames from bacEndSingles;' danRer1 >> bacEnds.lfNames hgsql -N -e 'select lfNames from bacEndPairsBad;' danRer1 >> bacEnds.lfNames # need to remove "," and put one name per line perl -pi.bak -e 's/,/\n/g' bacEnds.lfNames sort bacEnds.lfNames | uniq > bacEnds.lfNames.uniq # 337751 bacEnds.lfNames.uniq # pslCheck has 2736 bad alignments # from psl file awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names # remove first few lines with no names sort bacEndsPsl.names | uniq > bacEndsPsl.names.uniq # 457560 bacEndsPsl.names.uniq # about 77% align - this is after filtering # Add an alias table for BAC clones # Replaced the name field with a Sanger Name field so that aliases for # that STS may be printed out in a table on the BAC Ends tracks # description pages (2005-05-04, hartera) cat << '_EOF_' > $HOME/kent/src/hg/lib/bacCloneAlias.as table bacCloneAlias "BAC clone associated STS aliases and Sanger STS names" ( string alias; "STS aliases associated with BAC clones" string sangerName; "Sanger STS name" ) '_EOF_' # << this line makes emacs coloring happy cd $HOME/kent/src/hg/lib autoSql bacCloneAlias.as bacCloneAlias cp bacCloneAlias.h ../inc # edit bacCloneAlias.sql to add INDEX(alias) to indices # edit bacCloneAlias.sql to add INDEX(sangerName) to indices # remove primary key, can not make a compound primary key by # adding sangerName column since not all the BAC clones have this # and it is not unique, more than one BAC clone can have the same # Sanger STS name. # Add a xref table to give external clone registry names, internal names # sanger name, relationship between STS and BAC clone (method of finding # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT, # Genbank accession and STS primer sequences cat << '_EOF_' > $HOME/kent/src/hg/lib/bacCloneXRef.as table bacCloneXRef "BAC clones and corresponding STS information" ( string name; "External name for BAC clone" string intName; "Internal Sanger FPC name" string chroms; "Chromosome(s) to which at one or both BAC ends are mapped by BLAT" string genbank; "Genbank accession for the BAC Clone" string sangerName; "Sanger STS name" uint relationship; "Relationship type - method of finding STS" string uniStsId; "UniSTS ID(s) for STS" string leftPrimer; "STS 5' primer sequence" string rightPrimer; "STS 3' primer sequence" ) '_EOF_' # << this line makes emacs coloring happy cd $HOME/kent/src/hg/lib autoSql bacCloneXRef.as bacCloneXRef cp bacCloneXRef.h ../inc # edit bacCloneXRef.sql to add INDEX(name) and remove primary key. # Add indices: INDEX(intName), INDEX(sangerName) for searches (2005-05-20) # edit so that all fields except the relationship field can be null # add and commit to cvs bacCloneXRef.as, .sql, .c and .h ssh kksilo cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases wc -l *.02.12.04.txt 28305 clonemarkers.02.12.04.txt 167441 ctgnames.02.12.04.txt 12008 markers.02.12.04.txt cp ../getBACInfo.pl getBACInfo2.pl ssh hgwdev cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases # create list of BAC clones in the tables with chromosome aligned to hgsql -N -e 'select name, chrom from bacEndPairs;' danRer1 \ > bacClones.namesandchrom hgsql -N -e 'select name, chrom from bacEndSingles;' danRer1 \ >> bacClones.namesandchrom sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq # use zfish_accs.txt provided by the Sanger Centre to create a list # of BAC clone names, internal names and Genbank accessions # extend getBACInfo.pl to add extended information (getBacInfo2.pl) perl getBACInfo2.pl zfish_accs.txt > bacCloneXRef.txt # write program to create bacCloneAlias.tab and bacCloneXRef.tab files # to load into tables - zfishBacEndsandSts.c # in $HOME/kent/src/hg/zfishBacEndsandSts ssh kksilo # use ssh kkstore01 to replace kksilo (2005-05-04) cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases # get list of UniSTS IDs using aliases to search alias file # print Sanger name, alias and UniSTS ID, use print_markers2.pl # modified version of program written by Tony DiBiase (Boston # Children's Hospital) # create find_markers3.pl as the previous version was finding too # many UniSTS IDs for some markers as using grep (2005-05-23, hartera) cat << '_EOF_' > find_markers3.pl # example: # perl find_markers.pl UniSTS.aliases markers.02.12.04.txt use strict; my $verbose = 0; my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest); my $aliasFile = $ARGV[0]; my $markersFile = $ARGV[1]; open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n"; open(MARKERS, $markersFile) || die "Can not open $markersFile\n"; # store aliases from aliasFile my ($id, $al, @alsArray, %aliasHash); while () { chomp; ($id, $al) = split /\t/; @alsArray = split(/;/, $al); foreach my $as (@alsArray) { push (@{$aliasHash{$as} }, $id); } } close ALIAS; while () { my @idArray; ($f, $t, $m, $idArray[0]) = 0; my @ids; chomp; ($a, $b, $aliases, @rest) = split /\|/; if ($verbose > 3) { printf "aliases $aliases \n"; } @alias = split /;/, $aliases; ALIAS: foreach $s (@alias) { if ($s =~ /[\D]+/) { if ($verbose > 5) { printf "this $s \n"; } if (exists($aliasHash{$s})) { @idArray = @{$aliasHash{$s}}; } if ($idArray[0]) { $f = 1; $t = $s; @ids = @idArray; if ($verbose) { printf "this $s found $m \n"; } last ALIAS; } } } if ($f) { my @sNames = split(/;/, $b); foreach my $sn (@sNames) { foreach my $i (@ids) { printf "$sn\t$i\n"; } } } } close MARKERS; '_EOF_' chmod +x find_markers3.pl perl find_markers3.pl /cluster/data/ncbi/UniSTS.2005-04-12/UniSTS.aliases \ markers.02.12.04.txt > sangerandUniSTSId.txt wc -l sangerandUniSTSId.txt # No need to reformat this for zfishBacClonesandSts # FPC contig information (i.e. FPC contig number) from ctgnames file is # not included in the tables as these are dynamic and constantly # changing with the assembly. # made code changes so that zfishBacClonesandSts always uses NULL # in the bacCloneXRef table if there is no UniSTS ID and then # reloaded the bacCloneXRef table with the resulting tab file # (hartera, 2005-04-27) # edit zfishBacClonesandSts to have aliases and Sanger STS name # in the bacCloneXRef.tab output file and re-run program # this means that aliases may be listed by sangerName on the # details page and searches for aliases can still be made using # a join to the XRef table to get the external name to find the # position in the relevant bacEnds table. # (hartera, 2005-05-12) # Added another argument so that output directory must be specified # and fixed bug as not all the BAC clone names for each Sanger STS name # were being stored. Change to zfishBacClonesandSts.c so that the # internal (Sanger) BAC clone names are not added to the alias table. # These can be found in the bacCloneXRef table and there are only # 12987 internal BAC clone names that have Sanger names and over # 200,000 more internal names in the bacCloneXRef table. Re-run program # and re-load tables. # Also use updated version of accessions file for BAC clones. # (hartera, 2005-05-20) # Received a new version of the zfish_accs.txt file: # zfish_accs050605.txt (from Mario Caccamo at Sanger), move this to # /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases # Created a merged file of accessions from this file and zfish_accs.txt # see makeDanRer1.doc, new version is zfish_accsMerged.txt # change to zfishBacClonesandSts.c so that the internal (Sanger) # BAC clone names are not added to the alias table. These can be found # in the bacCloneXRef table and there are only 12987 that have Sanger names # and over 200,000 more in the bacCloneXRef table. # Re-ran zfishBacClonesandSts after increasing NUMALIAS used for setting # array size as not all the aliases were being stored. Also, the # Sanger name and UniSTS ID file was re-created as this had incorrect # UniSTS IDs for some Sanger names (as many as 200) and bug fixed as # correct extName was not being added to the extNameHash in one case. # (hartera, 2005-05-24) set dir=/cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases cp $dir/zfish_accsMerged.txt . mkdir -p /cluster/bluearc/danRer1/bacEnds/out nice $HOME/bin/i386/zfishBacClonesandSts ctgnames.02.12.04.txt \ clonemarkers.02.12.04.txt markers.02.12.04.txt \ bacClones.namesandchrom.uniq zfish_accsMerged.txt \ sangerandUniSTSId.txt /cluster/bluearc/danRer1/bacEnds/out \ > zfishBacs.out # output is in /cluster/bluearc/danRer1/bacends/out so copy over # sort alias file by sangerName sort -k2 /cluster/bluearc/danRer1/bacEnds/out/bacAlias.tab \ > bacAlias.sort.tab cp /cluster/bluearc/danRer1/bacEnds/out/bacXRef.tab . wc -l *.tab # 55836 bacAlias.sort.tab # 234056 bacXRef.tab ssh hgwdev cd /cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases # Remove bacCloneAlias table and add new table with sangerName field # instead of the external name field # Re-load data including sanger STS name for the associated STS markers. # (2005-05-05, hartera) # Re-load data after bug fix and using new BAC clone accessions and # without adding internal (Sanger) BAC clone names to alias table and # using the bacCloneAlias table with indices for name, intName and # sangerName for searches. (2005-05-20, hartera) # Reload tables after using new version of zfishBacClonesandSts and # corrected mapping of Sanger names to UniSTS IDs # and fixed case where incorrect extName was being stored for an # internal name. (hartera, 2005-05-24) hgsql danRer1 -e 'drop table bacCloneXRef' hgsql danRer1 -e 'drop table bacCloneAlias' hgsql danRer1 < $HOME/kent/src/hg/lib/bacCloneAlias.sql hgsql danRer1 < $HOME/kent/src/hg/lib/bacCloneXRef.sql nice hgsql danRer1 -e \ 'load data local infile "bacAlias.sort.tab" into table bacCloneAlias' nice hgsql danRer1 -e \ 'load data local infile "bacXRef.tab" into table bacCloneXRef' # checked data in tables to check that everything was correctly loaded # from the files and errors were corrected - see TEST section below # (DONE, 2005-05-24, hartera) # Remove incorrect row from bacCloneXRef table (hartera, 2006-04-19) # There is one row where the name is "CH211-155M11__" and the # intName is "zC155M11__" which is incorrect. # There is a correct row for intName zC155M11 and zC155M11__ and # CH211-155M11__ is not in bacEnd{Singles,Pairs,PairsBad} tables and # zC155M11__ is not an alias in bacEndAlias. hgsql -e 'delete from bacCloneXRef where name = "CH211-155M11__";' danRer1 cd $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1 # edit trackDb.ra to add searches for the alias table # added dontCheckXrefQueryFormat so now a table join can be performed # (2005-05-17, hartera) # corrected termRegex for some bacCloneXRef searches so that they work # correctly (bacPairsSangerSts and bacSinglesSangerSts) # (2006-04-19, hartera) # searchName bacPairsAliases # searchTable bacEndPairs # searchMethod exact # searchType bed # searchBoth 1 # xrefTable bacCloneAlias # dontCheckXrefQueryFormat . # xrefQuery select bacCloneXRef.name,alias from %s as bca, bacCloneXRef where alias like '%s' and bacCloneXRef.sangerName = bca.sangerName # searchPriority 9 # searchName bacSinglesAliases # searchTable bacEndSingles # searchMethod exact # searchType bed # searchBoth 1 # xrefTable bacCloneAlias # dontCheckXrefQueryFormat . # xrefQuery select bacCloneXRef.name,alias from %s as bca, bacCloneXRef where alias like '%s' and bacCloneXRef.sangerName = bca.sangerName # searchPriority 9 # add new searches to search added sangerName field of bacCloneXRef table # (2005-05-17, hartera) # searchName bacPairsSangerSts # searchTable bacEndPairs # searchMethod exact # searchType bed # xrefTable bacCloneXRef # xrefQuery select name, sangerName from %s where sangerName like '%s' # termRegex (et|st)ID[0-9]+\.[0-9]+ # searchPriority 10 # searchName bacSinglesSangerSts # searchTable bacEndSingles # searchMethod exact # searchType bed # xrefTable bacCloneXRef # xrefQuery select name, sangerName from %s where sangerName like '%s' # termRegex (et|st)ID[0-9]+\.[0-9]+ # searchPriority 9 # commit trackDb.ra to CVS # make code changes to $HOME/kent/src/hg/hgc/hgc.c to display some of the # alias and STS information from bacCloneXRef and bacCloneAlias tables on the # BAC ends tracks details pages. Update hgc.c to show the aliases associated # with each Sanger STS name listed for a BAC clones rather than just a list of # all aliases associated with that BAC clone (2005-05-17, hartera). # add searches for internal (Sanger) BAC clone name (2005-05-20, hartera): # searchName bacPairsIntName # searchTable bacEndPairs # searchMethod exact # searchType bed # xrefTable bacCloneXRef # xrefQuery select name,intName from %s where intName like '%s' # termRegex (zC|zK|bZ|dZ|bY)[[:alnum:]]+ # searchPriority 8.5 # searchName bacSinglesIntName # searchTable bacEndSingles # searchMethod exact # searchType bed # xrefTable bacCloneXRef # xrefQuery select name,intName from %s where intName like '%s' # termRegex (zC|zK|bZ|dZ|bY)[[:alnum:]]+ # searchPriority 8.5 # BACENDS: TESTING FOR bacCloneAlias and bacCloneXRef TABLES # (DONE, 2005-05-24, hartera) # ADDED TEST TO CHECK SANGERNAMES IN ALIAS AND XREF TABLES # (DONE, 2005-05-26, hartera) # The following tests were carried out to check that all the data # in the bacCloneAlias and bacCloneXRef tables is correct. ssh hgwdev mkdir -p \ /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables # Check that the correct aliases are associated with their Sanger STS names awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \ ../markers.02.12.04.txt > sNameandaliases # write script to get one SangerName and one alias on each line chmod +x getSangerAndAlias.pl perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format sort sNameandaliases.format | uniq > sNameandaliases.sort hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer1 \ | sort | uniq > alias.db.sort wc -l alias.db.sort # 55836 alias.db.sort diff sNameandaliases.sort alias.db.sort # some differences awk '{print $1;}' sNameandaliases.sort | sort | uniq -c | sort -nr \ > sNameandaliases.sNameCount # the most is 216 # 216 etID22623.3 # 154 stID49848.11 # 61 etID49717.20 # 58 etID35364.24 # 39 etID45086.3 # extend NUMALIASES from 50 to 250 in zfishBacCloneandSts.c - not all the # aliases for some Sanger names are being stored. # re-run program and reload tables and test again - now ok. # Check Sanger STS names correspond in bacAlias and bacCloneXRef tables # (2005-05-26, hartera) # get Sanger names from alias table hgsql -N -e 'select sangerName from bacCloneAlias;' danRer1 \ | sort | uniq > sName.alias.sort wc -l sName.alias.sort # 14940 sName.alias.sort # get Sanger names from xRef table hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \ is not null;' danRer1 | sort | uniq > sName.xRef.sort wc -l sName.xRef.sort # 15153 sName.xRef.sort comm -23 sName.alias.sort sName.xRef.sort # nothing in alias file only so all sanger names in the alias table are # also in the xRef table comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias wc -l sNamexRefNotAlias # 213 sNamexRefNotAlias - 213 sangerNames in alias table not in XRef table # get Sanger names from clonemarkers file awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.02.12.04.txt | sort | uniq \ > clonemarkers.sNames.sort # get Sanger names from markers file awk 'BEGIN {FS="|"}{print $2}' ../markers.02.12.04.txt > markers.sNames # remove semi-colons and sort sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort # sanger names unique to markers file comm -13 clonemarkers.sNames.sort markers.sNames.sort # there are none comm -23 clonemarkers.sNames.sort markers.sNames.sort \ > sNames.clonemarkersOnly wc -l sNames.clonemarkersOnly # 213 sNames.clonemarkersOnly diff sNames.clonemarkersOnly sNamexRefNotAlias # no difference so all the extra Sanger Names in the xRef table are # from the clonemarkers file and these have no aliases in the markers file # so they are not in the alias table so this is all ok. # Check that Sanger STS names and primers are associated correctly # get sanger names and primers from markers file awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \ ../markers.02.12.04.txt > sNameandPrimers # write script to reformat and write with one Sanger name per line chmod +x getSangerandPrimers.pl perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format sort sNameandPrimers.format > sNameandPrimers.format.sort wc -l sNameandPrim* # 12008 sNameandPrimers # 14940 sNameandPrimers.format # 14940 sNameandPrimers.format.sort # get Sanger names and primers from the database hgsql -N -e \ 'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \ where sangerName is not null and leftPrimer is not null and \ rightPrimer is not null;' danRer1 | sort | uniq \ > sNamesandprimers.fromdb.sort wc -l sNamesandprimers.fromdb.sort # 14940 sNamesandprimers.fromdb.sort diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort # No difference so ok. # Check that UniSTS IDs and Sanger names are associated correctly # get Sanger names and UniSTS IDs from the database hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \ uniStsId is not null;' danRer1 | sort | uniq > sNameUniSTS.fromdb.sort wc -l sNameUniSTS.fromdb.sort # 5456 sNameUniSTS.fromdb.sort # Need to reformat the sNameUniSTS.fromdb.sort chmod +x formatUniSts.pl perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \ > sNameUniSTS.fromdb.format.sort # sort file of Sanger names and UniSTS IDs generated by find_markers2.pl sort ../sangerandUniSTSId.format > sangerandUniSTSId.format.sort diff sangerandUniSTSId.format.sort sNameUniSTS.fromdb.format.sort \ > sangerandUniSTSIdvsdb # some differences found in original file. see how many uniSTS Ids there # are for each Sanger name awk '{print $1}' ../sangerandUniSTSId.format | sort | uniq -c | sort -nr \ > sangerandUniSTSId.count # 200 etID309730.9 # 84 etID138038.17 # 11 etID315128.1 # 11 etID315081.10 # 10 etID309656.22 # 9 etID22068.24 # 8 etID309667.22 # this is not correct, the UniSTS IDs for etID209730.9 are not correct. # need to change program that finds these UniSTS IDs for the Sanger names. # rewrote find_markers3.pl and recreated this file as sangerandUniSTSId.txt awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \ > sangerandUniSTSId.count2 # the most is now 3 UniSTS IDs # 3 etID9056.23 # 3 etID9042.2 # 3 etID8627.2 # 3 etID8281.9 # 3 etID11096.5 # 2 etID9986.14 # 2 etID9968.17 # re-check as above after reloading data in database - all ok now sort ../sangerandUniSTSId.txt > sangerandUniSTSId.txt.sort diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort \ > sangerandUniSTSIdvsdb # Check that chrom mappings and external BAC clone names are correct # get extNames and chroms they map to from the database hgsql -N -e 'select name, chroms from bacCloneXRef where \ chroms is not null;' danRer1 | sort | uniq \ > nameandchromsfromdb.sort # reformat nameandchromsfromdb.sort perl formatUniSts.pl < nameandchromsfromdb.sort | sort \ > nameandchromsfromdb.format.sort # compare extNames and chroms from db to those in data file diff ../bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort # no difference - all ok # Check Genbank accessions and internal BAC clone names hgsql -N -e 'select intName,genbank from bacCloneXRef where \ genbank is not null;' danRer1 | sort | uniq \ > intNamesandAccs.fromdb.sort # this should be a subset of zfish_accsMerged.txt - not all BAC clones # listed here appear in either our BAC ends tracks or the markers files. sort ../zfish_accsMerged.txt > zfish_accsMerged.sort comm -23 intNamesandAccs.fromdb.sort zfish_accsMerged.sort # zC212E7 BX324109 # this accession is for zC212E4 comm -13 intNamesandAccs.fromdb.sort zfish_accsMerged.sort > onlyinzfishAccs wc -l onlyinzfishAccs # 88 onlyinzfishAccs # zC212E7 # extNameHash had no entry for zC212E4 and when queried with zC212E7 # this returned CH211-212E4 as the extName which is incorrect # fixed bug in zfishBacClonesandSts.c and re-ran program, reloaded tables. # re-check as above. now only 87 in zfishAccs and all those internal names # in the database are found in the zfish_accsMerged.txt file # get all intNames that have NULL as accession and check that none of these # are in the zfish_accsMerged.txt file hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' danRer1 | sort | uniq > intNamesNoAcc.fromdb.sort awk '{print $1;}' zfish_accsMerged.sort | sort > intNames.withAccs.sort comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \ > indbNoAccsandAccs.out # none of these names are common to both so all accessions from # zfish_accsMerged.txt are in the database for the internal names stored # where there are accessions available. # Test Sanger STS names, internal names and external names are all correct # Test Sanger STS name and internal BAC clone names are associated correctly # get internal names and Sanger names from data file awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.02.12.04.txt \ | sort | uniq > intNameandSanger.sort # get internal names and Sanger names from database hgsql -N -e 'select intName, sangerName from bacCloneXRef \ where sangerName is not null;' danRer1 \ | sort | uniq > intNameandSanger.fromdb.sort diff intNameandSanger.sort intNameandSanger.fromdb.sort # no difference so ok # Check BAC clone internal name and relationship fields # get internal names and relationships from data file awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.02.12.04.txt \ | sort | uniq > intNameandRelation.sort # get internal names and relationships from database hgsql -N -e 'select intName, relationship from bacCloneXRef \ where relationship != 0;' danRer1 \ | sort | uniq > intNameandrelation.fromdb.sort # differences unique to database file comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \ > intNameRelation.indbonly # differences unique to data file comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \ > intNameRelation.incloneMarkersonly awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names awk '{print $1}' intNameRelation.incloneMarkersonly \ > intNameRelation.incloneMarkersonly.names diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names # there is no difference in the internal names with relationship fields # no difference in names and the only places these should differ is that # the second column should all be 3 in the data from the database only. # this is because all the relationship entries that were blank were # in the clonemarkers file were changed to 3 when entered into the database. awk '{print $2}' intNameRelation.indbonly | sort | uniq # 3 - correct so all ok # all the differences should be that those that are blank in clonemarkers # are 3 in the database # check that those that have 0 in the database bacCloneXRef relationshipe # field are not in the list from cloneMarkers # select these internal names with 0 relationship from the database hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \ danRer1 | sort | uniq > intNameNoRelation.fromdb.sort # get all the internal names from the data file awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt \ | sort | uniq > intNamefromCloneMarkers.sort comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort # nothing in common between these two files as expected so there are # no internal names in the db with 0 in the relationship field that # appear in the clonemarkers file. # Check all BAC clone internal names and external names from the # ctgnames file are in the database # get intName and extName from ctgnames file awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.02.12.04.txt \ | sort | uniq > intNameandextNamefromCtgNames.sort # get intName and extName from database hgsql -N -e 'select intName,name from bacCloneXRef;' danRer1 \ | sort | uniq > intNameandextName.fromdb.sort wc -l intNameandextName* # 218738 intNameandextName.fromdb.sort # 167441 intNameandextNamefromCtgNames.sort comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \ > intandextindbAndCtgNames wc -l intandextindbAndCtgNames # 167441 intandextindbAndCtgNames # there are 167441 name pairs common between the file and the database # and this is the same number of name pairs as in the data file diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort # no difference between those name pairs from the data file and those that # are common between the data file and the database so all internal and # external names from ctgNames file are in the database # get the list of extra ones from db comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \ > intandextNamesindbNotinCtgNames wc -l intandextNamesindbNotinCtgNames # 51297 intandextNamesindbNotinCtgNames # get list of internal names from the clonemarkers file awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt | sort | uniq \ > clonemarkers.intName.sort wc -l clonemarkers.intName.sort # 12987 clonemarkers.intName.sort # compare these intNames to those from the database not in the ctgnames file comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames # none of these clone markers internal names are in this list so they # must all be in the ctgnames file too. These extra internal names will be # translations of external names found in the list of mappings of BAC clones # to chroms. # Check that all the BAC clone external names from the list of chromosome # mappings and from the ctgnames file are in the database. # get all extNames from baclones.namesandchrom.uniq and from ctgnames awk '{print $1}' ../bacClones.namesandchrom.uniq > \ extNames.ctgnamesandbacClones awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.02.12.04.txt \ >> extNames.ctgnamesandbacClones wc -l extNames.ctgnamesandbacClones # 386727 extNames.ctgnamesandbacClones sort extNames.ctgnamesandbacClones | uniq \ > extNames.ctgnamesandbacClones.sort wc -l extNames.ctgnamesandbacClones.sort # 218738 extNames.ctgnamesandbacClones.sort # get extNames from the database hgsql -N -e 'select name from bacCloneXRef;' danRer1 | sort | uniq \ > extNames.fromdb.sort wc -l extNames.fromdb.sort # 218738 extNames.fromdb.sort # find extNames in common from data files and database comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \ > extNames.fromdbandfiles wc -l extNames.fromdbandfiles # 218738 extNames.fromdbandfiles diff extNames.fromdb.sort extNames.fromdbandfiles # no difference, all extNames from files are in db # Check that all BAC clone internal names from the ctgnames and clonemarkers # files are in the database # get internal names from ctgnames and clonemarkers files awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.02.12.04.txt \ > intNames.ctgnamesandclonemarkers awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.02.12.04.txt \ >> intNames.ctgnamesandclonemarkers wc -l intNames.ctgnamesandclonemarkers # 195746 intNames.ctgnamesandclonemarkers sort intNames.ctgnamesandclonemarkers | uniq \ > intNames.ctgnamesandclonemarkers.sort wc -l intNames.ctgnamesandclonemarkers.sort # 167441 intNames.ctgnamesandclonemarkers.sort # get internal names from database hgsql -N -e 'select intName from bacCloneXRef;' danRer1 | sort | uniq \ > intNames.fromdb.sort wc -l intNames.fromdb.sort # 218738 intNames.fromdb.sort # some of these intNames are derived from the corresponding extNames # all of the intNames from the file should be in the db comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \ > intNames.fromdbandfiles wc -l intNames.fromdbandfiles # 167441 intNames.fromdbandfiles diff intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort # no difference, all intNames from files are in db # Check that all translations are correct between BAC clone # external and internal names. # write script to get the prefixes from internal and external names chmod +x getNamePrefixes.pl hgsql -N -e 'select name, intName from bacCloneXRef;' danRer1 \ | sort | uniq > extandintNames.fromdb.sort perl getNamePrefixes.pl < extandintNames.fromdb.sort \ > extandintNames.prefixes sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq # these all look good # BUSM1 dZ # CH211 zC # CH211 zc # CH73 zH # CT7 bP # DKEY zK # DKEY zk # DKEYP zKp # RP71 bZ # XX bY # zk is a internal name prefix for the external name prefix, DKEY-. There # is only one example where this is used (DKEY-81G7) and this in the # ctgnames file and is in the bacCloneXRef table so that is ok. # All data looks good in these tables now. # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DANRER1 # (DONE, 2004-07-13, hartera) ssh hgwdev # DNA port is "0", trans prot port is "1" echo 'insert into blatServers values("danRer1", "blat8", "17779", "0", "0"); \ insert into blatServers values("danRer1", "blat8", "17778", "1", "0");' \ | hgsql -h genome-testdb hgcentraltest # if you need to delete those entries echo 'delete from blatServers where db="danRer1";' \ | hgsql -h genome-testdb hgcentraltest # to check the entries: echo 'select * from blatServers where db="danRer1";' \ | hgsql -h genome-testdb hgcentraltest # Remove Blat servers entries for danRer1 from blatServers table in # hgcentralbeta and hgcentral (see section on ARCHIVING danRer1) # (2006-05-02, hartera) # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-07-20, hartera) # CORRECTION MADE TO chrM.agp FILE - MADE TAB DELIMITED INSTEAD OF # SPACE DELIMITED SO REPLACE OLD AGP FILE IN DOWNLOADS # (DONE, 2005-04-25, hartera) # UPDATED README FILES WITH NEW SEQUENCE FTP FOR SANGER (2005-08-04,hartera) ssh kksilo cd /cluster/data/danRer1 #- Build the .zip files cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip mkdir zip # chrom AGP's zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp # chrom RepeatMasker out files zip -j zip/chromOut.zip */chr*.fa.out # soft masked chrom fasta zip -j zip/chromFa.zip */chr*.fa # hard masked chrom fasta zip -j zip/chromFaMasked.zip */chr*.fa.masked # chrom TRF output files cd bed/simpleRepeat zip ../../zip/chromTrf.zip trfMask/chr*.bed cd ../.. # get GenBank native mRNAs cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=danRer1 -native GenBank mrna \ /cluster/data/danRer1/zip/mrna.fa # get GenBank xeno mRNAs ./bin/i386/gbGetSeqs -db=danRer1 -xeno GenBank mrna \ /cluster/data/danRer1/zip/xenoMrna.fa # get native RefSeq mRNAs ./bin/i386/gbGetSeqs -db=danRer1 -native refseq mrna \ /cluster/data/danRer1/zip/refMrna.fa # get native GenBank ESTs ./bin/i386/gbGetSeqs -db=danRer1 -native GenBank est \ /cluster/data/danRer1/zip/est.fa cd /cluster/data/danRer1/zip # zip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs zip -j mrna.zip mrna.fa zip -j xenoMrna.zip xenoMrna.fa zip -j refMrna.zip refMrna.fa zip -j est.zip est.fa '_EOF_' # << this line makes emacs coloring happy csh ./jkStuff/zipAll.csh |& tee ./jkStuff/zipAll.log cd zip #- Look at zipAll.log to make sure all file lists look reasonable. # Make upstream files and Copy the .zip files to # hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/danRer1/zip # make upstream files for zebrafish RefSeq featureBits danRer1 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa featureBits danRer1 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test set gp = /usr/local/apache/htdocs/goldenPath/danRer1 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips mkdir -p $gp/chromosomes foreach f (../*/chr*.fa) zip -j $gp/chromosomes/$f:t.zip $f end cd $gp/bigZips md5sum *.zip > md5sum.txt cd $gp/chromosomes md5sum *.zip > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # Remake just the chromAgp.zip with correct chrM.agp (2005-04-25,hartera) ssh kksilo cd /cluster/data/danRer1 rm /cluster/data/danRer1/zip/chromAgp.zip # chrom AGP's zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp ssh hgwdev cd /cluster/data/danRer1/zip #- Check zip file integrity: foreach f (chromAgp.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l chromAgp.zip.test # looks good set gp = /usr/local/apache/htdocs/goldenPath/danRer1 rm $gp/bigZips/chromAgp.zip rm $gp/bigZips/md5sum.txt cp -p chromAgp.zip $gp/bigZips # go to directory with zip files and remake the md5sum.txt file cd $gp/bigZips md5sum *.zip > md5sum.txt # updated bigZips and chromosomes README.txt files with new ftp site # for the assembly sequence at Sanger (2005-08-04, hartera): # now ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv3release/ # MAKE VSHG17 DOWNLOADABLES (DONE, 2004-07-20, hartera) ssh hgwdev cd /cluster/data/danRer1/bed/blastz.hg17/axtChrom set gp = /usr/local/apache/htdocs/goldenPath/danRer1 mkdir -p $gp/vsHg17/axtChrom cp -p *.axt $gp/vsHg17/axtChrom cd $gp/vsHg17/axtChrom gzip *.axt md5sum *.gz > md5sum.txt cd /cluster/data/danRer1/bed/blastz.hg17/axtChain zip -j /cluster/data/danRer1/zip/hg17.chain.zip hg17.chain rm hg17.chain cp humanhg17.net hg17.net zip -j /cluster/data/danRer1/zip/hg17.net.zip hg17.net rm hg17.net cd $gp/vsHg17 mv /cluster/data/danRer1/zip/hg17*.zip . md5sum *.zip > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MAKE VSFR1 DOWNLOADABLES (DONE, 2004-07-26, hartera) ssh hgwdev cd /cluster/data/danRer1/bed/blastz.fr1/axtChrom set gp = /usr/local/apache/htdocs/goldenPath/danRer1 mkdir -p $gp/vsFr1/axtChrom cp -p *.axt $gp/vsFr1/axtChrom cd $gp/vsFr1/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # add the axtNet *.axt in blastz.fr1/axtNet cd /cluster/data/danRer1/bed/blastz.fr1/axtNet set gp = /usr/local/apache/htdocs/goldenPath/danRer1 mkdir -p $gp/vsFr1/axtNet cp -p *.axt $gp/vsFr1/axtNet cd $gp/vsFr1/axtNet gzip *.axt md5sum *.gz > md5sum.txt cd /cluster/data/danRer1/bed/blastz.fr1/axtChain cp all.chain fr1.chain zip -j /cluster/data/danRer1/zip/fr1.chain.zip fr1.chain rm fr1.chain zip -j /cluster/data/danRer1/zip/fr1.net.zip fr1.net cd $gp/vsFr1 mv /cluster/data/danRer1/zip/fr1*.zip . md5sum *.zip > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # TBLASTN HG16 KNOWN GENES TRACK (DONE, 2004-07-23, hartera) # Collaboration with Anthony DiBiase and Yi Zhou of Leonard Zon's lab at # the Childrens Hospital, Boston - part of the Zebrafish Genome Initiative # Protein set created by Anthony DiBiase: adibiase@enders.tch.harvard.edu # tblastn of proteins from hg16 knownGenePep table as query # against a database of repeatmasked Zv3 zebrafish assembly # Originally received tblastn output in html format and a # blastHg16.psl file which has an extra accession column at # the beginning - this was created from the html # Use perl script to move accession to the end of each row ssh hgwdev # Create a new .as for an extended psl table with a # query ID column for the accession cat << '_EOF_' > $HOME/kent/src/hg/lib/pslWQueryID.as table pslWQueryID "Summary info about a patSpace alignment with a query ID addition" ( uint matches; "Number of bases that match that aren't repeats" uint misMatches; "Number of bases that don't match" uint repMatches; "Number of bases that match but are part of repeats" uint nCount; "Number of 'N' bases" uint qNumInsert; "Number of inserts in query" uint qBaseInsert; "Number of bases inserted in query" uint tNumInsert; "Number of inserts in target" uint tBaseInsert; "Number of bases inserted in target" char[2] strand; "+ or - for query strand. For mouse second +/- for genomic strand" string qName; "Query sequence name" uint qSize; "Query sequence size" uint qStart; "Alignment start position in query" uint qEnd; "Alignment end position in query" string tName; "Target sequence name" uint tSize; "Target sequence size" uint tStart; "Alignment start position in target" uint tEnd; "Alignment end position in target" uint blockCount; "Number of blocks in alignment" uint[blockCount] blockSizes; "Size of each block" uint[blockCount] qStarts; "Start of each block in query." uint[blockCount] tStarts; "Start of each block in target." string queryID; "query ID field" ) '_EOF_' cd $HOME/kent/src/hg/lib/ autoSql pslWQueryID.as pslWQueryID mv pslWQueryID.h ../inc # edit pslWQueryID.sql to have: # index(tName(32),tStart,tEnd) instead of PRIMARY KEY (matches) # commit to CVS the .as, .sql, .h and .c files ssh hgwdev cd /cluster/data/danRer1/bed/ZonLab/blast echo "drop table pslWQueryID" | hgsql danRer1 hgsql danRer1 < $HOME/kent/src/hg/lib/pslWQueryID.sql # rename table to blastHg16KG echo "alter table pslWQueryID rename blastHg16KG" | hgsql danRer1 # process data to add to the table mkdir -p /cluster/data/danRer1/bed/ZonLab/blast cd /cluster/data/danRer1/bed/ZonLab/blast # put blastHg16.psl into this directory perl formatPsl.pl < blastHg16.psl > blastHg16.format.tab # use pslCheck.c to check file after removing accessions pslCheck blastHg16NoAcc.psl # there are many errors. the psl file was not correctly formed. # Easiest to re-run tblasn and get regular output and use blastToPsl # to get a correctly formatted psl file. mkdir -p /cluster/data/danRer1/bed/ZonLab/tblastn cd tblastn # copy hg16_kg_proteinseq.fa here. This FASTA file was provided by # Anthony DiBiase and it contains 40115 predicted protein sequences from # hg16 known genes - from the knownGenePep table. There are 42026 # peptide sequences in hg16 knownGenePep. The FASTA file sequences were # obtained proteins by merging annotation data from the kgXref table with # the sequence and name found in the knownGenePep table. # split known Genes proteins into smaller files ssh kkr1u00 mkdir -p /cluster/data/danRer1/bed/ZonLab/tblastn cd tblastn # copy hg16_kg_proteinseq.fa here mkdir /iscratch/i/danRer1/hg16kgPep faSplit sequence hg16_kg_proteinseq.fa 20 \ /iscratch/i/danRer1/hg16kgPep/hg16kg cd /iscratch/i/danRer1/ mkdir blastdb cd blastdb # make formatted database for each foreach c (/iscratch/i/danRer1/trfFa/*.fa) set h = $c:t /scratch/blast/formatdb -i $c -t $h -n $h -p F end iSync ssh kk cd /cluster/data/danRer1/bed/ZonLab/tblastn mkdir blastout ls -1S /iscratch/i/danRer1/hg16kgPep/*.fa > hg16kg.lst ls -1S /iscratch/i/danRer1/blastdb/*.nsq | sed "s/.nsq//" > zv3.lst # Make blast script cat > blastSome << end #!/bin/csh setenv BLASTMAT /scratch/blast/data /scratch/blast/blastall -p tblastn -d \$1 -i \$2 -o \$3 -e 0.01 end # << this line keeps emacs coloring happy chmod a+x blastSome # Make gensub2 file cat > blastGsub < shortenQueryName.pl #!/usr/bin/perl -w use strict; my $found = "false"; # flag set when query line found open(QUERY, ">>queries.log") || die "Can not create error.log:$!"; while () { if (/^(Query=\s?[A-Z]{1,2}[0-9]{4,})\|\w+/) { my $query = $1; print QUERY; $found = "true"; print "$query\n"; } elsif ($found eq "true") { if (/^\s+\([0-9]+\s?letters\)/) { print; } else { } $found = "false"; } else { print; } } '_EOF_' foreach c (blastout/*.out) set t = $c:t set s = $t:r echo "Processing $c" perl shortenQueryName.pl < $c > blastout/${s}.format end foreach b (blastout/*.format) set t = $b:t set s = $t:r echo "Creating ${s}.psl" /cluster/bin/i386/blastToPsl $b psl/${s}.psl -scores=scores/${s}.scores \ >>& blastToPsl.log end # need to filter so that only results with e-value <= 1e-100 are kept # edit blastToPsl.c so can add e-value option and only get results for # e-values less than or equal to that value for filtering mkdir filtered/Blast cd filteredBlast mkdir psl scores foreach b (../blastout/*.format) set t = $b:t set s = $t:r echo "Creating ${s}.psl" $HOME/bin/i386/blastToPsl $b psl/${s}.psl -scores=scores/${s}.scores \ >>& blastToPsl.log end mkdir -p /cluster/data/danRer1/bed/ZonLab/blast lSort dirs raw.psl tmp psl liftUp -type=".psl" tblastnHg16KG.psl \ /cluster/data/danRer1/jkStuff/liftAll.lft warn raw.psl # add protein IDs to the qName $HOME/bin/i386/protDat tblastnHg16KG.psl \ /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl \ /cluster/data/hg16/bed/blat.hg16KG/kg.mapNames tblastnHg16KGPep.psl pslCheck tblastnHg16KGPep.psl # psl is good # load psl into database hgLoadPsl danRer1 tblastnHg16KGPep.psl # there are 173355 rows in the table # need to do further filtering # rename as tblastnHg16KGEval hgsql -e "alter table tblastnHg16KGPep rename tblastnHg16KGEval;" danRer1 mv tblastnHg16KGPep.psl tblastnHg16KGEval.psl foreach s (scores/*.scores) cat $s >> all.scores end sort -k2 all.scores > all.scores.sort # filter by taking just top 5 hits so for each protein query, take all the # alignments associated with its top 5 hits to the whole zebrafish genome # using the scores directory created by blastToPsl after filtering at an # e-value threshold of 1e-100. # Heather wrote a Java program and used database tables to do this. # ScoreFilter.java is in ~kent/java/src/edu/ucsc/genome/qa/filter/ ssh hgwdev cd /cluster/data/danRer1/bed/ZonLab/tblastn/filteredBlast/top5 hgsql -e "create database qa"; danRer1 cat << end > blast.sql CREATE TABLE blast ( strand char(2) not null, qName varchar(255) not null, qStart int unsigned not null, qEnd int unsigned not null, tName varchar(255) not null, tStart int unsigned not null, tEnd int unsigned not null, tSize int unsigned not null, score double not null ); end hgsql qa < blast.sql echo "load data local infile 'all.scores' into table blast" \ | hgsql qa cat << end > blast2.sql CREATE TABLE blast2 ( strand char(2) not null, qName varchar(255) not null, qStart int unsigned not null, qEnd int unsigned not null, tName varchar(255) not null, tStart int unsigned not null, tEnd int unsigned not null, tSize int unsigned not null, score double not null ); end hgsql qa < blast2.sql # load data into blast2 and sort echo "insert into blast2 select * from blast order by qName, score, tName" \ | hgsql qa # create index echo "create index qName on blast2(qName)" | hgsql qa cat << end > blast3.sql CREATE TABLE blast3 ( strand char(2) not null, qName varchar(255) not null, qStart int unsigned not null, qEnd int unsigned not null, tName varchar(255) not null, tStart int unsigned not null, tEnd int unsigned not null, tSize int unsigned not null, score double not null ); end hgsql qa < blast3.sql mkdir data echo "select distinct(qName) from blast" | hgsql qa -N > data/idlist wc -l data/idlist # 1987 qNames # then run ScoreFilter program - this reads from blast2 and # writes to blast3 and takes the list of qNames as input source javaenv javac ScoreFilter.java java ScoreFilter qNames.blast.out # Test case echo "select score, tName from blast2 where qName = 'AB002324' limit 12;" \ | hgsql qa echo "select * from blast3 where qName = 'AB002324';" | hgsql qa # get list of qNames and tNames from blast3 to query psl echo "select qName, tName from blast3;" | hgsql qa -N \ > data/qNametName.blast3.out wc -l data/qNametName.blast3.out # 5201 data/qNametName.blast3.out # create psl table in qa cat << end > pslBlast.sql CREATE TABLE pslBlast ( matches int unsigned not null, misMatches int unsigned not null, repMatches int unsigned not null, nCount int unsigned not null, qNumInsert int unsigned not null, qBaseInsert int not null, tNumInsert int unsigned not null, tBaseInsert int not null, strand char(2) not null, qName varchar(255) not null, qSize int unsigned not null, qStart int unsigned not null, qEnd int unsigned not null, tName varchar(255) not null, tSize int unsigned not null, tStart int unsigned not null, tEnd int unsigned not null, blockCount int unsigned not null, blockSizes longblob not null, qStarts longblob not null, tStarts longblob not null, index(tName(32),tStart,tEnd) ); end hgsql qa < pslBlast.sql # cat together psl files foreach s (../psl/*.psl) cat $s >> all.blast.psl end wc -l all.blast.psl # 181405 all.blast.psl # load psl data into table echo "load data local infile 'all.blast.psl' into table pslBlast" \ | hgsql qa # check all data loaded, correct number of rows # select all lines with qName and tName in data/qNametName.blast3.out # from this pslBlast table cat << end > getSelectedPsl.pl #!/usr/bin/perl -w use strict; while(<>) { chomp; my @names = split(/\t/); my ($qName, $tName) = @names; print STDERR "Querying pslBlast for query: $qName and target:$tName ... \n"; system "hgsql -N -e 'select * from pslBlast where qName = \"$qName\" and tName = \"$tName\";' qa"; } end chmod +x getSelectedPsl.pl perl getSelectedPsl.pl < data/qNametName.blast3.out > blastTop5.psl # Need to remove long names cat << end > shortenNames.pl #!/usr/bin/perl -w use strict; while(<>) { my $line = $_; my @row = split(/|/); # qName is the 10th field in a psl file if ($row[9] =~ /\|/) { my @accs = split(/\|/, $row[9]); my $newAcc = $accs[0]; foreach (my $i = 0; $i <= $#row; $i++ ) { if ($i == 9) { print "$newAcc\t"; } elsif ($i == $#row) { print "$row[$i]\n"; } else { print "$row[$i]\t"; } } } else { print $line; } } end chmod +x shortenNames.pl perl shortenNames.pl < blastTop5.psl > blastTop5Format.psl wc -l blastTop* # 29091 blastTop5.psl # 29091 blastTop5Format.psl mkdir psl mv blastTop5Format.psl ./psl pslSort dirs blastTop5Sort.psl tmp psl # lift up to chrom co-ordinates liftUp -type=".psl" tblastnHg16KG.psl \ /cluster/data/danRer1/jkStuff/liftAll.lft warn blastTop5Sort.psl # add protein IDs to the qName $HOME/bin/i386/protDat tblastnHg16KG.psl \ /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl \ /cluster/data/hg16/bed/blat.hg16KG/kg.mapNames tblastnHg16KGPep.psl pslCheck tblastnHg16KGPep.psl # psl is good # load psl into database hgLoadPsl danRer1 tblastnHg16KGPep.psl # previous table of alignments filtered on 1e-100 only is tblastnHg16KGEval # compare this to current tblastnHg16KGPep table with featureBits # featureBits danRer1 tblastnHg16KGEval # 2783971 bases of 1459132082 (0.191%) in intersection # featureBits danRer1 tblastnHg16KGPep # 2397991 bases of 1459132082 (0.164%) in intersection # featureBits danRer1 refGene:cds tblastnHg16KGEval # 246994 bases of 1459132082 (0.017%) in intersection # featureBits danRer1 refGene:cds tblastnHg16KGPep # featureBits danRer1 refGene:cds tblastnHg16KGPep # 250847 bases of 1459132082 (0.017%) in intersection # featureBits danRer1 tblastnHg16KGPepTop3 # 2223873 bases of 1459132082 (0.152%) in intersection # featureBits danRer1 refGene:cds tblastnHg16KGPepTop3 # 236309 bases of 1459132082 (0.016%) in intersection # load sequences into table hg16_kg_proteinseq.fa has 2833 sequences # less than in knownGenePep for hg16 - load all knownGenePep sequences ssh hgwdev cd /cluster/data/danRer1/bed/ZonLab/tblastn echo "select * from knownGenePep;" | hgsql -N hg16 > knownGenePepHg16.out # make FASTA file and add a "p" to accessions so hgc.c does not get # confused with the Genbank sequences of the same name awk '{ print ">"$1"p\n"$2}' knownGenePepHg16.out > knownGenePepHg16.fa grep '>' knownGenePepHg16.fa | wc # 42026 sequences # Copy sequences to gbdb if they are not already there mkdir -p /gbdb/hgFixed/tblastnKGHg16 ln -s /cluster/data/danRer1/bed/ZonLab/tblastn/knownGenePepHg16.fa \ /gbdb/hgFixed/tblastnKGHg16 # load sequences hgLoadSeq danRer1 /gbdb/hgFixed/tblastnKGHg16/knownGenePepHg16.fa # try taking top 3 instead of top 5 ssh hgwdev mkdir -p /cluster/data/danRer1/bed/ZonLab/tblastn/filteredBlast/top3 cd /cluster/data/danRer1/bed/ZonLab/tblastn/filteredBlast/top3 mkdir data cp ../top5/data/idlist data/ cat << '_EOF_' > getTop3.csh #!/bin/tcsh set db = "qa" set table = "blast3" set acc = $1 hgsql -N -e 'select qName, tName from '$table' where qName = "'$acc'" order by score limit 3;' $db >> data/qNametName.blast3.out '_EOF_' # << this line makes emacs coloring happy chmod +x getTop3.csh cat << '_EOF_' > gsub #LOOP ./getTop3.csh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 data/idlist single gsub jobList.csh chmod +x jobList.csh jobList.csh perl ../top5/getSelectedPsl.pl < data/qNametName.blast3.out > blastTop3.psl perl ../top5/shortenNames.pl < blastTop3.psl > blastTop3Format.psl wc -l blastTop* # 19823 blastTop5.psl # 19823 blastTop5Format.psl mkdir psl mv blastTop3Format.psl ./psl pslSort dirs blastTop3Sort.psl tmp psl # lift up to chrom co-ordinates liftUp -type=".psl" tblastnHg16KGTop3.psl \ /cluster/data/danRer1/jkStuff/liftAll.lft warn blastTop3Sort.psl # add protein IDs to the qName $HOME/bin/i386/protDat tblastnHg16KGTop3.psl \ /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl \ /cluster/data/hg16/bed/blat.hg16KG/kg.mapNames \ tblastnHg16KGPepTop3.psl pslCheck tblastnHg16KGPepTop3.psl # psl is good # load psl into database hgLoadPsl danRer1 tblastnHg16KGPepTop3.psl # previous table of alignments filtered on 1e-100 only is tblastnHg16KGEval # featureBits danRer1 tblastnHg16KGPepTop3 # 2223873 bases of 1459132082 (0.152%) in intersection # Decided to release the Top 5 hits with tblastnHg16KGPep # This could be improved upon. Brian Raney's method below is better and # finding exons and reduces noise in the alignments. # At a later date, only one of these tracks should be displayed. # small changes -- sort by chrom, chromStart, chromEnd; add tName index mkdir /cluster/data/danRer1/bed/ZonLab/tblastn/aug12 cd /cluster/data/danRer1/bed/ZonLab/tblastn/aug12 hgsqldump danRer1 tblastnHg16KGPep --no-data > tblastnHg16KGPepSorted.sql # edit the tablename to be tblastnHg16KGPepSorted.sql # also don't need the indices in tblastnHg16KGPepSorted.sql hgsql danRer1 < tblastnHg16KGPepSorted.sql hgsql danRer1 insert into tblastnHg16KGPepSorted select * from tblastnHg16KGPep order by tName, tStart, tEnd; rename table tblastnHg16KGPep to tblastnHg16KGPepNotSorted; rename table tblastnHg16KGPepSorted to tblastnHg16KGPep; # would prefer dual indices, but convention is single create index bin on tblastnHg16KGPep(bin); create index tStart on tblastnHg16KGPep(tStart); create index qName on tblastnHg16KGPep(qName); create index tEnd on tblastnHg16KGPep(tEnd); create index tName on tblastnHg16KGPep(tName); # Human Proteins - tBLASTn of hg16 knownGene (2004-07-27, braney) # blastHg16KG track. ssh kksilo bash mkdir -p /cluster/data/danRer1/bed/tblastn.hg16KG cd /cluster/data/danRer1/bed/tblastn.hg16KG ls -1S /iscratch/i/danRer1/blastdb/*.nsq | sed "s/\.nsq//" > fish.lst mkdir kgfas split -l 68 /cluster/data/hg16/bed/blat.hg16KG.2004-05-27/hg16KG.psl kgfas/kg cd kgfas for i in * do pslxToFa $i $i.fa rm $i done ls -1S kgfas/*.fa > kg.lst # use bluearc to hold our temp output mkdir -p /cluster/bluearc/danRer1/bed/tblastn.hg16KG/blastOut ln -s /cluster/bluearc/danRer1/bed/tblastn.hg16KG/blastOut . for i in `cat kg.lst` do mkdir blastOut/$i done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } {check out exists blastOut/$(root2)/q.$(root1).bscore} {check out exists blastOut/$(root2)/t.$(root1).psl } {check out exists blastOut/$(root2)/t.$(root1).bscore} #ENDLOOP '_EOF_' gensub2 fish.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/danRer1/bed/tblastn.hg16KG para create blastSpec para push cd blastOut for i in kg?? do cat $i/q.*.psl > q.$i.psl cat $i/q.*.bscore > q.$i.bscore cat $i/t.*.psl > t.$i.psl cat $i/t.*.bscore > t.$i.bscore simpleChain -prot -maxGap=27000 -outPsl q.$i.psl c.$i.psl sort -rn c.$i.psl | pslUniq stdin stdout | awk "((\$13 - \$12) / \$11) > 0.6 {print}" > u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > preblastHg16KG.psl blatDir=/cluster/data/hg16/bed/blat.hg16KG.2004-05-27 protDat preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl ssh hgwdev cd /cluster/data/danRer1/bed/tblastn.hg16KG/blastOut hgLoadPsl danRer1 blastHg16KG.psl #end Human Proteins # RH MAP TRACK (DONE, 2004-07-26, hartera) # RE-DO RH MAP TRACK WITH MORE STRINGENT POST-BLAT FILTERING # (DONE, 2005-05-14, hartera) # Data from Leonard Zon's lab at the Childrens Hospital, Boston # Radiation hybrid (RH) map data consists of 8707 genomic sequences: # (1) 2835 BAC clone ends (Max Planck Inst. Robert # Geisler's lab, Tuebingen, Germany, # (2) 2015 NCBI ESTs (submitted from around the planet, # (3) 171 RH shotgun sequences (Max Planck Inst. & Children^?s # Hosp. Boston, # (4) 3686 WZ compiled ESTs from WashU # # Repeatmasked Zv3 supercontigs BLASTn 8707 RH map genomic sequences ssh kkr1u00 mkdir -p /cluster/data/danRer1/bed/ZonLab/rhmap cd /cluster/data/danRer1/bed/ZonLab/rhmap mkdir -p /cluster/data/danRer1/bed/ZonLab/rhmap/seq cd /cluster/data/danRer1/bed/ZonLab/rhmap/seq ls -1S /iscratch/i/danRer1/trfFa/*.fa > genome.lst # Put the unmasked sequence data in rhDevSeq dir unzip rhSeqAll.zip foreach f (rhSeqAll/*/*.txt) mv $f . end rm -r rhSeqAll foreach s (*.txt) tr '[a-z]' '[A-Z]' < $s >> rhcontigs.fa end mkdir -p /iscratch/i/danRer1/rhmap cd /cluster/data/danRer1/bed/ZonLab/rhmap/seq # fasplit sequence rhcontigs.fa 20 /iscratch/i/danRer1/rhmap # remove ^M at end of each line perl stripEndChar.pl < rhcontigs.fa > rhcontigsNoEndChar.fa # also remove long headers and just keep first and second accession grep '>' rhcontigsNoEndChar.fa > rhcontigsFa.headers cat << '_EOF_' > formatFastaHeader.pl #!/usr/bin/perl -w use strict; while(<>) { if (/>/) { my @f = split(/\|/); print "$f[0].$f[1]\n"; } elsif ($_ eq "") { } else { print; } } '_EOF_' chmod +x formatFastaHeader.pl perl formatFastaHeader.pl< rhcontigsNoEndChar.fa > rhcontigsFormat.fa mv rhcontigsFormat.fa rhcontigs.fa rm rhcontigsNoEndChar.fa grep '>' rhcontigs.fa | wc -l # 8690 sequences, (~15500 for Affy zebrafish) so do not split cp rhcontigs.fa /iscratch/i/danRer1/rhmap iSync ssh kk cd /cluster/data/danRer1/bed/ZonLab/rhmap mkdir psl ls -1S /iscratch/i/danRer1/rhmap/rhcontigs.fa > rhmap.lst ls -1S /iscratch/i/danRer1/trfFa/*.fa > genome.lst # try same parameters as for bacEnds cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -tileSize=10 -ooc=/iscratch/i/danRer1/10.ooc {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 genome.lst rhmap.lst gsub spec para create spec para try para check para push, try ... etc. # para time # Completed: 310 of 310 jobs # CPU time in finished jobs: 7256s 120.93m 2.02h 0.08d 0.000 y # IO & Wait Time: 987s 16.45m 0.27h 0.01d 0.000 y# Average job time: 27s 0.44m 0.01h 0.00d # Longest job: 38s 0.63m 0.01h 0.00d # Submission to last job: 196s 3.27m 0.05h 0.00d # Make & check the psl table # Do sort, best in genome filter, and convert to chromosome coordinates # to create rhmap.psl pslSort dirs raw.psl tmp psl # only use alignments that have at least 80% identity in aligned region. # these are the parameters used for STS markers # try more stringent filtering for these: minAli=0.96 and minCover=0.40 # was suggested by Jim (DONE, 2005-05-14, hartera) # pslReps -nearTop=0.0001 -minAli=0.8 -noIntrons raw.psl \ # contig.psl /dev/null # Processed 646182 alignments mv rhMap.psl rhMap.psl.old mv contig.psl contig.psl.old pslReps -nearTop=0.0001 -minAli=0.96 -minCover=0.40 raw.psl \ contig.psl /dev/null # Processed 646182 alignments liftUp rhMap.psl /cluster/data/danRer1/jkStuff/liftAll.lft warn contig.psl pslCheck rhMap.psl # Summary of different alignment filters: # nearTop minAli minCover -noIntrons #alignments #aligned # 0.0001 0.8 - yes 16738 8380 # - 0.96 0.40 no 8711 7421 # 0.0001 0.96 0.40 no 7866 7421 # 0.002 0.96 0.40 no 7924 7421 # 0.0001 0.96 0.40 yes 7877 7421 # the best parameters seems to be nearTop=0.0001, minAli=0.96 and minCover=0.40 # previously, the sequence with the most alignments had 1987. Now the sequence # with the most alignments has 7, 26 have 3-7 alignments and 7014 have 1 # For the previous parameters, there were > 900 sequences with many alignments # as well as 7382 with 1 alignment. # Load sequence alignments into database. # Reload rhMap alignments - more stringent filtering (hartera, 2005-05-14) ssh hgwdev cd /cluster/data/danRer1/bed/ZonLab/rhmap echo "drop table rhMap;" | hgsql danRer1 hgLoadPsl danRer1 rhMap.psl hgsql -N -e "select qName from rhMap;" danRer1 | sort | uniq | wc # 8380 out of 8690 sequences aligned (96%) # after more stringent filtering - 7421 out of 8690 sequences aligned (85%) # Add RH Map sequences # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/danRer1/rhmap ln -s \ /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rhcontigs.fa \ /gbdb/danRer1/rhmap cd /cluster/data/danRer1/bed/ZonLab/rhmap/seq hgLoadSeq danRer1 /gbdb/danRer1/rhmap/rhcontigs.fa # BLASTZ SWAP FOR mm5 vs danRer1 BLASTZ TO CREATE danRer1 vs mm5 BLASTZ # USE RESCORED ALIGNMENTS (see makeMm5.doc) # CONVERT AXTs TO PSL AND LOAD INTO DATABASE # (DONE, 2004-08-05, hartera) ssh kolossus mkdir /cluster/data/danRer1/bed/blastz.mm5.swap cd /cluster/data/danRer1/bed/blastz.mm5.swap # use rescored axtChrom from blastzDanRer1 on mm5 set aliDir = /cluster/data/mm5/bed/blastz.danRer1 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 3.2G /cluster/data/mm5/bed/blastz.danRer1/axtChrom # 3.2G unsorted # 3.2G axtChrom rm -r unsorted # translate sorted axt files into psl ssh kolossus cd /cluster/data/danRer1/bed/blastz.mm5.swap mkdir -p pslChrom set tbl = "blastzMm5" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/danRer1/bed/blastz.mm5.swap/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl danRer1 $f echo "$f Done" end # CHAIN MOUSE (mm5) BLASTZ (DONE, 2004-08-05, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/danRer1/bed/blastz.mm5.swap mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/danRer1/bed/blastz.mm5.swap/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from hg16 vs chicken run. cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy # create chains with only default filtering on score - minScore = 1000 cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap $1 \ /iscratch/i/danRer1/nib \ /cluster/bluearc/scratch/mus/mm5/softNib $2 >& $3 '_EOF_' chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 29 of 29 jobs # CPU time in finished jobs: 2987s 49.78m 0.83h 0.03d 0.000 y # IO & Wait Time: 848s 14.14m 0.24h 0.01d 0.000 y # Average job time: 132s 2.20m 0.04h 0.00d # Longest job: 426s 7.10m 0.12h 0.00d # Submission to last job: 710s 11.83m 0.20h 0.01d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end ssh hgwdev cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain/chain hgLoadChain danRer1 chr1_chainMm5 chr1.chain # featureBits -chrom=chr1 danRer1 refGene:cds chainMm5Link -enrichment # refGene:cds 0.449%, chainMm5Link 9.488%, both 0.364%, cover 81.22%, # enrich 8.56x # featureBits -chrom=chr1 danRer1 refGene:cds chainHg17Link -enrichment # refGene:cds 0.449%, chainHg17Link 3.924%, both 0.358%, cover 79.76%, # enrich 20.33x # Try filtering for minScore = 5000 ssh kksilo cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain rm -r chain chainSplit chain all.chain ssh hgwdev cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain/chain hgLoadChain danRer1 chr1_chainMm5Filt5k chr1.chain # featureBits -chrom=chr1 danRer1 refGene:cds chainMm5Filt5kLink -enrichment # refGene:cds 0.449%, chainMm5Filt5kLink 7.948%, both 0.362%, cover 80.64%, # enrich 10.15x # featureBits -chrom=chr1 danRer1 refGene:cds chainMm5Filt10kLink -enrichment # refGene:cds 0.449%, chainMm5Filt10kLink 6.363%, both 0.357%, cover 79.66%, # enrich 12.52x # rows: chr1_chainMm5Link 303196 # chr1_chainMm5Filt5kLink 231508 # chr1_chainMm5Filt10kLink 147336 # filter chr1-25, M and Finished with minScore=5000 # and minScore=10000 for chrNA and chrUn which have a lot more # low scoring hits. # featureBits -chrom=chrNA danRer1 refGene:cds chainMm5Link -enrichment ssh kksilo cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain/chain rm chrNA.chain chrUn.chain # replace with those filtered on minScore=10000 cp ../filt10k/chain/chrUn.chain . cp ../filt10k/chain/chrNA.chain . cd .. rm -r filt10k gzip all.chain.unfiltered hgsql -e "drop table chr1_chainMm5Filt5k" danRer1 hgsql -e "drop table chr1_chainMm5Filt5kLink" danRer1 hgsql -e "drop table chr1_chainMm5Filt10k" danRer1 hgsql -e "drop table chr1_chainMm5Filt10kLink" danRer1 # load database tables ssh hgwdev cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain danRer1 ${c}_chainMm5 $i echo done $c end # NET MOUSE (mm5) BLASTZ (DONE, 2004-08-05, hartera) ssh kksilo cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 205414400, utime 5596 s/100, stime 237 # Add classification info using db tables: cd /cluster/data/mm5/bed/blastz.danRer1/axtChain # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInMouse cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \ /cluster/bluearc/mm5/linSpecRep.notInZebrafish cp /iscratch/i/danRer1/linSpecRep.notInMouse/* \ /cluster/bluearc/danRer1/linSpecRep.notInMouse ssh hgwdev cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain time netClass noClass.net danRer1 mm5 mm5.net \ -tNewR=/cluster/bluearc/danRer1/linSpecRep.notInMouse \ -qNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish -noAr # 112.390u 57.180s 4:17.17 65.9% 0+0k 0+0io 4565pf+0w netFilter -minGap=10 mm5.net | hgLoadNet danRer1 netMm5 stdin # featureBits danRer1 refGene:cds netMm5 -enrichment # refGene:cds 0.380%, netMm5 36.622%, both 0.331%, cover 87.02%, enrich 2.38x # CLEANUP BACENDS (DONE, 2004-08-10, hartera) ssh kksilo cd /cluster/data/danRer1/bed/ZonLab/bacends/clusterBlast/ nice rm batch batch.bak out *.lst blastToPsl.log para.results \ clusterBacEnds.psl clusterBacEnds.out cd zeroEval nice rm blastToPsl.log nice rm clusterBacEnds.psl nice zip cluster.zip * cd /cluster/data/danRer1/bed/ZonLab/bacends nice rm -r err nice rm bacends.remove bacends.percentNs80 bacends.noseqswithlotsNs \ bacends_good.fa batch batch.bak bed.tab BACerror.log psl.tab raw.psl \ seq.tab para.results nice rm -r psl nice gzip bacends.1/* duplicates/* pairs/* singles/* scores/* & # BLASTZ MM5 CLEANUP (DONE, 2004-08-10, hartera) ssh eieio cd /cluster/data/danRer1/bed/blastz.mm5.swap nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/noClass.net & nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net & # unzipped all.chain.gz and mm5.net.gz to make VSMM5 downloadables # and then zip up again (hartera, 2004-09-10) # MAKE VSMM5 DOWNLOADABLES (DONE, 2004-09-10, hartera) ssh kksilo # zip chains and nets cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChain gunzip all.chain.gz cp all.chain mm5.chain zip -j /cluster/data/danRer1/zip/mm5.chain.zip mm5.chain rm mm5.chain gunzip mm5.net.gz zip -j /cluster/data/danRer1/zip/mm5.net.zip mm5.net ssh hgwdev # copy chains and nets to downloads area set gp = /usr/local/apache/htdocs/goldenPath/danRer1 mkdir -p $gp/vsMm5 cd $gp/vsMm5 mv /cluster/data/danRer1/zip/mm5*.zip . md5sum *.zip > md5sum.txt # move axt files to downloads area and zip cd /cluster/data/danRer1/bed/blastz.mm5.swap/axtChrom mkdir -p $gp/vsMm5/axtChrom nice cp -p *.axt $gp/vsMm5/axtChrom cd $gp/vsMm5/axtChrom nice gzip *.axt md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # BLASTZ HG17 CLEANUP (DONE, 2004-08-10, hartera) ssh eieio cd /cluster/data/danRer1/bed/blastz.hg17 nice rm -rf raw & nice rm -rf axtChrom.orig & nice rm -rf axtChain.orig & nice rm -rf lav & nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/noClass.net & nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net & # BLASTZ FR1 CLEANUP (DONE, 2004-08-10, hartera) ssh eieio cd /cluster/data/danRer1/bed/blastz.fr1 nice rm -rf raw & nice rm -rf axtChain.orig & nice rm -rf lav & nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/noClass.net & nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net & # REPEAT LIBRARY TEST (DONE, 2004-10-20, hartera) # Get new repeat library ssh kksilo cd /cluster/data/danRer1 wget --timestamp \ http://www.genetics.wustl.edu/fish_lab/repeats/Dr.repeats.020501 # copy to libraries directory for RepeatMasker cp Dr.repeats.020501 /cluster/bluearc/RepeatMasker/Libraries/danioRW.lib # temporarily move danio.lib to danioRM.lib.old # add danioRW.lib to danio.lib # for now just mask 0.5Mb so use chr1_1_01.fa - unmasked mkdir newRepeats # faSize chr1_1_01.fa # 495381 bases (62485 N's 432896 real) in 1 sequences in 1 files cp ./1/chr1_1/chr1_1_01.fa > ./newRepeats # RepeatMasker script with input directory and name of .fa file ssh kolossus cd /cluster/data/danRer1/newRepeats nice /cluster/data/danRer1/jkStuff/RMZebrafish \ /cluster/data/danRer1/newRepeats chr1_1_01.fa # can also use -lib option to use a custom library cat << '_EOF_' > jkStuff/RMandRWZebrafish #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/danRer1/$2 /bin/cp $2 /tmp/danRer1/$2/ cd /tmp/danRer1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -lib danioRMandRW.lib $2 popd /bin/cp /tmp/danRer1/$2/$2.out ./ if (-e /tmp/danRer1/$2/$2.align) /bin/cp /tmp/danRer1/$2/$2.align ./ if (-e /tmp/danRer1/$2/$2.tbl) /bin/cp /tmp/danRer1/$2/$2.tbl ./ if (-e /tmp/danRer1/$2/$2.cat) /bin/cp /tmp/danRer1/$2/$2.cat ./ /bin/rm -fr /tmp/danRer1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMandRWZebrafish cp /dev/null newRepeats/new/RMJobs echo /cluster/data/danRer1/jkStuff/RMandRWZebrafish \ /cluster/data/danRer1/newRepeats/test chr1_1_01.fa \ '{'check out line+ /cluster/data/danRer1/newRepeats/test/chr1_1_01.fa.out'}' \ >> newRepeats/test/RMJobs #- Do the run ssh kk cd /cluster/data/danRer1/newRepeats/test para create RMJobs para try # para time # Completed: 1 of 1 jobs # with previous output some print errors at end of run but try using for mask # anyway # already have simpleRepeats for chr1_1_01.fa ssh kksilo cd /cluster/data/danRer1/newRepeats/test # lift up to contig level set d = /cluster/data/danRer1/1/chr1_1 liftUp chr1_1.fa.out $d/chr1_1.lft warn chr1_1_01.fa.out > /dev/null #- Lift pseudo-contigs to chromosome level set d = /cluster/data/danRer1/1/ if (-e $d/lift/ordered.lft && ! -z $d/lift/ordered.lft) then liftUp chr1.fa.out $d/lift/ordered.lft warn chr1_1.fa.out \ > /dev/null # the new repeats "Drxxxx" do not have a class type so add this in # otherwise maskOutFa can not process this. awk 'BEGIN {OFS="\t";} {if ($10 ~ /^Dr[0-9]+$/ ){ print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, "DrRW", $11, $12, $13, $14;} else {print;}}' chr1.fa.out > chr1.format.out mv chr1.fa.out chr1.fa.out.orig mv chr1.format.out chr1.fa.out set trfChr=/cluster/data/danRer1/bed/simpleRepeat/trfMaskChrom foreach f (./chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/chr1.bed $f end # blastz alignment of 0.5Mb region much improved - check with laj. # removes many repeats # MAKE UNIGENE ALIGNMENTS (DONE, 2004-11-01, hartera) ssh kksilo mkdir /cluster/data/danRer1/bed/unigene cd /cluster/data/danRer1/bed/unigene # Download zebrafish UniGene and run Chuck's preproc scripts: wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Dr.info wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Dr.seq.uniq.gz wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Dr.data.gz gunzip *.gz # Chuck's script expects human (Hs) -- use Gga for chicken: sed -e 's/Hs/Dr/g' /projects/cc/hg/sugnet/uniGene/countSeqsInCluster.pl \ > ./countSeqsInCluster.pl chmod a+x ./countSeqsInCluster.pl perl ./countSeqsInCluster.pl Dr.data counts.tab perl /projects/cc/hg/sugnet/uniGene/parseUnigene.pl Dr.seq.uniq \ Dr.seq.uniq.simpleHeader.fa leftoverData.tab # Put on iscratch for cluster run ssh kkr1u00 mkdir /iscratch/i/danRer1/unigene cp -p /cluster/data/danRer1/bed/unigene/Dr.seq.uniq.simpleHeader.fa \ /iscratch/i/danRer1/unigene/ iSync # align on big cluster ssh kk cd /cluster/data/danRer1/bed/unigene mkdir psl ls -1S /iscratch/i/danRer1/trfFa/*.fa > genome.lst ls -1S /iscratch/i/danRer1/unigene/*.fa > uniGene.lst cat << '_EOF_' > template.sub #LOOP /cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/iscratch/i/danRer1/11.ooc {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.lst uniGene.lst template.sub jobList para create jobList para try, check, push, check # para time # Completed: 310 of 310 jobs # CPU time in finished jobs: 12627s 210.44m 3.51h 0.15d 0.000 y # IO & Wait Time: 2304s 38.41m 0.64h 0.03d 0.000 y # Average job time: 48s 0.80m 0.01h 0.00d # Longest job: 89s 1.48m 0.02h 0.00d # Submission to last job: 699s 11.65m 0.19h 0.01d # postprocess ssh kksilo cd /cluster/data/danRer1/bed/unigene pslSort dirs raw.psl tmp psl >& pslSort.log liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \ | pslReps -minCover=0.2 -minAli=0.965 -nearTop=0.002 \ stdin uniGene.lifted.pslReps.psl /dev/null rm raw.psl gzip Dr.seq.uniq Dr.data # load into db ssh hgwdev cd /cluster/data/danRer1/bed/unigene mkdir /gbdb/danRer1/unigene ln -s /cluster/data/danRer1/bed/unigene/Dr.seq.uniq.simpleHeader.fa \ /gbdb/danRer1/unigene/ hgLoadSeq danRer1 /gbdb/danRer1/unigene/Dr.seq.uniq.simpleHeader.fa hgLoadPsl -fastLoad -table=uniGene_dr danRer1 uniGene.lifted.pslReps.psl # STS Markers (in progress, 2004-08-09, hartera) # download new files (2005-04-12, hartera) # Check NCBI for latest update for STS makers in dbSTS. # Our latest version is in store5 for space ssh eieio mkdir -p /cluster/store5/sts.2005-04-12 ln -s /cluster/store5/sts.2005-04-12 /cluster/data/ncbi cd /cluster/data/ncbi/sts.2005-04-12 # The dbSTS.sts does not have as many ids as in the version # on the NCBI ftp site so get new version. # update from NCBI wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz gunzip sts.gz mv sts dbSTS.fa # check against sts.9 and a number of new Danio rerio markers added mkdir -p /cluster/store5/UniSTS.2005-04-12 ln -s /cluster/store5/UniSTS.2005-04-12 /cluster/data/ncbi cd /cluster/data/ncbi/UniSTS.2005-04-12 # sts and aliases files were last updated on 04/12/05 by NCBI wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases # these mapping panels files were last updated 03/16/05 by NCBI wget --timestamp -r l1 \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/ mkdir danRer mv ./ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/* \ ./danRer rm -r ftp.ncbi.nih.gov # No difference between UniSTS and sts files so move dbSTS.fa to UniSTS dir # convert dbSTS.fa file to easier reading format, and get accessions cd /cluster/data/ncbi/UniSTS.2005-04-12 mv /cluster/data/ncbi/sts.2005-04-12/dbSTS.fa . perl /cluster/bin/scripts/convertGbFaFile dbSTS.fa > dbSTS.convert.fa grep ">" dbSTS.convert.fa | cut -f 2 -d ">" > dbSTS.acc ssh kksilo mkdir -p /cluster/store8/danRer1/STSmarkers ln -s /cluster/store8/danRer1/STSmarkers \ /cluster/data/danRer1/bed/STSmarkers mkdir /cluster/data/danRer1/bed/STSmarkers/downloads cd /cluster/data/danRer1/bed/STSmarkers/downloads # use UniSTSParse.pl to get alias names cp /cluster/data/mm5/bed/STSmarkers/downloads/UniSTSParse.pl . cp /cluster/data/ncbi/UniSTS.2005-04-12/UniSTS.sts . cp /cluster/data/ncbi/UniSTS.2005-04-12/UniSTS.aliases . # create a UniSTS_zebrafish.sts by grepping for lines with "Danio" grep "Danio" UniSTS.sts > UniSTS_Danio_rerio.sts perl ./UniSTSParse.pl UniSTS_Danio_rerio.sts UniSTS.aliases \ > UniSTS_Danio_rerio_alias.0 cat UniSTS_Danio_rerio_alias.0 | sort -u | sort -n \ > UniSTS_Danio_rerio.alias # some UniSTS markers for zebrafish have only primers but no name and no accession # Download ZFIN data for all markers for the six mapping panels wget --timestamp http://zfin.org/data_transfer/Downloads/mappings.txt # Download ZFIN marker mappings to Genbank accessions wget --timestamp http://zfin.org/data_transfer/Downloads/genbank.txt # Download file of old and new names for STS markers from ZFIN wget --timestamp http://zfin.org/data_transfer/Downloads/marker_alias.txt grep "STS" genetic_markers.txt > zfinIDSTS.txt # make file of mapping data foreach f (/cluster/data/ncbi/UniSTS.2004-10-06/danRer1/*.txt) cat $f >> danRer1STSMaps.txt end mv /cluster/data/ncbi/UniSTS.2004-10-06/danRer1/danRer1STSMaps.txt \ /cluster/data/danRer1/bed/STSmarkers/downloads # Need to download zebrafish ZFIN IDs mapped to GenBank cd /cluster/data/danRer1/bed/STSmarkers/downloads wget --timestamp http://zfin.org/data_transfer/Downloads/genbank.txt # Download ZFIN IDs and mapping panels wget --timestamp http://zfin.org/data_transfer/Downloads/mappings.txt # Create new tables for STS so there are less columns in Info table # Download file of old and new names for STS markers wget --timestamp http://zfin.org/data_transfer/Downloads/marker_alias.txt # sort and uniq mappings.txt as there are duplicate lines sort mappings.txt | uniq > mappings.uniq.txt # make a .as file for stsMarkers which is similar to the hg17 stsInfo2 table # align primer sequences # create a file of UniSTS IDs and the primer sequences ssh eieio cd /cluster/data/ncbi/UniSTS.2005-04-12/danRer cp /cluster/data/danRer1/bed/STSmarkers/downloads/UniSTS_Danio_rerio* . awk 'BEGIN {OFS="\t"} {print $1, $2, $3}' UniSTS_Danio_rerio.sts > zfish.primers # strip out N's and wobbles (KS) as isPcr can't handle them # only 1 K, no Ns and no S. K is T or G so just switch for T and if it is a # G in that position will just get a mismatch, UniSTS ID 204670 sed -e 's/K/T/' zfish.primers > zfish.primers.isPcr # keep getting error where # bad char in reverse seq: aacckgagcgggtcagct # this is the complement of one of the reverse primers, M->K # use findChars.pl to find primers with characters other than ACTG cat > findChars.pl >> 'EOF' #!/usr/bin/perl -w use strict; my @a = ; @a = glob("/cluster/bluearc/danRer1/stsMarkers/primers/primers_*"); while (defined(my $nextName = )) { print "Processing $nextName ...\n"; my $file = open(FILE, $nextName) || die "Can not open $nextName: $!\n"; while () { my $l = $_; chomp $l; if ($l =~ /[^ACTG0-9\t]/) { print "$l\n"; } } } 'EOF' # << for emacs chmod +x findChars.pl perl findChars.pl # output is: # Processing /cluster/bluearc/danRer1/stsMarkers/primers/primers_al ... # 204670 TGCTGTCGRCATTTACTGGAAC TGCTTTATCACGTACAGCTGA # Processing /cluster/bluearc/danRer1/stsMarkers/primers/primers_an ... # 238222 GTATAGTAGTGTGTATGAGCATCTGG AGCTGACCCGCTCMGGTT # since R is A or G and M is C or A # so change R -> A and M -> C perl -pi.bak -e 's/R/A/' zfish.primers.isPcr perl -pi.bak -e 's/M/C/' zfish.primers.isPcr # only one primer with length < 10, so keep. looks like isPcr can now handle # shorter primers mkdir -p /cluster/bluearc/danRer1/stsMarkers/primers cd /cluster/bluearc/danRer1/stsMarkers/primers split -l 2000 \ /cluster/data/ncbi/UniSTS.2005-04-12/danRer/zfish.primers.isPcr primers_ # keep getting error where # bad char in reverse seq: aacckgagcgggtcagct # this is the complement of one of the reverse primers, M->K # use findChars.pl to find primers with characters other than ACTG cat > findChars.pl >> 'EOF' #!/usr/bin/perl -w use strict; my @a = ; @a = glob("/cluster/bluearc/danRer1/stsMarkers/primers/primers_*"); while (defined(my $nextName = )) { print "Processing $nextName ...\n"; my $file = open(FILE, $nextName) || die "Can not open $nextName: $!\n"; while () { my $l = $_; chomp $l; if ($l =~ /[^ACTG0-9\t]/) { print "$l\n"; } } } 'EOF' # << for emacs chmod +x findChars.pl perl findChars.pl # output is: # Processing /cluster/bluearc/danRer1/stsMarkers/primers/primers_al ... # 204670 TGCTGTCGRCATTTACTGGAAC TGCTTTATCACGTACAGCTGA # Processing /cluster/bluearc/danRer1/stsMarkers/primers/primers_an ... # 238222 GTATAGTAGTGTGTATGAGCATCTGG AGCTGACCCGCTCMGGTT # since R is A or G and M is C or A # so change R -> A and M -> C perl -pi.bak -e 's/R/A/' \ /cluster/bluearc/danRer1/stsMarkers/primers/primers_al perl -pi.bak -e 's/M/C/' \ /cluster/bluearc/danRer1/stsMarkers/primers/primers_an rm /cluster/bluearc/danRer1/stsMarkers/primers/*.bak ssh kk cd /cluster/data/danRer1/bed/STSmarkers/ mkdir primers cd primers mkdir run cd run ls -1S /iscratch/i/danRer1/trfFa/*.fa > contigs.lst ls -1S /cluster/bluearc/danRer1/stsMarkers/primers/primers_* > primers.lst cat > template << 'EOF' #LOOP /cluster/home/hartera/bin/i386/isPcr -out=psl -minPerfect=5 -maxSize=5000 -tileSize=10 -ooc=/iscratch/i/danRer1/10.ooc -stepSize=5 $(path1) $(path2) {check out line /cluster/bluearc/danRer1/stsMarkers/primers/out/$(root1)_$(root2).psl} #ENDLOOP 'EOF' # << for emacs # may need to add -mask=lower option mkdir -p /cluster/bluearc/danRer1/stsMarkers/primers/out gensub2 contigs.lst primers.lst template jobList para create jobList # 4340 jobs para try, para check, para push, para check etc. # Completed: 4340 of 4340 jobs # CPU time in finished jobs: 6296s 104.93m 1.75h 0.07d 0.000 y # IO & Wait Time: 14235s 237.26m 3.95h 0.16d 0.000 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 12s 0.20m 0.00h 0.00d # Submission to last job: 95s 1.58m 0.03h 0.00d # may need to add -mask=lower option # Check NCBI for latest update for STS makers in dbSTS. # Our latest version is in store5 for space ssh eieio mkdir -p /cluster/store5/sts.2004-10-06 ln -s /cluster/store5/sts.2004-10-06 /cluster/data/ncbi cd /cluster/data/ncbi/sts.2004-10-06 # The dbSTS.sts does not have as many ids as in the version # on the NCBI ftp site so get new version. # update from NCBI wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz gunzip sts.gz mv sts dbSTS.fa # checked difference between these and sts.9 files - no Danio sequences added mkdir -p /cluster/store5/UniSTS.2004-10-06 ln -s /cluster/store5/UniSTS.2004-10-06 /cluster/data/ncbi cd /cluster/data/ncbi/UniSTS.2004-10-06 wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases wget --timestamp -r l1 \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/ mkdir danRer1 cp ./ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/* \ ./danRer1 rm -r ftp.ncbi.nih.gov # No difference between UniSTS and sts files so move dbSTS.fa to UniSTS dir # convert dbSTS.fa file to easier reading format, and get accessions cd /cluster/data/ncbi/UniSTS.2004-10-06 mv /cluster/data/ncbi/sts.2004-10-06/dbSTS.fa . perl /cluster/bin/scripts/convertGbFaFile dbSTS.fa > dbSTS.convert.fa grep ">" dbSTS.convert.fa | cut -f 2 -d ">" > dbSTS.acc # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers, # all.STS.fa, stsAlias.bed files # create stsInfo2.bed.prev and all.STS.fa.prev as empty files cp /dev/null stsInfo2.bed.prev cp /dev/null all.STS.fa.prev updateStsInfo -verbose=1 -gb=dbSTS.acc stsInfo2.bed.prev all.STS.fa.prev \ dbSTS.sts dbSTS.aliases dbSTS.convert.fa new # this creates a valid new.primers and new.fa file mv new.primers all_primers mv new.fa all.STS.fa # above does not work well without previous information to add # DONT USE THESE FILES - in a test dir ssh kksilo mkdir -p /cluster/store8/danRer1/STSmarkers ln -s /cluster/store8/danRer1/STSmarkers \ /cluster/data/danRer1/bed/STSmarkers mkdir /cluster/data/danRer1/bed/STSmarkers/downloads cd /cluster/data/danRer1/bed/STSmarkers/downloads # use UniSTSParse.pl to get alias names cp /cluster/data/mm5/bed/STSmarkers/downloads/UniSTSParse.pl . cp /cluster/data/ncbi/UniSTS.2004-10-06/UniSTS.sts . cp /cluster/data/ncbi/UniSTS.2004-10-06/UniSTS.aliases . # create a UniSTS_zebrafish.sts by grepping for lines with "Danio" grep "Danio" UniSTS.sts > UniSTS_Danio_rerio.sts perl ./UniSTSParse.pl UniSTS_Danio_rerio.sts UniSTS.aliases \ > UniSTS_Danio_rerio_alias.0 cat UniSTS_Danio_rerio_alias.0 | sort -u | sort -n \ > UniSTS_Danio_rerio.alias # Download ZFIN data for all markers wget --timestamp http://zfin.org/data_transfer/Downloads/genetic_markers.txt grep "STS" genetic_markers.txt > zfinIDSTS.txt # make file of mapping data foreach f (/cluster/data/ncbi/UniSTS.2004-10-06/danRer1/*.txt) cat $f >> danRer1STSMaps.txt end mv /cluster/data/ncbi/UniSTS.2004-10-06/danRer1/danRer1STSMaps.txt \ /cluster/data/danRer1/bed/STSmarkers/downloads # Need to download zebrafish ZFIN IDs mapped to GenBank cd /cluster/data/danRer1/bed/STSmarkers/downloads wget --timestamp http://zfin.org/data_transfer/Downloads/genbank.txt # Download ZFIN IDs and mapping panels wget --timestamp http://zfin.org/data_transfer/Downloads/mappings.txt # Create new tables for STS so there are less columns in Info table # Download file of old and new names for STS markers wget --timestamp http://zfin.org/data_transfer/Downloads/marker_alias.txt # make a .as file for stsMarkers which is similar to the hg17 stsInfo2 table cat << '_EOF_' > $HOME/kent/src/hg/lib/stsMarkers.as table stsMarkers "Constant STS marker information - Oct 2004 revision" ( uint identNo; "UCSC identification number" string name; "Official UCSC name" uint gbCount; "Number of related GenBank accessions" string[gbCount] genbank; "Related GenBank accessions" string OtherDb; "Name of other database of marker IDs" uint otherDbCount; "Number of related other database identifiers" string[OtherDbCount] gdb; "Related other database identifiers" uint nameCount; "Number of alias names" string[nameCount] otherNames; "Alias names" uint dbSTSid; "ID number in UniSTS or dbSTS" string leftPrimer; "5' primer sequence" string rightPrimer; "3' primer sequence" string distance; "Length of STS sequence" string organism; "Organism for which STS discovered" uint sequence; "Whether the full sequence is available (1) or not (0) for STS" ) '_EOF_' cd $HOME/kent/src/hg/lib/ autoSql stsMarkers.as stsMarkers mv stsMarkers.h $HOME/kent/src/hg/inc cat << '_EOF_' > $HOME/kent/src/hg/lib/stsMapping.as table stsMapping "STS marker and its position on golden path" ( string chrom; "Chromosome or 'unknown'" int chromStart; "Start position in chrom - negative 1 if unpositioned" uint chromEnd; "End position in chrom" string name; "Name of STS marker" uint score; "Score of a marker = 1000/(#placements) when placed uniquely, else 1500/(#placements) when placed in multiple locations" uint identNo; "Identification number of STS" string ctgAcc; "Contig accession number" string otherAcc; "Accession number of other contigs that the marker hits" ) '_EOF_' cd $HOME/kent/src/hg/lib/ autoSql stsMapping.as stsMapping mv stsMapping.h $HOME/kent/src/hg/inc cat << '_EOF_' > $HOME/kent/src/hg/lib/stsMapPos.as table stsMapPos "STS marker and its position on genetic maps" ( uint identNo; "Identification number of STS" string mapName; "Name or source of genetic map" string name; "Name of marker in map" string chrom; "Chromosome in map" float pos; "Position in map" float LOD; "LOD score in map" ) '_EOF_' cd $HOME/kent/src/hg/lib/ autoSql stsMapPos.as stsMapPos mv stsMapPos.h $HOME/kent/src/hg/inc # stsAlias already exists in $HOME/kent/src/hg/lib/ # COMMIT THESE LATER # do make to check it works and commit the .as, .sql, .c and .h file to CVS # Need to download zebrafish ZFIN IDs mapped to GenBank cd /cluster/data/danRer1/bed/STSmarkers/downloads wget # cluster run of zebrafish sts sequences ssh kksilo cd /cluster/data/danRer1/bed/STSmarkers/downloads # use list of zebrafish UniSTS entry accessions awk 'BEGIN {FS="\t"} {print $7;}' UniSTS_Danio_rerio.sts \ | sed -e 's/;/\n/g' > danRer.accs sort danRer.accs | uniq > danRer.uniq.accs # 26894 lines; 26955 accessions; 17312 unique accessions $HOME/bin/i386/faSomeRecords \ /cluster/data/ncbi/UniSTS.2004-10-06/dbSTS.convert.fa \ /cluster/data/danRer1/bed/STSmarkers/downloads/danRer.uniq.accs \ /cluster/data/danRer1/bed/STSmarkers/downloads/danRerdbSTS.fa # got 6045 sequences, 11267 not found # sequences not found - may all be in GenBank as ESTs, those found # by searching were in GenBank # Get sequences from GenBank ssh kksilo mkdir -p /cluster/data/danRer1/bed/STSmarkers/sequences cd /cluster/data/danRer1/bed/STSmarkers/sequences echo 'restrict org = "Danio rerio"' > sts.filt gbToFaRa sts.filt danRer.fa danRer.ra /dev/null \ /cluster/data/genbank/data/download/genbank.143.0/gbsts*.gz # still does not have these missing records that are in dbEST # try getting these sequences that are missing from the EST db # make list of these missing accessions grep '>' danRerdbSTS.fa | sed -e 's/>//' | sort > drSTS.accs.sort # 6045 accs found here, 11266 missing; 17311 in danRer.uniq.acc2 comm -13 drSTS.accs.sort danRer.uniq.acc2 > notFound.accs wc -l notFound.accs # 11266 getRna hg17 notFound.accs notIndbSTSFASTA.fa > notInESTs.accs # 11250 accs found so 16 to find still awk '{print $1}' notInESTs.accs > accsToFind # get these sequences from Batch Entrez at NCBI to batchseq.fa perl /cluster/bin/scripts/convertGbFaFile batchseq.fa > batchseq.convert.fa # cat sequences together cat danRerdbSTS.fa notIndbSTSFASTA.fa batchseq.convert.fa \ > allSTSMarkerSeqs.fa # split sequences to map these to the genome mkdir -p /cluster/bluearc/stsMarker/zebrafishMarker/split cd /cluster/bluearc/stsMarker/zebrafishMarker/split faSplit sequence \ /cluster/data/danRer1/bed/STSmarkers/downloads/allSTSMarkerSeqs.fa 50 sts # do the blat run ssh kk cd /cluster/data/danRer1/bed/STSmarkers mkdir run cd run ls -1 /iscratch/i/danRer1/trfFa/*.fa > genome.lst ls -1S /cluster/bluearc/stsMarker/zebrafishMarker/split/sts*.fa > sts.lst mkdir -p /cluster/bluearc/danRer1/stsMarkers/sts/out foreach f (`cat sts.lst`) set d = $f:t:r mkdir /cluster/bluearc/danRer1/stsMarkers/sts/out/$d end # create alignments cat > template << 'EOF' #LOOP /cluster/bin/i386/blat $(path1) $(path2) -ooc=/cluster/bluearc/danRer1/11.ooc -stepSize=5 {check out line+ /cluster/bluearc/danRer1/stsMarkers/sts/out/$(root2)/$(root1).$(root2).psl} #ENDLOOP 'EOF' # << for emacs gensub2 genome.lst sts.lst template jobList para create jobList # 15190 jobs para try, create, push ..... # para time # Completed: 15190 of 15190 jobs # CPU time in finished jobs: 61925s 1032.09m 17.20h 0.72d 0.002 y # IO & Wait Time: 51578s 859.63m 14.33h 0.60d 0.002 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest job: 49s 0.82m 0.01h 0.00d # Submission to last job: 187s 3.12m 0.05h 0.00d ssh kolossus cd /cluster/bluearc/danRer1/stsMarkers/sts pslSort dirs raw.psl temp out/* rm -rf temp pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons raw.psl \ stsMarkers.psl /dev/null # Processed 2574993 alignments # Lift them and get them ready to combine with primer alignments liftUp -nohead stsMarkers.lifted.psl \ /cluster/data/danRer1/jkStuff/liftAll.lft warn stsMarkers.psl /cluster/bin/scripts/extractPslInfo stsMarkers.lifted.psl # creates .initial /cluster/bin/scripts/findAccession2 -agp stsMarkers.lifted.psl.initial \ /cluster/data/danRer1 sort -k 4n stsMarkers.lifted.psl.initial.acc > stsMarkers.final # determine how many were mapped cut -f 4 stsMarkers.final | sort -n | uniq > stsMarkers.found wc -l stsMarkers.found # 12365 stsMarkers.found out of 17311 # retry with less stringent pslReps pslReps -nearTop=0.0001 -minAli=0.8 -noIntrons raw.psl \ stsMarkers2.psl /dev/null # Processed 2574993 alignments # Lift them and get them ready to combine with primer alignments liftUp -nohead stsMarkers2.lifted.psl \ /cluster/data/danRer1/jkStuff/liftAll.lft warn stsMarkers2.psl /cluster/bin/scripts/extractPslInfo stsMarkers2.lifted.psl # creates .initial /cluster/bin/scripts/findAccession2 -agp stsMarkers2.lifted.psl.initial \ /cluster/data/danRer1 sort -k 4n stsMarkers2.lifted.psl.initial.acc > stsMarkers2.final # determine how many were mapped cut -f 4 stsMarkers2.final | sort -n | uniq > stsMarkers2.found wc -l stsMarkers2.found # 16310 stsMarkers2.found out of 17311 # get mappings.txt from ZFIN ssh kksilo mkdir /cluster/data/danRer1/bed/STSmarkers/downloads/panels cd /cluster/data/danRer1/bed/STSmarkers/downloads/panels foreach m (GAT HS T51 LN54 MGH MOP) echo "# Map Name: ${m}" >> $m.txt grep $m ../mappings.txt >> $m.txt end cat *.txt >> zfin.maps 2005-06-13 # use formatZfishSts to bring all information together about STS markers $HOME/bin/i386/formatZfishSts # USAGE: formatZfishSts [-verbose=] mkdir /cluster/data/danRer1/bed/STSmarkers/downloads/test cd /cluster/data/danRer1/bed/STSmarkers/downloads/test cat /cluster/data/ncbi/UniSTS.2005-04-12/danRer/*.txt >> UniSTSmaps.txt awk 'BEGIN {FS="\t"} {print $2;}' UniSTSmaps.txt > UniSTSmaps.names sort UniSTSmaps.names | uniq > UniSTSmaps.names.sort # remove blank line at top awk 'BEGIN {FS="\t"} {print $3;}' markers.format | sort | uniq \ > UniSTS.names.sort comm -12 UniSTS.names.sort UniSTSmaps.names.sort > UniSTSnamesvsmaps comm -13 UniSTS.names.sort UniSTSmaps.names.sort > onlyinUniSTSmaps.names awk 'BEGIN {FS="\t"} {print $4;}' markers.format | sort | uniq \ > ZFIN.names.sort comm -12 ZFIN.names.sort onlyinUniSTSmaps.names > ZFINnamesvsmaps # 14350 ZFINnamesvsmaps # 47197 onlyinUniSTSmaps.names comm -13 ZFIN.names.sort onlyinUniSTSmaps.names > onlyinUniSTSmaps.names2 # 32847 onlyinUniSTSmaps.names2 awk 'BEGIN {FS="\t"} {print $6;}' markers.format > aliases.names # ENABLE AUTO BUILD MGC GENES TRACK (DONE markd) - add the following lines to danRer1 entry in /cluster/data/genbank/etc/genbank.conf danRer1.mgcTables.default = full danRer1.mgcTables.mgc = all - wait one day for nightly build to align and load them into the db - rebuild trackDb # LIFTOVER CHAINS TO DANRER2 (DONE, 2005-07-20, hartera) # Added path for makeLo scripts (DONE, 2005-07-28, hartera) # CLEAN UP OF DIRECTORIES AND FILES (DONE, 2006-08-11, hartera) # Create liftOver for only chr1-25 because chrFinished, chrUn and chrNA # will cause too many false alignments. # run alignment # NOTE: split danRer2 to /iscratch/i is doc'ed in makeDanRer2.doc # make a nibDir with no chrUn and chrNA ssh kkr1u00 mkdir /iscratch/i/danRer1/nibNoUnandNA cp /iscratch/i/danRer1/nib/* /iscratch/i/danRer1/nibNoUnandNA/ cd /iscratch/i/danRer1/nibNoUnandNA/ rm chrUn.nib chrNA.nib iSync mkdir /iscratch/i/danRer1/nibUnandNA cp /iscratch/i/danRer1/nib/chrUn.nib /iscratch/i/danRer1/nibUnandNA/ cp /iscratch/i/danRer1/nib/chrNA.nib /iscratch/i/danRer1/nibUnandNA/ iSync ssh kk cd /cluster/data/danRer1 set dir = /cluster/bin/scripts # run 1-25, Finished and M for danRer1 vs 1-25 and M for danRer2 $dir/makeLoChain-align danRer1 /iscratch/i/danRer1/nibNoUnandNA \ danRer2 /iscratch/i/danRer2/liftOver/liftSplit/split # run Un and NA for danRer1 separately as these could take a long time $dir/makeLoChain-align danRer1 /iscratch/i/danRer1/nibUnandNA \ danRer2 /iscratch/i/danRer2/liftOver/liftSplit/split cd /cluster/data/danRer1/bed/blat.danRer2.2005-05-24/run para try, check, push etc. # do this next: # ssh kkusr01 # /cluster/bin/scripts/makeLoChain-lift.csh danRer1 danRer2 # Created parasol job in bed/blat.danRer1.2005-05-18/run cd bed rm blat.danRer2 ln -s blat.danRer2.2005-05-18 blat.danRer2 cd blat.danRer2/run para try, check, push, check etc.. # para time # Completed: 702 of 702 jobs # CPU time in finished jobs: 27720091s 462001.51m 7700.03h 320.83d 0.879 y # IO & Wait Time: 50264s 837.74m 13.96h 0.58d 0.002 y # Average job time: 39559s 659.32m 10.99h 0.46d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 156623s 2610.38m 43.51h 1.81d # Submission to last job: 541963s 9032.72m 150.55h 6.27d ssh kk cd /cluster/data/danRer1 set dir = /cluster/bin/scripts $dir/makeLoChain-align danRer1 /iscratch/i/danRer1/nib \ danRer2 /iscratch/i/danRer2/liftOver/liftSplit/split/UnandNA # Created parasol job in bed/blat.danRer1.2005-05-31/run cd bed/blat.danRer2.2005-05-31/run para try, check, push, check etc.. # Lift results # the lift directory was defined in makeHg17.doc when split was performed # this expects data in bed/blat.hg17, so symlink must be there # use kolossus for speed ssh kolossus cd /cluster/data/danRer1/bed set dir = /cluster/bin/scripts # for script to work, directory must have today's date mv blat.danRer2.2005-05-18 blat.danRer2.2005-07-19 cd /cluster/data/danRer1/bed/blat.danRer2.2005-07-19 $dir/makeLoChain-lift danRer1 danRer2 \ /iscratch/i/danRer2/liftOver/liftSplit/lift >&! lift.log & tail -100f lift.log cd .. mv blat.danRer2.2005-07-19 blat.danRer2.2005-05-18 # do manually as this stops after chr1 - same problem seen with mm5->mm6 # when all runs are finished, combine all the results into raw dir # in blat.danRer2 and do lift ssh kk cd /cluster/data/danRer1/bed/blat.danRer2/raw bash # if not working in bash already for nib in `ls /cluster/data/danRer2/nib`; do chrom=${nib%.nib} echo $chrom liftUp -pslQ ../psl/${chrom}.psl /iscratch/i/danRer2/liftOver/liftSplit/lift/${chrom}.lft warn chr*_${chrom}.psl echo done $chrom done rm ../psl/chrUn.psl ../psl/chrNA.psl # Un and NA are empty psls as these were not run ssh kk9 set dir = /cluster/bin/scripts cd /cluster/data/danRer1/bed ln -s blat.danRer2.2005-05-18 blat.danRer2.2005-07-19 $dir/makeLoChain-chain danRer1 /cluster/data/danRer1/nib danRer2 \ /cluster/data/danRer2/nib cd /cluster/data/danRer1/bed/blat.danRer2.2005-07-19/chainRun para try, check, push, check etc. # para time # Completed: 26 of 26 jobs # CPU time in finished jobs: 2501s 41.68m 0.69h 0.03d 0.000 y # IO & Wait Time: 1773s 29.55m 0.49h 0.02d 0.000 y # Average job time: 164s 2.74m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 210s 3.50m 0.06h 0.00d # Submission to last job: 504s 8.40m 0.14h 0.01d # make alignment net, need access to kkusr01 ssh kolossus cd /cluster/data/danRer1/bed/blat.danRer2 # remove chrFinished - this also has Ns between contigs and could # cause false alignments due to alignments bridging contigs rm /cluster/data/danRer1/bed/blat.danRer2/chainRaw/chrFinished.chain # run programs as in /cluster/bin/scripts/makeLoChain-net # script aborts if it is not on fileserver for # this directory as no local login to kkusr01 at the moment # sort chains rm -fr chain mkdir chain chainMergeSort chainRaw/*.chain | chainSplit chain stdin rm ./chain/chrFinished.chain # create net rm -fr net mkdir net cd chain foreach i (*.chain) echo $i:r chainNet $i /cluster/data/danRer1/chrom.sizes \ /cluster/data/danRer2/chrom.sizes ../net/$i:r.net /dev/null echo done $i end rm -fr ../over mkdir ../over foreach i (*.chain) echo $i:r netChainSubset ../net/$i:r.net $i ../over/$i end cat ../over/*.chain > ../danRer1ToDanRer2.over.chain mkdir -p /cluster/data/danRer1/bed/bedOver cp ../danRer1ToDanRer2.over.chain /cluster/data/danRer1/bed/bedOver/ # load into database and copy to download directory ssh hgwdev # makeLoChain-load does not make database entry so run manually # save to download area mkdir -p /usr/local/apache/htdocs/goldenPath/danRer1/liftOver cd /usr/local/apache/htdocs/goldenPath/danRer1 gzip -c /cluster/data/danRer1/bed/bedOver/danRer1ToDanRer2.over.chain \ > liftOver/danRer1ToDanRer2.over.chain.gz # load into database mkdir -p /gbdb/danRer1/liftOver rm -f /gbdb/danRer1/liftOver/danRer1ToDanRer2.over.chain ln -s /cluster/data/danRer1/bed/bedOver/danRer1ToDanRer2.over.chain \ /gbdb/danRer1/liftOver # Need to add entry into hgcentraltest.liftOverChain echo 'insert into liftOverChain values("danRer1", "danRer2", "/gbdb/danRer1/liftOver/danRer1ToDanRer2.over.chain", "0.95", "0", "0", "N", "1", "N");' \ | hgsql hgcentraltest # check entry in db echo 'select * from liftOverChain where fromDb = "danRer1";' \ | hgsql hgcentraltest # add an entry to the pushQ in the liftOver subqueue - liftOverQueue # clean up, remove old directories for blat to danRer2 # (hartera, 2006-08-11) ssh kkstore03 cd /cluster/data/danRer1/bed rm -r blat.danRer2.2005-05-24 rm -r blat.danRer2.2005-05-31 cd blat.danRer2.2005-05-18 rm -r chainRaw chainRun raw run rm lift.log gzip chain/*.chain gzip net/*.net gzip psl/*.psl gzip over/*.chain gzip danRer1ToDanRer2.over.chain ######################################################################### # ARCHIVING danRer1 (in progress, 2006-05-03, hartera) ssh hgwdev # remove Blat servers for danRer1 on hgwbeta and the RR hgsql -h hgwbeta -e 'delete from blatServers where db = "danRer1";' \ hgcentralbeta hgsql -h genome-centdb -e 'delete from blatServers where db = "danRer1";' \ hgcentral # set active = 0 for danRer1 on hgwbeta and the RR hgsql -h hgwbeta -e 'update dbDb set active = 0 where name = "danRer1";' \ hgcentralbeta hgsql -h genome-centdb -e \ 'update dbDb set active = 0 where name = "danRer1";' hgcentral ######################################################################### ## Reorder Fish organisms (DONE - 2006-12-22 - Hiram) hgsql -h genome-testdb hgcentraltest \ -e "update dbDb set orderKey = 453 where name = 'danRer1';" ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer1 ########################################################################## # CLEANUP (DONE, 2007-03-27, hartera) ssh kkstore04 cd /cluster/data/danRer1/bed # quality scores not used as no way to relate them to chroms du -sh quality # 9.7G quality/ nice rm -r quality mv /cluster/store8/danRer1/bed/blastzSelf.2004-10-06 \ /cluster/data/danRer1/bed/ # then re-make symbolic link to this directory rm blastzSelf ln -s /cluster/data/danRer1/bed/blastzSelf.2004-10-06 \ /cluster/data/danRer1/bed/blastzSelf # remove older directory du -sh blastzSelf.2004-08-09 # 21M blastzSelf.2004-08-09 rm -r blastzSelf.2004-08-09 # STS markers never made it into a track - messy data du -sh STSmarkers # 1.1G STSmarkers/ cd /cluster/store8/danRer1 rm -r STSmarkers cd /cluster/data/danRer1/bed/ rm STSmarkers du -sh Zonlab # 112G ZonLab/ cd /cluster/data/danRer1/bed/ZonLab mv rhSeqAll/rhSeqAll.zip . rm -r rhSeqAll # du -sh tblastn # 103G tblastn cd tblastn nice rm -r tmp psl rm kgpephg16table.notinfile hg16kgproseqs* *.tab # remove *.out files of blast output as have *.format files # and gzip *.format files foreach f (./blastout/*.out) nice rm -r $f end foreach f (./blastout/*.format) nice gzip $f end # BACENDS CLEANUP cd /cluster/data/danRer1/bed/bacends rm out log jobList *.lst bed.tab pslCheck.log pairsNoReps.bed template rm bacEndsPsl* # bacEnds.lifted.psl is just lifted version of bacEnds.psl rm bacEnds.psl nice gzip bacEnds.lifted.psl rm singles/bacEnds* rm pairs/bacEnds* pairs/psl.tab.gz rm duplicates/bed.tab.gz duplicates/test.gz cd scores rm bed.tab.gz error.log.gz bacEndPairs.out* singles* pairs* badPairs.hits* cd ../cloneandStsAliases rm bacClones* bacEnds.* bacPrs.* *.sort markers* sanger* xref.* *.tab rm -r old test1 test2 testTables testUniSTS tmp1 cd /cluster/data/danRer1/bed/ du -sh ZonLab # 44G ZonLab/ # removed 59 GB # move out presentations to another directory ssh hgwdev mv /cluster/data/danRer1/bed/ZonLab/presn/* \ /cse/staff/hartera/presentations/ rm -r /cluster/data/danRer1/bed/ZonLab/presn ##########################################################################