# for emacs: -*- mode: sh; -*- # Danio Rerio (zebrafish) from Sanger, version Zv5 (released 5/20/05) # Project website: # http://www.sanger.ac.uk/Projects/D_rerio/ # Assembly notes: # http://www.sanger.ac.uk/Projects/D_rerio/Zv5_assembly_information.shtml # DOWNLOAD SEQUENCE (DONE, 2005-06-06, hartera) # MOVE DANRER3 DIRECTORY AND CONTENTS TO STORE11 AS STORE3 IS FULL # (DONE, 2005-07-22, hartera) ssh kkstore01 mkdir /cluster/store9/danRer3 ln -s /cluster/store9/danRer3 /cluster/data cd /cluster/data/danRer3 wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/README wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.stats wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.chunks.agp wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.scaffolds.agp wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.fa # 2005-07-22 MOVE danRer3 # store9 is 100% full, move danRer3 to store11 which is 10% full ssh kkstore02 cd /cluster/store9 nohup nice mv danRer3 /cluster/store11 & # make link to /cluster/data/danRer3 ln -s /cluster/store11/danRer3 /cluster/data # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2005-06-13, hartera) ssh kkstore01 mkdir -p /cluster/data/danRer3/M cd /cluster/data/danRer3/M # go to http://www.ncbi.nih.gov/ and search Nucleotide for # "Danio mitochondrion genome". That shows the gi number: # 8576324 for the accession, AC024175 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA' # Edit chrM.fa: make sure the header line says it is the # Danio Rerio mitochondrion complete genome, and then replace the # header line with just ">chrM". perl -pi.bak -e 's/>.+/>chrM/' chrM.fa rm *.bak # Make a "pseudo-contig" for processing chrM too: mkdir ./chrM_1 sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa mkdir ./lift echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst echo "chrM_1" > ./lift/ordered.lst echo "0 M/chrM_1 16596 chrM 16596" > ./lift/ordered.lft # make sure this is tab delimited # create a .agp file for chrM as hgGoldGapGl and other # programs require a .agp file so create chrM.agp cat << '_EOF_' > ./chrM.agp chrM 1 16596 1 F AC024175.3 1 16596 + '_EOF_' # Create a chrM.chunks.agp mkdir -p /cluster/data/danRer3/M/agps cd /cluster/data/danRer3/M/agps awk 'BEGIN {OFS="\t"} \ {print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' ../chrM.agp \ > chrM.chunks.agp # make sure that all these above files are tab delimited # Create list of chromosomes (DONE, 2005-06-08, hartera) ssh kkstore01 cd /cluster/data/danRer3 awk '{if ($1 !~ /Zv5/) print $1;}' Zv5.scaffolds.agp \ | sort -n | uniq > chrom.lst cp chrom.lst chrom1to25.lst # add chrM echo "M" >> chrom.lst # add chrUn echo "Un" >> chrom.lst # add NA echo "NA" >> chrom.lst # MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2005-06-09, hartera) ssh kkstore01 cd /cluster/data/danRer3 # This used to hold scripts -- better to keep them inline here # Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/danRer3/jkStuff # This is where most tracks will be built: mkdir /cluster/data/danRer3/bed # GET ADDITIONAL ZEBRAFISH REPBASE LIBRARY FOR REPEATMASKER # (DONE, 2005-05-10, hartera) # Go to http://www.girinst.org/server/RepBase/RepBase10.04.fasta # and download zebunc.ref containing unclassified zebrafish repeats. # Need username and password. Copy to /cluster/bluearc/RepeatMasker/Libraries/ ssh hgwdev cd /cluster/bluearc/RepeatMasker/Libraries/ perl -pi.bak -e 's/>(Dr[0-9]+)/>$1#Unknown \@danio [S:]/' zebunc.ref # add to RepeatMasker library cat zebunc.ref >> RepeatMasker.lib # This is all in: /cluster/bluearc/RepeatMasker050305/Libraries # CHECK AGP FILES AND FASTA SIZE CONSISTENCY (DONE, 2005-06-10, hartera) # The script, createAgpWithGaps.pl (see next section for creating # agps and FASTAs for chrNA and chrUn), was used to create a scaffolds # agp file for chrUn to test the program. The agp output was compared to # that from scaffoldFaToAgp and difference was found in the agp file # output for scaffoldFaToAgp which used 990568 as the end co-ordinate for # Zv5_scaffold1475 instead of 976101 as in the output from the script. So # the co-ordinate numbering is different from there on. The program, # scaffoldFaToAgp is creating the agp file from the FASTA file # so perhaps the sequence is a different size than stated in the agp file. # Get sequence and find the size: ssh kkstore01 mkdir test cd test faOneRecord ../Zv5.fa Zv5_scaffold1475 > Zv5_scaffold1475.fa faSize Zv5_scaffold1475.fa # 990568 bases rm Zv5_scaffold1475.fa # reported this inconsistency to Mario Caccamo at Sanger # mc2@sanger.ac.uk (2005-06-09) and new scaffolds and chunks agp files # were sent on 2005-06-10. There was a chunk (contig) missing from the # chunks agp file and the scaffold therefore had the wrong end # co-ordinate in the agp files. # check all sizes of scaffold sequences against those in the agp files ssh kkr1u00 cd /cluster/data/danRer3 mkdir -p /iscratch/i/danRer3/scaffolds cp Zv5.fa /iscratch/i/danRer3/scaffolds/ iSync ssh kk mkdir -p /cluster/data/danRer3/scaffolds/run cd /cluster/data/danRer3/scaffolds/run grep '>' ../Zv5.fa | sed -e 's/>//' > Zv5.scaffolds.lst cat << '_EOF_' > getSizes.csh #!/bin/csh -fe set dir=/cluster/bluearc/danRer3/scaffolds faOneRecord /iscratch/i/danRer3/scaffolds/Zv5.fa $1 > $dir/$1.fa echo $1 >> $dir/$1.size faSize $dir/$1.fa >> $dir/$1.size rm $dir/$1.fa '_EOF_' # << this line makes emacs coloring happy chmod +x getSizes.csh cat << '_EOF_' > gsub #LOOP getSizes.csh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 Zv5.scaffolds.lst single gsub jobList para create jobList para try,check,push,check etc... ssh kkstore01 cd /cluster/bluearc/danRer3/scaffolds foreach f (*.size) cat $f >> Zv5.scaffolds.sizes end cd /cluster/data/danRer3/scaffolds mv /cluster/bluearc/danRer3/scaffolds/Zv5.scaffolds.sizes . # Check that these sizes correspond to the sizes in the scaffolds agp file # use script compareSizes.pl cat << '_EOF_' > compareSizes.pl #!/usr/bin/perl -w use strict; my ($file, $agp); $file = $ARGV[0]; $agp = $ARGV[1]; open(FILE, $file) || die "Can not open $file: $!\n"; open(AGP, $agp) || die "Can not open $agp: $!\n"; open(OUT, ">log.txt") || die "Can not create log.txt: $!\n"; my ($l, $name, $size, %scafsHash); while () { $l = $_; if ($l =~ /^(Zv5_(scaffold|NA)[0-9]+)/) { $name = $1; } elsif ($l =~ /^([0-9]+)\sbases/) { $size = $1; $scafsHash{$name} = $size; } } close FILE; while () { my ($line, @fi, $scaf, $end); $line = $_; @fi = split(/\t/, $line); $scaf = $fi[5]; $end = $fi[7]; if (exists($scafsHash{$scaf})) { if ($scafsHash{$scaf} eq $end) { print OUT "$scaf - ok\n"; } else { print OUT "$scaf - different size to sequence\n"; } } else { print OUT "$scaf - does not exist in list of sizes\n"; } } close AGP; close OUT; '_EOF_' # << happy emacs chmod +x compareSizes.pl perl compareSizes.pl Zv5.scaffolds.sizes ../Zv5.scaffolds.agp # the only lines where no ID was found in the list of scaffolds with sizes # were those lines for gaps. grep "different" Zv5_scaffold1475 # Zv5_scaffold1475 - different size to sequence # so only this scaffold is a different size in the agp to the sequence # need to check that sizes are consistent between agp files # check also new agp file for scaffolds - newAgps/Zv5.scaffolds.agp perl compareSizes.pl Zv5.scaffolds.sizes ../newAgps/Zv5.scaffolds.agp # these are all consistent with the sequence sizes cd /cluster/data/danRer3/newAgps/ # print out scaffold names where the co-ordinates are not consistent # with sizes given awk '{if ($6 ~ /^Zv5/ && (($3-$2+1) != $8)) print $6;}' Zv5.scaffolds.agp \ > Zv5.scaffolds.coordCheck # this file is empty so they are ok. do the same for the chunks.agp file awk '{if ($6 ~ /^Zv5/ && (($3-$2+1) != $8)) print $6;}' Zv5.chunks.agp \ > Zv5.chunks.coordCheck # also empty so ok. check that the difference between $7 and $8 is the # same as the difference between $11 and $12 fields awk '{if ($6 != 5000 && (($8 - $7) != ($12 - $11))) print $6;}' \ Zv5.chunks.agp > Zv5.chunks.coordCheck2 # these are all ok rm Zv5.*.coord* cat << '_EOF_' > checkSizesInAgps.pl #!/usr/bin/perl -w use strict; my ($ch, $sc, %scafsHash); $sc = $ARGV[0]; # scaffolds agp $ch = $ARGV[1]; # chunks or contigs agp open(SCAFS, $sc) || die "Can not open $sc: $!\n"; open(CHUNKS, $ch) || die "Can not open $ch: $!\n"; while () { my ($l, @f, $name, $e); $l = $_; @f = split(/\t/, $l); if ($f[5] =~ /^Zv5/) { $name = $f[5]; $e = $f[2]; $scafsHash{$name} = $e; } } close SCAFS; my $scaf = ""; my $prev = ""; my $prevEnd = 0; while () { my ($line, @fi); $line = $_; @fi = split(/\t/, $line); if ($fi[5] ne "5000") { $scaf = $fi[9]; if (($scaf ne $prev) && ($prev ne "")) { checkCoords($prev, $prevEnd); } $prev = $scaf; $prevEnd = $fi[2]; } } # check last entry in file checkCoords($prev, $prevEnd); close CHUNKS; sub checkCoords { my ($name, $end) = @_; if (exists($scafsHash{$prev})) { if ($scafsHash{$prev} != $prevEnd) { my $ed = $scafsHash{$prev}; print "Scaffold $prev is not consistent between agps\n"; } else { my $ed = $scafsHash{$prev}; print "Scaffold $prev - ok\n"; } } } '_EOF_' # << happy emacs chmod +x checkSizesInAgps.pl checkSizesInAgps.pl Zv5.scaffolds.agp Zv5.chunks.agp \ > Zv5.scafsvschunks grep "not consistent" Zv5.scafsvschunks # no lines were inconsistency was reported wc -l Zv5.scafsvschunks # 16214 Zv5.scafsvschunks grep "Zv5" Zv5.scaffolds.agp | wc -l # 16214 # so all the scaffolds were checked and were ok. cd /cluster/data/danRer3 mv ./newAgps/Zv5.scaffolds.agp . mv ./newAgps/Zv5.chunks.agp mv ./scaffolds/compareSizes.pl ./jkStuff/ mv ./newAgps/checkSizesInAgps.pl ./jkStuff/ rm -r newAgps # SPLIT AGP FILES BY CHROMOSOME (DONE, 2005-06-13, hartera) # FASTA WAS CREATED USING SCAFFOLDS AGP ssh kkstore01 cd /cluster/data/danRer3 # There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and # then one for chunks (contigs on danRer1) showing how they map on to # scaffolds. # get list of scaffolds from FASTA file and check these are in agp grep '>' Zv5.fa | sed -e 's/>//' | sort | uniq > Zv5FaScafs.lst # get list of scaffolds from agp - do not print from gap lines awk '{if ($7 !~ /contig/) print $6;}' Zv5.scaffolds.agp \ | sort | uniq > Zv5AgpScafs.lst diff Zv5FaScafs.lst Zv5AgpScafs.lst # no difference so all scaffolds are in the FASTA file # add "chr" prefix for the agp files perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp # for chromosomes: foreach c (`cat chrom1to25.lst`) echo "Processing $c ..." mkdir $c perl -we "while(<>){if (/^chr$c\t/) {print;}}" \ ./Zv5.chunks.agp \ > $c/chr$c.chunks.agp perl -we "while(<>){if (/^chr$c\t/) {print;}}" \ ./Zv5.scaffolds.agp \ > $c/chr$c.scaffolds.agp end # CREATE AGP FILES FOR chrNA AND chrUn (DONE, 2005-06-13, hartera) ssh kkstore01 # chrNA consists of WGS contigs that could not be related to any # FPC contig and the scaffolds and contigs are named Zv5_NAN in the # first field of the agp files cd /cluster/data/danRer3 mkdir ./NA awk '{if ($1 ~ /Zv5_NA/) print;}' Zv5.chunks.agp \ > ./NA/NA.chunks.agp awk '{if ($1 ~ /Zv5_NA/) print;}' Zv5.scaffolds.agp \ > ./NA/NA.scaffolds.agp # change the first field to "chrUn" then can use agpToFa to process perl -pi.bak -e 's/Zv5_NA[0-9]+/chrNA/' ./NA/*.agp # check files and remove backup files rm ./NA/*.bak # then process chrUn. # Re-make chrUn with new agp files - this is made from scaffolds and # contigs where the name is Zv5_scaffoldN in the first field of the # agp files. These scaffolds and contigs are unmapped to chromosomes # in the agp file. chrUn is made up of WGS scaffolds that mapped to # FPC contigs, but the chromosome is unknown. rm -r Un mkdir ./Un awk '{if ($1 ~ /Zv5_scaffold/) print;}' Zv5.chunks.agp \ > ./Un/Un.chunks.agp awk '{if ($1 ~ /Zv5_scaffold/) print;}' Zv5.scaffolds.agp \ > ./Un/Un.scaffolds.agp # change the first field to "chrUn" then can use agpToFa to process perl -pi.bak -e 's/Zv5_scaffold[0-9]+/chrUn/' ./Un/*.agp # check files and remove backup files rm ./Un/*.bak # get FASTA file of sequences for NA and Un and create agp with # Ns between scaffolds # from scaffolds agp, get name of scaffolds to retrieve from the FASTA # file to make the NA and Un chromosomes. foreach c (NA Un) awk '{print $6;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst $HOME/bin/i386/faSomeRecords /cluster/data/danRer3/Zv5.fa \ $c/chr$c.scaffolds.lst $c/chr$c.fa end # check that all scaffolds in list are in FASTA file for NA and Un - ok # edit scaffoldFaToAgp.c so that it creates agp with 500Ns between # scaffolds as contig gaps for chrNA and compile. chrNA is already large # so the number of Ns are reduced to reduce the size. foreach c (NA Un) $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa mv $c/chr$c.fa $c/chr$c.scaffolds.fa end # change chrUn to chrNA for NA and D to W for NA and Un sed -e 's/chrUn/chrNA/' ./NA/chrNA.agp | sed -e 's/D/W/' \ > ./NA/chrNA.scaffolds.agp sed -e 's/D/W/' ./Un/chrUn.agp > ./Un/chrUn.scaffolds.agp # edit ./NA/chrNA.scaffolds.agp and ./Un/chrUn.scaffolds.agp and # remove last line as this just adds an extra 500 Ns at the # end of the sequence. rm ./NA/chrNA.agp ./Un/chrUn.agp cat << '_EOF_' > /cluster/data/danRer3/jkStuff/createAgpWithGaps.pl #!/usr/bin/perl use strict; # This script takes a chunks agp and inserts Ns between scaffolds for # the chunks (contigs) agp file. Could also insert Ns between scaffolds # for scaffolds agp. my ($chrom, $numN, $name, $prev, $st, $end, $prevEnd, $id); my $chrom = $ARGV[0]; # chromosome name my $numN = $ARGV[1]; # number of Ns to be inserted my $type = $ARGV[2]; # contigs or scaffolds $prev = ""; $st = 1; $prevEnd = 0; $id = 0; while () { my $l = $_; my @f = split(/\t/, $l); if ($type eq "contigs") { $name = $f[9]; } else { $name = $f[5] } my $currSt = $f[1]; my $currEnd = $f[2]; my $size = $currEnd - $currSt; $id++; $st = $prevEnd + 1; $end = $st + $size; if (($prev ne "") && ($prev ne $name)) { $st = $prevEnd + 1; $end = ($st + $numN) - 1; print "$chrom\t$st\t$end\t$id\tN\t$numN\tcontig\tno\n"; $prevEnd = $end; $id++; } $st = $prevEnd + 1; $end = $st + $size; print "$chrom\t$st\t$end\t$id\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$f[8]"; if ($type eq "contigs") { print "\t$f[9]\t$f[10]\t$f[11]"; } $prevEnd = $end; $prev = $name; } '_EOF_' chmod +x /cluster/data/danRer3/jkStuff/createAgpWithGaps.pl cd /cluster/data/danRer3 foreach c (NA Un) cd $c perl ../jkStuff/createAgpWithGaps.pl chr${c} 500 contigs \ < ${c}.chunks.agp > chr${c}.chunks.agp cd .. end # check co-ordinates # clean up foreach c (NA Un) rm $c/${c}.scaffolds.agp $c/${c}.chunks.agp $c/chr${c}.scaffolds.fa \ $c/${c}.scaffolds.lst end # BUILD CHROM-LEVEL SEQUENCE (DONE, 2005-06-13, hartera) ssh kkstore01 cd /cluster/data/danRer3 # Sequence is already in upper case so no need to change foreach c (`cat chrom.lst`) echo "Processing ${c}" $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \ $c/chr$c.fa ./Zv5.fa echo "${c} - DONE" end # move scaffolds agp to be chrom agp and clean up foreach c (`cat chrom.lst`) cd $c rm *.bak cp chr${c}.scaffolds.agp chr${c}.agp mkdir -p agps mv chr${c}.*.agp ./agps/ cd .. end # CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2005-06-13, hartera) # Check that the size of each chromosome .fa file is equal to the # last coord of the .agp: ssh hgwdev cd /cluster/data/danRer3 foreach c (`cat chrom.lst`) foreach f ( $c/chr$c.agp ) set agpLen = `tail -1 $f | awk '{print $3;}'` set h = $f:r set g = $h:r echo "Getting size of $g.fa" set faLen = `faSize $g.fa | awk '{print $1;}'` if ($agpLen == $faLen) then echo " OK: $f length = $g length = $faLen" else echo "ERROR: $f length = $agpLen, but $g length = $faLen" endif end end # all are the OK so FASTA files are the expected size # CREATING DATABASE (DONE, 2005-06-13, hartera) # Create the database. # next machine ssh hgwdev echo 'create database danRer3' | hgsql '' # if you need to delete that database: !!! WILL DELETE EVERYTHING !!! echo 'drop database danRer3' | hgsql danRer3 # Delete and re-create database as above (hartera, 2004-11-30) # Use df to make sure there is at least 10 gig free on df -h /var/lib/mysql # Before loading data: # Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 927G 734G 56% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2005-06-13, hartera) # next machine ssh hgwdev # the following command copies all the data from the table # grp in the database danRer2 to the new database danRer3 echo "create table grp (PRIMARY KEY(NAME)) select * from danRer2.grp" \ | hgsql danRer3 # if you need to delete that table: !!! WILL DELETE ALL grp data !!! echo 'drop table grp;' | hgsql danRer3 # BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS # (DONE, 2004-06-14, hartera) ssh kkstore01 cd /cluster/data/danRer3 foreach c (`cat chrom.lst`) foreach agp ($c/chr$c.agp) if (-e $agp) then set fa = $c/chr$c.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end # MAKE LIFTALL.LFT (DONE, 2005-06-14, hartera) ssh kkstore01 cd /cluster/data/danRer3 cat */lift/ordered.lft > jkStuff/liftAll.lft # SIMPLE REPEAT [TRF] TRACK (DONE, 2005-06-14, hartera) # TRF can be run in parallel with RepeatMasker on the file server # since it doesn't require masked input sequence. # Run this on the kilokluster. Need to mask contig and chromosome # sequences so run trf using contig sequences. # First copy over contig sequences to iscratch and then iSync to cluster. ssh kkr1u00 mkdir -p /iscratch/i/danRer3/contigsNoMask cd /cluster/data/danRer3 foreach d (/cluster/data/danRer3/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) echo "Copyig $f ..." cp $f /iscratch/i/danRer3/contigsNoMask/ end end # 288 sequence files /cluster/bin/iSync ssh kk mkdir -p /cluster/data/danRer3/bed/simpleRepeat cd /cluster/data/danRer3/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << keep emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << keep emacs coloring happy ls -1S /iscratch/i/danRer3/contigsNoMask/chr*.fa > genome.lst gensub2 genome.lst single gsub jobList # 288 jobs para create jobList para try, check, push, check etc... para time # Completed: 288 of 288 jobs # CPU time in finished jobs: 70742s 1179.03m 19.65h 0.82d 0.002 y # IO & Wait Time: 1263s 21.05m 0.35h 0.01d 0.000 y # Average job time: 250s 4.17m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 6722s 112.03m 1.87h 0.08d # Submission to last job: 10037s 167.28m 2.79h 0.12d # lift up to chrom level liftUp simpleRepeat.bed /cluster/data/danRer3/jkStuff/liftAll.lft warn \ trf/*.bed # Load into the database ssh hgwdev cd /cluster/data/danRer3/bed/simpleRepeat hgLoadBed danRer3 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 757119 elements of size 16 # PROCESS SIMPLE REPEATS INTO MASK (DONE, 2005-06-14, hartera) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore01 cd /cluster/data/danRer3/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/danRer3 mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2005-06-15, hartera) # When a new library is added for this version of repeatMasker, need to # check in /cluster/bluearc/RepeatMasker/Libraries for a directory made # up of a date e.g. 20050112 here and inside this are species directories # for which RepeatMasker has already been run. In this directory it creates # a specieslib of the danio repeats. If this exists, this is used for the # RepeatMasker run for that species so if new repeats are added to the # library, they will not get used unless this is deleted a new specieslib # is created using the new library on the first run for danio. ssh kkstore01 rm -r /cluster/bluearc/RepeatMasker/Libraries/20050112/danio/ cd /cluster/data/danRer3 #- Split contigs into 500kb chunks, at gaps if possible: foreach c (`cat chrom.lst`) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end # For RepeatMasking, use RepeatMasker "open-3.0" with repeat library # version RepBase Update 9.11, RM database version 20050112 with the # addition of the zebrafish unclassified repeats (zebunc.ref) - see above # section on getting this additional zebrafish RepeatMasker library. #- Make the run directory and job list: cd /cluster/data/danRer3 cat << '_EOF_' > jkStuff/RMZebrafish #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/danRer3/$2 /bin/cp $2 /tmp/danRer3/$2/ cd /tmp/danRer3/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species danio $2 popd /bin/cp /tmp/danRer3/$2/$2.out ./ if (-e /tmp/danRer3/$2/$2.align) /bin/cp /tmp/danRer3/$2/$2.align ./ if (-e /tmp/danRer3/$2/$2.tbl) /bin/cp /tmp/danRer3/$2/$2.tbl ./ if (-e /tmp/danRer3/$2/$2.cat) /bin/cp /tmp/danRer3/$2/$2.cat ./ /bin/rm -fr /tmp/danRer3/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer3/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer3 '_EOF_' chmod +x jkStuff/RMZebrafish mkdir -p RMRun cp /dev/null RMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/danRer3/jkStuff/RMZebrafish \ /cluster/data/danRer3/$d $f \ '{'check out line+ /cluster/data/danRer3/$d/$f.out'}' \ >> RMRun/RMJobs end end end # Do the run ssh kk cd /cluster/data/danRer3/RMRun para create RMJobs para try, para check, para check, para push, para check,... para time # Completed: 4069 of 4069 jobs # CPU time in finished jobs: 13726314s 228771.90m 3812.87h 158.87d 0.435 y # IO & Wait Time: 45762s 762.70m 12.71h 0.53d 0.001 y # Average job time: 3385s 56.41m 0.94h 0.04d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 4549s 75.82m 1.26h 0.05d # Submission to last job: 56947s 949.12m 15.82h 0.66d # This is slow. It should have taken about 5 hours. #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kkstore01 cd /cluster/data/danRer3 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/danRer3 hgLoadOut danRer3 */chr*.fa.out -verbose=2 # bad rep range [689, 602] line 105524 of 16/chr16.fa.out # bad rep range [147, 146] line 124027 of 16/chr16.fa.out # bad rep range [280, 258] line 754 of 17/chr17.fa.out # bad rep range [280, 258] line 76417 of 17/chr17.fa.out # bad rep range [314, 311] line 99427 of 19/chr19.fa.out # bad rep range [367, 366] line 88398 of 23/chr23.fa.out # bad rep range [41, 40] line 51509 of 25/chr25.fa.out # bad rep range [1133, 1132] line 62610 of 9/chr9.fa.out # bad rep range [6133, 6132] line 122359 of NA/chrNA.fa.out # bad rep range [6133, 6132] line 160183 of NA/chrNA.fa.out # bad rep range [292, 291] line 252829 of NA/chrNA.fa.out # bad rep range [751, 599] line 261276 of NA/chrNA.fa.out # bad rep range [360, 359] line 259794 of Un/chrUn.fa.out # bad rep range [360, 359] line 259796 of Un/chrUn.fa.out # bad rep range [360, 359] line 259798 of Un/chrUn.fa.out # bad rep range [1, -56] line 379516 of Un/chrUn.fa.out # note: 16 records dropped due to repStart > repEnd # check coverage of repeats masked # featureBits -chrom=chr1 danRer1 rmsk # 11589712 bases of 40488791 (28.624%) in intersection # featureBits -chrom=chr1 danRer2 rmsk # 26879295 bases of 61678023 (43.580%) in intersection # featureBits -chrom=chr1 danRer3 rmsk # 25822888 bases of 55805710 (46.273%) in intersection # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF AND BUILD NIB FILES # (DONE, 2005-06-15, hartera) ssh kkstore01 cd /cluster/data/danRer3 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom # for the chromosomes: foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # This warning is extremely rare -- if it indicates a problem, it is only with # the repeat annotation and does not affect the masking: # repeat- and trf-masking Un/chrUn.fa # WARNING: negative rEnd: -56 chrUn:153329594-153329609 MOSAT_DR # for the contigs: foreach c (`cat chrom.lst`) echo "repeat- and trf-masking contigs of chr$c" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end # same warning here too: # repeat- and trf-masking contigs of chrUn # WARNING: negative rEnd: -56 chrUn_26:1159145-1159160 MOSAT_DR # check percent sequence masked faSize /cluster/data/danRer3/1/chr1.fa # 55805710 bases (1047706 N's 54758004 real 28887275 upper 25870729 lower) # 46% is in lower case so masked # for danRer2: faSize /cluster/data/danRer2/1/chr1New.fa # 62208023 bases (3421437 N's 58786586 real 31874160 upper 26912426 lower) # 43% is in lower case so masked # Build nib files, using the soft masking in the fa mkdir nib foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE, 2005-06-15, hartera) # Added link from danRer3.2bit file to the danRer3 gbdb directory # (2005-06-17, hartera) # Make symbolic links from /gbdb/danRer3/nib to the real nibs ssh hgwdev cd /cluster/data/danRer3 mkdir -p /gbdb/danRer3/nib foreach f (/cluster/data/danRer3/nib/chr*.nib) ln -s $f /gbdb/danRer3/nib end # Load /gbdb/danRer3/nib paths into database and save size info # hgNibSeq creates chromInfo table hgNibSeq -preMadeNib danRer3 /gbdb/danRer3/nib */chr*.fa echo "select chrom,size from chromInfo" | hgsql -N danRer3 > chrom.sizes # take a look at chrom.sizes, should be 28 lines wc chrom.sizes # 28 56 409 chrom.sizes # Make one big 2bit file as well, and make a link to it in # /gbdb/danRer3/nib because hgBlat looks there: faToTwoBit */chr*.fa danRer3.2bit # add link to this 2bit file from gbdb danRer3 directory (2005-06-17) ln -s /cluster/data/danRer3/danRer3.2bit /gbdb/danRer3/ # also make 2 bit files for chrUn and chrNA later on - need masked seq # make 2 bit files for chrUn and chrNA scaffolds (2005-06-17) ssh kkstore01 cd /cluster/data/danRer3 # make scaffolds files foreach c (NA Un) cd $c echo "Processing $c ..." mkdir scafSeqs awk '{if ($5 != "N") print $6;}' chr${c}.agp > scafSeqs/scaffolds.lst cd .. end cd /cluster/data/danRer3/NA/scafSeqs cat << '_EOF_' > getSeqs.csh #!/bin/csh -fe set dir=/cluster/bluearc/danRer3/scaffolds faOneRecord /iscratch/i/danRer3/scaffolds/Zv5.fa $1 > $dir/$1.fa '_EOF_' # << this line makes emacs coloring happy chmod +x getSeqs.csh cat << '_EOF_' > gsub #LOOP getSeqs.csh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ssh kk cd /cluster/data/danRer3/NA/scafSeqs gensub2 scaffolds.lst single gsub jobList para create jobList para try,check,push,check etc... ssh kkstore01 cd /cluster/bluearc/danRer3/scaffolds foreach f (*.size) faToTwoBit ./chrNA/scafSeqs/*.fa danRer3ChrNA.2bit faToTwoBit ./chrUn/scafSeqs *.fa danRer3ChrUn.2bit # MAKE GOLD AND GAP TRACKS (DONE, 2005-06-15, hartera) # Add trackDb entry and html page for gold and gap tracks (2005-06-16, hartera) ssh hgwdev cd /cluster/data/danRer3 # the gold and gap tracks are created from the chrN.agp file and this is # the scaffolds or supercontigs agp hgGoldGapGl -noGl -chromLst=chrom.lst danRer3 /cluster/data/danRer3 . # featureBits danRer3 gold # 1630323462 bases of 1630323462 (100.000%) in intersection # featureBits danRer2 gold # 1560497282 bases of 1560497282 (100.000%) in intersection # featureBits danRer1 gold # 1459132082 bases of 1459132082 (100.000%) in intersection # featureBits danRer3 gap # 13709500 bases of 1630323462 (0.841%) in intersection # featureBits danRer2 gap # 28776000 bases of 1560497282 (1.844%) in intersection # featureBits danRer1 gap # 64174000 bases of 1459132082 (4.398%) in intersection # Add trackDb.ra entries for gold and gap tracks and also create # gap.html and gold.html pages. # MAKE TRACKDB ENTRY FOR DANRER3 (DONE, 2005-06-16, hartera) ssh hgwdev # Make trackDb table so browser knows what tracks to expect: mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3 cd ~/kent/src/hg/makeDb/trackDb/zebrafish cvs add danRer3 cvs commit danRer3 cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add danRer3 in all the right places and do make update make alpha cvs commit -m "Added danRer3." makefile # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2005-06-16, hartera) ssh hgwdev mkdir /cluster/data/danRer3/html # make a symbolic link from /gbdb/danRer3/html to /cluster/data/danRer3/html ln -s /cluster/data/danRer3/html /gbdb/danRer3/html # Add a description page for zebrafish cd /cluster/data/danRer3/html cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer2/description.html . # Edit this for zebrafish danRer3 # create a description.html page here cd ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3 # Add description page here too cp /cluster/data/danRer3/html/description.html . cvs add description.html cvs commit -m "First draft of description page for danRer3." \ description.html cd ~/kent/src/hg/makeDb/trackDb make update make alpha # MAKE HGCENTRALTEST ENTRY FOR DANRER3 (DONE, 2005-06-16, hartera) # UPDATE ENTRY TO ADD DANRER3 TO GENE SORTER (DONE, 2006-06-09, hartera) # Make dbDb and defaultDb entries so test browser knows about it: ssh hgwdev # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ values("danRer3", "May 2005", \ "/gbdb/danRer3", "Zebrafish", "chr2:15,906,734-15,926,406", 1, \ 37, "Zebrafish", "Danio rerio", \ "/gbdb/danRer3/html/description.html", 0, 0, \ "Sanger Centre, Danio rerio Sequencing Project Zv5");' \ | hgsql -h genome-testdb hgcentraltest # set danRer3 to be the default assembly for Zebrafish echo 'update defaultDb set name = "danRer3" \ where genome = "Zebrafish";' \ | hgsql -h genome-testdb hgcentraltest # Update dbDb entry for danRer3 to add it to Gene Sorter # (hartera, 2006-06-09) echo 'update dbDb set hgNearOk = 1 where name = "danRer3";' \ | hgsql -h genome-testdb hgcentraltest # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS AND ON BLUEARC # (DONE, 2005-06-16, hartera) ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/danRer3/nib mkdir -p /iscratch/i/danRer3/nib cp -p /cluster/data/danRer3/nib/chr*.nib /iscratch/i/danRer3/nib # Pseudo-contig fa that have been repeat- and trf-masked: rm -rf /iscratch/i/danRer3/trfFa mkdir /iscratch/i/danRer3/trfFa foreach d (/cluster/data/danRer3/*/chr*_?{,?}) cp -p $d/$d:t.fa /iscratch/i/danRer3/trfFa end rm -rf /iscratch/i/danRer3/rmsk mkdir -p /iscratch/i/danRer3/rmsk cp -p /cluster/data/danRer3/*/chr*.fa.out /iscratch/i/danRer3/rmsk cp -p /cluster/data/danRer3/danRer3.2bit /iscratch/i/danRer3/ /cluster/bin/iSync # add to the bluearc ssh kkstore01 mkdir -p /cluster/bluearc/danRer3/nib cp -p /cluster/data/danRer3/nib/chr*.nib /cluster/bluearc/danRer3/nib mkdir -p /cluster/bluearc/danRer3/trfFa foreach d (/cluster/data/danRer3/*/chr*_?{,?}) cp -p $d/$d:t.fa /cluster/bluearc/danRer3/trfFa end cp /cluster/data/danRer3/danRer3.2bit /cluster/bluearc/danRer3/ # ADD CONTIGS TRACK (DONE, 2005-06-16, hartera) # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from # chunks (contigs) agp files. ssh kkstore01 mkdir -p /cluster/data/danRer3/bed/ctgPos2 cd /cluster/data/danRer3/bed/ctgPos2 # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc foreach c (`cat /cluster/data/danRer3/chrom.lst`) awk 'BEGIN {OFS="\t"} \ {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \ /cluster/data/danRer3/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab end ssh hgwdev cd /cluster/data/danRer3/bed/ctgPos2 hgsql danRer3 < ~/kent/src/hg/lib/ctgPos2.sql echo "load data local infile 'ctgPos2.tab' into table ctgPos2" \ | hgsql danRer3 # create trackDb.ra entry and html page for ctgPos2 track. # Changed termRegEx for ctgPos2 in trackDb.ra so that it handles # contigs named "Zv5_scaffold*". (2006-04-19, hartera) # CREATE gc5Base WIGGLE TRACK (DONE, 2005-06-16, hartera) # FIX LINK FOR WIB FILES TO POINT TO danRer3 ON store11 (2005-07-25, hartera) ssh kkstore01 mkdir -p /cluster/data/danRer3/bed/gc5Base cd /cluster/data/danRer3/bed/gc5Base # The number of bases that hgGcPercent claimed it measured is calculated, # which is not necessarily always 5 if it ran into gaps, and then the # division by 10.0 scales down the numbers from hgGcPercent to the range # [0-100]. wigEncode now replaces wigAsciiToBinary and the previous # processing step between these two programs. The result file is *.wig. # Each value represents the measurement over five bases beginning with # . wigEncode also calculates the zoomed set of data. # Uses the 2bit file in /cluster/data/danRer3 as sequence input. nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer3 \ /cluster/data/danRer3 | \ wigEncode stdin gc5Base.wig gc5Base.wib # load the .wig file back on hgwdev: ssh hgwdev cd /cluster/data/danRer3/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/danRer3/wib/gc5Base \ danRer3 gc5Base gc5Base.wig # and symlink the .wib file into /gbdb # fix link as danRer3 is now in store 11 (2005-07-25, hartera) rm -r /gbdb/danRer3/wib/gc5Base mkdir -p /gbdb/danRer3/wib/gc5Base ln -s `pwd`/gc5Base.wib /gbdb/danRer3/wib/gc5Base # MAKE 10.OOC, 11.OOC FILE FOR BLAT (DONE, 2005-06-17, hartera) # Use -repMatch=512 (based on size -- for human we use 1024, and # the zebrafish genome is ~50% of the size of the human genome ssh kkr1u00 mkdir /cluster/data/danRer3/bed/ooc cd /cluster/data/danRer3/bed/ooc mkdir -p /cluster/bluearc/danRer3 ls -1 /cluster/data/danRer3/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/danRer3/danRer3_11.ooc -repMatch=512 # Wrote 50575 overused 11-mers to /cluster/bluearc/danRer3/11.ooc # For 10.ooc, repMatch = 4096 for human, so use 2048 blat nib.lst /dev/null /dev/null -tileSize=10 \ -makeOoc=/cluster/bluearc/danRer3/danRer3_10.ooc -repMatch=2048 # Wrote 12574 overused 10-mers to /cluster/bluearc/danRer3/10.ooc # keep copies of ooc files in this directory and copy to iscratch cp /cluster/bluearc/danRer3/*.ooc . cp -p /cluster/bluearc/danRer3/*.ooc /iscratch/i/danRer3/ /cluster/bin/iSync # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer3 (DONE, 2005-07-20, kuhn) # hgcentraltest is now on hgwdev ssh hgwdev # DNA port is "0", trans prot port is "1" echo 'insert into blatServers values("danRer3", "blat2", "17778", "1", "0"); insert into blatServers values("danRer3", "blat2", "17779", "0", "1");' \ | hgsql hgcentraltest # this enables blat and isPcr, isPcr is enabled by loading blat server # with tilesize=5 (ask for this when request blat servers from # cluster admin). # if you need to delete those entries echo 'delete from blatServers where db="danRer3";' \ | hgsql hgcentraltest # to check the entries: echo 'select * from blatServers where db="danRer3";' \ | hgsql hgcentraltest # AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2005-07-22, hartera) # REMAKE THIS TRACK USING chrUn AND chrNA SCAFFOLDS (DONE, 2005-08-19, hartera) # UPDATED (2006-09-27) - see separate section, UPDATE AFFY ZEBRAFISH TRACK. # array chip sequences already downloaded for danRer1 ssh hgwdev cd /projects/compbio/data/microarray/affyZebrafish mkdir /cluster/bluearc/affy cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /cluster/bluearc/affy/ # Set up cluster job to align Zebrafish consensus sequences to danRer3 ssh kkr1u00 mkdir -p /cluster/data/danRer3/bed/affyZebrafish.2005-08-19 ln -s /cluster/data/danRer3/bed/affyZebrafish.2005-08-19 \ /cluster/data/danRer3/bed/affyZebrafish cd /cluster/data/danRer3/bed/affyZebrafish mkdir -p /iscratch/i/affy cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy /cluster/bin/iSync # the kilokluster is down, so run on the pitakluster ssh pk cd /cluster/data/danRer3/bed/affyZebrafish ls -1 /cluster/bluearc/affy/Zebrafish_consensus.fa > affy.lst ls -1 /cluster/bluearc/danRer3/trfFa/chr[0-9M]*.fa > genome.lst # for output: mkdir -p /san/sanvol1/danRer3/affy/pslChrom echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/danRer3/affy/pslChrom/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst affy.lst template.sub para.spec para create para.spec para try, check, push ... etc. # para time # Completed: 208 of 208 jobs # CPU time in finished jobs: 1355s 22.59m 0.38h 0.02d 0.000 y # IO & Wait Time: 9988s 166.46m 2.77h 0.12d 0.000 y # Average job time: 55s 0.91m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 74s 1.23m 0.02h 0.00d # Submission to last job: 217s 3.62m 0.06h 0.00d # then run the 2bit file of scaffolds ssh pk cd /cluster/data/danRer3/bed/affyZebrafish mkdir scaffoldsNAandUnRun cd scaffoldsNAandUnRun ls -1 /cluster/bluearc/affy/Zebrafish_consensus.fa > affy.lst foreach f (/cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/*.fa) ls -1 $f >> scafs.lst end mkdir -p /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn/$(root1)_$(root2).psl}\n#ENDLOOP' > template2.sub gensub2 scafs.lst affy.lst template2.sub para.spec para create para.spec para try, check, push ... etc. # para time # Completed: 14941 of 14941 jobs # CPU time in finished jobs: 27574s 459.57m 7.66h 0.32d 0.001 y # IO & Wait Time: 47642s 794.03m 13.23h 0.55d 0.002 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 35s 0.58m 0.01h 0.00d # Submission to last job: 339s 5.65m 0.09h 0.00d # need to do pslSort and lift up for each separate run cd /cluster/data/danRer3/bed/affyZebrafish cd /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyZebrafish.psl # only use alignments that have at least # 95% identity in aligned region. # do not use minCover since a lot of sequence is in Un, NA and Finished # so genes may be split up so good to see all alignments # first do the chr1-25 and chrM alignments pslSort dirs raw.psl tmp pslChrom pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null # Processed 27408 alignments pslSort dirs rawNAandUn.psl tmp pslScaffoldsNAandUn pslReps -minAli=0.95 -nearTop=0.005 rawNAandUn.psl scafNAandUn.psl /dev/null # Processed 9888 alignments # lift up chrom contigs to chrom level liftUp affyZfishChroms.psl \ /cluster/data/danRer3/jkStuff/liftAll.lft warn contig.psl liftUp affyZfishScafsNAandUn.psl \ /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \ warn scafNAandUn.psl # sort and merge these files mkdir psl cp affyZfish* ./psl/ pslSort dirs affyZebrafish.psl tmp1 psl # rsync these psl files rsync -a --progress /san/sanvol1/danRer3/affy/*.psl \ /cluster/data/danRer3/bed/affyZebrafish/ ssh kkstore02 cd /cluster/data/danRer3/bed/affyZebrafish # shorten names in psl file sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp mv affyZebrafish.psl.tmp affyZebrafish.psl pslCheck affyZebrafish.psl # psl is good # load track into database ssh hgwdev cd /cluster/data/danRer3/bed/affyZebrafish hgLoadPsl danRer3 affyZebrafish.psl # Add consensus sequences for Zebrafish chip # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/hgFixed/affyProbes ln -s \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=Zebrafish: danRer3 \ /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa # Clean up rm batch.bak contig.psl raw.psl # moved affyZebrafish.html description and trackDb.ra track entry and # search for Affy Zebrafish track to # ~/kent/src/hg/makeDb/trackDb/zebrafish since it is common to all # danRer assemblies. # LIFT FILES FROM SCAFFOLDS TO chrUn AND chrNA (DONE, 2005-07-27, hartera) ssh kkstore02 mkdir -p /cluster/data/danRer3/liftSupertoChrom cd /cluster/data/danRer3/liftSupertoChrom # lift files are already created when scaffoldFaToAgp was run for chrUn.fa # and chrNA.fa. These need to be edited as the last 500 Ns were removed # from the agp file making the sequence 184125739 bp and not 184126239 bp # for chrUn, for chrNA, it is 253521007 bp instead of 253521507 bp and need # to change chrUn to chrNA cp /cluster/data/danRer3/Un/tmp/chrUn.lft . cp /cluster/data/danRer3/NA/tmp/chrNA.lft . # edit to remove last lines of each file first # then use perl to change co-ordinates perl -pi.bak -e 's/184126239/184125739/' chrUn.lft perl -pi.bak -e 's/253521507/253521007/' chrNA.lft perl -pi.bak -e 's/chrUn/chrNA/' chrNA.lft cat *.lft >> liftNAandUnScaffoldsToChrom.lft # clean up rm *.bak # ENSEMBL GENES (DONE, 2005-07-29, hartera) ssh hgwdev mkdir -p /cluster/data/danRer3/bed/ensembl cd /cluster/data/danRer3/bed/ensembl # Get the Ensembl gene data from # http://www.ensembl.org/Multi/martview # Follow this sequence through the pages: (NOTE: this interface has changed # a little since danRer2) # Page 1) Select the Ensembl dataset (v32 here) and the # Danio_rerio choice (ZFISH5 here). Hit next. 22877 entries total. # Ensembl 35 now (2005-11-23) and this is the same as for the version 32 # downloaded as above. Ensembl 36 (Dec 2005) is the same as for 32 for # Zebrafish. Ensembl 38 (April 2006) Protein Coding genes is the same # as for Ensembl 32. (Select Gene type as protein_coding on page 2). # Page 2) Then hit next. # Page 3) Choose the "Structures" Attribute Page from the pulldown menu # at the top. Make sure that under the GENE section, the Ensembl # Attributes checked include the Ensembl Gene ID and Ensembl # Transcript ID. Choose GTF as the output. Choose gzip compression. # Hit export. Save as ensemblGene35.gtf.gz # the Ensembl gene predictions are mapped to chromosomes except for # chrNA and chrUn. Use lift files for scaffolds to these chroms. # get chrUn and chrNA Ensembl records ssh kkstore02 cd /cluster/data/danRer3/bed/ensembl gunzip ensemblGene.gtf.gz awk '$1 ~ /^Zv5_NA[0-9]+/ || $1 ~ /^Zv5_scaffold[0-9]+/' ensemblGene.gtf \ > ensemblGenechrUns.gtf # get records for all other chroms awk '$1 ~ /^[0-9]+/' ensemblGene.gtf > ensemblGenechroms.gtf wc -l *.gtf # 513421 ensemblGenechroms.gtf # 125319 ensemblGenechrUns.gtf # 638740 ensemblGene.gtf # total lines of files made equal to original file so ok liftUp -type=.gtf ensemblGenechrUns.lifted \ /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \ warn ensemblGenechrUns.gtf # Got 29880 lifts in # /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft sed -e "s/^/chr/" ensemblGenechroms.gtf > ensGene.gtf cat ensemblGenechrUns.lifted >> ensGene.gtf # check file sizes -ok and some of the lifted co-ordinates # there were some erroneous lines with "1;" or "2;" - 8 lines total # Notified Ensembl and they fixed it so downloaded file again # and reloaded into database # Also remove the suffix that denotes the transcript version number. # This is not in the ensGtp or ensPep tables. perl -pi.bak -e 's/\.[0-9]+//'g ensGene.gtf # load into database ssh hgwdev cd /cluster/data/danRer3/bed/ensembl hgsql -e 'drop table ensGene;' danRer3 /cluster/bin/i386/ldHgGene danRer3 ensGene ensGene.gtf # Read 32143 transcripts in 638732 lines in 1 files # 32143 groups 27 seqs 1 sources 4 feature types # 32143 gene predictions # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format and gzip compression. # Result name: ensGtp. gunzip ensGtp.tsv.gz # edit to remove first header line hgsql danRer3 < ~/kent/src/hg/lib/ensGtp.sql # remove header line from ensGtp.txt echo "load data local infile 'ensGtp.tsv' into table ensGtp" \ | hgsql -N danRer3 # Get the ensembl peptide sequences from # http://www.ensembl.org/Multi/martview # Choose Danio Rerio as the organism # Follow this sequence through the pages: # Page 1) Choose the Ensembl Genes choice. Hit next. # Page 2) Then hit next. # Page 3) Choose "Sequences" from the Attributes pulldown menu at the top. # Page 4) Choose Peptide as type of sequence to export and select # Ensembl Gene ID from Gene Attributes and # Ensembl Transcript ID and Ensembl Peptide Stable ID from # Transcript Attributes as the output, # choose text/fasta and gzip compression, # name the file ensemblPep.fa.gz and then hit export. gunzip ensemblPep.fa.gz hgPepPred danRer3 ensembl ensemblPep.fa # added code to hgc.c so that the link to the Ensembl Protein # is also displayed on the description page. FOR MGC GENES: - wait one day for nightly build to align and load them into the db - rebuild trackDb # SPLIT UP ZEBRAFISH MASKED SEQUENCE FROM chrUn and chrNA INTO SCAFFOLDS # (DONE, 2005-08-04, hartera) # ADD SOFT-MASKED SCAFFOLDS TO ISERVERS FOR CLUSTER RUNS # (DONE, 2005-08-15, hartera) AND TO BLUEARC (DONE, 2005-08-19) ssh kkstore02 cd /cluster/data/danRer3 # for chrUn and chrNA, get masked sequence for soft and hard-masked foreach c (Un NA) cd $c mkdir scaffoldsSoftMask scaffoldsHardMask awk 'BEGIN {FS="\t"}{if ($5 != "N") \ print "faFrag -mixed chr'${c}'.fa",$2-1, $3, $6".fa";}' chr${c}.agp \ >> ./scaffoldsSoftMask/faFragSoftMask.csh awk 'BEGIN {FS="\t"}{if ($5 != "N") \ print "faFrag -mixed chr'${c}'.fa.masked",$2-1, $3, $6".fa.masked";}' \ chr${c}.agp >> ./scaffoldsHardMask/faFragHardMask.csh cd .. end # change permissions run scripts to get sequences foreach d (Un NA) chmod +x $d/scaffoldsSoftMask/faFragSoftMask.csh chmod +x $d/scaffoldsHardMask/faFragHardMask.csh end cat << '_EOF_' > jkStuff/getMaskedScaffolds.csh #!/bin/csh foreach c (Un NA) set dir=/cluster/data/danRer3 echo "Processing $c" cd $dir/$c/scaffoldsSoftMask cp ../chr${c}.fa . echo "Getting soft-masked sequences ..." nice faFragSoftMask.csh >& faFrag.log echo "Getting hard-masked sequences ..." cd $dir/$c/scaffoldsHardMask cp ../chr${c}.fa.masked . nice faFragHardMask.csh >& faFrag.log end '_EOF_' chmod +x jkStuff/getMaskedScaffolds.csh nice ./jkStuff/getMaskedScaffolds.csh & # check a few sequences that they are correct # add name of scaffold to sequence fasta and cat together foreach c (Un NA) set dir = /cluster/data/danRer3 foreach d (scaffoldsSoftMask scaffoldsHardMask) cd $dir/$c/$d foreach f (Zv5*) if ($d == "scaffoldsHardMask") then set b=$f:r set g=$b:r set sc=scaffoldMasked${c}.fa else set g=$f:r set sc=scaffold${c}.fa endif perl -pi.bak -e "s/>chr[0-9A-Za-z\-\:]+/>$g/" $f cat $f >> $sc rm *.bak end cp scaffold* $dir/$c/ end end # check sizes of final FASTA file with all sequences. check a few # sequence files to see that they are correct - ok # Add soft-masked scaffolds to the iservers for cluster runs # (2005-08-15, hartera) ssh kkr1u00 mkdir -p /iscratch/i/danRer3/scaffoldsSoftMask cd /cluster/data/danRer3 foreach c (NA Un) foreach f (/cluster/data/danRer3/$c/scaffoldsSoftMask/Zv5_*.fa) cp -p $f /iscratch/i/danRer3/scaffoldsSoftMask end end /cluster/bin/iSync # Add soft-masked scaffolds to the bluearc for cluster runs # (2005-08-19, hartera) ssh kkr1u00 cd /cluster/data/danRer3/ mkdir -p /cluster/bluearc/scratch/danRer3/scaffoldsSoftMask foreach c (NA Un) foreach f (/cluster/data/danRer3/$c/scaffoldsSoftMask/Zv5_*.fa) rsync -a --progress $f \ /cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/ end end # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2005-08-05, hartera) ssh kkstore02 cd /cluster/data/danRer3 #- Build the .zip files cat << '_EOF_' > jkStuff/gzipAll.csh rm -rf gzip mkdir gzip # chrom AGP's tar cvzf gzip/chromAgp.tar.gz [0-9A-Z]*/chr*.agp # chrom RepeatMasker out files tar cvzf gzip/chromOut.tar.gz */chr*.fa.out # soft masked chrom fasta tar cvzf gzip/chromFa.tar.gz */chr*.fa # soft masked chrNA and chrUn scaffolds tar cvzf gzip/scaffoldUnsFa.tar.gz NA/scaffoldNA.fa \ Un/scaffoldUn.fa # hard masked chrom fasta tar cvzf gzip/chromFaMasked.tar.gz */chr*.fa.masked # hard masked chrNA and chrUn scaffolds tar cvzf gzip/scaffoldUnsFaMasked.tar.gz \ NA/scaffoldMaskedNA.fa \ Un/scaffoldMaskedUn.fa # chrom TRF output files cd bed/simpleRepeat tar cvzf ../../gzip/chromTrf.tar.gz trfMaskChrom/chr*.bed cd ../.. # get GenBank native mRNAs cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=danRer3 -native GenBank mrna \ /cluster/data/danRer3/gzip/mrna.fa # get GenBank xeno mRNAs ./bin/i386/gbGetSeqs -db=danRer3 -xeno GenBank mrna \ /cluster/data/danRer3/gzip/xenoMrna.fa # get native RefSeq mRNAs ./bin/i386/gbGetSeqs -db=danRer3 -native refseq mrna \ /cluster/data/danRer3/gzip/refMrna.fa # get native GenBank ESTs ./bin/i386/gbGetSeqs -db=danRer3 -native GenBank est \ /cluster/data/danRer3/gzip/est.fa cd /cluster/data/danRer3/gzip # gzip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs gzip mrna.fa gzip xenoMrna.fa gzip refMrna.fa gzip est.fa '_EOF_' # << this line makes emacs coloring happy chmod +x ./jkStuff/gzipAll.csh csh ./jkStuff/gzipAll.csh |& tee ./jkStuff/gzipAll.log #- Look at zipAll.log to make sure all file lists look reasonable. # Make upstream files and Copy the .zip files to # hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/danRer3/gzip # make upstream files for zebrafish RefSeq featureBits danRer3 refGene:upstream:1000 -fa=upstream1000.fa gzip upstream1000.fa featureBits danRer3 refGene:upstream:2000 -fa=upstream2000.fa gzip upstream2000.fa set gp = /usr/local/apache/htdocs/goldenPath/danRer3 mkdir -p $gp/bigZips cp -p *.gz $gp/bigZips mkdir -p $gp/chromosomes foreach f (../*/chr*.fa) cp $f $gp/chromosomes end foreach c (NA Un) cd /cluster/data/danRer3/$c cp scaffold${c}.fa.gz $gp/chromosomes end cd $gp/bigZips md5sum *.gz > md5sum.txt cd $gp/chromosomes # gzip the chromosome and scaffold FASTAs individually foreach f (*.fa) gzip $f end md5sum *.gz > md5sum.txt # Take a look at bigZips/* and chromosomes/* # copy README.txt's from danRer2 and update # MAKE NIB FILES AND 2BIT FILE FOR SOFT MASKED chrUn AND chrNA SCAFFOLDS # (DONE, 2005-08-06, hartera) # ADD chrUn AND chrNA SCAFFOLDS 2BIT FILE TO BLUEARC (DONE, 2005-08-19, hartera) ssh kkstore02 cd /cluster/data/danRer3 mkdir scaffoldsNAandUnNib # Build nib files, using the soft masking in the fa foreach c (NA Un) echo "Processing $c" foreach f ($c/scaffoldsSoftMask/Zv5*.fa) faToNib -softMask $f scaffoldsNAandUnNib/$f:t:r.nib end end # check correct number of nib files in directory: 14941 # there are 14676 chrNA scaffolds and 265 chrUn scaffolds # copy chromosome 1-25 and chrNA and chrUn scaffolds nibs to a directory # on iscratch and iSync for use in cluster runs ssh kkr1u00 mkdir -p /iscratch/i/danRer3/chromandScafNib cp -p /cluster/data/danRer3/nib/chr[0-9]*.nib \ /iscratch/i/danRer3/chromandScafNib foreach f (/cluster/data/danRer3/scaffoldsNAandUnNib/Zv5*.nib) cp -p $f /iscratch/i/danRer3/chromandScafNib end ssh kkstore02 # make a 2 bit file of all the scaffolds for chrNA and chrUn # for blastz cluster runs cd /cluster/data/danRer3/ cat NA/scaffoldNA.fa Un/scaffoldUn.fa > danRer3NAandUnScaffolds.fa grep '>' danRer3NAandUnScaffolds.fa | wc -l # 14941 faToTwoBit danRer3NAandUnScaffolds.fa danRer3NAandUnScaf.2bit ssh kkr1u00 mkdir -p /iscratch/i/danRer3/NAandUnScafs cp /cluster/data/danRer3/danRer3NAandUnScaf.2bit \ /iscratch/i/danRer3/NAandUnScafs /cluster/bin/iSync # get sizes of scaffolds for the .len file used by blastz ssh kolossus mkdir -p /panasas/store/danRer3/NAandUnScafSizes cd /cluster/data/danRer3 cat << '_EOF_' > jkStuff/getNAandUnScafSizes.csh #!/bin/csh -fe foreach c (NA Un) set sizeDir=/panasas/store/danRer3/NAandUnScafSizes cd /cluster/data/danRer3/$c/scaffoldsSoftMask foreach f (Zv5*.fa) set g=$f:r faSize detailed=on $f >> $sizeDir/NAandUnScafs.sizes end end '_EOF_' chmod +x jkStuff/getNAandUnScafSizes.csh nice jkStuff/getNAandUnScafSizes.csh >& size.log & # took about 1 minute wc -l /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes # 14941 /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes # so correct number of scaffolds cp /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes \ /cluster/data/danRer3 # add 2 bit to bluearc for cluster runs (2005-08-19, hartera) ssh kkr1u00 mkdir -p /cluster/bluearc/scratch/danRer3 cp /cluster/data/danRer3/danRer3NAandUnScaf.2bit \ /cluster/bluearc/scratch/danRer3/ # BLASTZ SWAP FOR MOUSE (mm6) (DONE, 2005-08-10, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS # REMAKE AXTNET AND COPY TO DOWNLOADS. REMAKE MAFNET (DONE, 2005-08-17, hartera) # DROPPED THE CHAIN AND NET TABLES FROM HGWDEV AS THERE WERE 3 SETS OF # MOUSE ALIGNMENTS: mm6, mm7 and mm8 (DONE, 2006-03-28, hartera) ssh kkr1u00 # blastz requires lineage-specific repeats # Treat all repeats as lineage-specific # if not done already, get lineage-specific repeats mkdir -p /iscratch/i/mm6/linSpecRep.notInZebrafish foreach f (/panasas/store/mm6/rmsk/chr*.fa.out) cp -p $f /iscratch/i/mm6/linSpecRep.notInZebrafish/$f:t:r:r.out.spec end mkdir -p /iscratch/i/danRer3/linSpecRep.notInMouse foreach f (/iscratch/i/danRer3/rmsk/chr*.fa.out) cp -p $f /iscratch/i/danRer3/linSpecRep.notInMouse/$f:t:r:r.out.spec end /cluster/bin/iSync # NOTE: the "mouse/human/etc." lineage-specific repeat files are now in # /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers # however, the files for chrNA and chrUn were missing, so I'm # adding them here. (2005-12-19 kate) ssh kkstore02 cd /cluster/data/danRer3 cp -p Un/chrUn.fa.out \ /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/chrUn.out.spec cp -p NA/chrNA.fa.out \ /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/chrNA.out.spec # do swap of mm6 vs. danRer3 chain and net alignments to # create danRer3 vs. mm6. see makeMm6.doc for details. ssh kk cd /cluster/data/mm6/bed/blastz.danRer3 mkdir -p /panasas/store/danRer3vsmm6Out nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -stop download -blastzOutRoot /panasas/store/danRer3vsmm6Out \ -swap -chainMinScore=5000 >& doSwap.log & # Start: Aug 10 16:30 # Finish: Aug 10 16:54 # Blastz parameters are as for mm6 vs. danRer3 danRer3 - see makeMm6.doc # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=6000 # BLASTZ_K=2200 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # BLASTZ_ABRIDGE_REPEATS=1 # do cleanup step and specify a different file server as can not # access panasas from kkstore02. nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -continue cleanup -fileServer eieio \ -blastzOutRoot /panasas/store/danRer3vsmm6Out \ -swap -chainMinScore=5000 >& doSwapCleanUp.log & # make html files and trackDb.ra entry for chain and net tracks. # check README.txt for downloads. # featureBits -chrom=chr1 danRer3 refGene:cds chainMm6Link -enrichment # refGene:cds 0.688%, chainMm6Link 8.193%, both 0.543%, cover 78.94%, # enrich 9.64x # featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment # refGene:cds 0.642%, chainMm5Link 4.499%, both 0.492%, cover 76.60%, # enrich 17.02x # featureBits -chrom=chr2 danRer3 refGene:cds chainMm6Link -enrichment # refGene:cds 0.705%, chainMm6Link 8.219%, both 0.557%, cover 79.04%, # enrich 9.62x # featureBits -chrom=chr2 danRer2 refGene:cds chainMm5Link -enrichment # refGene:cds 0.739%, chainMm5Link 4.539%, both 0.579%, cover 78.37%, # enrich 17.26x # looks good, although enrichment is lower than for danRer2 and mm5, there are # more chains in the score <10000 range for danRer3 than for danRer2 but # this does not make up for all the extra chains in danRer3 over danRer2. # Maybe there are more high scoring alignments to the chrUn and chrNA chains # due to the scaffolds being used for the alignments. # danRer3 has a extra sequence compared to danRer2. danRer3 chr2 is 48.2 Mb # and for danRer2, chr2 is 52 Mb so in this case the chrom is smaller. # featureBits -chrom=chrNA danRer3 refGene:cds chainMm6Link -enrichment # refGene:cds 0.449%, chainMm6Link 10.952%, both 0.350%, cover 77.94%, # enrich 7.12x # featureBits -chrom=chrNA danRer2 refGene:cds chainMm5Link -enrichment # refGene:cds 0.499%, chainMm5Link 4.176%, both 0.372%, cover 74.60%, # enrich 17.86x # netToAxt was processing nets incorrectly so remake these with # new version of netToAxt and transfer to downloads dir. ssh kkstore02 cd /cluster/data/danRer3/bed/blastz.mm6.swap rm -r axtNet # Make axtNet for download: one .axt per danRer3 seq. # remake noClass.net # Make nets("noClass", i.e. without rmsk/class stats which are added later): cd axtChain chainPreNet danRer3.mm6.all.chain.gz /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len stdout \ | chainNet stdin -minSpace=1 /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len stdout /dev/null \ | netSyntenic stdin noClass.net # create net for each chrom again netSplit noClass.net net # also split up chains again mkdir chain zcat danRer3.mm6.all.chain.gz | chainSplit chain stdin ssh hgwdev cd /cluster/data/danRer3/bed/blastz.mm6.swap mkdir axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/bluearc/danRer3/nib /panasas/store/mm6/nib stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.danRer3.mm6.net.axt.gz end # cleanup ssh kkstore02 cd /cluster/data/danRer3/bed/blastz.mm6.swap/axtChain rm noClass.net rm -r net rm -r chain # remake mafNet from the new axtNet cd /cluster/data/danRer3/bed/blastz.mm6.swap rm -r mafNet # Make mafNet for multiz: one .maf per danRer3 seq. mkdir mafNet foreach f (axtNet/*.danRer3.mm6.net.axt.gz) axtToMaf -tPrefix=danRer3. -qPrefix=mm6. $f \ /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end # copy the new axtNet files to downloads and replace old ones ssh hgwdev rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6/axtNet cd /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6 mkdir -p /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6/axtNet ln -s /cluster/data/danRer3/bed/blastz.mm6.swap/axtNet/*.axt.gz axtNet/ # remake md5sum.txt rm md5sum.txt md5sum *.gz */*.gz > md5sum.txt # Dropped mouse mm6 chain and net tables from hgwdev as there were 3 sets # of mouse alignments for danRer3: mm6, mm7 and mm8 (hartera, 2006-03-29) hgsql -e 'drop table netMm6;' danRer3 foreach c (`cat /cluster/data/danRer3/chrom.lst`) hgsql -e "drop table chr${c}_chainMm6;" danRer3 hgsql -e "drop table chr${c}_chainMm6Link;" danRer3 end # BLASTZ FOR FUGU (fr1) (DONE, 2005-08-18, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS # RECREATE DOWNLOADS AS THE FUGU DOWNLOADS DIRECTORY HAS BEEN DELETED # (DONE, 2005-11-17, hartera) ssh kk mkdir /cluster/data/danRer3/bed/blastz.fr1.2005-08-13 cd /cluster/data/danRer3/bed ln -s blastz.fr1.2005-08-13 blastz.fr1 # use parameters for fr1 in makeDanRer2.doc. Using scaffolds makes this run # slower so it is best to have the scaffolds in the query. Use HoxD55.q # matrix as Fugu is quite distant from zebrafish. Blastz uses # lineage-specfic repeats but there are none for these two species. cat << '_EOF_' > DEF # zebrafish (danRer3) vs. Fugu (fr1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - zebrafish (danRer3) # soft-masked chroms, 1-25 and M SEQ1_DIR=/iscratch/i/danRer3/chromNib SEQ1_RMSK= # lineage-specific repeats # we don't have that information for these species SEQ1_SMSK= SEQ1_FLAG= SEQ1_IN_CONTIGS=0 # 10 MB chunk for target SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Fugu (fr1) # soft-masked scaffolds in 2bit format SEQ2_DIR=/iscratch/i/fr1/UnScaffolds/fr1UnScaffolds.2bit SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG= SEQ2_IN_CONTIGS=0 # 10 Mbase for query SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/danRer3/bed/blastz.fr1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF cp /cluster/data/danRer3/chrom.sizes ./S1.len # make S2.len for fr1 scaffolds twoBitInfo /cluster/data/fr1/fr1UnScaffolds.2bit ./S2.len wc -l *.len # 28 S1.len # 20379 S2.len # make output directory mkdir -p /cluster/bluearc/danRer3vsfr1Out # do blastz and create chains for fr1 scaffolds on danRer3 chr1-25 and chrM # chickenHumanTuned.gap scoring matrix is now used by default # by axtChain. nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \ -stop chainMerge >& do.log & # Start: Aug 13 10:48 # Finish: Aug 13 13:35 # then run the danRer3 NA and Un scaffolds against fugu scaffolds mkdir NAandUnScaffolds cd NAandUnScaffolds cat << '_EOF_' > DEF # zebrafish (danRer3) vs. Fugu (fr1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - zebrafish (danRer3) # soft-masked scaffolds for chrNA and chrUn in 2 bit format SEQ1_DIR=/iscratch/i/danRer3/NAandUnScafs/danRer3NAandUnScaf.2bit SEQ1_RMSK= # lineage-specific repeats # we don't have that information for these species SEQ1_SMSK= SEQ1_FLAG= SEQ1_IN_CONTIGS=0 # 10 MB chunk for target SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Fugu (fr1) # soft-masked scaffolds in 2bit format SEQ2_DIR=/iscratch/i/fr1/UnScaffolds/fr1UnScaffolds.2bit SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG= SEQ2_IN_CONTIGS=0 # 10 Mbase for query SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF twoBitInfo /cluster/data/danRer3/danRer3NAandUnScaf.2bit ./S1.len # make S2.len for fr1 scaffolds twoBitInfo /cluster/data/fr1/fr1UnScaffolds.2bit ./S2.len wc -l *.len # 14941 S1.len # 20379 S2.len # make output directory mkdir -p /cluster/bluearc/danRer3vsfr1Out/NAandUnScaffolds # do blastz and create chains for fr1 scaffolds on danRer3 # chrNA and chrUn scaffolds nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out/NAandUnScaffolds \ -chainMinScore=5000 -stop chainMerge >& do.log & # Start: Aug 13 14:05 # Finish: Aug 14 20:58 # The log file says it is finished. chainSplit was not run as SEQ1 has # is not < 100 sequences. Need to do liftUp before running chainSplit. cd /cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds/axtChain/run # Lifting up chains: # need to lift these chains up to chrom level for Fugu for chrom run and # for danRer3 and Fugu for the NA and Un scaffolds run. # first for Fugu in the danRer3 chrom run ssh kkstore02 cd /cluster/data/danRer3/bed/blastz.fr1/axtChain mkdir liftedChain foreach f (chain/*.chain) set c=$f:t:r echo $c liftUp -chainQ liftedChain/${c}.lifted.chain \ /cluster/data/fr1/Un/lift/ordered.lft warn $f end # lift up for danRer3 scaffolds run. ssh kkstore02 cd /cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds/axtChain # first lift Fugu fr1 query, there is no split chains here as there # were not < 100 sequences for the target. zcat danRer3.fr1.all.chain.gz | liftUp -chainQ danRer3.fr1.liftedQall.chain \ /cluster/data/fr1/Un/lift/ordered.lft warn stdin # then liftUp target coords for danRer3 liftUp danRer3.fr1.liftedQandTall.chain \ /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \ warn danRer3.fr1.liftedQall.chain # gzip lifted danRer3.fr1 chain file gzip danRer3.fr1.liftedQandTall.chain # merge the chains from the danRer3 chrom run and the danRer3 # NA and Un scaffolds run. chains are sorted by score and IDs are uniqued. cd /cluster/data/danRer3/bed/blastz.fr1/axtChain mv danRer3.fr1.all.chain.gz danRer3.fr1.chroms.chain.gz set blastz=/cluster/data/danRer3/bed/blastz.fr1 # copy over lifted chains for danRer3 scaffolds vs fr1 cp $blastz/NAandUnScaffolds/axtChain/danRer3.fr1.liftedQandTall.chain.gz \ ./liftedChain gunzip ./liftedChain/*.gz nice chainMergeSort liftedChain/*.chain \ | nice gzip -c > danRer3.fr1.all.chain.gz # then split up into chains again mv chain chromChain mkdir chain nice zcat danRer3.fr1.all.chain.gz | chainSplit chain stdin # then pick up the doBlastzChainNet.pl script at the net step. ssh kkstore02 cd /cluster/data/danRer3/bed/blastz.fr1 cp DEF DEF.chroms # edit DEF file to include the all nib files for danRer3 and the # nib file for the chrUn of Fugu fr1. Since all the coords have now # been lifted to chrom level then these are now needed. # SEQ1_DIR=/iscratch/i/danRer3/nib # SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib # use kkr1u00 for computationally intensive steps as kolossus is down. # need to create new S2.len for whole chrUn for Fugu mv S2.len S2.scaffolds.len cp /cluster/data/fr1/chrom.sizes S2.len nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \ -workhorse kkr1u00 -continue net >& doNet.log & # crashed at cleanup step when trying to access kkstore02 # The authenticity of host 'kkstore02 (128.114.50.155)' can't be # established. Re-run from this step. nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \ -workhorse kkr1u00 -continue cleanup >& doNet2.log & # netToAxt was processing nets incorrectly so remake these with # new version of netToAxt. # and transfer to downloads dir. ssh kkstore02 cd /cluster/data/danRer3/bed/blastz.fr1 rm -r axtNet # Make axtNet for download: one .axt per danRer3 seq. # remake noClass.net # Make nets("noClass", i.e. without rmsk/class stats which are added later): cd axtChain chainPreNet danRer3.fr1.all.chain.gz \ /cluster/data/danRer3/bed/blastz.fr1/S1.len /cluster/data/danRer3/bed/blastz.fr1/S2.len stdout \ | chainNet stdin -minSpace=1 /cluster/data/danRer3/bed/blastz.fr1/S1.len \ /cluster/data/danRer3/bed/blastz.fr1/S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # create net for each chrom again netSplit noClass.net net # also split up chains again mkdir chain zcat danRer3.fr1.all.chain.gz | chainSplit chain stdin ssh hgwdev cd /cluster/data/danRer3/bed/blastz.fr1 mkdir axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /cluster/bluearc/danRer3/nib /cluster/bluearc/fugu/fr1/chromNib stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.danRer3.fr1.net.axt.gz end # cleanup ssh kkstore02 cd /cluster/data/danRer3/bed/blastz.fr1/axtChain rm noClass.net rm -r net rm -r chain # remake mafNet from the new axtNet cd /cluster/data/danRer3/bed/blastz.fr1 rm -r mafNet mkdir mafNet foreach f (axtNet/*.danRer3.fr1.net.axt.gz) axtToMaf -tPrefix=danRer3. -qPrefix=fr1. $f \ /cluster/data/danRer3/bed/blastz.fr1/S1.len /cluster/data/danRer3/bed/blastz.fr1/S2.len \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end # copy the new axtNet files to downloads and replace old ones ssh hgwdev rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1/axtNet cd /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1 mkdir -p /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1/axtNet ln -s /cluster/data/danRer3/bed/blastz.fr1/axtNet/*.axt.gz axtNet/ # remake md5sum.txt rm md5sum.txt md5sum *.gz */*.gz > md5sum.txt # Check README in downloads section and add a note about how the # unordered chroms were split up into scaffolds. # Add trackDb entry for chain and net tracks to # trackDb/zebrafish/danRer3/trackDb.ra # Do swap to get danRer3 chains on Fugu, fr1 - see makeFr1.doc # featureBits -chrom=chr2 danRer3 refGene:cds chainFr1Link -enrichment # refGene:cds 0.705%, chainFr1Link 8.960%, both 0.645%, cover 91.53%, # enrich 10.22x # featureBits -chrom=chr2 danRer2 refGene:cds chainFr1Link -enrichment # refGene:cds 0.739%, chainFr1Link 4.537%, both 0.620%, cover 83.90%, # enrich 18.49x # featureBits -chrom=chrNA danRer3 refGene:cds chainFr1Link -enrichment # refGene:cds 0.449%, chainFr1Link 7.129%, both 0.399%, cover 88.78%, # enrich 12.45x # featureBits -chrom=chrNA danRer2 refGene:cds chainFr1Link -enrichment # refGene:cds 0.499%, chainFr1Link 3.901%, both 0.409%, cover 81.90%, # enrich 20.99x # Run directory files are already on /cluster/data. Remake downloads # for fugu alignments since these have been removed from # the downloads directory. (hartera, 2005-11-17) ssh hgwdev # remake downloads using doBlastzChainNet.pl script cd /cluster/data/danRer3/bed/blastz.fr1 nice /cluster/bin/scripts/doBlastzChainNet.pl \ -continue download -stop download `pwd`/DEF >& doDownload.log & # Check README in downloads section and add a note about how the # unordered chroms were split up into scaffolds. # VEGA # get transcripts in transcripts_coords from e-mail from Mario Caccamo # at Sanger 06/16/05. # also README for Vega ssh kkstore01 mkdir -p /cluster/data/danRer3/bed/vegaGene cd /cluster/data/danRer3/bed/vegaGene # AUTO UPDATE GENBANK MRNA AND EST AND MGC GENES RUN (DONE, 2005-08-22, markd) # align with revised genbank process cd ~kent/src/hg/makeDb/genbank cvs update -d etc # edit etc/genbank.conf to add danRer3, had to run on pk, due to kk # being down. Set temporary locations for server files # danRer3 (zebrafish) # Lift file partitions unplaced sequence pseudo-chroms (disabled) danRer3.serverGenome = /cluster/data/danRer3/danRer3.2bit ##danRer3.clusterGenome = /iscratch/i/danRer3/danRer3.2bit ##danRer3.ooc = /iscratch/i/danRer3/danRer3_11.ooc danRer3.clusterGenome = /san/sanvol1/scratch/danRer3/danRer3.2bit danRer3.ooc = /san/sanvol1/scratch/danRer3/danRer3_11.ooc ##danRer3.align.unplacedChroms = chrNA chrUn ##danRer3.lift = /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft danRer3.lift = no danRer3.downloadDir = danRer3 danRer3.mgcTables.default = full danRer3.mgcTables.mgc = all # update /cluster/data/genbank/ make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial danRer3 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad danRer3& # enable daily alignment and update of hgwdev cd ~kent/src/makeDb/genbank cvs update -d etc # add danRer3 to: etc/align.dbs etc/hgwdev.dbs cvs commit make etc-update # TIGR GENE INDEX (DONE, 2005-08-24, hartera) # Data from Razvan Sultana (rsultana@jimmy.harvard.edu or rsultana@tigr.org) # Includes data for chr1-25 and chrM, NOT chrNA and chrUn. Asked for these # on scaffolds and not on the virtual chroms - harder to generate. ssh kkstore02 mkdir -p /cluster/data/danRer3/bed/tigr cd /cluster/data/danRer3/bed/tigr wget --timestamping \ ftp://ftp.tigr.org/pub/data/tgi/Danio_rerio/TGI_track_danRer3_chr1-25.tgz tar xvzf TGI*.tgz # this is data for just chr1-25 and chrM. Data for NA and Un are to follow. ls chr1_* # chr1_drosophTCs chr1_g_gallusTCs chr1_mouseTCs chr1_zfishTCs # chr1_elegansTCs chr1_humanTCs chr1_ratTCs # so species are fly, chicken, mouse, zebrafish, C. elegans, human and rat foreach f (*g_gallus*) set f1 = `echo $f | sed -e 's/g_gallus/chicken/g'` mv $f $f1 end foreach f (*drosoph*) set f1 = `echo $f | sed -e 's/drosoph/Dmelano/g'` mv $f $f1 end foreach o (Dmelano chicken elegans human mouse rat zfish) echo $o setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ssh hgwdev cd /cluster/data/danRer3/bed/tigr hgsql danRer3 -e "drop table tigrGeneIndex" nice ldHgGene -exon=TC danRer3 tigrGeneIndex *.gff # Read 75388 transcripts in 288032 lines in 182 files # 75388 groups 26 seqs 1 sources 1 feature types # 75388 gene predictions checkTableCoords danRer3 tigrGeneIndex /cluster/bin/scripts/runGeneCheck /cluster/data/danRer3/bed/tigr # no CDS in these gene predictions so fix this: hgsql danRer3 -e "update tigrGeneIndex set cdsStart = txStart;" hgsql danRer3 -e "update tigrGeneIndex set cdsEnd = txEnd;" # compress all files gzip chr* # MAKE Human Proteins track (DONE 2005-09-21 braney) ssh kkstore02 mkdir -p /cluster/data/danRer3/blastDb cd /cluster/data/danRer3/blastDb cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA/d" | sed "/Un/d" > chrom.list for i in `cat chrom.list`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list ln -s `cat list` . for i in *.fa do /projects/compbio/bin/i686/formatdb -i $i -p F done rm *.log *.fa list cd .. for i in `cat blastDb/chrom.list`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft rm blastDb/chrom.list mkdir /cluster/data/danRer3/scaffoldBlastDb cd /cluster/data/danRer3/scaffoldBlastDb cat ../Un/scaffoldsSoftMask/*.fa ../NA/scaffoldsSoftMask/*.fa | faSplit sequence stdin 500 scaf for i in *.fa do /projects/compbio/bin/i686/formatdb -i $i -p F done rm *.log *.fa mkdir -p /san/sanvol1/scratch/danRer3/comboBlastDb cd /cluster/data/danRer3/blastDb for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/danRer3/comboBlastDb; done cd /cluster/data/danRer3/scaffoldBlastDb for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/danRer3/comboBlastDb; done mkdir -p /cluster/data/danRer3/bed/tblastn.hg17KG cd /cluster/data/danRer3/bed/tblastn.hg17KG echo /san/sanvol1/scratch/danRer3/comboBlastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst # we want around 250000 jobs calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\) # 37365/(250000/3539) = 528.938940 mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa split -l 529 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2 liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/danRer3/bed/tblastn.hg17KG para create blastSpec para push # Completed: 203170 of 203170 jobs # CPU time in finished jobs: 17875092s 297918.20m 4965.30h 206.89d 0.567 y # IO & Wait Time: 4092508s 68208.46m 1136.81h 47.37d 0.130 y # Average job time: 108s 1.80m 0.03h 0.00d # Longest finished job: 1778s 29.63m 0.49h 0.02d # Submission to last job: 64970s 1082.83m 18.05h 0.75d tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec para push # Completed: 71 of 71 jobs # CPU time in finished jobs: 89115s 1485.25m 24.75h 1.03d 0.003 y # IO & Wait Time: 35631s 593.85m 9.90h 0.41d 0.001 y # Average job time: 1757s 29.28m 0.49h 0.02d # Longest finished job: 15587s 259.78m 4.33h 0.18d # Submission to last job: 23380s 389.67m 6.49h 0.27d ssh kkstore02 cd /cluster/data/danRer3/bed/tblastn.hg17KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done liftUp -nohead -type=.psl stdout /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft carry u.*.psl m60* | \ sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > /cluster/data/danRer3/bed/tblastn.hg17KG/blastHg17KG.psl ssh hgwdev cd /cluster/data/danRer3/bed/tblastn.hg17KG hgLoadPsl danRer3 blastHg17KG.psl # 21063005 bases of 1630323462 (1.292%) in intersection # back to kkstore02 rm -rf blastOut # End tblastn # BACENDS TRACK (DONE, 2005-09-28, hartera) # Track display is very slow on large regions. Split all_bacends table by # chromosome (DONE, 2006-04-19, hartera) # REDO BACENDS FOR PAIRS, SINGLES, BAD PAIRS AND ALL BACENDS TABLES # (see separate section on REDO BACENDS, 2006-05-01 - 2006-05-08, hartera) ssh kkstore01 # BAC ends sequence files provided by Mario Caccamo at Sanger # mc2@sanger.ac.uk mkdir -p /cluster/data/danRer3/bed/bacends cd /cluster/data/danRer3/bed/bacends wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/zf_bacends.fa.gz wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/DH_bacends.fa.gz wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/bacend_placement.txt.gz gunzip *.gz # DH_bacends.fa are from the new library from a doubled haploid zebrafish # zf_bacends.fa are from the existing libraries used in danRer2 and danRer1 # Several reads are present for some of the BAC ends and these have # names like p1kaSP6 or q1kaT7 for duplicated reads and p1kSP6w or q1kT7w # for multiple reads. In the trace repository, the most recent sequence # is stored and the 'a' or 'w' is dropped from the name. # for the DH_bacends.fa from the CHORI73 library, the names are # experiment file name trace_name # ======================== ================ # CHORI73_139g06.p1kSP6 CHORI73_139G6SP6 # CHORI73_165b21.q1kT7 CHORI73_165B21T7 # The trace name is that stored in the trace archive with leading zeros # dropped and ".p1k" or ".q1k" and lower case changed to upper. ssh kkstore02 cd /cluster/data/danRer3/bed/bacends # check list of prefixes in zf_bacends.fa grep '>' zf_bacends.fa > zf.names perl -pi.bak -e 's/>//' zf.names perl -pi.bak -e 's/^([A-Za-z]+)[0-9]+.+/$1/' zf.names sort -u zf.names # bZ # zC # zK # zKp # in DH_bacends.fa, all are CHORI73_ # For DH_bacends.fa, need to clean up, change names to Trace archive # format as above. Then choose most recent sequence, those that are bad # with lots of Ns should be removed at the alignment stage as they will # not pass the Blat or pslReps criteria. # cat zf_bacends.fa DH_bacends.fa >> Zv5Bacends.fa # faSize Zv5Bacends.fa # 680121953 bases (11160014 N's 668961939 real 668961939 upper 0 lower) # in 729101 sequences in 1 files # Total size: mean 932.8 sd 242.6 min 26 (CHORI73_189m04.p1kSP6) # max 5717 (CHORI73_255a17.q1kT7) median 882 # N count: mean 15.3 sd 75.7 # U count: mean 917.5 sd 242.2 # L count: mean 0.0 sd 0.0 wc -l *.fa # 6412741 DH_bacends.fa # 14700258 Zv5Bacends.fa # 8287517 zf_bacends.fa grep '>' DH_bacends.fa | wc -l # 304252 grep '>' zf_bacends.fa | wc -l # 424849 # for DH_bacends.fa there are replicate reads. If duplicate plates # have been made (i.e. read names like ..p1kaSP6 or ..q1kaT7) or plates # have been sequenced multiple times (i.e. read names like ..p1kSP6w or # ..q1kT7w), the Sanger trace repository has the most recent read and # dropped the 'a' or 'w' from the trace name. # some are not in the repository. They had bad quality reads with a lot # of Ns or runs of the same base. These should be dropped in the # alignment filtering. # now download sequence files from Sanger ftp site - these are the # ones from the Sanger sequence repository ssh kkstore02 mkdir -p /cluster/data/danRer3/bed/bacends/seqs cd /cluster/data/danRer3/bed/bacends/seqs # get contents of ftp directory wget --timestamp \ ftp://ftp.ensembl.org/pub/traces/danio_rerio/fasta/ # from index.html, grep lines with cloneEnd grep "cloneEnd" index.html > cloneEnds awk 'BEGIN {FS="\""} {print "wget --timestamp",$2;}' cloneEnds \ > getCloneEnds.csh chmod +x getCloneEnds.csh cat getCloneEnds.csh wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025270298.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025273988.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025278580.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1035416745.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1035417824.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1040215846.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1048006071.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1114727127.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115222417.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115226483.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115230498.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115234585.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115238038.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115240957.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039514906.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039603426.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039604741.fasta.gz wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1040231265.fasta.gz nice getCloneEnds.csh >& bac.log & # check log to see everything downloaded ok and then remove rm bac.log index.html # unzip files gunzip *.gz # cat together CHORI73 FASTA files cat sanger-zfish-CHORI*.fasta > CHORI73_bacends.fa grep '>' CHORI73_bacends.fa > CHORI73.names perl -pi.bak -e 's/>//' CHORI73.names sort CHORI73.names | uniq > CHORI73.names.sort wc -l CHORI73.names.sort # 265235 CHORI73.names.sort cat sanger-zfish-cloneEnd*.fasta > zfish_bacends.fa cat mpgeb-zfish-cloneEnd*.fasta > zfishmpgeb_bacends.fa grep '>' zfish_bacends.fa | wc -l # 164302 grep '>' zfishmpgeb_bacends.fa | wc -l # 264633 cp CHORI73.names.sort /cluster/data/danRer3/bed/bacends/ # compared this list of sequence names for zf_bacends.fa and got more # sequences in the zf_bacends.fa - checked and some are in the trace # repository and some are not. # for CHORI_73 there are 394 extra sequences in the downloaded file # and over 7000 in the original file sent by Mario. Just use the original # file here as the sequences will probably be filtered out if there # are bad alignments. get list of sequences for which there are more than # 2 ends. Some end sequences have multiple reads. cd /cluster/data/danRer3/bed/bacends # look at file of CHORI73_ sequences sent by Mario at Sanger: grep '>' DH_bacends.fa > DH.names perl -pi.bak -e 's/>//' DH.names perl -pi.bak -e 's/(CHORI73_[0-9a-z]+)\.[a-z0-9]+.+/$1/' DH.names sort DH.names | uniq -c | sort -nr > DH.names.counts awk '{if ($1 > 2) print $2;}' DH.names.counts > DH.names.morethan2 # translate to upper case and remove leading zeros cat DH.names.morethan2 | tr '[a-z]' '[A-Z]' > DH.names.morethan2.upper # remove leading 0 perl -pi.bak -e 's/(CHORI73_[0-9]+[A-Z])0([0-9]+)/$1$2/' \ DH.names.morethan2.upper sort DH.names.morethan2.upper | uniq > DH.names.morethan2.upper.sort wc -l *.sort # 265235 CHORI73.names.sort # 6020 DH.names.morethan2.upper.sort comm -12 CHORI73.names.sort DH.names.morethan2.upper.sort | wc # 5299 # so 721 are not in this list so they are probably not in the repository # but align these anyway. # for those that are then use the versions in CHORI73.names comm -12 CHORI73.names.sort DH.names.morethan2.upper.sort \ > CHORI73.names.touse comm -13 CHORI73.names.sort DH.names.morethan2.upper.sort \ > DHmorethan2.DHonly awk '{if ($1 <= 2) print $2;}' DH.names.counts > DH.names.2orless # this is list of sequences to get from DH_bacends.fa # need to back translate the list in DHmorethan2.DHonly cat DHmorethan2.DHonly | tr '[A-Z]' '[a-z]' > DHtmp sed -e 's/chori/CHORI/' DHtmp > DHmorethan2.DHonly.format # need to put leading zeros back and "." at the end to help # pattern matching with grep. cat << '_EOF_' > addZeros.pl #/usr/bin/perl -w use strict; my ($file); $file = $ARGV[0]; open (FILE, $file) || die "Can not open $file: $!\n"; while () { chomp; my ($l,$id); $l = $_; if ($l =~ /^CHORI73_[0-9]+[a-z][0-9]{2,}/) { print "$l\\.\n"; } elsif($l =~ /^(CHORI73_[0-9]+[a-z])([0-9]{1})/) { $id = $1 . "0" . $2 . "\\."; print "$id\n"; } } close FILE; '_EOF_' chmod +x addZeros.pl perl addZeros.pl DHmorethan2.DHonly.format > DHmorethan2.DHonly.format2 wc -l DHmorethan2.DHonly* # 721 DHmorethan2.DHonly # 721 DHmorethan2.DHonly.format # 721 DHmorethan2.DHonly.format2 # need to get full sequence names grep '>' DH_bacends.fa > DHBacs.fullnames perl -pi.bak -e 's/>//' DHBacs.fullnames perl -pi.bak -e 's/(CHORI73_[0-9a-z]+\.[a-z0-9A-Z]+) bases.+/$1/' \ DHBacs.fullnames grep -f DHmorethan2.DHonly.format2 DHBacs.fullnames \ > DHmorethan2.DHonly.fullnames wc -l DHmorethan2.DHonly.fullnames # 2352 DHmorethan2.DHonly.fullnames sort DHmorethan2.DHonly.fullnames > DHmorethan2.DHonly.fullnames.sort # do for those with less than 2 sequences to get the full names cat << '_EOF_' > getFullNames.pl #!/usr/bin/perl -w use strict; my ($file, $patterns, %idsHash); $file = $ARGV[0]; $patterns = $ARGV[1]; open (FILE, $file) || die "Can not open $file: $!\n"; open (PATTERNS, $patterns) || die "Can not open $patterns: $!\n"; while () { chomp; my ($l, $pref, $dir); $l = $_; if ($l =~ /^(CHORI73_[0-9a-z]+)\./) { $pref = $1; push(@{$idsHash{$pref}}, $l); } } close FILE; while () { my ($line, @ids, $i); chomp; $line = $_; if (exists($idsHash{$line})) { @ids = @{$idsHash{$line}}; foreach $i (@ids) { print "$i\n"; } } } close PATTERNS; '_EOF_' chmod +x getFullNames.pl perl getFullNames.pl DHBacs.fullnames DH.names.2orless \ > DH.fullnames.2orless # do the same for CHORI73.names.touse to get full names awk '{print $1"SP6"}' CHORI73.names.touse > CHORI73.namesSP6.touse awk '{print $1"T7"}' CHORI73.names.touse > CHORI73.namesT7.touse cat CHORI73.namesSP6.touse CHORI73.namesT7.touse \ > CHORI73.namesSP6andT7.touse wc -l CHORI73.names* # 265235 CHORI73.names.sort # 10598 CHORI73.namesSP6andT7.touse # 5299 CHORI73.namesSP6.touse # 5299 CHORI73.namesT7.touse # 5299 CHORI73.names.touse grep '>' CHORI73_bacends.fa > CHORI73.fullnames perl -pi.bak -e 's/>//' CHORI73.fullnames grep -f CHORI73.namesSP6andT7.touse CHORI73.fullnames \ > CHORI73.fullnames.touse # so get all the sequence records together in one file ssh kkstore02 cd /cluster/data/danRer3/bed/bacends mkdir bacSeqs # get all sequences from DH_bacends.fa that have 2 or less for the clone. # This might include cases where there are duplicate reads for one end # only but these will go into the singles track anyway. faSomeRecords DH_bacends.fa DH.fullnames.2orless ./bacSeqs/DHBacs.2orless.fa # get all sequences with more than 2 sequences for that clone but # with no sequence in the new downloaded BAC ends sequence file that # has only one sequence for each BAC end. faSomeRecords DH_bacends.fa DHmorethan2.DHonly.fullnames.sort \ ./bacSeqs/DHBacs.2ormore.orig.fa # get all sequences for BAC ends where there are more than 2 read for # ends for one clone so there are replicate reads for at least one end. # use the sequence in the downloaded CHORI73 set of clone ends for these. faSomeRecords CHORI73_bacends.fa CHORI73.fullnames.touse \ ./bacSeqs/CHORI73.fromDH.morethan2.fa cd bacSeqs # translate to upper case and remove leading zeros cat DHBacs.2orless.fa | tr '[a-z]' '[A-Z]' > DHBacs.2orless.format.fa cat DHBacs.2ormore.orig.fa | tr '[a-z]' '[A-Z]' \ > DHBacs.2ormore.orig.format.fa # remove leading 0 and just use name as FASTA header # need to leave in a or w as in p1kaSP6 or q1kaT7 or p1kSP6w or q1kT7w # these will distinguish replicate reads from the same sequence and will # be removed later when the best alignment is selected. perl -pi.bak -e \ 's/(CHORI73_[0-9]+[A-Z]{1})0?([0-9]+)\.(P1K|Q1K)(ASP6|SP6|SP6W|AT7|T7|T7W) BASES.+/$1$2$4/' \ DHBacs*format.fa cat CHORI73.*.fa DHBacs*.format.fa > CHORI73BACends.fa grep '>' CHORI73BACends.fa | wc -l # 295722 # then combine these with the zf_bacends.fa from Sanger which contain # the rest of the BAC end sequences. cat ../zf_bacends.fa CHORI73BACends.fa > Zv5BACends.fa grep '>' Zv5BACends.fa | wc -l # 720571 faSize Zv5BACends.fa # 674252474 bases (10674972 N's 663577502 real 663577502 upper 0 lower) in # 720571 sequences in 1 files Total size: mean 935.7 sd 239.8 # min 26 (CHORI73_189M4SP6) max 5403 (zC259G13.zb) median 882 # N count: mean 14.8 sd 72.4 # U count: mean 920.9 sd 239.6 # L count: mean 0.0 sd 0.0 # check Zv5BACends.fa has unique sequence names grep '>' Zv5BACends.fa | sed 's/>//' > names sort names | uniq -c | sort -nr > names.count # all unique names so cleanup rm names names.count *.bak # Now the BAC end sequences file has been made, align the sequences # to danRer3 using Blat. ssh pk # problems running these on kk using input from bluearc - slowed down # kkstore02 with heavy load. So move everything to the san as it # scales better than the bluearc especially from the pk. run directory # is on san also. cd /cluster/data/danRer3/bed/bacends/bacSeqs # first split up bacends sequence and add to directory on the san mkdir -p /san/sanvol1/scratch/danRer3/bacends/Zv5bacends # split up sequence for cluster runs faSplit sequence Zv5BACends.fa 20 \ /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/bacends # get all the chrom contig files onto the san mkdir -p /san/sanvol1/scratch/danRer3/trfFaChroms rsync -a --progress /cluster/bluearc/danRer3/trfFa/chr[0-9M]*.fa \ /san/sanvol1/scratch/danRer3/trfFaChroms/ cd /cluster/data/danRer3/bed/bacends mkdir -p /san/sanvol1/scratch/danRer3/bacends/chromsRun ln -s /san/sanvol1/scratch/danRer3/bacends/chromsRun # make directory for output, do not have output going to /cluster/data dir # as it is very large. mkdir -p /san/sanvol1/scratch/danRer3/bacends/chromsPsl ln -s /san/sanvol1/scratch/danRer3/bacends/chromsPsl # also copy over the 11.ooc file for danRer3 if not there already cp -p /cluster/bluearc/danRer3/danRer3_11.ooc \ /san/sanvol1/scratch/danRer3/ # make input file lists cd /cluster/data/danRer3/bed/bacends/chromsRun ls -1S /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/*.fa > bacends.lst # do blat just for chr1-25 and chrM ls -1S /san/sanvol1/scratch/danRer3/trfFaChroms/*.fa > seqs.lst # 64 bit blat used for pk. This version of blat recently had a bug fix # so should give the same result as i386 blat on kk. use absolute path for # output dir rather than symlink as that would increase I/O. # use Blat parameters as for mm5 and hg17 cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc {check out line+ /san/sanvol1/scratch/danRer3/bacends/chromsPsl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy /cluster/bin/i386/gensub2 seqs.lst bacends.lst template jobList /cluster/bin/i386/para create jobList /cluster/bin/i386/para try, check, push, check, ... # /cluster/bin/i386/para time # Completed: 4160 of 4160 jobs # CPU time in finished jobs: 746878s 12447.96m 207.47h 8.64d 0.024 y # IO & Wait Time: 11166s 186.11m 3.10h 0.13d 0.000 y # Average job time: 182s 3.04m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 915s 15.25m 0.25h 0.01d # Submission to last job: 5100s 85.00m 1.42h 0.06d # run jobs to do blat of NA and Un scaffolds vs BAC end sequences ssh pk # copy scaffolds to the san mkdir -p /san/sanvol1/scratch/danRer3/scaffoldsSoftMask foreach f (/cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/Zv5_*.fa) rsync -a --progress $f /san/sanvol1/scratch/danRer3/scaffoldsSoftMask/ end cd /cluster/data/danRer3/bed/bacends mkdir -p /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun ln -s /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun # make directory for output, do not have output going to /cluster/data dir # as it is very large. mkdir -p /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl ln -s /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl # make input file lists cd /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnRun ls -1S /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/*.fa > bacends.lst # do blat just for NA and Un scaffolds foreach f (/san/sanvol1/scratch/danRer3/scaffoldsSoftMask/Zv5_*.fa) echo $f >> scafs.lst end # 64 bit blat used for pk. This version of blat recently had a bug fix # so should give the same result as i386 blat on kk. use absolute path for # output dir rather than symlink as that would use # use Blat parameters as for mm5 and hg17 cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc {check out line+ /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy /cluster/bin/i386/gensub2 scafs.lst bacends.lst template jobList /cluster/bin/i386/para create jobList /cluster/bin/i386/para try, check, push, check, ... # para time # Completed: 298820 of 298820 jobs # CPU time in finished jobs: 1232495s 20541.58m 342.36h 14.26d 0.039 y # IO & Wait Time: 923511s 15391.85m 256.53h 10.69d 0.029 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1008s 16.80m 0.28h 0.01d # Submission to last job: 37494s 624.90m 10.41h 0.43d ssh kolossus cd /cluster/data/danRer3/bed/bacends # need to sort psl files, filter and liftUp # first do the chr1-25 and chrM alignments nice pslSort dirs rawChroms.psl tmp chromsPsl >& chromSort.log # Time taken: 2 hours 42 minues pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ rawChroms.psl bacEndsChroms.psl /dev/null >& pslRepsChroms.log # Took 19 minutes # then lift up NA and Un scaffolds to chrom level nice pslSort dirs rawNAandUn.psl tmp scaffoldsNAandUnPsl \ >& scafsNAandUnSort.log # took 1 hour 50 minutes pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ rawNAandUn.psl bacNAandUnScafs.psl /dev/null >& pslRepsNAandUn.log # took 18 minutes # lift results: liftUp bacEnds.liftedChroms.psl /cluster/data/danRer3/jkStuff/liftAll.lft \ warn bacEndsChroms.psl liftUp bacEnds.liftedNAandUn.psl \ /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \ warn bacNAandUnScafs.psl # sort and merge these files mkdir liftedPsl mv *.lifted*.psl ./liftedPsl/ nice pslSort dirs bacEnds.psl tmp1 liftedPsl >& pslSortAll.log # Took 4 minutes pslCheck bacEnds.psl >& pslCheck.log # there are 520 BAC ends with overlapping block errors - 1385 alignments # use pslReps parameters used for mm6 pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons raw.psl \ bacEnds.psl /dev/null # those for hg17 pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ raw.psl bacEnds2.psl /dev/null # see how many align in each case awk '{print $10;}' bacEnds.psl | sort | uniq -c \ | sort -nr > bacEnds.qNames.sort awk '{print $10;}' bacEnds2.psl | sort | uniq -c \ | sort -nr > bacEnds2.qNames.sort wc -l bacEnds*qNames.sort # 549086 bacEnds2.qNames.sort # 519773 bacEnds.qNames.sort grep '>' Zv5Bacends.fa | wc -l # 729101 # so 71% of sequences aligned in bacEnds.psl # and 75% of sequences aligned in bacEnds2.psl # use textHistogram to look at number of alignments # bacEnds.psl has 374002 with only 1 alignment # bacEnds2.psl has 362364 with only 1 alignment # bacEnds.psl - most alignments for 1 sequence is 515, # for bacEnds2.psl - most alignments for 1 sequence is 1272 # when these are split up into bacEndPairs, bacEndPairsBad and # bacEndSingles, the number of alignments per sequence is reduced # so use bacEnds2.psl # Process BAC end alignments ssh kkstore02 mkdir -p /cluster/data/danRer3/bed/bacends/pairs mkdir -p /cluster/data/danRer3/bed/bacends/bacends.1 # Downloaded BAC ends accessions from SRS # Go to http://srs.sanger.ac.uk # Go to "Select Databanks" tab and check DBGSS # Go to "Query Form" tab # Select Organism as field and enter "Danio*" as search term # Select AllText as field and enter "*Sanger*" as search term # Select AllText as filed and enter "T7|SP6" as search term # Select a view # Download as BACEndAccs.txt to bacend.1 directory cd /cluster/data/danRer3/bed/bacends/bacends.1 cp /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1/getBacEndInfo.pl . # get lists of SP6 and T7 accessions and merge lists awk 'BEGIN {FS="\t"}{OFS="\t"} {if ($7 ~ /SP6/) print $3"SP6",$4}' \ BACEndAccs.txt > BACEndSP6.accs awk 'BEGIN {FS="\t"}{OFS="\t"} {if ($7 ~ /T7/) print $3"T7",$4}' \ BACEndAccs.txt > BACEndT7.accs cat BACEndSP6.accs BACEndT7.accs > BACEndExtNames.accs # change external names to internal names cat << '_EOF_' > extToIntNames.pl #!/usr/bin/perl -w use strict; my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-"); my %cloneHash = qw { CH211- zC DKEY- zK DKEYP- zKp RP71- bZ BUSM1- dZ CH73- CHORI73_ }; while () { my ($l, $c, $intPref); $l = $_; foreach $c (@clonePrefixes) { if ($l =~ /$c/) { # get internal name if (exists($cloneHash{$c})) { $intPref = $cloneHash{$c}; $l =~ s/$c/$intPref/; print $l; } } } } '_EOF_' chmod +x extToIntNames.pl perl extToIntNames.pl < BACEndExtNames.accs > BACEnd_accessions.txt # get BAC clone accessions from Genbank. They can be obtained from EMBL # through SRS but harder to separate the BAC end accessions from the # BAC clone accessions: # go to http://www.ncbi.nlm.nih.gov # 1) select "Nucleotide" as the search database. # 2) Search string: Danio rerio[ORGN] AND clone[TITL] NOT survey[TITL] # Those sequences with "genomic survey" in the title appear to be # BAC clone end accessions. Here, we want only BAC clone accessions. # 3) There are 628991 sequences (2005-09-19). Select File from Send To # pulldown menu and name file "BACClones.gbAccs.txt". # create script to parse out clone ID and the accession: cat << '_EOF_' > getAccsandIdsFromGb.pl #!/usr/bin/perl -w use strict; my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-"); my %cloneHash = qw { CH211- zC DKEY- zK DKEYP- zKp RP71- bZ BUSM1- dZ CH73- CHORI73_ }; my $found = "FALSE"; my $acc = ""; my $id = ""; while () { my ($l, @f, $intId, $extPref, $intPref); $intPref = ""; $extPref = ""; chomp; $l = $_; if ($l =~ /^[0-9]+:\s+([A-Z]+[0-9]{3,})/) { $acc = ""; $acc = $1; $found = "FALSE"; } elsif ($l =~ /clone/) { $id = ""; # check for clone name in this line foreach my $p (@clonePrefixes) { if ($l =~ /clone:?\s?($p[0-9-A-Za-z]+)/) { $id = $1; # translate to upper case $id =~ tr/a-z/A-Z/; $extPref = $p; $found = "TRUE"; } } } if ($found eq "TRUE") { if (exists($cloneHash{$extPref})) { $intPref = $cloneHash{$extPref}; } $intId = $id; # translate this to internal ID $intId =~ s/$extPref/$intPref/; print "$intId\t$acc\t$id\n"; $found = "FALSE"; } } '_EOF_' # chmod +x getAccsandIds.pl # perl getAccsandIds.pl < BACClones.accs.txt > BACClonesIdsandAccs.txt # Took 36 minutes. This file has internal BAC clone name, accession and chmod +x getAccsandIdsFromGb.pl # CHORI73_ is a new prefix, this is for the internal name of # BAC clones from the CHORI73 doubled haploid library. nice perl getAccsandIdsFromGb.pl < BACClones.gbAccs.txt \ > BACClonesIdsandAccs.txt & # Took under 3 minutes. The output file here has internal BAC clone name, # Genbank accession and external BAC clone name. grep '>' ../bacSeqs/Zv5BACends.fa | sed -e 's/>//' > allBacEnds.names # modify getBacEndInfo.pl for these sequence names so rename as # getBacEndInfov2.pl # need to make pairs file perl getBacEndInfov2.pl allBacEnds.names BACEnd_accessions.txt \ > bacEnds.log # check that all the BAC end sequence names from allBacEnds.names # appear in either bacEndPairs.txt or bacEndSingles.txt wc -l bacEnd* # 159319 bacEndAccs.aliases # 333356 bacEndPairs.txt # 19788 bacEndSingles.txt # bacEndAccs.aliases contains sequence read names and their # Genbank accessions. awk 'BEGIN {OFS="\n"} {print $1, $2;}' bacEndPairs.txt \ | sed -e 's/,/\n/g' > bacPrs.names awk '{print $1;}' bacEndSingles.txt | sed -e 's/,/\n/g' > bacSingles.names cat bacPrs.names bacSingles.names | sort > bacEnds.names.sort sort allBacEnds.names > allBacEnds.names.sort wc -l *.sort # 720571 allBacEnds.names.sort # 720571 bacEnds.names.sort # so all the BAC ends from the FASTA file have been accounted for either # as pairs or singles. # process BAC end alignments cd /cluster/data/danRer3/bed/bacends/pairs set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1 # try different parameters /cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \ -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.psl $bacDir/bacEndPairs.txt \ all_bacends bacEnds wc -l bacEnds.* # 426 bacEnds.long # 14875 bacEnds.mismatch # 229139 bacEnds.orphan # 164778 bacEnds.pairs # 0 bacEnds.short # 100 bacEnds.slop # 409318 total # there are less slop (190) more pairs (90967) and orphans (229139) # and less mismatch (18083) and less long (980) than for danRer2 # size of sequence should be 100-200 kb but since assembly is not # complete there are misassemblies so the distance between pairs could be # larger. If -max=200000 -slopval=10000 -hardMax=500000 is used, then # there are 18377 bacEnds.long, 250243 bacEnds.orphan, # and 131209 bacEnds.pairs and over 3000 less just drop out.413243 total # try -max=300000 -slopval=10000 -hardMax=500000 # wc -l bacEnds.* # 3343 bacEnds.long # 11731 bacEnds.mismatch # 243500 bacEnds.orphan # 154981 bacEnds.pairs # 0 bacEnds.short # 509 bacEnds.slop # 414064 total # try -min=25000 -max=350000 -slopval=10000 -hardMax=500000 as for human # wc -l bacEnds.* # 1725 bacEnds.long # 12081 bacEnds.mismatch # 242235 bacEnds.orphan # 156444 bacEnds.pairs # 616 bacEnds.short # 1017 bacEnds.slop # 414118 total # this would be good to use but for direct comparison between danRer2 # and danRer3, it would be good to use the same parameters as before # so stick with those above: # -min=2000 -max=650000 -slopval=10000 -hardMax=800000 # create header required by "rdb" tools # NOTE: there are overlapping BAC clone ends for danRer3. Some of these # are only a few kb apart (from beginning of one to end of the other) # so use stricter pslPairs parameters as for human and mouse. ssh kkstore02 mkdir /cluster/data/danRer3/bed/bacends/pairs cd /cluster/data/danRer3/bed/bacends/pairs set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1 /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.psl \ $bacDir/bacEndPairs.txt all_bacends bacEnds wc -l bacEnds.* echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\ > ../header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header # make pairs bed file cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairs.bed # also need to process bacEndSingles.txt into a database table # for singles in bacEndSingles.txt, create a dummy file where they # are given zJA11B12T7 as dummy sequence pair. If the single is a forward # sequence, put the dummy sequence in the second column, if the single is # a reverse sequence put in first column. use a perl script to do this. cd /cluster/data/danRer3/bed/bacends set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1 mkdir singles cd singles cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl . perl formatSingles.pl $bacDir/bacEndSingles.txt > \ $bacDir/bacEndSingles.format # then run pslPairs on this formatted file /cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \ -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \ all_bacends bacEnds wc -l bacEnds.* # 0 bacEnds.long # 0 bacEnds.mismatch # 11439 bacEnds.orphan # 0 bacEnds.pairs # 0 bacEnds.short # 0 bacEnds.slop # there are 11439 orphans here and 229139 from pair analysis so # a total of 240578 orphans cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles wc -l bacEnds.singles # 240578 bacEnds.singles # make singles bed file cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndSingles.bed cp bacEndSingles.bed ../pairs cd ../pairs # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad # since orphans are already in bacEndSingles, do not add these cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBad.bed # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBadNoOrphans.bed # use extractPslLoad later to get all_bacends.psl for database # There are rows where the aligments were the same but the lfNames are # different. This is due to the presence of multiple reads for the # same BAC end sequence. Sometimes they are slightly different lengths # so the alignments are a little different. It would be good to # consolidate all of these. Firstly, the identical rows were merged into # one with a list of all the lfNames corresponding to that alignment. ssh kkstore02 #echo "create database bacsDr3_rah;" | hgsql danRer3 cd /cluster/data/danRer3/bed/bacends/pairs #hgLoadBed bacsDr3_rah bacEndPairs bacEndPairs.bed \ # -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb # Loaded 163174 elements of size 11 # create a bacEndSingles table like bacEndPairs if not created already # hgLoadBed bacsDr3_rah bacEndSingles bacEndSingles.bed \ # -sqlTable=../singles/bacEndSingles.sql -notItemRgb # Loaded 212775 elements of size 11 # NOTE - this track isn't pushed to RR, just used for assembly QA # Use bacEndPairsBadNoOrphans.bed as orphans are in the singles bed file # hgLoadBed bacsDr3_rah bacEndPairsBad bacEndPairsBadNoOrphans.bed \ # -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb # Loaded 15169 elements of size 11 # Need to consolidate similar rows for bacEndPairs and bacEndSingles - same # name, different lfNames and same alignments. mkdir -p /cluster/data/danRer3/bed/bacends/duplicates cd /cluster/data/danRer3/bed/bacends/duplicates mkdir -p /cluster/bluearc/danRer3/bacends/duplicates/overlapRun cd /cluster/data/danRer3/bed/bacends/duplicates ln -s /cluster/bluearc/danRer3/bacends/duplicates/overlapRun # write program to do this for linked feature series (lfs) which # is the type of data structure used for BAC ends. # Need a bed file sorted by chrom and chromStart cd overlapRun foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles) sort -k1,2 /cluster/data/danRer3/bed/bacends/pairs/${f}.bed > ${f}.lfs end wc -l *.lfs # 15169 bacEndPairsBadNoOrphans.lfs # 163174 bacEndPairs.lfs # 212775 bacEndSingles.lfs # remove replicate rows where names match and the overlapping region # (chromEnd - chromStart) is greater than or equal to 0.999. ssh kolossus cd /cluster/data/danRer3/bed/bacends/duplicates/overlapRun foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles) echo "Processing $f" nohup nice /cluster/home/hartera/bin/i386/lfsOverlap ${f}.lfs \ ${f}.bed -name -minOverlap=1.0 -notBlocks end # Started: Tue Sep 27 21:51 Finished: Sep 28 06:29 ssh kkstore02 cd /cluster/data/danRer3/bed/bacends/duplicates/overlapRun # check the numbers of lines are correct foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles) awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \ | sort | uniq -c | sort -nr > ${f}.uniqCount end wc -l * # 163116 bacEndPairs.bed # 163174 bacEndPairs.lfs # 163116 bacEndPairs.uniqCount # 15163 bacEndPairsBad.bed # 15169 bacEndPairsBad.lfs # 15163 bacEndPairsBad.uniqCount # 212754 bacEndSingles.bed # 212775 bacEndSingles.lfs # 212754 bacEndSingles.uniqCount # numbers of lines after uniqueing by coords, name and score is the # same as that after using lfsOverlap to remove these lines so correct. cd /cluster/data/danRer3/bed/bacends/duplicates mv ./overlapRun/* . rm -r overlapRun /cluster/bluearc/danRer3/bacends/duplicates/overlapRun # Use perl script to choose 2 BAC ends to represent each BAC clone. # since there are often more than one read for each BAC end in this set, # 2 were chosen for each BAC pair or 1 for the singles. This was based on # the ones that had the largest region aligned (using lfSizes). # copy perl script over that was used for danRer2 cp /cluster/data/danRer2/bed/ZonLab/bacends/duplicates/pickLfNames.pl \ pickLfNamesv2.pl # edit so that regular expression for matching BAC end names is the # same as that used in ../bacends.1/getBacEndInfov2.pl # need to sort by chrom, chromStart foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles) sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed end # run perl script: input bed file, pairs or singles, name of output file perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed mv error.log log.pairs # log.pairs lists the 18 cases where alignments for a BAC clone use # a different pair of sequence reads for the ends than the previous # alignment for ends for that BAC clone. These were all checked and in # each case, the extra alignments are almost identical or overlap for # the most part so it does not matter if the extra alignments are # removed. # run script for singles: perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed mv error.log log.singles # log.singles has 34 cases where alignments for a BAC clone use # different sequence reads for either the T7 or SP6 BAC end. # singles may include both BAC ends for a clone in the case # where they aligned to different chromosomes or a long way apart on # the same chromsome (orphans). mostly those that have a different read # align to an almost identical or largely overlapping region. # some sequences appear to be different: CH211-98J20 - zC98J20.yb and # zC98J20.ya do not align to each other. DKEYP-107B4 - zKp107B4.ya looks # like it has low complexity sequence, this is discarded and zKp107B4.yb # is kept. zKp107B4.za and zKp107B4.zb only align in the first ~ 59bp. # zKp107B4.zb is kept in this case. DKEYP-114B4 - zKp114B4.za: 15-61 bp # on zKp114B4.za align to 11-58 bp on zKp114B4.zb. zKp114B4.za is kept. # In these cases, the 2 sequences align to different regions. perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \ badPairs2lfNames.bed mv error.log log.badPairs # only 3 alignments have a different pair of ends to other alignments # but alignment region is almost the same in each case. # for each of these new bed files, checks were made that there are # only 2 BAC ends per alignments for pairs and 1 for singles. # For each pair, there should only be 2 ends which can appear either # way round depending on the orientation and there should be 1 end for # the beginning (suffix T7, t7 or z) and one end for the end # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g. # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite # orientation. For singles, there should be a single BAC end for each # alignment and for each BAC clone, a sequence for either or both types # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate # alignments. # Finally overlaps in BAC clone names were checked. All BAC clones # represented in each of the pairs, badPairs and singles bed files are # unique to that file. Between all three bed files, 300323 BAC clones # have alignments. 512886 clone ends are aligned in these three bed files. # NOTE: using sort and uniq on hgwdev produces tab delimited output # after merging rows with the same BAC name, the scoring is now # wrong in the bed files. # Scores should be 1000 if there is 1 row for that name, else # 1500/number of rows for that sequence name - calculated by pslPairs. # Correct the scores. mkdir -p /cluster/data/danRer3/bed/bacends/scores cd /cluster/data/danRer3/bed/bacends/scores # copy over correctScores2.pl and checkscores.pl scripts from danRer2 and # edit so both scripts so that hits file is split on space,not on tabs cp /cluster/data/danRer2/bed/ZonLab/bacends/scores/correctScores2.pl . cp /cluster/data/danRer2/bed/ZonLab/bacends/scores/checkScores.pl . awk '{print $4}' ../duplicates/pairs2lfNames.bed \ | sort | uniq -c > pairs.hits perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits noBin \ > bacEndPairsGoodScores.bed # same for singles awk '{print $4}' ../duplicates/singles1lfName.bed \ | sort | uniq -c > singles.hits perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits \ noBin > bacEndSinglesGoodScores.bed # and for badPairs awk '{print $4}' ../duplicates/badPairs2lfNames.bed \ | sort | uniq -c > badPairs.hits perl correctScores2.pl ../duplicates/badPairs2lfNames.bed badPairs.hits \ noBin > bacEndPairsBadGoodScores.bed # check that the scores are now correct awk '{print $4, $5}' bacEndPairsGoodScores.bed \ | sort | uniq -c > pairs.count perl checkScores.pl < pairs.count # all the BAC clones should be in good.txt and none in bad.txt # wc -l should give same number of lines in good.txt as in pairs.hits # repeat for other bed files awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \ | sort | uniq -c > badPairs.count perl checkScores.pl < badPairs.count awk '{print $4, $5}' bacEndSinglesGoodScores.bed \ | sort | uniq -c > singles.count perl checkScores.pl < singles.count # for the singles, 6 ended up in bad.txt because their scores # were 214.285714285714 which is correct for 7 alignments. rounding the # score caused the discrepancy. ssh hgwdev cd /cluster/data/danRer3/bed/bacends/scores # copy over table definition from danRer2 cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/bacEndSingles.sql \ ../singles/ # Now load database tables: hgLoadBed danRer3 bacEndPairs bacEndPairsGoodScores.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb # Loaded 163098 elements of size 11 hgLoadBed danRer3 bacEndSingles bacEndSinglesGoodScores.bed \ -sqlTable=../singles/bacEndSingles.sql -notItemRgb # Loaded 212720 elements of size 11 # 212720 record(s), 0 row(s) skipped, 50 warning(s) loading bed.tab # warnings are unknown but all of bed file loaded and the number # of warnings is small so ignore hgLoadBed danRer3 bacEndPairsBad bacEndPairsBadGoodScores.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb # Loaded 15160 elements of size 11 # load BAC end sequences into seq table so alignments may be viewed # symlink to bacends.fa sequences in danRer1 mkdir -p /gbdb/danRer3/bacends ln -s /cluster/data/danRer3/bed/bacends/bacSeqs/Zv5BACends.fa \ /gbdb/danRer3/bacends/Zv5BACends.fa hgLoadSeq danRer3 /gbdb/danRer3/bacends/Zv5BACends.fa # create file for loading all_bacends table ssh kkstore02 cd /cluster/data/danRer3/bed/bacends/scores # for all_bacends table, just load the alignments for those sequences # represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables # bacEnds.load.psl is the file of alignments # get all the names of sequences foreach f (*GoodScores.bed) echo $f awk '{print $11;}' $f >> allBacEnds.names end wc -l allBacEnds.names # 390978 allBacEnds.names # this is the total number of lines in the *GoodScores.bed files perl -pi.bak -e 's/,/\n/g' allBacEnds.names sort allBacEnds.names | uniq > allBacEnds.names.uniq wc -l allBacEnds.names.uniq # 512886 allBacEnds.names.uniq # get alignments for just the BAC ends that are in the database tables # make bacEnds.load.psl cd /cluster/data/danRer3/bed/bacends/scores extractPslLoad -noBin ../bacEnds.psl bacEndPairsGoodScores.bed \ bacEndPairsBadGoodScores.bed bacEndSinglesGoodScores.bed | \ sorttbl tname tstart | headchg -del > bacEnds.load.psl # check that alignments are present for all BAC ends in # allBacEnds.names.uniq awk '{print $10}' bacEnds.load.psl | sort | uniq > bacEnds.names comm -12 bacEnds.names allBacEnds.names.uniq | wc -l # 512886 ssh hgwdev cd /cluster/data/danRer3/bed/bacends/scores # load all_bacends table hgLoadPsl danRer3 -table=all_bacends bacEnds.load.psl # load of all_bacends did not go as planned: 7584708 record(s), # 0 row(s) skipped, 526 warning(s) loading psl.tab # (hartera, 2006-04-19) # Display is very slow for BAC ends on large regions. Try splitting # all_bacends by chromosome. ssh hgwdev mkdir /cluster/data/danRer3/bed/bacends/all_bacends cd /cluster/data/danRer3/bed/bacends/all_bacends foreach c (`cat /cluster/data/danRer3/chrom.lst`) echo "Processing $c ..." awk '{if ($14 == "'chr${c}'") print;}' \ /cluster/data/danRer3/bed/bacends/scores/bacEnds.load.psl \ > chr${c}.bacEnds.load.psl end # rename old table hgsql -e 'alter table all_bacends rename allBacendsOld;' danRer3 # load new tables foreach c (`cat /cluster/data/danRer3/chrom.lst`) hgLoadPsl danRer3 -table=chr${c}_all_bacends chr${c}.bacEnds.load.psl end # There are still warnings on loading, most (510) are for chrUn. # This improves the performance a lot. # The chrom-parsing code is confused by the double underscores in the # chrN_all_bacends tables so change the names to chrN_allBacends foreach c (`cat /cluster/data/danRer3/chrom.lst`) hgsql -e "alter table chr${c}_all_bacends rename chr${c}_allBacends;" \ danRer3 end # Then add correct table name to each of the bacEnd* tables foreach t (bacEndPairs bacEndPairsBad bacEndSingles) hgsql -e "update $t set pslTable = 'allBacends';" danRer3 end # corrected termRegex for some bacCloneXRef searches in trackDb.ra so # that they work correctly (bacPairsIntName, bacSinglesIntName, # bacPairsSangerSts and bacSinglesSangerSts). (2006-04-19, hartera) # CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES # (bacEndAlias, bacCloneAlias and bacCloneXRef) (DONE, 2005-10-06, hartera) # RECREATE TABLES AFTER REMAKING THE SINGLES AND PAIRS TABLES # (see REDO BACENDS SECTION) (DONE, 2006-06-08, hartera) # REPLICATE ROWS IN TABLES SO REMOVE AND RELOAD (DONE, 2006-08-04, hartera) # Process data and create bacEndAlias table ssh kkstore02 cd /cluster/data/danRer3/bed/bacends/bacends.1 # make bacEndAlias table with Genbank accessions for ends # need to run getBacEndInfo.pl for the BAC end names in the # BAC tables. # in the pairs directory, there is the allBacEnds.names.uniq file # so use this. # Already made the bacEndAccs.aliases file with getBacEndInfov2.pl # This has none of the BAC ends whose names end in ASP6 or AT7 as # these are all from the CHORI73 library and they do not have BAC end # accessions in Genbank at the moment. This contains accessions for # all BAC ends even those without alignments. hgsql danRer3 < $HOME/kent/src/hg/lib/bacEndAlias.sql echo "load data local infile 'bacEndAccs.aliases' into table \ bacEndAlias" | hgsql danRer3 ssh kkstore02 # get the latest versions of the clonemarkers, contig names and markers # files from Sanger mkdir -p /cluster/data/danRer3/bed/bacends/cloneandStsAliases cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/README wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/clonemarkers.27.07.05.txt wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/ctgnames.27.07.05.txt wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/markers.27.07.05.txt wc -l *27.07.05.txt # 29885 clonemarkers.27.07.05.txt # 167858 ctgnames.27.07.05.txt # 12250 markers.27.07.05.txt # Recreate tables as bacEndPairs, bacEndSingles, bacEndPairsBad and # chrN_allBacends tables have changed (2006-06-08, hartera) # get list of BAC end names, lfNames cp /cluster/data/danRer3/bed/bacends/scoresAndCoords/allBacEnds.names.uniq . # get list of BAC clone names foreach f (bacEndPairs bacEndPairsBad bacEndSingles) awk '{print $4}' \ /cluster/data/danRer3/bed/bacends/scoresAndCoords/${f}GoodScores.bed >> bacs.names end sort -u bacs.names > bacs.names.uniq wc -l *.uniq # 512321 allBacEnds.names.uniq # 300290 bacs.names.uniq # from psl file awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names # edit to remove first few lines with no names sort bacEndsPsl.names | uniq > bacEndsPsl.names.uniq wc -l bacEndsPsl.names.uniq # 545920 bacEndsPsl.names.uniq # this is all the BAC ends that originally had alignments # Add an alias table for BAC clones # bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc # Add a xref table to give external clone registry names, internal names # sanger name, relationship between STS and BAC clone (method of finding # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT, # Genbank accession and STS primer sequences # bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc set dir=/cluster/data/danRer3/bed/bacends/ awk 'BEGIN {OFS="\t"}{print $4, $1}' \ $dir/scoresAndCoords/bacEndPairsGoodScores.bed > bacClones.namesandchrom awk 'BEGIN {OFS="\t"}{print $4, $1}' \ $dir/scoresAndCoords/bacEndSinglesGoodScores.bed >> bacClones.namesandchrom sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq # use a list of internal names,Genbank accessions, and BAC clone names # use BACClonesIdsandAccs.txt. # get list of UniSTS IDs using aliases to search alias file # print Sanger name, alias and UniSTS ID, use find_markers3.pl cat << '_EOF_' > find_markers3.pl # example: # perl find_markers3.pl UniSTS.aliases markers.02.12.04.txt use strict; my $verbose = 0; my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest); my $aliasFile = $ARGV[0]; my $markersFile = $ARGV[1]; open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n"; open(MARKERS, $markersFile) || die "Can not open $markersFile\n"; # store aliases from aliasFile my ($id, $al, @alsArray, %aliasHash); while () { chomp; ($id, $al) = split /\t/; @alsArray = split(/;/, $al); foreach my $as (@alsArray) { push (@{$aliasHash{$as} }, $id); } } close ALIAS; while () { my @idArray; ($f, $t, $m, $idArray[0]) = 0; my @ids; chomp; ($a, $b, $aliases, @rest) = split /\|/; if ($verbose > 3) { printf "aliases $aliases \n"; } @alias = split /;/, $aliases; ALIAS: foreach $s (@alias) { if ($s =~ /[\D]+/) { if ($verbose > 5) { printf "this $s \n"; } if (exists($aliasHash{$s})) { @idArray = @{$aliasHash{$s}}; } if ($idArray[0]) { $f = 1; $t = $s; @ids = @idArray; if ($verbose) { printf "this $s found $m \n"; } last ALIAS; } } } if ($f) { my @sNames = split(/;/, $b); foreach my $sn (@sNames) { foreach my $i (@ids) { printf "$sn\t$i\n"; } } } } close MARKERS; '_EOF_' chmod +x find_markers3.pl perl find_markers3.pl /cluster/data/ncbi/UniSTS.2005-09-29/UniSTS.aliases \ markers.27.07.05.txt > sangerandUniSTSId.txt # No need to reformat this for zfishBacClonesandSts # FPC contig information (i.e. FPC contig number) from ctgnames file is # not included in the tables as these are dynamic and constantly # changing with the assembly. # FILE OF BAC CLONE ACCESSIONS # http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out # copy over file of BAC internal names, accessions and external names cp /cluster/data/danRer3/bed/bacends/bacends.1/BACClonesIdsandAccs.txt . # use zfishBacClonesandSts to create tab files for loading into # bacCloneAlias and bacCloneXRef tables # make output directory rm -r /cluster/bluearc/danRer3/bacEnds/out mkdir -p /cluster/bluearc/danRer3/bacEnds/out # edit zfishBacClonesandSts.c to add prefixes for CHORI73 library: # CHORI73_ for internal name, CH73- for external name # in ctgnames.27.07.05.txt and clonemarkers.27.07.05.txt perl -pi.bak -e 's/zH([0-9]+)/CHORI73_$1/' *.27.07.05.txt mv ctgnames.27.07.05.txt.bak ctgnames.27.07.05.orig mv clonemarkers.27.07.05.txt.bak clonemarkers.27.07.05.txt.orig # no change to markers file so remove .bak file rm markers.27.07.05.txt.bak nice $HOME/bin/x86_64/zfishBacClonesandSts ctgnames.27.07.05.txt \ clonemarkers.27.07.05.txt markers.27.07.05.txt \ bacClones.namesandchrom.uniq BACClonesIdsandAccs.txt \ sangerandUniSTSId.txt ./out > ./out/zfishBacs.out & # output is in /cluster/bluearc/danRer3/bacends/out so copy over # sort alias tab file by sangerName sort -k2 ./out/bacAlias.tab > bacAlias.sort.tab cp ./out/bacXRef.tab . wc -l *.tab # 110961 bacAlias.sort.tab # 540800 bacXRef.tab ssh hgwdev cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases hgsql -e 'drop table bacCloneAlias;' danRer3 hgsql -e 'drop table bacCloneXRef;' danRer3 hgLoadSqlTab danRer3 bacCloneAlias \ $HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab hgLoadSqlTab danRer3 bacCloneXRef \ $HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab # edit trackDb.ra to add bacEnds tracks and searches for the bacEndPairs # and bacEndSingles tracks as for danRer1. copy over html from danRer2 # for bacEndPairs and bacEndSingles tracks. # Replicate rows in table so reload after removing these # (hartera, 2006-08-04) ssh hgwdev cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases sort bacAlias.sort.tab | uniq | sort -k2 > bacAlias.sort.tab.uniq sort bacXRef.tab | uniq > bacXRef.tab.uniq wc -l *.tab.uniq # 57656 bacAlias.sort.tab.uniq # 356453 bacXRef.tab.uniq # Drop old tables and reload: hgsql -e 'drop table bacCloneAlias;' danRer3 hgsql -e 'drop table bacCloneXRef;' danRer3 hgLoadSqlTab danRer3 bacCloneAlias \ $HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab.uniq hgLoadSqlTab danRer3 bacCloneXRef \ $HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab.uniq # BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES # (DONE, 2005-10-06, hartera) # REDONE AFTER REMAKING bacCloneAlias AND bacCloneXRef TABLES - both ok. # (DONE, 2006-06-12, hartera) # REDONE AFTER REMAKING bacCloneAlias AND bacCloneXRef TABLES # (DONE, 2006-08-04, hartera) # The following tests were carried out to check that all the data # in the bacCloneAlias and bacCloneXRef tables is correct. ssh hgwdev cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases cp ./testTablesNew/*.pl . rm -r testTablesNew mkdir -p testTablesNew cd testTablesNew # Check that the correct aliases are associated with their Sanger STS names awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \ ../markers.27.07.05.txt > sNameandaliases # write script to get one Sanger name and one alias on each line cp ../*.pl . perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format sort sNameandaliases.format | uniq > sNameandaliases.sort # get Sanger names and aliases from database hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer3 \ | sort | uniq > alias.db.sort wc -l alias.db.sort # 57656 alias.db.sort diff sNameandaliases.sort alias.db.sort # No difference between data file and data from database so ok # Check Sanger STS names correspond in bacAlias and bacCloneXRef tables # get Sanger names from alias table hgsql -N -e 'select sangerName from bacCloneAlias;' danRer3 \ | sort | uniq > sName.alias.sort wc -l sName.alias.sort # 15309 sName.alias.sort # get Sanger names from xRef table hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \ is not null;' danRer3 | sort | uniq > sName.xRef.sort wc -l sName.xRef.sort # 15522 sName.xRef.sort comm -23 sName.alias.sort sName.xRef.sort # nothing unique to alias file so all Sanger names in the alias table are # also in the xRef table comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias wc -l sNamexRefNotAlias # 213 sNamexRefNotAlias awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.27.07.05.txt | sort | uniq \ > clonemarkers.sNames.sort # get Sanger names from markers file awk 'BEGIN {FS="|"}{print $2}' ../markers.27.07.05.txt > markers.sNames # remove semi-colons and sort sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort # sanger names unique to markers file comm -13 clonemarkers.sNames.sort markers.sNames.sort # there are none comm -23 clonemarkers.sNames.sort markers.sNames.sort \ > sNames.clonemarkersOnly wc -l sNames.clonemarkersOnly # 213 sNames.clonemarkersOnly diff sNames.clonemarkersOnly sNamexRefNotAlias # No difference so all the extra Sanger Names in the xRef # table are from the clonemarkers file and these have no aliases in # the markers file so they are not in the alias table so this is all ok. # Check that Sanger STS names and primers are associated correctly cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases/testTablesNew # get sanger names and primers from markers file awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \ ../markers.27.07.05.txt > sNameandPrimers # use script to reformat and write with one Sanger name per line chmod +x getSangerandPrimers.pl perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format sort sNameandPrimers.format > sNameandPrimers.format.sort wc -l sNameandPrim* # 12250 sNameandPrimers # 15309 sNameandPrimers.format # 15309 sNameandPrimers.format.sort # get Sanger names and primers from database hgsql -N -e \ 'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \ where sangerName is not null and leftPrimer is not null and \ rightPrimer is not null;' danRer3 | sort | uniq \ > sNamesandprimers.fromdb.sort wc -l sNamesandprimers.fromdb.sort # 15309 sNamesandprimers.fromdb.sort diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort # No difference so ok. # Check that UniSTS IDs and Sanger STS names are associated correctly # get Sanger names and UniSTS IDs from the database hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \ uniStsId is not null;' danRer3 | sort | uniq > sNameUniSTS.fromdb.sort wc -l sNameUniSTS.fromdb.sort # 5634 sNameUniSTS.fromdb.sort # Need to reformat the sNameUniSTS.fromdb.sort chmod +x formatUniSts.pl perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \ > sNameUniSTS.fromdb.format.sort # get Sanger names from data file and see how many UniSTS IDs there are # for each name awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \ > sangerandUniSTSId.count # the most is 3 # 3 etID9786.21 # 3 etID9056.23 # 3 etID9042.2 # 3 etID8627.2 # 3 etID8281.9 # 3 etID11096.5 sort ../sangerandUniSTSId.txt > sangerandUniSTSId.txt.sort diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort \ > sangerandUniSTSIdvsdb # No difference between data from original file and that in database so ok # Check that chrom mappings and external BAC clone names are correct # get extNames and chroms they map to from the database hgsql -N -e 'select name, chroms from bacCloneXRef where \ chroms is not null;' danRer3 | sort | uniq \ > nameandchromsfromdb.sort # reformat nameandchromsfromdb.sort perl formatUniSts.pl < nameandchromsfromdb.sort | sort \ > nameandchromsfromdb.format.sort # compare extNames and chroms from db to those in data file cp ../bacClones.namesandchrom . sort -u bacClones.namesandchrom > bacClones.namesandchrom.uniq diff bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort # no difference - all ok # Check Genbank accessions and internal BAC clone names hgsql -N -e 'select intName,genbank from bacCloneXRef where \ genbank is not null;' danRer3 | sort | uniq \ > intNamesandAccs.fromdb.sort # this should be a subset of zfish_accsMerged.txt - not all BAC clones # listed here appear in either our BAC ends tracks or the markers files. awk 'BEGIN {OFS="\t"} {print $1,$2}' ../BACClonesIdsandAccs.txt \ | sort -u > BACClonesIntandAccs.sort comm -23 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort # there is nothing in the database that is not in BACClonesIntandAccs.sort comm -13 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort \ > onlyinzfishAccs wc -l onlyinzfishAccs # 86 onlyinzfishAccs hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' \ danRer3 | sort | uniq > intNamesNoAcc.fromdb.sort awk '{print $1;}' BACClonesIntandAccs.sort > intNames.withAccs.sort comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \ > indbNoAccsandAccs.out # none of these names are common to both so all accessions from # BACClonesIdsandAccs.txt are in the database for the internal names stored # where there are accessions available. # Test Sanger STS names, internal names and external names are all correct # Test Sanger STS name and internal BAC clone names are associated correctly # get internal names and Sanger names from data file awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.27.07.05.txt \ | sort | uniq > intNameandSanger.sort hgsql -N -e 'select intName, sangerName from bacCloneXRef \ where sangerName is not null;' danRer3 \ | sort | uniq > intNameandSanger.fromdb.sort diff intNameandSanger.sort intNameandSanger.fromdb.sort # No difference between data from file and that from database so ok # Check BAC clone internal name and relationship fields # get internal names and relationships from data file awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.27.07.05.txt \ | sort | uniq > intNameandRelation.sort # get internal names and relationships from database hgsql -N -e 'select intName, relationship from bacCloneXRef \ where relationship != 0;' danRer3 \ | sort | uniq > intNameandrelation.fromdb.sort # differences unique to database file comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \ > intNameRelation.indbonly # differences unique to data file comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \ > intNameRelation.incloneMarkersonly wc -l intNameRelation* # 4650 intNameRelation.incloneMarkersonly # 4650 intNameRelation.indbonly awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names awk '{print $1}' intNameRelation.incloneMarkersonly \ > intNameRelation.incloneMarkersonly.names diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names # there is no difference in the internal names with relationship fields # no difference in names and the only places these should differ is that # the second column should all be 3 in the data from the database only. # this is because all the relationship entries that were blank were # in the clonemarkers file were changed to 3 when entered into the database. awk '{print $2}' intNameRelation.indbonly | sort | uniq # 3 - correct so all ok # all the differences should be that those that are blank in clonemarkers # are 3 in the database. # check that those that have 0 in the database bacCloneXRef relationshipe # field are not in the list from cloneMarkers # select these internal names with 0 relationship from the database hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \ danRer3 | sort | uniq > intNameNoRelation.fromdb.sort # get all the internal names from the data file awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.07.05.txt \ | sort | uniq > intNamefromCloneMarkers.sort comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort # nothing in common between these two files as expected so there are # no internal names in the db with 0 in the relationship field that # appear in the clonemarkers file. # Check all BAC clone internal names and external names from the # ctgnames file are in the database # get intName and extName from ctgnames file awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.27.07.05.txt \ | sort | uniq > intNameandextNamefromCtgNames.sort # get intName and extName from database hgsql -N -e 'select intName,name from bacCloneXRef;' danRer3 \ | sort | uniq > intNameandextName.fromdb.sort wc -l intNameandextName* # 340039 intNameandextName.fromdb.sort # 167858 intNameandextNamefromCtgNames.sort comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \ > intandextindbAndCtgNames wc -l intandextindbAndCtgNames # 167858 intandextindbAndCtgNames # there are 167858 name pairs common between the file and the database # and this is the same number of name pairs as in the data file diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort # no difference between those name pairs from the data file and those that # are common between the data file and the database so all internal and # external names from ctgNames file are in the database # get the list of extra ones from db comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \ > intandextNamesindbNotinCtgNames wc -l intandextNamesindbNotinCtgNames # 172181 intandextNamesindbNotinCtgNames # get list of internal names from the clonemarkers file awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.07.05.txt | sort | uniq \ > clonemarkers.intName.sort wc -l clonemarkers.intName.sort # 13471 clonemarkers.intName.sort # compare these intNames to those from the database not in the ctgnames file comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames # none of these clone markers internal names are in this list so they # must all be in the ctgnames file too. These extra internal names will be # translations of external names found in the list of mappings of BAC clones # to chroms. # Check that all the BAC clone external names from the list of chromosome # mappings and from the ctgnames file are in the database. # get all extNames from baclones.namesandchrom.uniq and from ctgnames awk '{print $1}' ../bacClones.namesandchrom.uniq > \ extNames.ctgnamesandbacClones awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.27.07.05.txt \ >> extNames.ctgnamesandbacClones wc -l extNames.ctgnamesandbacClones # 510169 extNames.ctgnamesandbacClones sort extNames.ctgnamesandbacClones | uniq \ > extNames.ctgnamesandbacClones.sort wc -l extNames.ctgnamesandbacClones.sort # 340039 extNames.ctgnamesandbacClones.sort # get extNames from the database hgsql -N -e 'select name from bacCloneXRef;' danRer3 | sort | uniq \ > extNames.fromdb.sort wc -l extNames.fromdb.sort # 340039 extNames.fromdb.sort comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \ > extNames.fromdbandfiles wc -l extNames.fromdbandfiles # 340039 extNames.fromdbandfiles # find extNames in common from data files and database diff extNames.fromdb.sort extNames.fromdbandfiles # no difference, all extNames from files are in db # Check that all BAC clone internal names from the ctgnames and clonemarkers # files are in the database # get internal names from ctgnames and clonemarkers files awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.27.07.05.txt \ > intNames.ctgnamesandclonemarkers awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.27.07.05.txt \ >> intNames.ctgnamesandclonemarkers wc -l intNames.ctgnamesandclonemarkers # 197743 intNames.ctgnamesandclonemarkers sort intNames.ctgnamesandclonemarkers | uniq \ > intNames.ctgnamesandclonemarkers.sort wc -l intNames.ctgnamesandclonemarkers.sort # 167858 intNames.ctgnamesandclonemarkers.sort # get internal names from database hgsql -N -e 'select intName from bacCloneXRef;' danRer3 | sort | uniq \ > intNames.fromdb.sort wc -l intNames.fromdb.sort # 340039 intNames.fromdb.sort # some of these intNames are derived from the corresponding extNames # all of the intNames from the file should be in the db comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \ > intNames.fromdbandfiles wc -l intNames.fromdbandfiles # 167858 intNames.fromdbandfiles diff intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort # no difference, all intNames from files are in db # Check that all translations are correct between BAC clone # external and internal names. # write script to get the prefixes from internal and external names chmod +x getNamePrefixes.pl hgsql -N -e 'select name, intName from bacCloneXRef;' danRer3 \ | sort | uniq > extandintNames.fromdb.sort perl getNamePrefixes.pl < extandintNames.fromdb.sort \ > extandintNames.prefixes sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq # these all look good # BUSM1 dZ # CH211 zC # CH211 zc # CH73 CHORI # CT7 bP # DKEY zK # DKEY zk # DKEYP zKp # RP71 bZ # XX bY # zk is a internal name prefix for the external name prefix, DKEY-. There # is only one example where this is used (DKEY-81G7) and this in the # ctgnames file and is in the bacCloneXRef table so that is ok. # All data looks good in these tables now. # BLASTZ TETRAODON (tetNig1) (DONE, 2005-10-20, hartera) # REMADE DOWNLOADS FOR net, all.chain AND over.chain AS THEY HAD BEEN DELETED. # MOVE ALL THE RUN FILES AND OUTPUT FROM THE SAN RUN DIRECTORY TO A DIRECTORY # ON /cluster/data AS THIS IS MORE PERMANENT. (DONE, 2005-11-17, hartera). # Tetraodon is quite distant from zebrafish, more distant than human/chicken # so use the HoxD55.q matrix for the Blastz alignments. # Blastz requires lineage-specific repeats but there are none # available between these two fish species ssh kkstore02 mkdir -p /cluster/data/danRer3/bed/blastz.tetNig1.2005-10-11 cd /cluster/data/danRer3/bed ln -s blastz.tetNig1.2005-10-11 blastz.tetNig1 cd /cluster/data/danRer3/bed/blastz.tetNig1 # create a 2bit file for danRer3 with all chroms (1-25 and M) and the # scaffolds for NA and Un if it does not exist already cd /cluster/data/danRer3 faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \ Un/scaffoldUn.fa NA/scaffoldNA.fa danRer3ChrUnNAScafs.2bit ssh hgwdev # move the 2 bit file for danRer3 to the san if not there already mkdir -p /san/sanvol1/scratch/danRer3/ mv /cluster/data/danRer3/danRer3ChrUnNAScafs.2bit \ /san/sanvol1/scratch/danRer3/ # also copy over the danRer3 2 bit file for all chroms and the # lift file for NA and Un scaffolds to chrNA and chrUn. cp /cluster/data/danRer3/danRer3.2bit /san/sanvol/scratch/danRer3/ cp /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \ /san/sanvol1/scratch/danRer3/ # also copy over tetraodon sequences to the san mkdir -p /san/sanvol1/scratch/tetNig1/contigs cp /cluster/bluearc/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit \ # see makeTetNig1.doc for making tetNig1ChrContigsRandomScafs.2bit # make output and run directories mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut cd /cluster/data/danRer3/bed/blastz.tetNig1 ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut # also copy over tetraodon sequences to the san mkdir -p /san/sanvol1/scratch/tetNig1/contigs cp /cluster/bluearc/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit \ /san/sanvol1/scratch/tetNig1/contigs/ # use tetraodon sequence in contigs for dynamic masking - see below # for dynamic masking: M=50. Each time a base is hit at least 50 times, it # is masked out. # Blastz danRer3 chroms and scaffolds vs tetNig1 ordered chrom contigs and # scaffolds from random chromosomes. lift up the tetNig1 contigs to chrom # level. Then make the chains and then liftUp all the scaffolds to chrom # level before sorting and merging chains and then netting. # get all contigs from mapped ordered chroms and make 2bit file # see makeTetNig1.doc cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun cat << '_EOF_' > DEF # zebrafish (danRer3) vs. tetraodon (tetNig1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin ALIGN=blastz-run BLASTZ=blastz.v7.x86_64 BLASTZ_M=50 BLASTZ_H=2500 BLASTZ_Q=/cluster/data/blastz/HoxD55.q #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified BLASTZ_ABRIDGE_REPEATS=0 # TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM and scaffolds SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft SEQ1_RMSK= # lineage-specific repeats # we don't have that information for these species SEQ1_SMSK= SEQ1_FLAG= SEQ1_LIMIT=30 SEQ1_IN_CONTIGS=0 # 0.5 Mb chunk for target with 5 kb overlap SEQ1_CHUNK=500000 SEQ1_LAP=5000 # QUERY - Tetraodon (tetNig1) # soft-masked 500 kb contigs for chroms, scaffolds for randoms SEQ2_DIR=/san/sanvol1/scratch/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=1000000000 SEQ2_LAP=0 BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes SEQ2_LEN=$BASE/S2.len TMPDIR=/scratch/tmp #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF cp /cluster/data/danRer3/chrom.sizes ./S1.len twoBitInfo /san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit \ chromsUnNAScafs.sizes twoBitInfo \ /san/sanvol1/scratch/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit ./S2.len nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk -smallClusterHub=pk -workhorse=pk -stop cat \ -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut \ `pwd`/DEF >& do.log & # PID 32339 Start: Tue Oct 11 14:55 # use Hiram's script to kill 4 empty shell commands on Thurs Oct 13th # /cluster/bin/scripts/findEmpty.sh -r to find # /cluster/bin/scripts/findEmpty.sh -K to kill # Fri Oct 14 10:41 # Checking finished jobs # crashed: 32 # running: 20 # ranOk: 3716 # failed 4 times: 32 # total jobs in batch: 3768 # check problems: # 141 jobs crashed on host: kkr10u19.kilokluster.ucsc.edu # Just removed this machine with parasol remove machine as over 9000 jobs # crashed for opossum run on this machine. # run again with para push -retries=20 # By 16:00 on Fri Oct 14, all jobs finished but 2 failed 4 times so repush # with para push -retries=20. # para time # Completed: 3768 of 3768 jobs # CPU time in finished jobs: 12465019s 207750.32m 3462.51h 144.27d 0.395 y # IO & Wait Time: 873594s 14559.90m 242.66h 10.11d 0.028 y # Average job time: 3540s 59.00m 0.98h 0.04d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 19777s 329.62m 5.49h 0.23d # Submission to last job: 264857s 4414.28m 73.57h 3.07d ssh pk cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/run.blastz para time > run.time # run doBlastzChainNet.pl to continue with cat step since the script # crashed when some of the jobs failed 4 times. ssh hgwdev cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk -smallClusterHub=pk -workhorse=pk -continue cat -stop cat \ -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut \ `pwd`/DEF >& doCat.log & # Took about 7 minutes. # Now need to liftUp the contigs for tetNig1 to chrom-level but # not the scaffolds. All the scaffolds will be lifted after the # chaining step. ssh kolossus # liftUp contigs for tetraodon query: cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun mv pslParts pslPartsNotLifted mkdir /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun/liftedPsl set dir=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun # use carry for "how" as this will carry items not in liftSpec to dest # file without translation. lift file is only for contigs not scaffolds. # use nohead option otherwise psl header added at the top of each file. # need to add the blastz params header zcat ./pslPartsNotLifted/part958.lst.psl.gz | head -3 > header # first lift to pseudo-contig level and then to chroms foreach f (./pslPartsNotLifted/*.psl.gz) set g=$f:r:t zcat $f | liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted.psl \ /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft carry stdin liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted2.psl \ /cluster/data/tetNig1/jkStuff/liftAll.lft carry $dir/liftedPsl/${g}.lifted.psl cat header $dir/liftedPsl/${g}.lifted2.psl > $dir/liftedPsl/${g} rm $dir/liftedPsl/${g}.lifted* end # check a couple of files and see that they have the correct number of lines # then move the contents of this directory to pslParts mkdir $dir/pslParts foreach f ($dir/liftedPsl/*.psl) gzip $f mv ${f}.gz $dir/pslParts/ end # carry on with doBlastzChainNet.pl from the chaining step ssh hgwdev cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun cp DEF DEF.tetraContigs # edit DEF file so that tetNig1 now has a 2bit file of the chroms and # scaffolds for randoms in the CTGDIR and also there is a lift file # for the scaffolds. cat << '_EOF_' > DEF # zebrafish (danRer3) vs. tetraodon (tetNig1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin ALIGN=blastz-run BLASTZ=blastz.v7.x86_64 BLASTZ_M=50 BLASTZ_H=2500 BLASTZ_Q=/cluster/data/blastz/HoxD55.q #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified BLASTZ_ABRIDGE_REPEATS=0 # TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM and scaffolds SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft SEQ1_RMSK= # lineage-specific repeats # we don't have that information for these species SEQ1_SMSK= SEQ1_FLAG= SEQ1_LIMIT=30 SEQ1_IN_CONTIGS=0 # 0.5 Mb chunk for target with 5 kb overlap SEQ1_CHUNK=500000 SEQ1_LAP=5000 # QUERY - Tetraodon (tetNig1) # soft-masked chroms, and scaffolds for randoms SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=1000000000 SEQ2_LAP=0 BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes SEQ2_LEN=$BASE/S2.len SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes TMPDIR=/scratch/tmp #DEBUG=1 '_EOF_' # if it does not exist already, make the file of sizes for the tetNig1 # chroms and scaffolds. twoBitInfo \ /san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit \ /san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes # Also, need to change the sequence sizes file for tetNig1 to the # chrom sizes and not the scaffolds and contigs sizes. cp S2.len S2contigsAndScafs.len cp /cluster/data/tetNig1/chrom.sizes S2.len # then run doBlastzChainNet.pl script again nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -continue chainRun \ -chainMinScore=5000 \ `pwd`/DEF >& doChains.log & # Start: Fri Oct 14 17:47 Finished: Oct 14 17:57 # crashed as one job failed after 4 retries, problem is that # part958.lst.psl.gz is not recognized as a psLayout file. It is empty # except for parameter comment lines so it can be ignored. # Also, need to change the sequence sizes file for tetNig1 to the # chrom sizes and not the scaffolds and contigs sizes. ssh pk cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain/run/ para time > run.time ssh hgwdev cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun # crashes while doing chainMerge so add a flag into DEF file to indicate # that the genomes are in scaffolds so there is a large number of chain # files. Changed doBlastzChainNet.pl so that if this flag is seen then # the chain files are concatentated and then chainSort is used to sort # the resulting chain file by score and chainMergeSort is used to renumber # the chain IDs so that they are unique. chainMergeSort expects chain # files sorted by score as input. # add this line to the DEF file: GENOME_IN_SCAFFOLDS=1 nice ./doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -continue chainMerge \ -chainMinScore=5000 \ `pwd`/DEF >& doChainMergeNet.log & # Start: Wed Oct 19 12:52 Finish: Oct 19 13:13 # Add a trackDb.ra entry for chainTetNig1 and netTetNig1 and add html # pages. Modify track descriptions to describe the process using # scaffolds for danRer3 chrNA and chrUn and the fact that dynamic # masking was used for the Blastz alignments Edit the README for # the downloads to add in information about using scaffolds for Blastz # for danRer3 chrNA and chrUn and for tetNig1 random unordered chroms, # and how the tetNig1 genome was aligned as a file of contigs for chroms # and scaffolds for randoms for the Blastz alignments and so that # each danRer3 chunk was aligned with the whole of the tetraodon # genome to take advantage of dynamic masking (M=50). # Finally, run a doBlastzChainNet.pl swap for this to create danRer3 # chains and net tracks on tetNig1 - see makeTetNig1.doc. # featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1Link -enrichment # refGene:cds 0.746%, chainTetNig1Link 7.167%, both 0.672%, cover 90.17%, # enrich 12.58x # featureBits -chrom=chr2 danRer2 refGene:cds chainTetNig1Link -enrichment # refGene:cds 0.750%, chainTetNig1Link 4.463%, both 0.621%, cover 82.84%, # enrich 18.56x # so better coverage for danRer3 but less enrichment than for danRer2. # Make the download files for all.chain, over.chain and net again as these # files have been removed. Put the files on /cluster/data rather than the # san so that they are not moved again. (hartera, 2005-11-17) ssh kolossus cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain chainMergeSort ./run/chain/*.chain | nice gzip -c \ > danRer3.tetNig1.all.chain.gz # copy over.chain file from bedOver directory to axtChain directory cp /cluster/data/danRer3/bed/bedOver/danRer3.tetNig1.over.chain.gz \ /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain/ # recreate net file # make noClass.net #Make nets ("noClass", i.e. without rmsk/class stats which are added later) chainPreNet danRer3.tetNig1.all.chain.gz ../S1.len ../S2.len \ stdout | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout \ /dev/null | netSyntenic stdin noClass.net # memory usage 251383808, utime 562 s/100, stime 41 # create net file ssh hgwdev cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain netClass -verbose=0 -noAr noClass.net danRer3 tetNig1 danRer3.tetNig1.net # compress net file gzip danRer3.tetNig1.net # Move these files to /cluster/data and remake download links as the # san is not a permanent storage space. mv /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun \ /cluster/data/danRer3/bed/blastz.tetNig1/ # Then change the symlinks in the downloads directory to point to the files # on /cluster/data cd /usr/local/apache/htdocs/goldenPath/danRer3/vsTetNig1/axtNet set runDir=/cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun rm *.gz foreach f ($runDir/axtNet/*.axt.gz) ln -s $f . end cd .. rm *.gz foreach f ($runDir/axtChain/*.gz) ln -s $f end # remake the md5sum file rm md5sum.txt md5sum *.gz */*.gz > md5sum.txt # Test Runs for chr2 and chrUn cd /cluster/data/danRer3/bed/blastz.tetNig1 mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run # create blastz output directory mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out mkdir /san/sanvol1/scratch/danRer3/chrUnand2 cd /san/sanvol1/scratch/danRer3/chrUnand2 cp ../nib/chr2.nib ../nib/chrUn.nib . rsync -a --progress /cluster/bluearc/tetNig1/contigs/tetNig1Contigs.2bit \ /san/sanvol1/scratch/tetNig1/contigs/ cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run cat << '_EOF_' > DEF # zebrafish (danRer3) vs. tetraodon (tetNig1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin ALIGN=blastz-run BLASTZ=blastz.v7.x86_64 BLASTZ_M=50 BLASTZ_H=2500 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified BLASTZ_ABRIDGE_REPEATS=0 # TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM SEQ1_DIR=/san/sanvol1/scratch/danRer3/chrUnand2 SEQ1_RMSK= # lineage-specific repeats # we don't have that information for these species SEQ1_SMSK= SEQ1_FLAG= SEQ1_IN_CONTIGS=0 # 0.5 Mb chunk for target SEQ1_CHUNK=500000 SEQ1_LAP=500 # QUERY - Tetraodon (tetNig1) # soft-masked 500 kb contigs for chroms, scaffolds for randoms SEQ2_DIR=/san/sanvol1/scratch/tetNig1/contigs/tetNig1Contigs.2bit SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=1000000000 SEQ2_LAP=0 BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF cp /cluster/data/danRer3/chrom.sizes ./S1.len twoBitInfo \ /san/sanvol1/scratch/tetNig1/contigs/tetNig1Contigs.2bit ./S2.len nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -stop cat \ -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \ -chainMinScore=5000 \ `pwd`/DEF >& do.log & # PID: 4890 Start: Thu Sep 29 14:50 # ran quickly, 30 mins # crashed as some jobs crashed and failed after 4 retries so # push them again. nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -continue cat \ -stop cat \ -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \ -chainMinScore=5000 \ `pwd`/DEF >& doCat.log & # Took a couple of minutes # need to lift up the contigs to chrom level for tetNig1 # liftUp contig files for tetraodon query: # if file is empty, then liftUp gets stuck reading commented lines # so make a list of files which contain alignment data and not just # commented lines starting with # (blastz parameters) foreach f (./pslPartsNotLifted/*.psl.gz) zcat $f | awk '{if ($1 !~ /#/) print "'$f'";}' >> pslParts.lst end sort pslParts.lst | uniq > pslPartsNotEmpty.lst cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run mv pslParts pslPartsNotLifted mkdir /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run/liftedPsl set dir=/san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run # use carry for "how" as this will carry items not in liftSpec to dest # file without translation. lift file is only for contigs not scaffolds. # use nohead option otherwise psl header added at the top of each file. # need to add the blastz params header zcat \ ./pslPartsNotLifted/chrUn.nib:chrUn:99500000-100000500.psl.gz \ | head -3 > header # first lift to pseudo-contig level and then to chroms foreach f (`cat pslPartsNotEmpty.lst`) set g=$f:r:t zcat $f | liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted.psl \ /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft warn stdin liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted2.psl \ /cluster/data/tetNig1/jkStuff/liftAll.lft warn $dir/liftedPsl/${g}.lifted.psl cat header $dir/liftedPsl/${g}.lifted2.psl > $dir/liftedPsl/${g} rm $dir/liftedPsl/${g}.lifted* end mv liftedPsl pslParts # need to gzip these again foreach f (./pslParts/*.psl) gzip $f end # then carry on with chaining for these danRer3 NA and Un scaffolds # tetNig1.2bit has full chroms for ordered chroms # and randoms as scaffolds cp DEF DEF.contigs # copy over 2bit file with chroms for tetNig1 if not # there already. mv S2.len S2.contigs twoBitInfo \ /san/sanvol1/scratch/tetNig1/tetNig1.2bit ./S2.len nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -continue chainRun \ -stop net \ -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \ -chainMinScore=5000 \ `pwd`/DEF >& doNet.log & # PID 1117 Start: Thu Sep 29 16:20 Finished: 16:24 # crashed: says it can't find [danRer3.tetNig1.]all.chain[.gz] but it # is there. nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -continue net \ -stop net \ -chainMinScore=5000 \ `pwd`/DEF >& doNet2.log & # Took 1 minute # TO DO: load tables cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run/axtChain/chain foreach f (*.chain) set c=$f:r hgLoadChain danRer3 ${c}_chainTetNig1NoScafs $f end cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run/axtChain # add gap/repeat stats to net file using db tables netClass -verbose=0 -noAr noClass.net danRer3 tetNig1 danRer3.tetNig1.net # load nets netFilter -minGap=10 danRer3.tetNig1.net \ | hgLoadNet -verbose=0 danRer3 netTetNig1NoScafs stdin # then need to load chains and net into browser with a different name # featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1Link -enrichment # refGene:cds 0.742%, chainTetNig1Link 7.166%, both 0.670%, cover 90.26%, # enrich 12.60x # featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1NoScafsLink -enrichment # refGene:cds 0.742%, chainTetNig1NoScafsLink 7.171%, both 0.670%, cover 90.30%, enrich 12.59x # featureBits -chrom=chrUn danRer3 refGene:cds chainTetNig1Link -enrichment # refGene:cds 0.497%, chainTetNig1Link 6.175%, both 0.441%, cover 88.68%, enrich 14.36x # featureBits -chrom=chrUn danRer3 refGene:cds chainTetNig1NoScafsLink -enrichment # refGene:cds 0.497%, chainTetNig1NoScafsLink 6.179%, both 0.441%, cover 88.67%, enrich 14.35x Rows in chainTetNig1Link: tetNig1 tetNig1NoScafs chr2 308576 303236 chrUn 1133922 1114061 #nets: # featureBits -chrom=chr2 danRer3 refGene:cds netTetNig1 -enrichment # refGene:cds 0.742%, netTetNig1 62.053%, both 0.715%, cover 96.34%, enrich 1.55x # featureBits -chrom=chr2 danRer3 refGene:cds netTetNig1NoScafs -enrichment # refGene:cds 0.742%, netTetNig1NoScafs 63.095%, both 0.717%, cover 96.63%, enrich 1.53x # featureBits -chrom=chrUn danRer3 refGene:cds netTetNig1 -enrichment # refGene:cds 0.497%, netTetNig1 48.803%, both 0.477%, cover 95.87%, enrich 1.96x # featureBits -chrom=chrUn danRer3 refGene:cds netTetNig1NoScafs -enrichment # refGene:cds 0.497%, netTetNig1NoScafs 49.207%, both 0.478%, cover 96.01%, enrich 1.95x # Rows in netTetNig1 # tetNig1 tetNig1NoScafs chr2 17370 17415 chrUn 56259 56360 # featureBits -chrom=chr2 danRer2 refGene:cds chainTetNig1Link -enrichment # refGene:cds 0.739%, chainTetNig1Link 4.463%, both 0.617%, cover 83.44%, # enrich 18.69x # featureBits -chrom=chr2 danRer3 refGene:cds chainNoHoxD55TetNig1Link -enrichment # refGene:cds 0.668%, chainNoHoxD55TetNig1Link 4.815%, both 0.587%, # cover 87.95%,enrich 18.27x # featureBits -chrom=chr2 danRer3 refGene:cds chainHoxD55TetNig1Link -enrichment # refGene:cds 0.668%, chainHoxD55TetNig1Link 7.846%, both 0.612%, cover 91.71%, enrich 11.69x # HoxD55.q with mm6 parameters but H=2500: # featureBits -chrom=chr2 danRer3 refGene:cds chainHoxD55v2TetNig1Link -enrichment # refGene:cds 0.668%, chainHoxD55v2TetNig1Link 7.400%, both 0.601%, # cover 90.10%,enrich 12.18x # if H=2000 is used, one job does not finish for blastz after a day. # makes little difference if use mm6 parameters # Database Table Number of chains # danRer2 chr2_chainTetNig1 21176 # danRer3 chr2_chainNoHoxD55TetNig1 16076 # danRer3 chr2_chainHoxD55TetNig1 23951 # danRer3 chr2_chainHoxD55v2TetNig1 21378 # also there are more lower scoring chains with HoxD55 alone than for # no HoxD55 or using the mm6 parameters with HoxD55. However, using HoxD55 # seems to increase the number of higher scoring chains. # BLASTZ, CHAIN AND NET FOR OPOSSUM (monDom2) (DONE, 2005-10-18, hartera) # MOVE ALL THE RUN FILES AND OUTPUT FROM THE SAN RUN DIRECTORY TO A DIRECTORY # ON /cluster/data AS THIS IS MORE PERMANENT. (DONE, 2005-11-17, hartera). ssh kkstore02 mkdir -p /cluster/data/danRer3/bed/blastz.monDom2.2005-10-07 cd /cluster/data/danRer3/bed ln -s blastz.monDom2.2005-10-07 blastz.monDom2 # create a 2 bit for danRer3 with all chroms (1-25 and M) and the # scaffolds for NA and Un. cd /cluster/data/danRer3 faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \ Un/scaffoldUn.fa NA/scaffoldNA.fa danRer3ChrUnNAScafs.2bit ssh hgwdev mkdir -p /san/sanvol1/scratch/danRer3/ mv /cluster/data/danRer3/danRer3ChrUnNAScafs.2bit \ /san/sanvol1/scratch/danRer3/ # make output and run directories mkdir -p /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun mkdir -p /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut cd /cluster/data/danRer3/bed/blastz.monDom2 ln -s /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun ln -s /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut cd chromsAndScafsRun cat << '_EOF_' > DEF # zebrafish (danRer3) vs opossum (monDom2) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin ALIGN=blastz-run BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=10000 BLASTZ_K=2200 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified BLASTZ_ABRIDGE_REPEATS=0 # TARGET - zebrafish (danRer3) soft-masked chroms 1-25 and chrM, and # scaffolds for NA and Un SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft SEQ1_RMSK= # lineage-specific repeats # we don't have that information for these species SEQ1_SMSK= SEQ1_FLAG= SEQ1_LIMIT=30 SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Opossum (monDom2) # soft-masked sequence in scaffolds SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit SEQ2_SMSK= SEQ2_FLAG= SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes SEQ2_LEN=$BASE/S2.len TMPDIR=/scratch/tmp #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF cp /cluster/data/danRer3/chrom.sizes S1.len twoBitInfo /san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit \ chromsUnNAScafs.sizes cp /cluster/data/monDom2/chrom.sizes S2.len # now do the run nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -stop cat \ -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut \ -chainMinScore=5000 \ `pwd`/DEF >& do.log & # chromsAndScafs PID 19811 Start: Fri Oct 7 15:16 # Friday Oct 14th 10:30 - # Checking finished jobs # crashed: 3271 # ranOk: 90399 # failed 4 times: 3271 # total jobs in batch: 93670 # more than 9000 crashed on one machine: kkr10u19.kilokluster.ucsc.edu # so remove this machine. # run again with para push -retries=20 # still 7 jobs crashed so repush again with para push -retries=20 # Now try using the SEQ1_LIMIT option in the DEF file to limit the # number of sequences in a partition file to 30. Before, there would # be a lot of small sequences in a partition file that would take a long # time to run. # finished around 21:40 Fri Oct 14 Took about 7 days, maybe a little less # as a number of jobs crashed last night. # carry on from the cat step to the end ssh pk cd /cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun/run.blastz para time > run.time # para time # Completed: 93670 of 93670 jobs # CPU time in finished jobs: 55738486s 928974.77m 15482.91h 645.12d 1.767 y # IO & Wait Time: 1276213s 21270.22m 354.50h 14.77d 0.040 y # Average job time: 609s 10.14m 0.17h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1470s 24.50m 0.41h 0.02d # Submission to last job: 627367s 10456.12m 174.27h 7.26d ssh hgwdev cd /cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -continue cat \ -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut \ -chainMinScore=5000 \ `pwd`/DEF >& doCatChainNet.log & # Took 13 minutes to cat then chain. It had 70 jobs crash at the chaining # step. These are empty files - when axtChain opens them using # pslxFileOpenWithMeta (in psl.c) it aborts as the file is empty apart from # meta data and therefore not psLayout format. Ignore these crashed jobs # for now and then modify psl.c so it will skip over these empty files. # Next, the script crashed on the chainMergeSort step # since there are too many chains due to opossum being scaffold-based. # chainMergeSort opens all the files at once. # Added a flag to the DEF file to show if an assembly is scaffold-based: # GENOME_IN_SCAFFOLDS=1 # and then modify doBlastzChainNet.pl so that if it sees this flag, then # chains are merged into one file then run chainSort to sort the file # and then chainMergeSort to change the IDs so they are unqiue. # chainMergeSort assumes that the input files are sorted already. nice ./doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -continue chainMerge \ -chainMinScore=5000 \ `pwd`/DEF >& doChainMergeNet.log & # Start: Tue Oct 18 12:55 Finished: 15:02 # add trackDb.ra entries for monDom2 chain and net tracks and add html for # these tracks too. Modified html pages to describe the process using # scaffolds for chrUn and chrNA for danRer3. # Modify the downloads README.txt to include a description of the process # of running blastz with scaffolds for the chrUn and chrNA unordered chroms. # Finally run the swap for this to get danRer3 chains and net tracks # on monDom2 - see makeMonDom2.doc. # Move the run directory files to /cluster/data and remake download links # as the san is not a permanent storage space (hartera, 2005-11-17) ssh hgwdev mv /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun \ /cluster/data/danRer3/bed/blastz.monDom2/ # then change the symlinks in the downloads directory to point to the files # on /cluster/data cd /usr/local/apache/htdocs/goldenPath/danRer3/vsMonDom2/axtNet set runDir=/cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun rm *.gz foreach f ($runDir/axtNet/*.axt.gz) ln -s $f . end cd .. rm *.gz foreach f ($runDir/axtChain/*.gz) ln -s $f end # remake the md5sum file rm md5sum.txt md5sum *.gz */*.gz > md5sum.txt # RADIATION HYBRID (RH) MAP TRACK (DONE, 2005-09-06, hartera) # Data from Leonard Zon's lab at the Childrens Hospital, Boston # Provided by Anhua Song: asong@enders.tch.harvard.edu # Updated data provided on 2006-02-23 ssh kkstore02 mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap cd /cluster/data/danRer3/bed/ZonLab/rhMap # download data from e-mail to this directory # new sequences (2006-02-23) are available unzip rhSequenceSubmit022306.zip # sequences are in rhSequenceSubmit022306/rhSequenceSubmitSeq022306.txt # primer information is in rhSequenceSubmit022306/rhSequenceSubmit022306.txt mv rhSequenceSubmitSeq022306.txt rhMap022306.fa mv rhSequenceSubmit022306.txt rhMapPrimers022306.txt # first remove ^M from end of lines dos2unix rhMap022306.fa dos2unix rhMapPrimers022306.txt grep '>' rhMap022306.fa | wc -l # 11514 wc -l rhMapPrimers022306.txt # 13438 rhMapPrimers022306.txt grep '>' rhMap022306.fa > rhMap.names # remove '>' from names and grab first field perl -pi.bak -e 's/>//' rhMap.names awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \ > rhMap.namesOnly.sort awk 'BEGIN {FS="|"} {print $1;}' rhMapPrimers022306.txt | sort | uniq \ > rhMapPrimers.namesOnly.sort wc -l *.sort # 11514 rhMap.namesOnly.sort # 13436 rhMapPrimers.namesOnly.sort (after removing blank line) # There are no replicates this time for rhMap sequences but there are for # the primers set: awk 'BEGIN {FS="|"} {print $1;}' rhMapPrimers022306.txt | sort | uniq -c \ | sort -nr > rhMapPrimers.names.count # These replicates are blank lines so there are no replicates # Total 11514 sequences in rhMap, but 13436 primer sets # 11527 rhMap.namesOnly.sort # 13436 rhMapPrimers.namesOnly.sort # get a list of headers from the FASTA file grep '>' rhMap022306.fa > rhMap.headers awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq # BAC_END # EST # GENE # SSLP # STS # 5 types of sequence awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq # BACends # Custom # Insertion_Mutant # Insertion_Mutants # MGH # NCBI # Sanger SG # Sequencing_Project # ThisseClone # Thisse_Clone # other_zfEst # wu_zfEst # wz # Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone; # So there are 11 different sources. awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq # CHBG # MPIEB # There are 2 sequences with problem primers. E-mailed Peter Song about # these and he suggested to delete thoser primers: # >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.| # >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A| # edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers. # need to reformat FASTA headers so they are in the format: # NAME.SOURCE.TYPE.ORIGIN # Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone # so change these to have the same name. Also shorten Sanger SG to Shotgun. perl -pi.bak -e 's/Insertion_Mutant/InsertMut/' rhMap022306.fa perl -pi.bak -e 's/Insertion_Mutants/InsertMut/' rhMap022306.fa perl -pi.bak -e 's/Sanger SG/Shotgun/' rhMap022306.fa perl -pi.bak -e 's/ThisseClone/Thisse/' rhMap022306.fa perl -pi.bak -e 's/Thisse_Clone/Thisse/' rhMap022306.fa perl -pi.bak -e 's/Sequencing_Project/Seqproj/' rhMap022306.fa # use a script to reformat the names for the FASTA headers to the format # >NAME.SOURCE where name is the first field separated by "|" and source # is the 9th field. The source is used to make the name unique. Some # of these names are BAC ends that occur in the BAC ends track so there # are name clashes in the seq table if the names are not made unique. # Also make the name upper case as for those for the danRer1 and danRer2 # RH map. cat << '_EOF_' > rhFix #!/usr/bin/awk -f #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG| /^>/ { split(toupper($0), a, "\\|"); print a[1]"."a[9]; next; } /^[0-9]+ / { $0 = $2; } { print $0; } '_EOF_' # << keep emacs coloring happy chmod +x rhFix rhFix rhMap022306.fa > rhMap.fa # Blat sequences vs danRer3 genome ssh pk mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun # make output directory mkdir -p /san/sanvol1/scratch/danRer3/rhMap/psl cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun ln -s /san/sanvol1/scratch/danRer3/rhMap/psl . # copy input to the san cp \ /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/rhMap.fa \ /san/sanvol1/scratch/danRer3/rhMap/ # do the blat run to align RH map sequences to danRer3 and do separate # runs for chroms and scaffolds from chrUn and chrNA ls -1S /san/sanvol1/scratch/danRer3/rhMap/rhMap.fa > rhMap.lst ls -1S /san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/chr[0-9M]*.fa \ > genome.lst # use the individual scaffolds for chrUn and chrNA alignments foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/Zv5_*.fa) ls -1S $f >> genome.lst end wc -l genome.lst # 15149 genome.lst cp -p /cluster/data/danRer3/bed/ooc/danRer3_10.ooc \ /san/sanvol1/scratch/danRer3 # try same parameters as for BAC ends cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/blat {check in line+ $(path1)} {check in line+ $(path2)} -tileSize=10 -ooc=/san/sanvol1/scratch/danRer3/danRer3_10.ooc {check out line+ /san/sanvol1/scratch/danRer3/rhMap/psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy # gensub2 genome.lst rhmap.lst gsub spec gensub2 genome.lst rhMap.lst gsub spec para create spec para try, check, push, check etc. # para time # Completed: 15149 of 15149 jobs # CPU time in finished jobs: 16326s 272.09m 4.53h 0.19d 0.001 y # IO & Wait Time: 41360s 689.34m 11.49h 0.48d 0.001 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 61s 1.02m 0.02h 0.00d # Submission to last job: 263s 4.38m 0.07h 0.00d cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun # Make & check the psl table # Do sort, best in genome filter, and convert to chromosome coordinates # to create rhmap.psl pslSort dirs raw.psl tmp psl pslReps -nearTop=0.0001 -minAli=0.80 -minCover=0.20 raw.psl \ contig.psl /dev/null # There are 11514 sequences in total in rhMap.fa # Experimented with different parameters: # little difference if STS markers BLAT parameters were used # i.e. -ooc=11.ooc and -stepSize=5. # For Blat parameters used above (-ooc=10.ooc and -tileSize=10), try # different pslReps parameters using minCover=0.40 and nearTop=0.0001: # minAli=0.96, 83%, most aligned sequence has 11 alignments. # minAli=0.90, 88% align, most aligned seq has 11 alignments # minAli=0.80, 88%, 10120 sequences aligned. # at minAli=0.50, there are still 10120 sequences aligned so those that # are not aligning must have very low sequence identity. Took a look at # some that are not aligning e.g. 2217C, 2791C and these are not passing # the minCover=0.40 criterion. Some sequences have Ns in them too # e.g. ZC92E13.YBF so has a lot of short alignments that do not pass # the minCover parameter. Lowering minCover increases the number of # sequences aligned: # minAli=0.80, minCover=0.20, there are 10850 (94%) of sequences aligned. # minAli=0.90, minCover=0.20, there are 10837 (94%) of sequences aligned # with 21 less alignment than for minAli=0.80. # Most alignments for one sequence is 99, second most is 11. There are # about 1851 sequences with more than > 1 alignment (many of these # have 2 alignments) while for minAli=0.80 and minCover=0.40, there were # 1266 sequences with more than 1 alignment. With lower minCover, more # sequences align, but there are more sequences with higher numbers of # multiple alignments. At minCover=0.0, there is 1 sequence with 1353 # alignments, the second largest number of alignments for 1 sequence # is 532, then 329 etc. So use minAli=0.80 and minCover=0.20 to get the # most sequences aligned without having sequences aligning too many times. # at minAli=0.80 and minCov=0.20, there are 10850 sequences aligned (94%). # 88% of sequences were aligned for danRer2. # merge together liftAll and scaffolds lift then lift psl to chrom level. cat /cluster/data/danRer3/liftSuperToChrom/liftNAandUnScaffoldsToChrom.lft \ /cluster/data/danRer3/jkStuff/liftAll.lft \ > /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft liftUp rhMap.psl \ /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft \ warn contig.psl # Got 30168 lifts pslCheck rhMap.psl # psl is ok # Load sequence alignments into database. ssh hgwdev cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun # drop old table and reload (hartera, 2006-03-26) echo "drop table rhMap;" | hgsql danRer3 hgLoadPsl danRer3 rhMap.psl # cleanup rm -r /san/sanvol1/scratch/danRer3/rhMap/psl rm psl para.results batch batch.bak spec rm -r err gzip *.psl # Copy sequences to gbdb if they are not already there. mkdir -p /gbdb/danRer3/rhMap ln -s \ /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/rhMap.fa \ /gbdb/danRer3/rhMap/rhMap022306.fa # then add sequences to database: # reloaded (hartera, 2006-03-26) hgLoadSeq danRer3 /gbdb/danRer3/rhMap/rhMap022306.fa # Note: first time these sequences were loaded there was a problem # 2215 are not loaded into database, these all # have names with extensions like .YB, .YC etc. so remove from extFile # and seq. Sequences with the same IDs are already in the seq table # for the BAC ends tracks so need to make these RH map names unique. hgsql -e 'delete from seq where extFile = 736113;' danRer3 hgsql -e 'delete from extFile where id = 736113;' danRer3 hgsql -e 'update history set errata = "Removed sequences. Error so not all asequences loaded." where ix = 23;' danRer3 # Check that all the headers from rhMap.headers are also in the primers # file which seems to contain the same headers from the FASTA file # as well as additional markers. ssh kkstore02 cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306 perl -pi.bak -e 's/>//' rhMap.headers sort rhMap.headers > rhMap.headers.sort sort rhMapPrimers022306.txt > rhMapPrimers.sort wc -l *.sort # 11514 rhMap.headers.sort # 13437 rhMapPrimers.sort comm -12 rhMap.headers.sort rhMapPrimers.sort | wc -l # 11514 in common # so all FASTA headers from rhMap022306.fa are in the primers file # Get headers again from rhMap.fa file as the names of the sources have # been changed. Parse out information from headers to add to an rhMapInfo # table so that this information can be displayed on the details page for # the RH map markers. # Fields: 1 - name, 2 - linkage group (chrom), 3 - position number on the # RH map for that linkage group, 4 - distance (in cR) from the # top of a linkage group, 4 - position number in entire RH map (ordered # from LG1 to LG25, 5 - type of marker (SSLP, BAC_END, EST, GENE, STS), # 9 - source, 10 - institute that mapped the marker, 11 - 5' forward primer, # 12 - 3' reverse primer. # Sort headers by linkage group and by position grep '>' rhMap022306.fa > rhMap.headers2 # then use the rhMap.headers2 file to extract the marker information # and to reformat the names for the FASTA headers to the format # >NAME.SOURCE where name is the first field separated by "|" and source # is the 9th field so that names in the rhMap and rhMapInfo tables are # the same. The source is used to make the name unique. cat << '_EOF_' > getRhInfo #!/usr/bin/awk -f #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG| /^>/ { sub(/>/,"",$0); split(toupper($0), a, "\\|"); print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12]; next; } '_EOF_' # << keep emacs coloring happy chmod +x getRhInfo getRhInfo rhMap.headers2 > rhMapInfo.tab # Sort headers by linkage group (LG) and by position sort -k 2,2 -k 3,3n rhMapInfo.tab > rhMapInfoSorted.tab wc -l rhMapInfoSorted.tab # 11514 rhMapInfoSorted.tab ssh hgwdev # Create a table with RH map item information including type, source, # origin and primer sequences. cat << 'EOF' > ~/kent/src/hg/lib/rhMapInfo.as table rhMapInfo "Radiation Hybrid map information" ( string name; "Name of Radiation Hybrid (RH) map marker" string linkageGp; "Linkage group to which the marker was mapped" uint position; "Position number in RH map for this linkage group" uint distance; "Distance from the top of linkage group (cR)" string markerType; "Type of marker" string source; "Source of marker" string mapSite; "Institution that mapped the marker" string leftPrimer; "Forward primer sequence" string rightPrimer; "Reverse primer sequence" ) 'EOF' # << happy emacs # create .sql, .c and .h files using autoSql autoSql rhMapInfo.as rhMapInfo mv rhMapInfo.h ../inc # rhMapInfo.sql - name is the primary key # commit rhMapInfo.as, .sql, .c and .h files to CVS. # create and load table (Reloaded: hartera, 2006-03-26) cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306 echo "drop table rhMapInfo;" | hgsql danRer3 hgsql danRer3 < ~/kent/src/hg/lib/rhMapInfo.sql hgsql -e \ 'load data local infile "rhMapInfoSorted.tab" into table rhMapInfo' danRer3 # edit danRer3/trackDb.ra to add rhMap track and the search spec. # add and edit rhMap.html to describe the info data. # edit ~/kent/src/hg/hgc/hgc.c so that the rhMapInfo data is displayed # on the details page for each marker - edit doRHmap function. # Add a rule to all.joiner to check that all names in rhMap also appear # in rhMapInfo # Add a rule to all.joiner to check that all names in rhMap also appear # in rhMapInfo.. # commit these to CVS. # Changed termRegex for rhMap search in trackDb.ra so that it works # for all IDs. (2006-04-19, hartera) # SELF BLASTZ, CHAIN, NET, AXTNET, MAFNET AND DOWNLOADS # (DONE, 2005-12-02, hartera) ssh pk mkdir -p /cluster/data/danRer3/bed/blastzSelf.2005-11-30 cd /cluster/data/danRer3/bed ln -s blastzSelf.2005-11-30 blastzSelf cd /cluster/data/danRer3/bed/blastzSelf # make run directory on the san mkdir -p /san/sanvol1/scratch/danRer3/blastzSelf/chromsRun ln -s /san/sanvol1/scratch/danRer3/blastzSelf/chromsRun # make 2 bit file of chr1-25 and chrM cd /cluster/data/danRer3 faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \ /san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit cd /cluster/data/danRer3/bed/blastzSelf/chromsRun twoBitInfo /san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit S1.len cp S1.len S2.len cat << '_EOF_' > DEF # zebrafish vs zebrafish export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_L=5000 BLASTZ_H=2500 BLASTZ_M=50 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Zebrafish danRer3 SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit SEQ1_IN_CONTIGS=0 SEQ1_LIMIT=30 SEQ1_CHUNK=500000 SEQ1_LAP=5000 # QUERY: Zebrafish danRer3 SEQ2_DIR=/san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit SEQ2_SELF=1 SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=1800000000 SEQ2_LAP=0 BASE=/san/sanvol1/scratch/danRer3/blastzSelf/chromsRun DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len TMPDIR=/scratch/tmp '_EOF_' chmod +x DEF ssh hgwdev cd /cluster/data/danRer3/bed/blastzSelf/chromsRun nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -chainMinScore=5000 \ -chainLinearGap=medium \ `pwd`/DEF >& do.log & # Start: Wed Nov 30 17:07 Finish: Thur Dec 1 06:51 # Crashed at downloads step as these exist from previous run so remove rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsSelf # para time (blastz) # Completed: 2425 of 2425 jobs # CPU time in finished jobs: 4783120s 79718.66m 1328.64h 55.36d 0.152 y # IO & Wait Time: 108014s 1800.24m 30.00h 1.25d 0.003 y # Average job time: 2017s 33.62m 0.56h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2762s 46.03m 0.77h 0.03d # Submission to last job: 14993s 249.88m 4.16h 0.17d # para time (axtChain) # Completed: 26 of 26 jobs # CPU time in finished jobs: 96405s 1606.74m 26.78h 1.12d 0.003 y # IO & Wait Time: 731s 12.19m 0.20h 0.01d 0.000 y # Average job time: 3736s 62.27m 1.04h 0.04d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 7405s 123.42m 2.06h 0.09d # Submission to last job: 7411s 123.52m 2.06h 0.09d # Carry on from downloads step. cd /cluster/data/danRer3/bed/blastzSelf/chromsRun nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kolossus \ -continue download \ -chainMinScore=5000 \ -chainLinearGap=medium \ `pwd`/DEF >& doDownloads.log & # Took 2 minutes. # check trackDb entry exists. Put html at danRer3 level of trackDb and edit # these and the downloads README to state that chrNA and chrUn were not # aligned for this track. # Remove extra downloads made by script: # Only chain track is pushed to the RR so remove the net and axtNet # downloads, re-make md5sum.txt and edit README.txt accordingly. ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/danRer3/vsSelf rm danRer3.danRer3.net.gz md5sum.txt rm -r axtNet md5sum *.gz > md5sum.txt # Original run with loose linear gap matrix and scaffolds for chrNA and chrUn # done 2005-10-26. # filtering chains from above on minScore 10,000. done 2005-11-18 # Using the medium linear gap matrix for axtChain. minScore=5,000. # done 2005-11-30. # chainSelf - loose linearGap matrix, filtered minScore=5000 # chainSelfFilt10k - loose linearGap matrix, filtered minScore=10000 # chainSelfMedGap - medium linearGap matrix, filtered minScore=5000 # featureBits -chrom=chr1 danRer3 refGene:cds chainSelfLink -enrichment # refGene:cds 0.743%, chainSelfLink 65.056%, both 0.560%, cover 75.29%, # enrich 1.16x # featureBits -chrom=chr1 danRer3 refGene:cds chainSelfFilt10kLink -enrichment # refGene:cds 0.743%, chainSelfFilt10kLink 64.019%, both 0.554%, cover 74.54%, # enrich 1.16x # number of rows in tables for chr1: # chainSelf 941416 # chainSelfFilt10k 530292 # chainSelfMedGap 997525 # chainSelfLink 9110071 # chainSelfFilt10kLink 7226815 # chainSelfMedGapLink 9149100 # featureBits -chrom=chr1 danRer3 refGene:cds chainSelfMedGapLink -enrichment # refGene:cds 0.743%, chainSelfMedGapLink 64.525%, both 0.549%, cover 73.80%, # enrich 1.14x # so the medium linearGap matrix increases the number of chains by about 5% # but coverage is little different. # for the chains filtered with minScore=10000 # 12192577 chains out of 17592225 do not have chrNA or chrUn as query or # target which is about 69%. # 12192577 out of 12807964 do not have chrNA or chrUn as the query for just # chr1-25 and chrM which is about 95%. # so make the chains without chrNA and chrUn and using the medium linearGap # matrix which is for species that are not so distant. # 2005-12-02 # medium linearGap matrix for axtChain, minScore=5000 and no chrNA or chrUn. # number of rows in tables for chr1: # chainSelf 943482 # chainSelfLink 8707208 # featureBits -chrom=chr1 danRer3 refGene:cds chainSelfLink -enrichment # refGene:cds 0.743%, chainSelfLink 60.876%, both 0.503%, cover 67.65%, # enrich 1.1 # coverage dropped about 8% without chrNA and chUn alignments so not a # huge difference. # BLASTZ SWAP FOR HUMAN (hg18) (DONE, 2005-12-24, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS ssh hgwdev # Blastz requires lineage-specific repeats # Treat all repeats as lineage-specific for all alignments except those # involving danRer3 chrUn and chrNA where the dynamic masking # functionality of Blastz was used. hg18 random chroms were aligned # as contigs and danRer3 chrNA and chrUn were aligned as scaffolds - # see zebrafish (danRer3) chain and net track section in makeHg18.doc # for further details. # do swap of hg18 vs. danRer3 chain and net alignments to # create danRer3 vs. hg18 see makeHg18.doc for details. cd /cluster/data/hg18/bed/blastz.danRer3/chromsRun # edit DEF file and add location of danRer3 and hg18 lineage-specific # repeats - move chrUn and chrNA lineage-specific repeats into a tmp # directory as they were not used. nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \ -bigClusterHub=pk -swap -chainMinScore=5000 \ -chainLinearGap loose `pwd`/DEF >& doSwap.log & # Took about 27 minutes. # Blastz parameters are as for hg18 vs. danRer3 - see makeHg18.doc # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=6000 # BLASTZ_K=2200 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # BLASTZ_ABRIDGE_REPEATS=1 # make html files and trackDb.ra entry for chain and net tracks. # check README.txt for downloads. # featureBits -chrom=chr2 danRer3 refGene:cds chainHg18Link -enrichment # refGene:cds 0.767%, chainHg18Link 4.370%, both 0.607%, cover 79.15%, # enrich 18.11x # featureBits -chrom=chr2 danRer2 refGene:cds chainHg17Link -enrichment # refGene:cds 0.769%, chainHg17Link 4.576%, both 0.605%, cover 78.69%, # enrich 17.20x # Similar coverage and enrichment as for danRer2 vs hg17 but there are less # chains: 7057 for hg18 on danRer3, 1111 for hg17 on danRer2 (chr1). # 5-WAY VAR_MULTIZ ALIGNMENTS (DONE, 2006-02-06, hartera) # MAF ANNOTATION ADDED (DONE, 2006-02-6, braney) # FINISHED MAKING TREE IMAGE FOR TRACK DESCRIPTION PAGE # (DONE, 2006-02-07, hartera) # Species: zebrafish(danRer3), human (hg18), mouse(mm7), # fugu(fr1) and tetraodon(tetNig1) # Opossum (monDom2) was dropped since there were many more alignments # for monDom2 than monDom1 and the chains were shorter on average. The # reason for this is unknown so they will not be included in the # conservation track at this time. # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd) ssh kkstore02 mkdir -p /cluster/data/danRer3/bed/multiz5way cd /cluster/data/danRer3/bed/multiz5way mkdir mafLinks # set up directories for links to mafs for each pairwise alignment mkdir mafLinks/hg18 mkdir mafLinks/mm7 mkdir mafLinks/fr1 mkdir mafLinks/tetNig1 set dir=/cluster/data/danRer3/bed # need to make links to all the mafNet files for pairwise blastz # alignments for each species. Make sure files are all called chrN.maf.gz ln -s $dir/blastz.hg18.swap/mafNet/*.maf.gz ./mafLinks/hg18 ln -s $dir/blastz.mm7.swap/mafNet/*.maf.gz ./mafLinks/mm7 ln -s $dir/blastz.fr1/mafNet/*.maf.gz ./mafLinks/fr1 ln -s $dir/blastz.tetNig1.2005-10-11/chromsAndScafsRun/mafNet/*.maf.gz \ ./mafLinks/tetNig1 # copy files over to the san for the pitakluster cluster run ssh pk mkdir /san/sanvol1/scratch/danRer3/multiz5way cd /san/sanvol1/scratch/danRer3/multiz5way rsync -a --copy-links --progress \ /cluster/data/danRer3/bed/multiz5way/mafLinks/ . # 277 Mb of data - took less than 1 minute mkdir penn cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/maf_project penn # Progressive alignment up the tree w/o stager, # using multiz.v10 (var_multiz) # Method: align internal subtrees (using 0 flag to var_multiz) # Then, align these to human (using 1 flag to var_multiz) # NOTE: must use maf_project after each multiz run, in order # to order output. Single-cov guaranteed by use of net MAF's, # so it is not necessary to run single_cov2. # make output dir and run dir cd /cluster/data/danRer3/bed/multiz5way mkdir -p maf mkdir -p run cd run # create scripts to run var_multiz on cluster cat > oneMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 set db = danRer3 set multi = /scratch/tmp/$db/multiz5way.$c set pairs = /san/sanvol1/scratch/$db/multiz5way set penn = $pairs/penn # special mode -- # with 1 arg, cleanup if ($#argv == 1) then echo "cleanup" echo "rm -fr $multi" rm -fr $multi echo "rmdir --ignore-fail-on-non-empty /scratch/tmp/$db" rmdir --ignore-fail-on-non-empty /scratch/tmp/$db exit endif # special mode -- # with 3 args, saves an alignment file if ($#argv == 3) then echo "cp $multi/$2/$c.maf $3" ls -og $multi/$2/$c.maf cp $multi/$2/$c.maf $3 exit endif set s1 = $2 set s2 = $3 set flag = $4 # locate input files -- in pairwise dir, or multiple dir set d1 = $multi set d2 = $multi if (-d $pairs/$s1) then set d1 = $pairs set f1 = $d1/$s1/$c.maf.gz set t1 = /tmp/$s1.$c.maf zcat $f1 > $t1 else set f1 = $d1/$s1/$c.maf set t1 = /tmp/$s1.$c.maf cp -p $f1 $t1 endif if (-d $pairs/$s2) then set d2 = $pairs set f2 = $d2/$s2/$c.maf.gz set t2 = /tmp/$s2.$c.maf zcat $f2 > $t2 else set f2 = $d2/$s2/$c.maf set t2 = /tmp/$s2.$c.maf cp -p $f2 $t2 endif # write to output dir set out = $multi/${s1}${s2} mkdir -p $out # check for empty input file if (-s $t1 && -s $t2) then echo "Aligning $f1 $f2 $flag" $penn/multiz $t1 $t2 $flag $out/$c.unused1.maf \ $out/$c.unused2.maf > $out/$c.full.maf cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \ $out/$c.tmp.maf echo "Ordering $c.maf" $penn/maf_project $out/$c.tmp.maf $db.$c > $out/$c.maf rm -f $t1 $t2 else if (-s $t1) then cp -p $t1 $out/$c.maf rm -f $t1 else if (-s $t2) then cp -p $t2 $out/$c.maf rm -f $t2 endif 'EOF' # << keep emacs coloring happy chmod +x oneMultiz.csh cp -p oneMultiz.csh \ /san/sanvol1/scratch/danRer3/multiz5way/penn/oneMultiz.csh # Create 6way.nh file of tree. This was used in the distant past for # early versions of phastCons. Now, this is merely a convenient # reference to the tree under construction. This is also used to draw # a graphic tree as species5.nh, see below. cat << '_EOF_' > /cluster/data/danRer3/bed/multiz5way/5way.nh (hg18,mm7),((tetNig1,fr1),danRer3)) '_EOF_' # << this line keeps emacs coloring happy # using the tree diagram as above, arrange these alignments # in order of the tree branches cat > allMultiz.csh << 'EOF' #!/bin/csh -fe # multiple alignment steps: set c = $1 set db = danRer3 set s = "/san/sanvol1/scratch/$db/multiz5way/penn/oneMultiz.csh" $s $c hg18 mm7 0 $s $c tetNig1 fr1 1 $s $c tetNig1fr1 hg18mm7 1 # get final alignment file $s $c tetNig1fr1hg18mm7 /cluster/data/$db/bed/multiz5way/maf/$c.maf #cleanup $s $c 'EOF' # happy emacs chmod +x allMultiz.csh cat << 'EOF' > template #LOOP ./allMultiz.csh $(root1) {check out line+ /cluster/data/danRer3/bed/multiz5way/maf/$(root1).maf} #ENDLOOP 'EOF' awk '{print $1}' ../../../chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para try, para check, para push, para check ... etc para time # Completed: 28 of 28 jobs #CPU time in finished jobs: 3546s 59.10m 0.98h 0.04d 0.000 y # IO & Wait Time: 115s 1.92m 0.03h 0.00d 0.000 y # Average job time: 131s 2.18m 0.04h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 553s 9.22m 0.15h 0.01d # Submission to last job: 709s 11.82m 0.20h 0.01d # do not filter mafs as only removes a small fraction of alignments # better to keep them all. check for single column alignments (these # just have a single base for each species in the alignment). There # should be none of these now. Previously had to do a glueing step to # deal with these. There are none here. # Build maf annotation and load database (braney, 2006-02-06) cd /cluster/data/danRer3/bed/multiz5way mkdir anno cd anno cat ../../maf/chr1.maf | awk "/^s/ {print \$2}" | sed "s/\..*$//" | sort -u > species.names mkdir maf run cd run rm sizes nBeds for i in `cat species.names` do ln -s /cluster/data/$i/chrom.sizes $i.len ln -s /cluster/data/$i/$i.N.bed $i.bed echo $i.bed >> nBeds echo $i.len >> sizes done for i in ../../maf/*.maf do echo mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/danRer3/danRer3.2bit ../maf/`basename $i` done > jobs sh -x jobs ssh hgwdev cd /cluster/data/danRer3/bed/multiz5way/anno/mafs cat *.maf | hgLoadMafSummary danRer3 multiz5way stdin # Dropped unused indexes (2006-05-09 kate) # NOTE: this is not required in the future, as the loader # has been fixed to not generate these indexes hgsql danRer3 -e "alter table multiz5waySummary drop index chrom_2" hgsql danRer3 -e "alter table multiz5waySummary drop index chrom_3" mkdir /gbdb/danRer3/multiz5way for i in *.maf do ln -s `pwd`/$i /gbdb/danRer3/multiz5way done hgLoadMaf danRer3 multiz5way rm *.tab cd /cluster/data/danRer3/bed/multiz5way mkdir frames cd frames cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames . cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile . #edit Makefile to correct species names mkdir -p /san/sanvol1/scratch/danRer3/multiz5wayFrames/maf for i in ../../maf/*.maf; do echo $i; cp $i /san/sanvol1/scratch/danRer3/multiz5wayFrames/maf/$i; done make getGenes make getFrames make loadDb ### # rebuild frames to get bug fix, using 1-pass maf methodology # (2006-06-09 markd) ssh kkstore02 cd /cluster/data/danRer3/bed/multiz5way/frames mv mafFrames/ mafFrames.old nice tcsh # easy way to get process niced (zcat ../maf/*.maf.gz | time genePredToMafFrames danRer3 stdin stdout danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz hg18 genes/hg18.gp.gz mm7 genes/mm7.gp.gz tetNig1 genes/tetNig1.gp.gz | gzip >multiz5way.mafFrames.gz)>&log& ssh hgwdev cd /cluster/data/danRer3/bed/multiz5way/frames hgLoadMafFrames danRer3 multiz5wayFrames multiz5way.mafFrames.gz >&log& #end of multiz5way annotation and load # create tree image - like tree.nh but with common names # (hartera, 2006-02-07) ssh hgwdev cd /cluster/data/danRer3/bed/multiz5way cat << '_EOF_' > species5.nh ((human,mouse),((tetraodon,fugu),zebrafish)) '_EOF_' /cluster/bin/phast/$MACHTYPE/draw_tree -b -s species5.nh > species5.ps convert species5.ps 5way.jpg # using GIMP, edit tree and remove whitespace # Photoshop used to edit the image (kuhn, 2006-02-07) cp 5way.jpg /usr/local/apache/htdocs/images/phylo/danRer3_5way.jpg # change permissions for display chmod +r /usr/local/apache/htdocs/images/phylo/danRer3_5way.jpg # check for all.joiner entry for multiz5way - ok # add trackDb.ra entry in ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3: # track multiz5way # shortLabel 5-Way Conservation # longLabel 5-Way Vertebrate Multiz Alignment & Conservation # group compGeno # priority 104 # visibility pack # color 0, 10, 100 # altColor 0,90,10 # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # yLineOnOff Off # autoScale Off # summary multiz5waySummary # speciesGroups vertebrate mammal # sGroup_mammal hg18 mm7 # sGroup_vertebrate tetNig1 fr1 # add this line to trackDb entry as above for the tree image (2006-02-07): # treeImage phylo/danRer3_5way.jpg # PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 6-WAY ALIGNMENT # (DONE, 2006-02-06, hartera) ssh kkstore02 mkdir /cluster/data/danRer3/bed/multiz5way/cons cd /cluster/data/danRer3/bed/multiz5way/cons # create a starting-tree.mod based on chr5 (73Mb - largest chrom) # chr5 is the largest chrom apart from NA and Un /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr5.maf \ --refseq ../../../5/chr5.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # takes about 30 seconds /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "((danRer3,(tetNig1,fr1)),(mm7,hg18))" \ --out-root starting-tree # took less than 1 minute rm s1.*ss # Get genome-wide average GC content (for all species together, # not just the reference genome). If you have a globally # estimated tree model, as above, you can get this from the # BACKGROUND line in the .mod file. E.g., # ALPHABET: A C G T # ... # BACKGROUND: 0.307629 0.191708 0.192177 0.308486 # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.384 is the GC content. This is used in the -gc argument below. # If you do *not* have a global tree model and you do not know your # GC content, you can get it directly from the MAFs with a command # like: # /cluster/bin/phast/$MACHTYPE/msa_view \ # --aggregate danRer3,tetNig1,fr1,mm7,hg18 -i MAF \ # -S /cluster/data/danRer3/bed/multiz5way/maf/chr*.maf > maf_summary.txt # This gives a GC content of 0.438 # break up the genome-wide MAFs into pieces on the san filesystem ssh kkstore02 mkdir -p /san/sanvol1/scratch/danRer3/cons/ss cd /san/sanvol1/scratch/danRer3/cons/ss bash for C in `awk '{print $1}' /cluster/data/danRer3/chrom.sizes` do if [ -s /cluster/data/danRer3/bed/multiz5way/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/danRer3/bed/multiz5way/maf/${C}.maf \ --refseq /cluster/data/danRer3/${chrN}/${C}.fa \ --in-format MAF --windows 1000000,0 --between-blocks 5000 \ --out-format SS -I 1000 --out-root ${C}/${C} fi done # took about 20 minutes to run # Create a random list of 50 1 mb regions (do not use chrNA and chrUn) ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh pk mkdir /san/sanvol1/scratch/danRer3/cons/treeRun1 cd /san/sanvol1/scratch/danRer3/cons/treeRun1 mkdir tree log # now set up cluster job to estimate model parameters. Parameters # will be estimated separately for each alignment fragment then # will be combined across fragments. Tuning this loop should come # back to here to recalculate. Tuning target-coverage and expected-length. # Create little script that calls phastCons with right arguments cat > makeTree << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/x86_64/phastCons ../ss/$1 \ /cluster/data/danRer3/bed/multiz5way/cons/starting-tree.mod \ --gc 0.438 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-length 12 --target-coverage 0.17 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # emacs happy chmod a+x makeTree # Make sure that the correct GC content is subsituted in here. Notice # the target coverage of 0.17. Here we are going to aim # for 65% coverage of coding regions by conserved elements. # Create gensub file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) #ENDLOOP '_EOF_' # happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try,check,push,check etc. # para time # Completed: 50 of 50 jobs # CPU time in finished jobs: 714s 11.90m 0.20h 0.01d 0.000 y # IO & Wait Time: 132s 2.20m 0.04h 0.00d 0.000 y # Average job time: 17s 0.28m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 26s 0.43m 0.01h 0.00d # Submission to last job: 353s 5.88m 0.10h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore02 cd /san/sanvol1/scratch/danRer3/cons/treeRun1 ls tree/chr*/*.cons.mod > cons.txt /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \ --output-average ../ave.cons.mod > cons_summary.txt ls tree/chr*/*.noncons.mod > noncons.txt /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/danRer3/bed/multiz5way/cons # measuring entropy # consEntropy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument # target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok) # the expected length that produces this entropy is the one # to use for phastCons. /cluster/bin/phast/$MACHTYPE/consEntropy 0.17 12 \ ave.cons.mod ave.noncons.mod # -target-coverage=0.17 -expected-lengths 12 #Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068 # Relative entropy: H=0.618383 bits/site # Expected min. length: L_min=17.978234 sites # Expected max. length: L_max=10.983828 sites # Phylogenetic information threshold: PIT=L_min*H=11.117434 bits # then the above steps from creating the treeRun directory onwards were # repeated with the target coverage and expected lengths parameters set as # below: # -target-coverage=0.25 -expected-lengths 12 #Transition parameters:gamma=0.250000, omega=12.000000, mu=0.083333,nu=0.027778 #Relative entropy: H=0.637721 bits/site #Expected min. length: L_min=15.535855 sites #Expected max. length: L_max=10.157133 sites #Phylogenetic information threshold: PIT=L_min*H=9.907536 bits #### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED #### # Parameters used for danRer2 6-way conservation track: # -target-coverage=0.35 -expected-lengths 18 #Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915 # Relative entropy: H=0.592725 bits/site # Expected min. length: L_min=16.435656 sites # Expected max. length: L_max=12.564154 sites # Phylogenetic information threshold: PIT=L_min*H=9.741828 bits # need to iterate and get the right coverage and parameters # try running phastCons below with parameters used above and check the # coverage of coding regions by the most conserved elements # Create cluster dir to do main phastCons run ssh pk mkdir -p /san/sanvol1/scratch/danRer3/cons/consRun1 cd /san/sanvol1/scratch/danRer3/cons/consRun1 mkdir ppRaw bed cp -p /san/sanvol1/scratch/danRer3/cons/ave.*.mod . # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast.csh << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 18 --target-coverage 0.35 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.*.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP doPhast.csh $(root1) $(file1) #ENDLOOP '_EOF_' # happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # combine predictions and transform scores to be in 0-1000 interval ssh kkstore02 cd /san/sanvol1/scratch/danRer3/cons/consRun1 # The sed's and the sort get the file names in chrom,start order find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/danRer3/bed/multiz5way # Figure out how much is actually covered by the mostConserved data as so: cd /cluster/data/danRer3 faSize */chr*.fa # 1644032962 bases (48201758 N's 1595831204 real 816464533 upper # 779366671 lower) in 28 sequences in 28 files # The non-N size is 1595831204 bases cd /cluster/data/danRer3/bed/multiz5way awk '{sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/1595831204\n",100.0*sum/1595831204,sum}' \ mostConserved.bed -target-coverage 0.35: % 3.06 = 100.0*48883581/1595831204 length=18 -target-coverage 0. ssh hgwdev cd /cluster/data/danRer3/bed/multiz5way # get an or of refGene and mgcGenes CDS regions featureBits danRer3 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed # 11338034 bases of 1630323462 (0.695%) in intersection featureBits danRer3 refSeqOrMgcCds.bed mostConserved.bed -enrichment # refSeqOrMgcCds.bed 0.695%, mostConserved.bed 2.998%, both 0.464%, # cover 66.71%, enrich 22.25x # so use this result for -target-coverage=0.35 -expected-lengths=18 # with entropy (PIT) value of 9.74 (aiming for around 9.8) and # 66.7% coverage of coding regions with most conserved elements # (aiming for about 65%) # Load most conserved track into database ssh hgwdev cd /cluster/data/danRer3/bed/multiz5way hgLoadBed danRer3 phastConsElements mostConserved.bed # Loaded 552331 elements of size 5 featureBits danRer3 mgcGenes:cds phastConsElements -enrichment # mgcGenes:cds 0.531%, phastConsElements 2.998%, both 0.363%, # cover 68.39%, enrich 22.81x featureBits danRer3 refGene:cds phastConsElements -enrichment # refGene:cds 0.658%, phastConsElements 2.998%, both 0.440%, cover 66.82%, # enrich 22.28x # Create merged posterier probability file and wiggle track data files # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode ssh kkstore02 cd /san/sanvol1/scratch/danRer3/cons/consRun1 find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | wigEncode stdin phastCons5way.wig phastCons5way.wib # takes a few minutes ls -l phastCons* # -rw-rw-r-- 1 hartera protein 198399845 Feb 6 16:05 phastCons5way.wib # -rw-rw-r-- 1 hartera protein 45304940 Feb 6 16:05 phastCons5way.wig cp -p phastCons5way.wi? /cluster/data/danRer3/bed/multiz5way/cons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/danRer3/bed/multiz5way/cons mkdir -p /gbdb/danRer3/wib ln -s `pwd`/phastCons5way.wib /gbdb/danRer3/wib/phastCons5way.wib # use this if need to reload table hgsql -e 'drop table phastCons5way;' danRer3 # load table hgLoadWiggle danRer3 phastCons5way phastCons5way.wig # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/danRer3/bed/multiz5way/cons bash time hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=danRer3 phastCons5way > histogram.data 2>&1 # real 2m33.069s # user 1m58.310s # sys 0m16.170s # create plot of histogram: cat << '_EOF_' > histo.gp set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Zebrafish danRer3 Histogram phastCons5 track" set xlabel " phastCons5 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # happy emacs gnuplot histo.gp > histo.png display histo.png & # add line: wiggle phastCons5way to trackDb.ra for multiz5way to display the # wiggle for the conservation track. # check all.joiner for entries for phastCons5way and phastConsElements5way -ok # copy over html for multiz and edit. # PHASTCONS SCORES DOWNLOADABLES (DONE, 2006-02-07, hartera) # prepare compressed copy of ascii data values for downloads ssh kkstore02 cd /san/sanvol1/scratch/danRer3/cons/consRun1 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons5Scores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons5Scores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' chmod +x gzipAscii.sh time ./gzipAscii.sh # 192.852u 8.835s 4:04.05 82.6% 0+0k 0+0io 1pf+0w # creates 331 Mb of data. # copy data for downloads ssh kkstore02 mkdir /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores cd /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores rsync -a --progress \ pk:/san/sanvol1/scratch/danRer3/cons/consRun1/phastCons5Scores/ . ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores cd /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores ln -s /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores/*.gz . md5sum *.gz > md5sum.txt # copy over and edit README.txt from the hg17 phastCons. # MULTIZ 5-WAY DOWNLOADABLES (DONE, 2006-02-22, hartera) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/danRer3 mkdir -p multiz5way cd multiz5way foreach f (/cluster/data/danRer3/bed/multiz5way/maf/*.maf) set c = $f:r:t echo $c nice gzip $f ln -s $f.gz . end md5sum *.gz > md5sum.txt # copy over README and edit for this 5-way multiple alignment ################################################################## # HGNEAR TABLES (also used by the Known Genes details page links) # GET LATEST PROTEIN SEQUENCE FOR ALL HGNEAR SPECIES (DONE, 2005-02-10, hartera) # # For species with knownGene, use that; otherwise, download the latest # version of the main model organism database for this species. # Human: use knownGene proteins. # need to get hg18 peptide sequence: mkdir -p /cluster/data/hg18/bed/blastp cd /cluster/data/hg18/bed/blastp pepPredToFa hg18 knownGenePep known.faa # # Mouse: use knownGene proteins. # already done: # mkdir -p /cluster/data/mm7/bed/geneSorter/blastp # cd /cluster/data/mm7/bed/geneSorter/blastp # pepPredToFa mm7 knownGenePep known.faa # Rat: use knownGene proteins. # already done: # mkdir /cluster/data/rn3/bed/blastp # cd /cluster/data/rn3/bed/blastp # pepPredToFa rn3 knownGenePep known.faa # Fly: use FlyBase proteins - already done # /cluster/data/dm2/bed/flybase4.2/flybasePep.fa # Worm: use WormBase proteins. mkdir -p /cluster/data/ce2/bed/blastp cd /cluster/data/ce2/bed/blastp # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/ # to find out the latest version. It is WormPep 154 so use that. wget --timestamping -O wormPep154.faa \ ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep154/wormpep154 # Yeast: use SGD proteins. mkdir -p /cluster/data/sacCer1/bed/blastp cd /cluster/data/sacCer1/bed/blastp # get latest version - from Jan 26, 2006 wget -O orf_trans.fasta.jan26.gz \ ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz # rename old version of peptide sequences mv sgdPep.faa sgdPep.jan9.faa zcat orf_trans.fasta.jan26.gz > sgdPep.faa # HGNEAR PROTEIN BLAST TABLES (DONE, 2006-02-10, hartera) # RENAME SELF BLASTP TABLE AND CHANGE CONFIG.RA FILE (DONE, 2006-04-19, hartera) # NOTE: mmBlastTab was updated to mm8 as a result of running doHgNearBlastp.pl # for mm8 on 2006-03-13 (see makeMm8.doc). # RECREATE THE HGNEAR TABLES FOR RAT AND MOUSE TO UPDATE THEM # (DONE, 2006-05-31, hartera) # RE-MADE THE ZEBRAFISH BLASTP TABLES USING THE TRANSCRIPT ID INSTEAD OF THE # PEPTIDE ID FOR EACH SEQUENCE - FOR ALL OTHER SPECIES THE PEPTIDE SEQUENCES # ARE REPRESENTED BY THEIR KNOWN GENES TRANSCRIPT ID # (DONE, 2006-07-03, hartera) # CHANGED INDEX ON ensZfishBlastTab (DONE, 2006-11-03, hartera) ssh hgwdev mkdir -p /cluster/data/danRer3/bed/hgNearBlastp cd /cluster/data/danRer3/bed/hgNearBlastp # zebrafish vs fly table has already been created as a result of # creating the blastp table for dm2 (see makeDm2.doc) cat << _EOF_ > config.ra # Latest zebrafish vs. other Gene Sorter orgs: # human, mouse, rat, worm, yeast # zebrafish vs fly already done (dm2) targetGenesetPrefix ensZfish targetDb danRer3 queryDbs hg18 mm7 rn3 ce2 sacCer1 danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa hg18Fa /cluster/data/hg18/bed/blastp/known.faa mm7Fa /cluster/data/mm7/bed/geneSorter/blastp/known.faa rn3Fa /cluster/data/rn3/bed/blastp/known.faa ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa buildDir /cluster/data/danRer3/bed/hgNearBlastp scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp _EOF_ # << this line makes emacs coloring happy nice doHgNearBlastp.pl config.ra >& do.log & tail -f do.log # Took about 2 hours to finish. # The target geneset (self Blastp) should be prefixed with ensZfish # so change the config.ra and rename the table (2006-04-19, hartera) hgsql -e 'alter table flyBaseBlastTab rename ensZfishBlastTab;' danRer3 # Update mouse to mm8 and rat to rn4 mkdir updates cd updates hgsql -e 'drop table mmBlastTab;' danRer3 hgsql -e 'drop table rnBlastTab;' danRer3 cat << _EOF_ > config.ra # Update of zebrafish vs. other Gene Sorter orgs: # mouse mm8 and rat rn4 targetGenesetPrefix ensZfish targetDb danRer3 queryDbs mm8 rn4 danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa rn4Fa /cluster/data/rn4/bed/blastp/known.faa buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates _EOF_ # << this line makes emacs coloring happy nice doHgNearBlastp.pl config.ra >& do.log & tail -f do.log # Took about 25 minutes. # Need to remake all the BlastTab tables using the transcript Id instead # of the protein ID for zebrafish Ensembl Genes. # create ensZfishBlastTab and drBlastTab tables using the Ensembl # transcript Ids for the tables instead of the peptide Ids # (2006-07-03, hartera) ssh hgwdev # create the FASTA file of Ensembl peptide sequences with transcript IDs # there is a one to one relationship between these IDs. cd /cluster/data/danRer3/bed/blastp # then create a fasta file of the sequences: pepPredToFa danRer3 ensPep ensPep.faa mkdir /cluster/data/danRer3/bed/hgNearBlastp/updates2 cd /cluster/data/danRer3/bed/hgNearBlastp/updates2 cat << _EOF_ > config.ra # Latest zebrafish vs. other Gene Sorter orgs: # human, mouse, rat, fly, worm, yeast targetGenesetPrefix ensZfish targetDb danRer3 queryDbs hg18 mm8 rn4 dm2 ce2 sacCer1 danRer3Fa /cluster/data/danRer3/bed/blastp/ensPep.faa hg18Fa /cluster/data/hg18/bed/blastp/known.faa mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa rn4Fa /cluster/data/rn4/bed/blastp/known.faa dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates2 scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates2 _EOF_ # << this line makes emacs coloring happy nice doHgNearBlastp.pl config.ra >& do.log & tail -f do.log # Took about 45 minutes # update sacCer1 otherOrgs.ra to use danRer3 instead of danRer1 # for drBlastTab. # also need to update: # dm1, hg{15,16,17}, mm{5,6,7}, rn{2,3} # Human (hg15 and hg16), Drosophila, mouse mm5 and rat all use danRer1. # Human hg17 and mouse mm6 and mm7 uses danRer2. # Update these all to use the Zv5 (danRer3) Ensembl proteins. # Ensembl 38 (April 2006) ssh hgwdev cd /cluster/data/danRer3/bed/hgNearBlastp/updates2 cat << _EOF_ > config2.ra # Latest zebrafish vs. other Gene Sorter orgs: # human, mouse, rat, fly - older databases targetGenesetPrefix ensZfish targetDb danRer3 queryDbs hg17 hg16 hg15 mm7 mm6 mm5 rn3 rn2 dm1 danRer3Fa /cluster/data/danRer3/bed/blastp/ensPep.faa hg17Fa /cluster/data/hg17/bed/blastp/known.faa hg16Fa /cluster/data/hg16/bed/blastp/known.faa hg15Fa /cluster/data/hg15/bed/blastp/known.faa mm7Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa mm6Fa /cluster/data/mm6/bed/geneSorter/blastp/known.faa mm5Fa /cluster/data/mm5/bed/geneSorter/blastp/known.faa rn3Fa /cluster/data/rn3/bed/blastp/known.faa rn2Fa /cluster/data/rn2/bed/blastp/known.faa dm1Fa /cluster/data/dm1/bed/blastp/bdgp.faa buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates2 scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates2 _EOF_ # << this line makes emacs coloring happy # create BlastTab tables for all queries vs target and no self blastp nice doHgNearBlastp.pl config2.ra -noSelf -queryOnly >& do2.log & tail -f do2.log # Took about 30 minutes # Update and commit hgGeneData and hgNearData files to make sure that # all queries and links now work for the transcript ID instead of # peptide ID for ensZfishBlastTab and drBlastTab tables. # Gene Sorter is very slow for danRer3. ensZfishBlastTab has an index # on both the query and target. All the other BlatTab tables have only # an index on the query so try dropping the index on the target. hgsql -e 'alter table ensZfishBlastTab drop index target;' danRer3 # Gene Sorter still loads slowly. # Index is too short. hgLoadBlastTab used to load table and index on # query is query(12). The first 12 characters are not unique for # the Ensembl IDs so extend to query(20). hgsql -e 'alter table ensZfishBlastTab drop index query;' danRer3 hgsql -e 'create index query on ensZfishBlastTab (query(20));' danRer3 # Much faster now. # END OF HGNEAR STUFF #################################################### # GENE SET BASED ON ENSEMBL GENES (PROTEIN CODING GENES) # (in progress, 2005-11-23, hartera) # see ENSEMBL GENES section for documentation of creation of # the ensGene, ensGtp and ensPep tables and the track. # compare the Ensembl and Human Proteins tracks featureBits danRer3 refGene:cds ensGene:cds -enrichment # refGene:cds 0.658%, ensGene:cds 1.994%, both 0.589%, cover 89.60%, # enrich 44.94x featureBits danRer3 refGene:cds blastHg17KG -enrichment # refGene:cds 0.658%, blastHg17KG 1.292%, both 0.385%, cover 58.52%, # enrich 45.30x # little difference in enrichment and less coverage for Human Proteins so # it seems like Ensembl is the best choice in terms of genome coverage # and intersection with RefSeq CDS regions. ssh kkstore02 mkdir -p /cluster/data/danRer3/bed/ensGenes cd /cluster/data/danRer3/bed/ensGenes # use Ensembl's BioMart to download the Ensembl Genes UniProt IDs and # descriptions. For genes with no description, use the InterPro domain. # Go to http://www.ensembl.org/Multi/martview # Follow this sequence through the pages: # Page 1) Select the Ensembl dataset (now v38 here, v36 and v37 is the # same for Zv5 Danio rerio protein coding genes) and the Danio_rerio # choice (ZFISH5 here). # Hit next. 25541 entries total. # Ensembl 37 from Feb 2006 - this dataset is the same as for the # version 32 downloaded as above for the Ensembl Genes track. # (Checked on 2006-03-09, hartera) # Ensembl 38 from April 2006 - this dataset is the same as for the # version 32 downloaded as above for the Ensembl Genes track. # (Checked on 2006-05-31, hartera) # Page 2) In the GENE section, select Gene type as protein_coding. # Then hit next. There are now 22877 entries in this filtered version. # Page 3) Choose the "Features" Attribute Page from the pulldown menu # at the top. Make sure that under the GENE section, the Ensembl # Attributes checked are the Ensembl Transcript ID, External Gene ID and the # Description. Under External References, select Unified UniProt # accession, and ZFIN Primary ID. Under the Protein section, select # InterPro Description and InterPro ID under InterPro # Attributes. Select text, tab-separated for output. Choose gzip # compression. Hit export. Save as ensGeneInfo37Coding.tsv.gz. Same as for # Ensembl v36 so update to Ensembl v37. Ensembl v38 is the same too # so update to this version (2006-05-31, hartera). Also add External Gene # ID for the Ensembl Attributes. gunzip ensGeneInfo38Coding.txt.gz # this file has some errors in it - there is a newline character in the # middle of the descriptions for the genes with the following UniProt # IDs: Q5TYV0, Q5SPG7, Q5SPG5, Q5RIJ2, Q5RID3. This causes the table # to be loaded incorrectly. Edit the ensGeneInfo38Coding.txt file manually # to remove these extra newlines. # Repeat above steps and get the Ensembl transcript ID from Ensembl # Attributes and then get EntrezGene ID, RefSeq DNA ID, and RefSeq # Peptide ID and from the External References section. Select text, # tab-separated for output. Choose gzip compression. Hit export. Again # Ensembl v36 gives the same result for Danio rerio. # Save as ensGeneInfo38Coding2.txt.gz cd /cluster/data/danRer3/bed/ensGenes gunzip ensGeneInfo38Coding2.txt.gz wc -l ensGeneInfo38* # 85607 ensGeneInfo38Coding.txt # 32457 ensGeneInfo38Coding2.txt # 85607 ensGeneInfo37Coding.tsv # 33233 ensGeneInfo37Coding2.tsv # find how many Transcripts have multiple SWISS-PROT IDs tail +2 ensGeneInfo38Coding.txt | awk '{FS="\t"} {OFS="\t"} \ {print $1, $2, $4}' > ensGene38UniProtandExtId.txt tail +2 ensGeneInfo38Coding.txt | awk '{FS="\t"} {OFS="\t"} \ {if ($2 != "") print $1, $4}' \ > ensGene38UniProt.txt sort ensGene38UniProt.txt | uniq > ensGene38UniProt.txt.uniq awk '{print $1}' ensGene38UniProt.txt.uniq | sort | uniq -c | sort -nr \ > ens38UniProt.count awk '{if ($1 > 1) print $2}' ens38UniProt.count \ > ens38UniProtMorethanOne.txt wc -l ens38UniProtMorethanOne.txt # 2257 ens38UniProtMorethanOne.txt awk '{if ($1 == 1) print $2}' ens38UniProt.count \ > ens38UniProtOnlyOne.txt wc -l ens38UniProtOnlyOne.txt # 8172 # get list of Ensembl transcripts with more than 1 UniProt ID and # the list of UniProt IDs. grep -f ens38UniProtMorethanOne.txt ensGene38UniProt.txt.uniq \ > ens38UniProtMorethanOne.uniProtIds # get list of Ensembl transcripts with more than 1 UniProt ID and # the list of UniProt IDs and external database IDs. sort ensGene38UniProtandExtId.txt | uniq \ > ensGene38UniProtandExtId.txt.uniq grep -f ens38UniProtMorethanOne.txt ensGene38UniProtandExtId.txt.uniq \ > ens38UniProtMorethanOne.uniProtandExtIds # to do blastp of Ensembl Proteins vs UniProt # (last uniProt update 2006-01-23): ssh hgwdev mkdir -p /cluster/data/danRer3/bed/ensGenes/blastDb cd /cluster/data/danRer3/bed/ensGenes/blastDb # create a table of Danio Rerio (Brachydanio rerio in UniProt) # SWISS-PROT sequences (2006-05-31) hgsql uniProt -e ' \ create table test.danioProt select protein.* from protein,accToTaxon \ where accToTaxon.taxon = 7955 and accToTaxon.acc = protein.acc;' # then create a fasta file of the sequences: pepPredToFa test danioProt danioUniProt.fa grep '>' danioUniProt.fa | wc -l # 14297 # then select just those UniProt IDs for the Ensembl Transcript IDs that # have multiple UniProt IDs associated with them. ssh kkstore02 cd /cluster/data/danRer3/bed/ensGenes/blastDb # get list of UniProt IDs awk '{print $2}' ../ens38UniProtMorethanOne.uniProtIds \ > ens38MultiUniProtIds.idsOnly sort ens38MultiUniProtIds.idsOnly | uniq \ > ens38MultiUniProtIds.idsOnly.uniq faSomeRecords danioUniProt.fa ens38MultiUniProtIds.idsOnly.uniq \ ens38DanioUniProt.fa # 4410 UniProt IDs but 4293 in the FASTA file so 117 are missing. grep '>' ens38DanioUniProt.fa | sort > uniProtSeq.ids perl -pi.bak -e 's/>//' uniProtSeq.ids comm -13 uniProtSeq.ids ens38MultiUniProtIds.idsOnly.uniq > uniProtMissing # these missing sequences are missing because the uniProt IDs are # secondary IDs. Find the primary ID. hgsql -N -e 'select o.acc, o.val from otherAcc as o, accToTaxon as a \ where o.acc = a.acc and a.taxon = 7955;' uniProt > otherAccs.zfish.txt wc -l otherAccs.zfish.txt # 321 otherAccs.zfish.txt grep -f uniProtMissing otherAccs.zfish.txt > uniProtMissing.otherAccs.txt # found 83 of them awk '{print $2}' uniProtMissing.otherAccs.txt | sort | uniq > otherAccsFound comm -13 otherAccsFound uniProtMissing > stillMissing # check list of deleted TrEMBL IDs - delac_tr.txt from Expasy site. sort delac_tr.txt > delac_tr.sort sort stillMissing > stillMissing.sort comm -12 delac_tr.sort stillMissing.sort | wc # 34. There are 34 in the stillMissing file and these are all in the # delac_tr.txt file. #This file lists the accession numbers of TrEMBL entries which have #been deleted from the database. Most deletions are due to the deletion of #the corresponding CDS in the source nucleotide sequence databases EMBL- #Bank/DDBJ/GenBank. In addition, some entries are recognised to be Open #Reading frames (ORFs) that have been wrongly predicted to code for #proteins. When there is enough evidence that these hypothetical proteins #are not real, we take the decision to remove them from TrEMBL. # Get the sequences for otherAccsFound from danioUniProt.fa awk '{print $1}' uniProtMissing.otherAccs.txt | sort | uniq \ > otherAccsFound.altAccs faSomeRecords danioUniProt.fa otherAccsFound.altAccs ens38DanioOtherAccs.fa grep '>' ens38DanioOtherAccs.fa | wc # 73 wc -l otherAccsFound.altAccs # 73 otherAccsFound.altAccs cat ens38DanioUniProt.fa ens38DanioOtherAccs.fa > ens38DanioAllUniProt.fa # create blastDb database ssh pk cd /cluster/data/danRer3/bed/ensGenes/blastDb mkdir format cd format mv ../ens38DanioAllUniProt.fa . /scratch/blast/formatdb -i ens38DanioAllUniProt.fa \ -t ensUniProt -n ensUniProt # Copy database over to the san mkdir -p /san/sanvol1/scratch/danRer3/ensGenes/blastDb cp ensUniProt* /san/sanvol1/scratch/danRer3/ensGenes/blastDb/ ssh hgwdev mkdir /cluster/data/danRer3/bed/ensGenes/blastp cd /cluster/data/danRer3/bed/ensGenes/blastp # get FASTA file of Ensembl sequences pepPredToFa danRer3 ensPep ensPep.fa # get list of Ensembl transcripts to use in Blastp cp ../blastDb/stillMissing . # need to remove the missing ones (those no longer in TrEMBL) from list grep -v -f stillMissing ../ens38UniProtMorethanOne.uniProtIds \ > ens38UniProt.uniProtIdsforBlastp # get final list of Ensembl Transcript Ids awk '{print $1}' ens38UniProt.uniProtIdsforBlastp | sort | uniq \ > ens38IdsOnlyForBlastp.txt wc -l ens38IdsOnlyForBlastp.txt # 2252 ens38IdsOnlyForBlastp.txt # grab the protein sequences just for these Ensembl Transcripts: faSomeRecords ensPep.fa ens38IdsOnlyForBlastp.txt ens38ForBlastp.fa # check that there are 2252 records # set up the Blastp run ssh pk cd /cluster/data/danRer3/bed/ensGenes/blastp # split Ensembl peptide sequences FASTA file into chunks for cluster mkdir split faSplit sequence ens38ForBlastp.fa 200 split/ens38 # make parasol run directory mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/csh -ef setenv BLASTMAT /san/sanvol1/scratch/blast64/blast-2.2.11/data /san/sanvol1/scratch/blast64/blast-2.2.11/bin/blastall \ -p blastp -d /san/sanvol1/scratch/danRer3/ensGenes/blastDb/ensUniProt \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod +x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../split/*fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try, check, push, check ... etc. # Completed: 190 of 190 jobs # CPU time in finished jobs: 279s 4.65m 0.08h 0.00d 0.000 y # IO & Wait Time: 2293s 38.22m 0.64h 0.03d 0.000 y # Average job time: 14s 0.23m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 30s 0.50m 0.01h 0.00d # Submission to last job: 37s 0.62m 0.01h 0.00d # Load these into a temporary database table. hgLoadBlastTab # picks the best hit for each of the queries (Ensembl peptide). ssh hgwdev cd /cluster/data/danRer3/bed/ensGenes/blastp/run/out time hgLoadBlastTab -maxPer=1 test ensUniProtBlastTab *.tab # 0.154u 0.008s 0:00.66 22.7% 0+0k 0+0io 0pf+0w # there were 2252 queries # BLASTP OF ALL ENS PEP VS ALL DANIO UNIPROT SEQS # Try doing Blastp again but this time using all the zebrafish UniProt # sequences as the database and all the Ensembl peptides as queries. # create blastDb database ssh pk cd /cluster/data/danRer3/bed/ensGenes/blastDb mkdir zfishUniProt cd zfishUniProt cp ../danioUniProt.fa . /san/sanvol1/scratch/blast64/blast-2.2.11/bin/formatdb \ -i danioUniProt.fa -t danioUniProt -n danioUniProt # Copy database over to the san mkdir -p /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt cp danioUniProt* /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt # split Ensembl peptide sequences FASTA file into chunks for cluster cd /cluster/data/danRer3/bed/ensGenes/blastp mkdir splitAll grep '>' ensPep.fa | wc -l # 32143 faSplit sequence ensPep.fa 8000 splitAll/ens38All # make parasol run directory mkdir runAll cd runAll mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/csh -ef setenv BLASTMAT /san/sanvol1/scratch/blast64/blast-2.2.11/data /san/sanvol1/scratch/blast64/blast-2.2.11/bin/blastall \ -p blastp \ -d /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt/danioUniProt \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << keep emacs happy chmod +x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << keep emacs happy # Create parasol batch echo ../splitAll/*fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try, check, push, check ... etc. para time #Completed: 7609 of 7609 jobs #CPU time in finished jobs: 11414s 190.23m 3.17h 0.13d 0.000 y #IO & Wait Time: 401489s 6691.48m 111.52h 4.65d 0.013 y #Average job time: 54s 0.90m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 77s 1.28m 0.02h 0.00d #Submission to last job: 1096s 18.27m 0.30h 0.01d # Load these into a temporary database table. hgLoadBlastTab # picks the best hit for each of the queries (Ensembl peptide). ssh hgwdev cd /cluster/data/danRer3/bed/ensGenes/blastp/runAll/out # cat files together as argument list too long for hgLoadBlastTab foreach t (*.tab) cat $t >> ensAll.tab end time hgLoadBlastTab -maxPer=1 test ensUniProtAllBlastTab ensAll.tab # 4.168u 0.737s 0:06.03 81.0% 0+0k 0+0io 5pf+0w # filter these and select just those with identity >= 95% # and eValue <= 0.00001 hgsql -N -e 'select distinct(target) from ensUniProtAllBlastTab where \ identity >= 95 and eValue <= 0.00001;' test | sort > out # get 11910 UniProt IDs mapping to Ensembl transcripts # there are 11343 unique UniProt IDs in ensGeneInfo38Coding.txt # load the ensGeneInfo38Coding.txt file into a table cat << 'EOF' > ens38Zfish.sql CREATE TABLE ens38Zfish ( transcriptId varchar(255) not null, extDbId varchar(255) not null, description longblob not null, uniProt varchar(255) not null, zfinId varchar(255) not null, interProDesc longblob not null, interProId varchar(255) not null ); 'EOF' # << emacs chmod a+r ensGeneInfo38Coding* tail +2 ensGeneInfo38Coding.txt > ens38Coding.tab hgLoadSqlTab test ens38Zfish ens38Zfish.sql ens38Coding.tab hgsql -N -e 'select distinct(uniProt) from ens38Zfish;' test \ | sort > ens38Zfish.uniProt.uniq wc -l ens38Zfish.uniProt.uniq out # 11344 ens38Zfish.uniProt.uniq # 9208 out comm -12 ens38Zfish.uniProt.uniq out | wc # 8526 in common comm -13 ens38Zfish.uniProt.uniq out > fromBlastPOnly comm -23 ens38Zfish.uniProt.uniq out > fromEns38Only wc -l from* # 682 fromBlastPOnly # 2817 fromEns38Only # find out how many from fromEns38Only are on the list of deleted from # TrEMBL IDs comm -12 fromEns38Only ./blastDb/delac_tr.sort > deletedFromTrEMBL comm -13 deletedFromTrEMBL fromEns38Only > fromEns38Only2 # get list of transcripts matched to a UniProt by blastP that # are not in ens38Zfish hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where uniProt = "";' test | sort > ens38Zfish.noUniProt hgsql -N -e 'select distinct(query) from ensUniProtAllBlastTab where \ identity >= 95 and eValue <= 0.00001;' test | sort > queryBlast.sort comm -12 queryBlast.sort ens38Zfish.noUniProt # 1967 # 9943 transcripts. # delac_sp.txt in ./blastDb - list of deleted SWISS-PROT IDs # as of May 30, 2006. 331 IDs. sort blastDb/delac_sp.txt > blastDb/delac_sp.sort # compare to list of SP IDs that are not in Blastp hits comm -12 blastDb/delac_sp.sort fromEns38Only2 # there are none in common # get list of Danio rerio UniProt IDs hgsql -N -e 'select distinct(acc) from danioProt;' test | sort \ > danioProt.accs.uniq comm -13 danioProt.accs.uniq fromEns38Only2 comm -12 danioProt.accs.uniq fromEns38Only2 > inuniProtAndfromEns38Only hgsql -e 'create table test.ensBlastp select * from ensUniProtAllBlastTab where identity >= 95 and eValue <= 0.00001;' test ## wc -l in*Only # 1967 inBlastpOnly # 278 inEns38Only # these are trancsript IDs # find the UniProt IDs for the 278 inEns38Only cd test6/tmp hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \ > ensBlastp.tId.sort hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where uniProt = "";' test | sort > ens38ZfishwithUniProt.tId.sort comm -13 ensBlastp.tId.sort ens38ZfishwithUniProt.tId.sort > inEns38Only comm -23 ensBlastp.tId.sort ens38ZfishwithUniProt.tId.sort > inBlastpOnly wc -l in*Only # 9943 inBlastpOnly # 19955 inEns38Only wc -l *.sort # 32143 ens38Zfish.tId.sort # 11910 ensBlastp.tId.sort # So there are 9943 that have Blastp hits assigned and 19955 in # Ensembl 38 that do not have Blastp hits # find those with no description and also have no UniProt ID. # there are 21236 and this is the same number without a description hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where description = "" and uniProt = "";' test | sort > ens38ZfishNoDesc.tid.sort # 21236 ens38ZfishNoDesc.tid.sort # compare this to the set of transcript IDs in Ensembl 38 Only # and for Blastp Only comm -12 inEns38Only ens38ZfishNoDesc.tid.sort > noBlastHitNoDesc comm -12 inBlastpOnly ens38ZfishNoDesc.tid.sort > blastHitNoDesc wc -l *NoDesc # 0 blastHitNoDesc # 19712 noBlastHitNoDesc # then get list of transcript IDs with no description in Ensembl 38 but # do have a Blastp hit comm -13 inEns38Only ens38ZfishNoDesc.tid.sort > blastpHitNoDesc.tid wc -l blastpHitNoDesc.tid # 1524 blastpHitNoDesc.tid # These are sequences with a Blastp hit but no description hgsql -N -e 'select distinct(target) from ensBlastp;' test \ | sort > blastp.uniProt.sort hgsql -N -e 'select distinct(uniProt) from ens38Zfish;' test \ | sort > ens38.uniProt.sort wc -l *uniProt.sort # 9208 blastp.uniProt.sort # 11344 ens38.uniProt.sort # there are 8526 in common comm -13 blastp.uniProt.sort ens38.uniProt.sort > ens38Only.uniProt comm -23 blastp.uniProt.sort ens38.uniProt.sort > blastpOnly.uniProt wc -l *.uniProt # 682 blastpOnly.uniProt # 2817 ens38Only.uniProt # there are 80 in the ense38Only.uniProt list that are deleted from TrEMBL # there are 3 in the blastpOnly.uniProt list that are deleted from TrEMBL # Q503U2 # Q7SY13 # Q8AW80 # Remove these from each list: comm -23 ens38Only.uniProt ../../blastDb/delac_tr.sort > ens38Only.uniProt2 comm -23 blastpOnly.uniProt ../../blastDb/delac_tr.sort > blastpOnly.uniProt2 # some of these will be ones where there were several SWISS-PROT IDs for # each transcript ID and only one is chosen so the others are dropped. # find how many of these ens38Only.uniProt2 are not in danioProt.accs.uniq comm -13 ../../danioProt.accs.uniq ens38Only.uniProt2 \ > ens38Only.uniProt.notinDanioProt # there are 88 of these. # find list of zebrafish accs with alternative accs in uniProt hgsql -N -e 'select val from otherAcc as a, accToTaxon as t where a.acc = t.acc and taxon = 7955;' uniProt | sort | uniq > zfishVals.otherAccs.uniq comm -12 ens38Only.uniProt.notinDanioProt zfishVals.otherAccs.uniq \ # 88 so all of these have alternate accessions. # remove these from list so: comm -13 ens38Only.uniProt.notinDanioProt ens38Only.uniProt2 \ > ens38Only.uniProt3 wc -l ens38Only.uniProt3 # 2649 ens38Only.uniProt3 # find number of uniProt IDs belonging to transcript IDs that have multiple # uniProt IDs: ../../blastDb/ens38MultiUniProtIds.idsOnly.uniq is list of # uniProt IDs for such transcripts. comm -12 ens38Only.uniProt3 ../../blastDb/ens38MultiUniProtIds.idsOnly.uniq \ > ens38Only.multiUniProtIds # there are 2310 of these. comm -13 ens38Only.multiUniProtIds ens38Only.uniProt3 > ens38Only.uniProt4 # 339 of these left grep -f ens38Only.uniProt4 ../../ensGene38UniProt.txt \ > ens38Only.uniProt4.tIdAndUpId awk '{print $1}' ens38Only.uniProt4.tIdAndUpId | sort | uniq \ > ens38Only.uniProt4.tId.uniq wc -l ens38Only.uniProt4.tId.uniq # 368 ens38Only.uniProt4.tId.uniq # Do these all have SWISS-PROT IDs by Blastp? hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \ > ensBlastp.query.sort comm -12 ens38Only.uniProt4.tId.uniq ensBlastp.query.sort # 183 so remove these: comm -23 ens38Only.uniProt4.tId.uniq ensBlastp.query.sort \ > ens38Only.uniProt4.tId.noBlastp wc -l ens38Only.uniProt4.tId.noBlastp # 185 ens38Only.uniProt4.tId.noBlastp #e.g. ENSDART00000002826, this has only 91% ID to Q6DBUS (Q6NYR4 in BioMart # download. It is 91.7% ID to Q6DBUS in Blastp table. hgsql -e 'create table test.ensBlastp90 select * from ensUniProtAllBlastTab where identity >= 90 and eValue <= 0.00001;' test hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \ > ensBlastp.tId.sort hgsql -N -e 'select distinct(query) from ensBlastp90;' test | sort \ > ensBlastp90.tId.sort # transcript IDs in ensBlastp90 and not in ensBlastp comm -23 ensBlastp90.tId.sort ensBlastp.tId.sort > ensBlastp90Only.tId wc -l ensBlastp90Only.tId # 704 ensBlastp90Only.tId # check these against list of ens38 with no description comm -12 ens38ZfishNoDesc.tid.sort ensBlastp90Only.tId \ > ensBlastp90Only.noUniProtInEns38 # 416 # also check against list of ens38Only.uniProt4.tId.noBlastp comm -12 ens38Only.uniProt4.tId.noBlastp ensBlastp90Only.tId # 140 comm -23 ens38Only.uniProt4.tId.noBlastp ensBlastp90Only.tId \ > ens38Only.uniProt4.tId.noBlastp90 # 45 of these left # ENSDART00000009971 has only 48% Identity to Q5DTD0. maps to Q58EF8 on # Ensembl web page. # Check 10 alignments with >= 95% and 10 that have >= 90% and < 95% cd /cluster/data/danRer3/bed/ensGenes/blastp/runAll2/out # ens38Blastp.out has the alignments in NCBI format # 95-96% 226 # 96-97% 322 # 97-98% 526 # 98-99% 1333 # 99-100% 9503 (both inclusive) # lower score can be due to shorter query and target # for >= 95% identity (ensBlastp table in test db). Get BlastP results # and check Ensembl. All Ensembl records show the UniProt ID given below # except where noted. # Query Target Identity qLen qAli tLen tAli E-value Score misMatch Comment # ENSDART00000012253 Q9W6E8 99.51 609 609 609 609 0 978 3 # ENSDART00000013114 Q6NYT1 99.63 267 267 267 267 4e-143 502 1 # ENSDART00000067816 Q6NZZ8 95.78 433 433 471 460 0 838 2 query doesn't # begin with Met, no associated UniProt ID in Ensembl # ENSDART00000018931 Q9DG41 99.42 346 346 552 346 0 709 2 query is partial, # doesn't begin with Met # ENSDART00000023846 Q7ZUQ4 98.33 300 300 625 300 1e-179 624 5 query doesn't # begin with Met # ENSDART00000006095 Q6P2V4 99.32 443 443 443 443 0 941 3 # ENSDART00000039597 Q5G9L7 100 146 146 146 146 3e-81 295 0 100% coverage # ENSDART00000028930 Q90442 97.53 84 81 85 81 5e-42 164 2 # ENSDART00000028255 Q8JHY2 100 63 63 63 63 2e-32 132 0 # ENSDART00000042947 Q4QRH1 95.22 1849 456 479 452 0 808 10 alignment length = # 460 bp, Ensembl doesn't show a UniProt protein ID for this. # Maybe there is a coverage criterion. # >= 90% and < 95% identity from ensBlastP90 table in test db: # There are 705 of these. 11911 have identity >= 95% # Query Target Identity qLen qAli tLen tAli E-value Score misMatch Comment # ENSDART00000031211 Q6R5A4 94.21 779 779 846 789 0 1266 38 (gapOpen 6) # bases 66-846 of target is aligning. Ensembl does not have a UniProt ID # for this transcript. # ENSDART00000028390 Q5TKR3 90.87 241 240 243 241 1e-125 444 21 (gapOpen 1) # ENSDART00000053312 Q5SYD9 92.64 325 325 322 322 8e-175 608 19 (gapOpen 2) # ENSDART00000056703 Q5CZR2 91.02 323 323 323 323 7e-124 605 29 (gapOpen 0) # Ensembl has no UniProt ID for this transcript. 91 % ID to NP_001013324.1, # also 323 bp. # ENSDART00000044490 Q3ZMH2 90.74 992 985 1082 994 0 1682 64 (gapOpen 7) # Ensembl has no UniProt ID, just InterPro domains. # ENSDART00000031487 Q5RHD6 92.81 320 320 319 319 7e-172 598 22 (gapOpen 1) # Ensembl has no UniProt ID, just InterPro domain. # ENSDART00000020233 Q6DHI1 91.72 298 298 299 299 6e-145 508 18 (gapOpen 2) # ENSDART00000061435 Q6PBV8 93.72 76 76 76 76 2e-33 135 5 (gapOpen 0) # ENSDART00000056959 Q4V9F6 94.21 433 426 440 431 0 728 18 (gapOpen 2) # only InterPro domain given for Ensembl, no UniProt ID. # ENSDART00000040220 Q504G5 90.12 172 172 174 172 3e-100 358 17 (gapOpen 0) # only InterPro domain given for Ensembl, no UniProt ID. # ENSDART00000066247 Q58EK5 90.08 767 231 485 251 3e-124 441 3 (gapOpen 3) # only InterPro domain given for Ensembl, no UniProt ID. # for 95% identity and above, there are only 18 proteins that have # mismatch > 40. # for between 90-95% then there are 62 with mismatch > 40. # use grep -A 100 -w # look at examples with high mismatch but identity < 95%. # ---+------+--------+------+--------+----------+ # | query | target | identity | aliLength | mismatch | gapOpen # |qStart | qEnd | tStart | tEnd | eValue | bitScore | # ENSDART00000012435 | Q6IQX1 | 91.2 | 1932 | 163 | 5 # | 2 | 1931 | 3 | 1931 | 0 | 3093 | # this has a high number of mismatches but distributed throughout # the protein and the UniProt sequence aligns to the genome with the # same exon structure as for ENSDART00000012435. # ENSDART00000050066 | Q7M558 | 91.69 | 3008 | 249 | 1 # | 0 | 3008 | 0 | 3007 | 0 | 5543 | # this is a very large protein so the mismatch is small compared to # the protein size. has same exon structure as Ensembl protein at # chr17:18,247,969-18,259,468. Blats to several regions - could be a # processed pseudogene or assembly artifact. # If identity < 95% and mismatch > 40 then size is at least around 450bp. # ENSDART00000028708 | Q7T296 | 90.12 | 486 | 45 | 1 # | 0 | 486 | 18 | 501 | 0 | 907 | # The most gaps in a sequence is 9 - only 1 sequence < 95% identity and # most have 0-2 gaps. Same for those >= 95% identity. # ENSDART00000039735 | Q7T1C9 | 98.15 | 1406 | 12 | 9 # | 0 | 1394 | 0 | 1404 | 0 | 2175 | # Gaps are spread throughout the seqeunce and are short. Blat of this # UniProt sequence gives the same exon structure as for the Ensembl seq. # | ENSDART00000053813 | Q7M560 | 90.07 | 2275 | 104 | 9 # | 0 | 2178 | 99 | 2349 | 0 | 3966 | # There are several large gaps in the first third of the sequence. The # rest of the gaps are short. Ensembl does not have a UniProt ID for this # transcript. Blat aligns this sequence to several places on the genome # all in close proximity to each other. One alignment corresponds to the # an Ensembl ID but not the one above. It does align to the region of # ENSDART00000053813 but with a different exon structure. # ENSDART00000044490 | Q3ZMH2 | 90.74 | 1004 | 64 | 7 # | 0 | 985 | 88 | 1082 | 0 | 1682 | # This has a couple of larger gaps. The UniProt sequence aligns to the # same region as ENSDART00000044490 which has 3 extra exons. There is # another transcript with the same exon structure. # | ENSDART00000041503 | Q3ZMH2 | 91.42 | 991 | 63 | # 5 | 0 | 974 | 82 | 1068 | 0 | 1684 | # This has only slightly higher identity. # ENSDART00000025635 | Q4FE55 | 99.33 | 2545 | 6 | 7 # | 0 | 2542 | 0 | 2537 | 0 | 4859 | # just short gaps. This Blats to the same region of ENSDART00000025635 # and gives the same exon structure. # could filter more using pslReps but should not filter on minAli since # either the query or target could be partial. # Use identity >= 90% as the cutoff and then associate the RefSeqs with # ZFIN IDs and update the official ZFIN Gene symbols. # ssh hgwdev # kkstore02 cd /cluster/data/danRer3/bed/ensGenes mkdir alignments cd alignments # Add a proteinID column to the ensGene table: ssh hgwdev cd /cluster/data/danRer3/bed/ensGenes # Add protein ID column: hgsql -e 'alter table ensGene add proteinID varchar(40) NOT NULL;' danRer3 # Add index to this column: # Next step, download the ZFIN IDs and UniProt IDs hgsql -e 'alter table ensGene add index(proteinID);' danRer3 hgsql -e 'select count(*) from ensGene;' danRer3 # 32143 hgsql -e 'update ensGene set proteinID = "";' danRer3 # ensBlastp is the table in the test database where proteins have # >=90% identity to the Ensembl proteins. hgsql -e 'select count(*) from ensGene as g, test.ensBlastp90 as p \ where g.name = p.query;' danRer3 # for >= 90% there are # 12614 # for >=95%, there are # 11910 # Use these UniProt IDs to fill in proteinID table. hgsql -e 'update ensGene as g, test.ensBlastp90 as p \ set g.proteinID = p.target where g.name = p.query;' danRer3 # check that there are 12614 rows with proteinID filled. hgsql -e 'select count(*) from ensGene where proteinID != "";' danRer3 # 12614 # once this is done, can create ensCanonical and ensIsoforms table - # see section on "BUILD GENE SORTER TABLES". # Add table for Ensembl 38 Ensembl Transcript IDs and RefSeq IDs # and Entrez Gene ID. ssh hgwdev cd /cluster/data/danRer3/bed/ensGenes cat << 'EOF' > ens38Zfish2.sql CREATE TABLE ens38Zfish2 ( transcriptId varchar(255) not null, entrezGeneId varchar(255) not null, refSeqId varchar(255) not null, refSeqProtId varchar(255) not null ); 'EOF' # << emacs tail +2 ensGeneInfo38Coding2.txt > ens38Coding2.tab hgLoadSqlTab test ens38Zfish2 ens38Zfish2.sql ens38Coding2.tab # 24523 lines where there is no Entrez Gene Id so these are set to 0. hgsql -N -e 'select distinct(entrezGeneId) from ens38Zfish2;' test \ | sort > ens38Zfish2.geneId.uniq wc -l ens38Zfish2.geneId.uniq # 6764 ens38Zfish2.geneId.uniq hgsql -e 'select count(distinct extDbId) from ens38Zfish;' test # 9028 hgsql -N -e 'select distinct(extDbId) from ens38Zfish;' test \ | sort > ens38Zfish.extDbId.sort grep -v NM ens38Zfish.extDbId.sort > ens38Zfish.extDbIdNoNM.sort # 8982 left grep -v BRARE ens38Zfish.extDbIdNoNM.sort \ > ens38Zfish.extDbIdNoNMandNoSP.sort grep -v NP ens38Zfish.extDbIdNoNMandNoSP.sort \ > ens38Zfish.extDbIdNoNMNoSPNoNP.sort wc -l ens38Zfish.extDbIdNoNMNoSPNoNP.sort # 5284 ens38Zfish.extDbIdNoNMNoSPNoNP.sort awk '{print $2}' ens38/ensToRefSeqvsZFIN.txt | sort | uniq \ > ensToRefSeqvsZFIN.names.uniq # how many in common comm -12 ens38Zfish.extDbIdNoNMNoSPNoNP.sort ensToRefSeqvsZFIN.names.uniq \ > common wc -l common # 4176 common comm -23 ens38Zfish.extDbIdNoNMNoSPNoNP.sort ensToRefSeqvsZFIN.names.uniq \ > extDbIdNotfromZFINviaRefSeq hgsql -N -e 'select mrnaAcc from refLink where locusLinkId != "";' danRer3 | sort | uniq > mrnaAcc.refLink.dr3.uniq wc -l mrnaAcc.refLink.dr3.uniq # 8811 mrnaAcc.refLink.dr3.uniq comm -12 mrnaAcc.refLink.dr3.uniq ensToRefSeq.refseq | wc # 7738 wc -l ensToRefSeq.refseq # 7738 # merge the ens38Zfish2 table with ens38ZfishNew. # for the Known Genes details pages. Changed table name from # ensGeneXRef to ensXRefZfish as there are a number of tables already # with similar names to ensGeneXRef so this would be confusing. # create a table definition for ensXRefZfish: # (updated 2006-11-08, hartera) cd ~/kent/src/hg/lib cat << 'EOF' > ensXRefZfish.as table ensXRefZfish "Link from an Ensembl Transcript ID to other database IDs and description." ( string ensGeneId; "Ensembl Transcript ID" string zfinId; "ZFIN ID" string uniProtId; "Unified UniProt protein accession" string spDisplayId; "UniProt Display ID" string geneId; "ZFIN Gene Symbol (formerly LocusLink) ID" string geneSymbol; "Official ZFIN Gene Symbol" string refSeq; "RefSeq DNA Accession" string protAcc; "RefSeq Protein Accession" string description; "Description" ) 'EOF' autoSql ensXRefZfish.as ensXRefZfish mv ensXRefZfish.h ../inc # commit ensXRefZfish* files to CVS. # add zfinId, uniProtId, spDisplayId, geneId, geneSymbol, refSeq and # protAcc as keys. ensGeneId is already the primary key. # description field is not long enough so it must be changed to a # longblob. perl -pi.bak -e 's/description varchar\(255\)/description longblob/' \ ensXRefZfish.sql # get the gene2refseq file from NCBI to give the Entrez Gene ID # and symbol for refSeq accessions. Taxonomy ID is 7955 for Danio rerio. # columns in file are tax_id, GeneID, status, # RNA nucleotide accession.version, RNA nucleotide gi, # protein accession.version, protein gi, genomic nucleotide # accession.version, genomic nucleotide gi, start position on the genomic # accession, end position on the genomic accession, orientation. # for the gene_info file, column headings are: # tax_id, GeneID, Symbol, LocusTag, Synonyms, dbXrefs, chromosome, # map location, description, type of gene, Symbol from nomenclature # authority, Full name from nomenclature authority, Nomenclature status. # DOWNLOAD LATEST versions (from Nov. 8, 2006) ssh kkstore02 mkdir /cluster/data/danRer3/bed/ensGenes/downloads cd /cluster/data/danRer3/bed/ensGenes/downloads wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz gunzip gene2refseq.gz gunzip gene_info.gz # get records for taxon ID: 7955 awk '{if ($1 == 7955) print;}' gene2refseq > zfish.gene2refseq wc -l zfish.gene2refseq # 14659 zfish.gene2refseq # 50465 zfish.gene2refseq - in March # Most of the ones no longer in the gene2refseq file are # PREDICTED, PROVISIONAL AND MODEL. # 37206 MODEL # 6278 PREDICTED # 6174 PROVISIONAL # 43 NA # 13 Reviewed # 5 REVIEWED # 1 VALIDATED # New sequences added: # 7021 PROVISIONAL # 6801 PREDICTED # 52 NA # 13 Reviewed # 12 VALIDATED # 10 INFERRED # 5 REVIEWED awk '{if ($1 == 7955) print;}' gene_info > zfish.gene_info wc -l zfish.gene_info # 38915 zfish.gene_info # 38126 zfish.gene_info - in March # checked that the Ensembl 38 genes for zebrafish are the same as # for Ensembl 35 for which these files were downloaded (see above - # updated file names to reflect v38). # also download the file from ZFIN that gives gene Symbols, ZFIN IDs # and RefSeq accessions. ZFIN associates more than one ZFIN ID with # UniProt IDs but there is a one to one relationship for ZFIN IDs # and RefSeq accessions. Therefore the RefSeq accessions can be used # to identify a ZFIN ID and gene name and vice versa. wget --timestamping http://zfin.org/data_transfer/Downloads/refseq.txt # already the ensGeneInfo38Coding.txt and ensGeneInfo38Coding2.txt # files into tables so that the information can be put together. # these are ens38Zfish and ens38Zfish2 in the test database. # first copy the ens38Zfish table and then replace the uniProtId column # with the best hits from the ensBlastp90 table. ssh hgwdev cd /cluster/data/danRer3/bed/ensGenes sed -e 's/ens38Zfish/ens38ZfishNew/' ens38Zfish.sql > ens38ZfishNew.sql # create table hgsql test < ens38ZfishNew.sql hgsql -e 'insert into ens38ZfishNew select * from ens38Zfish;' test # Add spDisplayId column: hgsql -e \ 'alter table ens38ZfishNew add spDisplayId varchar(255) NOT NULL;' test # add some indices hgsql -e 'create index uniProt on ens38ZfishNew (uniProt);' test hgsql -e 'create index query on ens38ZfishNew (transcriptId(20));' test # first remove uniProt IDs and add those found by Blastp: hgsql -e 'update ens38ZfishNew set uniProt = "";' test # add displayIds from uniProt to this table hgsql -e 'select count(*) from ens38ZfishNew as g, ensBlastp90 as p \ where g.transcriptId = p.query;' test # 37362 hgsql -e 'update ens38ZfishNew as g, ensBlastp90 as p \ set g.uniProt = p.target where g.transcriptId = p.query;' test # check that 37362 rows have an entry for uniProt - ok hgsql -e 'select count(*) from ens38ZfishNew as g, uniProt.displayId as p \ where g.uniProt = p.acc;' test # 36647 # 36647 have display IDs in UniProt hgsql -e 'update ens38ZfishNew as g, uniProt.displayId as p \ set g.spDisplayId = p.val where g.uniProt = p.acc;' test # check that 36647 of the rows have spDisplayId - ok. # add new columns for ens38ZfishNew hgsql -e \ 'alter table ens38ZfishNew add entrezGeneId varchar(255) NOT NULL;' test hgsql -e \ 'alter table ens38ZfishNew add refSeqId varchar(255) NOT NULL;' test hgsql -e \ 'alter table ens38ZfishNew add refSeqProtId varchar(255) NOT NULL;' test # merge together the tables. hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \ set g.entrezGeneId = e.entrezGeneId \ where g.transcriptId = e.transcriptId;' test hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \ set g.refSeqId = e.refSeqId \ where g.transcriptId = e.transcriptId;' test hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \ set g.refSeqProtId = e.refSeqProtId \ where g.transcriptId = e.transcriptId;' test cd /cluster/data/danRer3/bed/ensGenes/downloads/ hgsql -N -e 'select * from ens38ZfishNew;' test > ens38ZfishNew.txt ssh kkstore04 cd /cluster/data/danRer3/bed/ensGenes/downloads/ # There are 308 cases where there is a RefSeq ID but no Entrez Gene ID. # There are 1046 cases where there is an Entrez Gene ID but no RefSeq ID. # Use the NCBI files to fill in the gaps where needed. # get ZFIN file of ZFIN IDs, gene name and GenBank accession # refseq.txt has ZFIN IDs, gene name and RefSeq ID. wget --timestamping http://zfin.org/data_transfer/Downloads/gene_seq.txt awk '{print $1, $2}' gene_seq.txt | sort | uniq > geneSeq.genes awk '{print $1, $2}' refseq.txt | sort | uniq > refSeq.genes comm -23 refSeq.genes geneSeq.genes > refSeqOnly comm -13 refSeq.genes geneSeq.genes > geneSeqOnly wc -l *SeqOnly # 9542 geneSeqOnly # 827 refSeqOnly # get certain fields from each file and merge awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1, $2, $3, $4, $6;}' \ zfish.gene2refseq > zfish.gene2refseqSubset.txt awk 'BEGIN {FS="\t"} {OFS="\t"} \ {print $2, $3, $5, $6, $9, $10, $11, $12;}' \ zfish.gene_info > zfish.gene_infoSubset.txt # need to sort on the GeneID field (second field in refseq file and # first field in gene_info file): sort -n -k2 zfish.gene2refseqSubset.txt | uniq \ > zfish.gene2refseqSubset.sort sort -n -k1 zfish.gene_infoSubset.txt | uniq > zfish.gene_infoSubset.sort # join the two files based on the GeneID (Entrez Gene ID) which is # the second field in refseq file and first field in gene_info file. # Need to set the $tab variable in .tcshrc file: # set tab = " " join -t "$tab" -1 2 -2 1 zfish.gene2refseqSubset.sort \ zfish.gene_infoSubset.sort \ > zfish.gene2refSeqPlusInfo.txt # The program needs to be written to fill in these gaps for RefSeq ID, # Entrez Gene ID and RefSeq Peptide ID. It should then check for the # gene symbol using the ZFIN ID using RefSeq ID. # write program taking ensGene38Coding.tsv and ensGene38Coding2.tsv as # input and also the RefSeq files to find Entrez Gene IDs and Gene Symbols. # and give the tabbed output for loading into the ensXRefZfish table. # hgEnsGeneXRef.c in ~/kent/src/hg/near/hgZfishEnsXRef /cluster/home/hartera/bin/x86_64/hgZfishEnsXRef \ ensGeneInfo38.txt zfish.gene2refSeqPlusInfo.txt refseq.txt \ ens37XRefZfish.tab >& ens37XRefZfish.log # load this tabbed file into ensXRefZfish table ssh hgwdev cd /cluster/data/danRer3/bed/ensGenes # remove old table: hgsql -e 'drop table ensXRefZfish;' danRer3 hgLoadSqlTab danRer3 ensXRefZfish ~/kent/src/hg/lib/ensXRefZfish.sql \ ens38XRefZfish.tab # loaded with no problems. # Now need to check its contents: mkdir testing cd testing hgsql -N -e 'select zfinId, geneSymbol, refSeq from ensXRefZfish where \ zfinId != "" AND refSeq != "";' test > zfinIdsymbAndrefseq.txt sort zfinIdsymbAndrefseq.txt | uniq > zfinIdsymbAndrefseq.sort sort ../refseq.txt | uniq > refseq.sort perl -pi.bak -e 's/\t\n/\n/' refseq.sort comm -23 zfinIdsymbAndrefseq.sort refseq.sort | wc comm -12 zfinIdsymbAndrefseq.sort refseq.sort | wc cd /cluster/data/danRer3/bed/ensGenes/testProgram/tmp3 awk 'BEGIN {FS="\t"} {print $5}' ens38ZfishNew.sort | sort | uniq \ ensFile.zfinIds.sort # There are 7321 zfin IDs # 7284 ZFIN IDs in table and 6499 with a RefSeq. hgsql -N -e 'select distinct(zfinId) from ensXRefZfish where refseq = "" \ and zfinId != "" and geneSymbol = "";' test \ | sort > zfinIdwithNoRefSeqNoSymb.sort # There are 853 with no refseq but a zfinId and no gene symbol and 690 # are unique ZFIN IDs. # compare these to ZFIN IDs in the zfish.gene2refSeqPlusInfo.txt from # NCBI files: awk 'BEGIN {FS="\t"} {print $8;}' zfish.gene2refSeqPlusInfo.txt \ | sort | uniq > zfinIds.fromNcbiFile.sort # remove first line and "ZFIN:" prefix tail +2 zfinIds.fromNcbiFile.sort | sed -e 's/ZFIN://' \ > zfinIds.fromNcbiFile.sort2 comm -13 zfinIds.fromNcbiFile.sort2 zfinIdwithNoRefSeqNoSymb.sort | wc # 251 of these with no symbols are not found in the NCBI file comm -12 zfinIds.fromNcbiFile.sort2 zfinIdwithNoRefSeqNoSymb.sort \ > zfinIds.inNcibFile.noRefSeqOrSymbinXRef awk '{print $1}' refseq.txt | sort | uniq > refseq.zfId.sort comm -13 refseq.zfId.sort zfinIdwithNoRefSeqNoSymb.sort | wc # 176 of these with no symbols are not found in the ZFIN RefSeq file comm -12 refseq.zfId.sort zfinIdwithNoRefSeqNoSymb.sort \ > zfinIds.inZfinFile.noRefSeqOrSymbinXRef # 435 are in both of these lists wc -l *.noRefSeqOrSymbinXRef # 439 zfinIds.inNcibFile.noRefSeqOrSymbinXRef # 514 zfinIds.inZfinFile.noRefSeqOrSymbinXRef # edit ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/genome.ra to give # mySQL queries to ensGtp and ensXRefZfish to retrieve name, protein and # description. Changed XRef table name to new name. cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/genome.ra name global knownGene ensGene knownGenePep ensPep nameSql select gene from ensGtp where transcript = '%s' descriptionSql select description from ensXRefZfish where ensGeneId = '%s' proteinSql select uniProtId from ensXRefZfish where ensGeneId = '%s' _EOF_ # << happy emacs # created blastp hgNear tables by alignment of Zebrafish Ensembl peptide # sequences to the equivalent "Known Genes" peptide sets for other species # - see hgNear sections above. Then create an otherOrg.ra file for # zebrafish specifying the species and databases for these organisms # with blastp homolog tables. cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/otherOrgs.ra name human db hg18 name mouse db mm8 name rat db rn4 name drosophila db dm1 name cElegans db ce2 name yeast db sacCer1 _EOF_ # << this line makes emacs coloring happy # add Zebrafish-specific section.ra file cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/section.ra name method shortLabel Methods longLabel Ensembl Genes Methods, Credits, and Data Use Restrictions priority 140 _EOF_ # << this line makes emacs coloring happy # added links to the Zebrafish links.ra file # update links.ra so that link for Ensembl Genes is to the correct # stable archive link for Ensembl37 (feb 2006) and change XRef # table name to new name. cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/links.ra # Zebrafish-specific link info. # This contains info to construct the quick links. name genome tables ensGene idSql select chrom,txStart+1,txEnd from ensGene where name = '%s' name family tables ensGene idSql select name from ensGene where name = '%s' name ensemblGenes shortLabel Ensembl Genes tables ensGene idSql select name from ensGene where name = '%s' url http://feb2006.archive.ensembl.org/Danio_rerio/transview?transcript=%s priority 25 name zfin shortLabel ZFIN tables ensXRefZfish idSql select zfinId from ensXRefZfish where ensGeneId = '%s' url http://zfin.org/cgi-bin/webdriver?MIval=aa-markerview.apg&OID=%s priority 28 name tbSchema shortLabel Table Schema tables ensGene name uniProt shortLabel UniProt tables ensXRefZfish idSql select uniProtId from ensXRefZfish where ensGeneId = '%s' priority 30 name refSeq shortLabel RefSeq tables ensXRefZfish idSql select refSeq from ensXRefZfish where ensGeneId = '%s' url http://www.ncbi.nlm.nih.gov/nuccore/%s?report=GenBank priority 40 name refSeqPep shortLabel RefSeq Peptide tables ensXRefZfish idSql select protAcc from ensXRefZfish where ensGeneId = '%s' url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=protein&term=%s&doptcmdl=GenPept&tool=genome.ucsc.edu priority 42 name entrezGene shortLabel Entrez Gene tables ensXRefZfish idSql select geneId from ensXRefZfish where ensGeneId = '%s' url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=Retrieve&dopt=Graphics&list_uids=%s&tool=genome.ucsc.edu priority 45 name genBank hide name pubMed hide name geneCards hide name stanfordSource hide name cgap hide name ensembl hide name aceView hide _EOF_ # << this line makes emacs coloring happy # then make my to visualize in own sandbox cd ~/kent/src/hg/hgGene make my # commit *.ra files for Zebrafish to CVS. # edited hgGene.c so that the Gene Symbol (if available) is displayed # in the description section of the details page. # added ensXRefZfish to ensemblTranscriptId rules in all.joiner. # add entry to danRer3/trackDb.ra: # track ensGene # shortLabel Ensembl Genes # longLabel Ensembl v37 Gene Predictions (Protein Coding Genes) # group genes # priority 32.8 # visibility pack # color 150,0,0 # type genePred ensPep # hgGene on # STS MARKERS (in progress, 2005-10-13, hartera) # DOWNLOADED RECENTLY FROM NCBI ssh kkstore02 mkdir -p /cluster/data/danRer3/bed/stsMarkers cd /cluster/data/danRer3/bed/stsMarkers # UniSTS is the a unique subset of markers that are STS markers from the # six zebrafish mapping panels: GAT, HS, LN54, MGH, MOP, T51, and also # ZMAP which contains markers from the other panels. Among markers in # these map, a subset that are STSs and with available primers sequences # were imported to UniSTS. These include submitted maps and those from # the Zebrafish Information Network (ZFIN). ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-28 - Hiram) ssh pk cd /cluster/data/mm8/bed/blastzDanRer3.2006-02-28 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits danRer3 chainMm8Link # 54831876 bases of 1630323462 (3.363%) in intersection # SWAP CHAINS/NET RN4 (DONE 4/2/06 angie) ssh kkstore02 mkdir /cluster/data/danRer3/bed/blastz.rn4.swap cd /cluster/data/danRer3/bed/blastz.rn4.swap doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.danRer3/DEF \ -workhorse kkr7u00 >& do.log & tail -f do.log ln -s blastz.rn4.swap /cluster/data/danRer3/bed/blastz.rn4 ############################################################################ ## BLASTZ swap from hg17 alignments (DONE 2006-04-09 markd) ssh pk mkdir /cluster/data/danRer3/bed/blastz.hg17.swap ln -s blastz.hg17.swap /cluster/data/danRer3/bed/blastz.hg17 cd /cluster/data/danRer3/bed/blastz.hg17.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \ -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ /cluster/data/hg17/bed/blastz.danRer3/DEF >& swap.out& # failed due to netChains: looks like previous stage was not # successful (can't find [danRer3.hg17.]all.chain[.gz]). # mv swap.out swap.out.1 # rerun with -continue=net time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -continue=net -stop=net \ -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ /cluster/data/hg17/bed/blastz.danRer3/DEF >& swap.out& # create the net filee (DONE 2006-04-09 markd) ssh hgwdev cd /cluster/data/danRer3/bed/blastz.hg17.swap/axtChain nice netClass -verbose=0 -noAr noClass.net danRer3 hg17 danRer3.hg17.net nice gzip danRer3.hg17.net ########################################################################### # SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM OTHER ASSEMBLIES # (DONE, 2006-04-17, hartera) # ADD TO SAN FOR PK RUNS (DONE, 2006-05-30, hartera) # followed instructions used in makePanTro2.doc ssh kkr1u00 cd /cluster/data/danRer3/bed mkdir -p liftOver cd liftOver makeLoChain-split danRer3 /cluster/data/danRer3/nib >&! split.log & # Took about 30 minutes. # add split10k to san for pk runs (2006-05-30, hartera) ssh kk rsync -a --progress /iscratch/i/danRer3/split10k \ /san/sanvol1/scratch/danRer3/ ########################################################################### # LIFTOVER CHAINS TO DANRER2 (DONE, 2006-04-25 - 2006-05-03, hartera) # CLEANUP BLAT DIRECTORY (DONE, 2006-12-14, hartera) # Split (using makeLoChain-split) of danRer2 is doc'ed in makeDanRer2.doc # Do what makeLoChain-split says to do next (start blat alignment) ssh kk mkdir -p /cluster/data/danRer3/bed/liftOver cd /cluster/data/danRer3/bed/liftOver makeLoChain-align danRer3 /iscratch/i/danRer3/nib danRer2 \ /iscratch/i/danRer2/split10k \ /iscratch/i/danRer2/11.ooc >&! align.log & # Took about 5 minutes. # Do what its output says to do next (start cluster job) cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/run para try, check, push, check, ... para time >&! run.time # Completed: 782 of 784 jobs # Crashed: 2 jobs # CPU time in finished jobs: 4324484s 72074.73m 1201.25h 50.05d 0.137 y # IO & Wait Time: 35200s 586.67m 9.78h 0.41d 0.001 y # Average job time: 5575s 92.92m 1.55h 0.06d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 62741s 1045.68m 17.43h 0.73d # Submission to last job: 355469s 5924.48m 98.74h 4.11d # 2 jobs keep crashing so try them on the pk: chrUn_chrUn and chrUn_chr20 # need to copy the danRer2 split10k over to the pk ssh kkr1u00 mkdir -p /san/sanvol1/scratch/danRer2/split10k rsync -a --progress /iscratch/i/danRer2/split10k/* \ /san/sanvol1/scratch/danRer2/split10k/ # copy over 11.ooc file for danRer2 cp /iscratch/i/danRer2/11.ooc /san/sanvol1/scratch/danRer2 ssh pk cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/run mkdir extraRun raw cd extraRun grep chrUn_chrUn ../spec > spec grep chrUn_chr20 ../spec >> spec # change directories for spec file perl -pi.bak -e 's#/iscratch/i#/san/sanvol1/scratch#g' spec rm spec.bak para create spec para push, check etc. para time >& run.time # Completed: 2 of 2 jobs # CPU time in finished jobs: 263163s 4386.05m 73.10h 3.05d 0.008 y # IO & Wait Time: 62s 1.04m 0.02h 0.00d 0.000 y # Average job time: 131613s 2193.54m 36.56h 1.52d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 147104s 2451.73m 40.86h 1.70d # Submission to last job: 147104s 2451.73m 40.86h 1.70d ssh kkr1u00 # merge all raw output: cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25 mv ./run/raw/*.psl ./raw/ # lift alignments cd /cluster/data/danRer3/bed/liftOver makeLoChain-lift danRer3 danRer2 >&! lift.log & # Took about 8 minutes to run. # chain alignments ssh kki cd /cluster/data/danRer3/bed/liftOver makeLoChain-chain danRer3 /iscratch/i/danRer3/nib \ danRer2 /iscratch/i/danRer2/nib >&! chain.log & # Do what its output says to do next (start cluster job) cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/chainRun para try, check, push, check etc. ... para time >&! run.time # Completed: 28 of 28 jobs # CPU time in finished jobs: 2751s 45.86m 0.76h 0.03d 0.000 y # IO & Wait Time: 879s 14.64m 0.24h 0.01d 0.000 y # Average job time: 130s 2.16m 0.04h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 598s 9.97m 0.17h 0.01d # Submission to last job: 1520s 25.33m 0.42h 0.02d # net alignment chains ssh kkstore02 cd /cluster/data/danRer3/bed/liftOver makeLoChain-net danRer3 danRer2 >&! net.log & # Took about 24 minutes to run. # load reference to over.chain into database table, # and create symlinks /gbdb and download area ssh hgwdev cd /cluster/data/danRer3/bed/liftOver makeLoChain-load danRer3 danRer2 >&! load.log & # clean up rm *.log # test by converting a region using the "convert" link on # the browser, and comparing to blat of the same region # CLEANUP for LiftOver blat directory (2006-12-14, hartera) ssh kkstore02 rm -r blat.danRer2.2006-04-25 # REDO BACENDS - bacEndPairs, bacEndSingles, bacEndBadPairs and all_bacends # (split as chrN_allBacends) ONLY (DONE, 2006-05-01 - 2006-05-08, hartera) # RELOADED chrN_allBacends TABLES (DONE, 2006-06-08, hartera) # RECREATED all_bacends table WITH ONLY RELEVANT PSLS FOR THE LFS BED # TABLES FOR PAIRS, PAIRSBAD AND SINGLES (DONE, 2006-08-04, hartera) # NOTE: there are overlapping BAC clone ends for danRer3. Some of these # are only a few kb apart (from beginning of one to end of the other) # so use stricter pslPairs parameters as for human and mouse. # These BAC Ends should be about 150-200 kb. Typically, they are # 50 - 300 kb apart. # NOTE: IN FUTURE, IF SPLITTING all_bacends TABLE BY CHROM AND # RENAMING AS chrN_allBacends THEN USE allBacends INSTEAD OF # all_bacends AS ARGUMENT TO pslPairs. THIS WILL THEN AUTOMATICALLY # ADD THE CORRECT PSL TABLE NAME TO THE BED (LFS) TABLES ssh kkstore02 mkdir /cluster/data/danRer3/bed/bacends/pairsNew cd /cluster/data/danRer3/bed/bacends/pairsNew set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1 /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.psl \ $bacDir/bacEndPairs.txt all_bacends bacEnds wc -l bacEnds.* # 1725 bacEnds.long # 12081 bacEnds.mismatch # 242235 bacEnds.orphan # 156444 bacEnds.pairs # 616 bacEnds.short # 1017 bacEnds.slop echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\ > ../header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header # make pairs bed file cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairs.bed # also need to process bacEndSingles.txt into a database table # for singles in bacEndSingles.txt, create a dummy file where they # are given zJA11B12T7 as dummy sequence pair. If the single is a forward # sequence, put the dummy sequence in the second column, if the single is # a reverse sequence put in first column. use a perl script to do this. cd /cluster/data/danRer3/bed/bacends set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1 mkdir singlesNew cd singlesNew cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl . perl formatSingles.pl $bacDir/bacEndSingles.txt > \ $bacDir/bacEndSingles.format # then run pslPairs on this formatted file /cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \ all_bacends bacEnds wc -l bacEnds.* # 0 bacEnds.long # 0 bacEnds.mismatch # 11439 bacEnds.orphan # 0 bacEnds.pairs # 0 bacEnds.short # 0 bacEnds.slop # there are 11439 orphans here and 242235 from pair analysis so # a total of 253674 orphans cat bacEnds.orphan ../pairsNew/bacEnds.orphan > bacEnds.singles wc -l bacEnds.singles # 253674 bacEnds.singles # make singles bed file cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndSingles.bed cp bacEndSingles.bed ../pairsNew cd ../pairsNew # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad # since orphans are already in bacEndSingles, do not add these cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBad.bed # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBadNoOrphans.bed # use extractPslLoad later to get all_bacends.psl for database # There are rows where the aligments were the same but the lfNames are # different. This is due to the presence of multiple reads for the # same BAC end sequence. Sometimes they are slightly different lengths # so the alignments are a little different. It would be good to # consolidate all of these. Firstly, the identical rows were merged into # one with a list of all the lfNames corresponding to that alignment. ssh kkstore02 cd /cluster/data/danRer3/bed/bacends/pairsNew mkdir -p /cluster/data/danRer3/bed/bacends/duplicatesNew cd /cluster/data/danRer3/bed/bacends/duplicatesNew mkdir -p /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun cd /cluster/data/danRer3/bed/bacends/duplicatesNew ln -s /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun # write program to do this for linked feature series (lfs) which # is the type of data structure used for BAC ends. # Need a bed file sorted by chrom and chromStart cd overlapRun foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles) sort -k1,2 /cluster/data/danRer3/bed/bacends/pairsNew/${f}.bed \ > ${f}.lfs end wc -l *.lfs # 155242 bacEndPairs.lfs # 15311 bacEndPairsBadNoOrphans.lfs # 221821 bacEndSingles.lfs # remove replicate rows where names match and the overlapping region # (chromEnd - chromStart) is greater than or equal to 0.999. ssh kolossus cd /cluster/data/danRer3/bed/bacends/duplicatesNew/overlapRun foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles) echo "Processing $f" nohup nice /cluster/bin/x86_64/lfsOverlap ${f}.lfs \ ${f}.bed -name -minOverlap=0.999 -notBlocks end # Started: May 3 23:30 PID: 9199 # pairs started: May 5 18:10, PID: 13232 # Segmentation fault with bacEndSingles. This is a very large file so # run again using the file split into two # chr24 starts at line 109407 head -109406 bacEndSingles.lfs > bacEndSinglesPart1.lfs tail +109407 bacEndSingles.lfs > bacEndSinglesPart2.lfs # then try again: foreach f (bacEndSinglesPart1 bacEndSinglesPart2) echo "Processing $f" nohup nice /cluster/home/hartera/bin/i386/lfsOverlap ${f}.lfs \ ${f}.bed -name -minOverlap=0.999 -notBlocks end # merge results cat bacEndSinglesPart*.bed > bacEndSingles.bed ssh kkstore02 cd /cluster/data/danRer3/bed/bacends/duplicatesNew/overlapRun # check the numbers of lines are correct foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles) awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \ | sort | uniq -c | sort -nr > ${f}.uniqCount end wc -l * # 155164 bacEndPairs.bed # 155242 bacEndPairs.lfs # 155189 bacEndPairs.uniqCount # 15293 bacEndPairsBadNoOrphans.bed # 15311 bacEndPairsBadNoOrphans.lfs # 15303 bacEndPairsBadNoOrphans.uniqCount # 221771 bacEndSingles.bed # 221821 bacEndSingles.lfs # 221799 bacEndSingles.uniqCount # 109390 bacEndSinglesPart1.bed # 109406 bacEndSinglesPart1.lfs # 112381 bacEndSinglesPart2.bed # 112415 bacEndSinglesPart2.lfs # different numbers for unique count since some of these alignments # were not identical but very close to identical (>0.999 overlap) cd /cluster/data/danRer3/bed/bacends/duplicatesNew mv ./overlapRun/* . rm -r overlapRun /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun # Use perl script to choose 2 BAC ends to represent each BAC clone. # since there are often more than one read for each BAC end in this set, # 2 were chosen for each BAC pair or 1 for the singles. This was based on # the ones that had the largest region aligned (using lfSizes). # copy perl script over that was used for danRer2 cp /cluster/data/danRer2/bed/ZonLab/bacends/duplicates/pickLfNames.pl \ pickLfNamesv2.pl # edit so that regular expression for matching BAC end names is the # same as that used in ../bacends.1/getBacEndInfov2.pl # need to sort by chrom, chromStart foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles) sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed end # run perl script: input bed file, pairs or singles, name of output file perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed mv error.log log.pairs # log.pairs is empty perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed mv error.log log.singles sort log.singles | uniq > log.singles.uniq cp bacEndSinglesSort.bed bacEndSingles2Sort.bed # log.singles has 15 cases where alignments for a BAC clone use # different sequence reads for either the T7 or SP6 BAC end. # singles may include both BAC ends for a clone in the case # where they aligned to different chromosomes or a long way apart on # the same chromsome (orphans). mostly those that have a different read # align to an almost identical or largely overlapping region. # CH211-189J23: zC189J23.ya and zC189J23.yb align to overlapping regions. # Use zC189J23.yb as aligns to a longer region and remove the other one. # CH211-42D5 # some sequences appear to be different: CH211-98J20 - zC98J20.yb and # zC98J20.ya do not align to each other. DKEYP-107B4 - zKp107B4.ya looks # like it has low complexity sequence, this is discarded and zKp107B4.yb # is kept. zKp107B4.za and zKp107B4.zb only align in the first ~ 59bp. # zKp107B4.zb is kept in this case. DKEYP-114B4 - zKp114B4.za: 15-61 bp # on zKp114B4.za align to 11-58 bp on zKp114B4.zb. zKp114B4.za is kept. # In these cases, the 2 sequences align to different regions. # Some sequences have overlapping alignments as one sequence is a bit # longer than the other. perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \ badPairs2lfNames.bed mv error.log log.badPairs # no alignments have a different pair of ends to other alignments # for each of these new bed files, checks were made that there are # only 2 BAC ends per alignments for pairs and 1 for singles. # For each pair, there should only be 2 ends which can appear either # way round depending on the orientation and there should be 1 end for # the beginning (suffix T7, t7 or z) and one end for the end # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g. # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite # orientation. For singles, there should be a single BAC end for each # alignment and for each BAC clone, a sequence for either or both types # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate # alignments. e.g. wc -l pairs2lfNames.bed grep ',' pairs2lfNames.bed # should be the same number, every line should have a comma # should be twice the number of above, just 2 end names per line awk '{print $11}' pairs2lfNames.bed | sort | uniq > pairs.ends sed -e 's/,/\n/g' pairs.ends > pairs.ends2 wc -l pairs.ends2 # should be twice the number of above, just 2 end names per line perl -pi.bak -e \ 's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' pairs.ends sort pairs.ends | uniq > pairs.ends.uniq # check that each of these have the correct pair type # Finally overlaps in BAC clone names were checked. All BAC clones # represented in each of the pairs, badPairs and singles bed files are # unique to that file. Between all three bed files, 300323 BAC clones # have alignments. 512886 clone ends are aligned in these three bed files. foreach f (*.bed) awk '{print $4}' $f | sort | uniq > ${f}.names end comm -12 pairs2lfNames.bed.names badPairs2lfNames.bed.names comm -12 pairs2lfNames.bed.names singles1lfName.bed.names comm -12 badPairs2lfNames.bed.names singles1lfName.bed.names # None of these files should have any BAC clone names in common and # they do not so they are ok. # clean up: rm *Part1.bed *Part2.bed *.names *.ends *.ends2 *.Part1.lfs *Part2.lfs rm *.uniqCount # NOTE: using sort and uniq on hgwdev produces tab delimited output # after merging rows with the same BAC name, the scoring is now # wrong in the bed files. # Scores should be 1000 if there is 1 row for that name, else # 1500/number of rows for that sequence name - calculated by pslPairs. # Correct the scores. The co-ordinates for the singles also need to be # corrected. mkdir -p /cluster/data/danRer3/bed/bacends/scoresAndCoords cd /cluster/data/danRer3/bed/bacends/scoresAndCoords # copy over correctScores2.pl and checkscores.pl scripts from danRer2 and # edit so both scripts so that hits file is split on space,not on tabs cp \ /cluster/data/danRer2/bed/ZonLab/bacends/scoresAndCoords/correctScores2.pl . cp \ /cluster/data/danRer2/bed/ZonLab/bacends/scoresAndCoords/checkScores.pl . awk '{print $4}' ../duplicatesNew/pairs2lfNames.bed \ | sort | uniq -c > pairs.hits perl correctScores2.pl ../duplicatesNew/pairs2lfNames.bed pairs.hits \ noBin > bacEndPairsGoodScores.bed # same for singles awk '{print $4}' ../duplicatesNew/singles1lfName.bed \ | sort | uniq -c > singles.hits perl correctScores2.pl ../duplicatesNew/singles1lfName.bed singles.hits \ noBin > bacEndSinglesGoodScores.bed # and for badPairs awk '{print $4}' ../duplicatesNew/badPairs2lfNames.bed \ | sort | uniq -c > badPairs.hits perl correctScores2.pl ../duplicatesNew/badPairs2lfNames.bed \ badPairs.hits noBin > bacEndPairsBadGoodScores.bed # check that the scores are now correct awk '{print $4, $5}' bacEndPairsGoodScores.bed \ | sort | uniq -c > pairs.count perl checkScores.pl < pairs.count # all the BAC clones should be in good.txt and none in bad.txt # wc -l should give same number of lines in good.txt as in pairs.hits # repeat for other bed files awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \ | sort | uniq -c > badPairs.count perl checkScores.pl < badPairs.count awk '{print $4, $5}' bacEndSinglesGoodScores.bed \ | sort | uniq -c > singles.count perl checkScores.pl < singles.count # for the singles, 7 ended up in bad.txt because their scores # were 214.285714285714 which is correct for 7 alignments. rounding the # score caused the discrepancy. # For singles, the co-ordinates in the lfs table are wrong. The # chromStart should be the same as the lfsStart and chromEnd - chromStart # should be the same as lfSizes. Need to correct these: # pslPairs has added min/2 to the end or subtracted min/2 from the start # depending on whether it is a left or a right BAC end and the # alignment orientation. min used here was 25000. awk 'BEGIN {FS="\t"} {OFS="\t"} \ {if ($2 != $9) print $1,$9,$3,$4,$5,$6,$7,$8,$9,$10,$11; \ else print $1,$2,$3 - 12500,$4,$5,$6,$7,$8,$9,$10,$11;}' \ bacEndSinglesGoodScores.bed \ > bacEndSinglesGoodScoresAndCoords.bed # clean up rm error.log *.txt *.count *.hits bacEndSinglesGoodScore.bed ssh hgwdev cd /cluster/data/danRer3/bed/bacends/scoresAndCoords # copy over table definition from danRer2 cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/bacEndSingles.sql \ ../singlesNew/ # Now load database tables: hgLoadBed danRer3 bacEndPairs bacEndPairsGoodScores.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb # Loaded 155164 elements of size 11 hgLoadBed danRer3 bacEndSingles bacEndSinglesGoodScoresAndCoords.bed \ -sqlTable=../singlesNew/bacEndSingles.sql -notItemRgb # Loaded 221754 elements of size 11 # 221754 record(s), 0 row(s) skipped, 57 warning(s) loading bed.tab # warnings are unknown but all of bed file loaded and the number # of warnings is small so ignore hgLoadBed danRer3 bacEndPairsBad bacEndPairsBadGoodScores.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb # Loaded 15293 elements of size 11 # load BAC end sequences into seq table so alignments may be viewed mkdir -p /gbdb/danRer3/bacends ln -s /cluster/data/danRer3/bed/bacends/bacSeqs/Zv5BACends.fa \ /gbdb/danRer3/bacends/Zv5BACends.fa hgLoadSeq danRer3 /gbdb/danRer3/bacends/Zv5BACends.fa # create file for loading all_bacends table ssh kkstore02 cd /cluster/data/danRer3/bed/bacends/scoresAndCoords # for all_bacends table, just load the alignments for those sequences # represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables # bacEnds.load.psl is the file of alignments # get all the names of sequences foreach f (*.bed) echo $f awk '{print $11;}' $f >> allBacEnds.names end wc -l allBacEnds.names # 392211 allBacEnds.names # this is the total number of lines in the *.bed files perl -pi.bak -e 's/,/\n/g' allBacEnds.names sort allBacEnds.names | uniq > allBacEnds.names.uniq wc -l allBacEnds.names.uniq # 512321 allBacEnds.names.uniq # get alignments for just the BAC ends that are in the database tables # make bacEnds.load.psl cd /cluster/data/danRer3/bed/bacends/scoresAndCoords extractPslLoad -noBin ../bacEnds.psl bacEndPairsGoodScores.bed \ bacEndPairsBadGoodScores.bed bacEndSinglesGoodScoresAndCoords.bed | \ sorttbl tname tstart | headchg -del > bacEnds.load.psl # check that alignments are present for all BAC ends in # allBacEnds.names.uniq awk '{print $10}' bacEnds.load.psl | sort | uniq > bacEnds.names comm -12 bacEnds.names allBacEnds.names.uniq | wc -l # 512321 wc -l * # 512321 allBacEnds.names.uniq # 512321 bacEnds.names # Reloaded split tables. Old bacEnds.load.psl was used # last time. (2006-06-08, hartera) ssh hgwdev cd /cluster/data/danRer3/bed/bacends/scoresAndCoords # remove old all_bacends table. This was moved over from hgwbeta after # the recent crash of hgwdevold after the power failure. hgsql -e 'drop table all_bacends;' danRer3 # Display is very slow for BAC ends on large regions. Try splitting # bacEnds.load.psl and load tables as chrN_allBacends. The parsing # code is confused if there are two underscores in the table name. foreach c (`cat /cluster/data/danRer3/chrom.lst`) echo "Processing $c ..." awk '{if ($14 == "'chr${c}'") print;}' \ /cluster/data/danRer3/bed/bacends/scoresAndCoords/bacEnds.load.psl \ > chr${c}.bacEnds.load.psl end # drop old tables foreach c (`cat /cluster/data/danRer3/chrom.lst`) echo $c hgsql -e "drop table chr${c}_allBacends;" danRer3 end # load new tables foreach c (`cat /cluster/data/danRer3/chrom.lst`) nice hgLoadPsl danRer3 -table=chr${c}_allBacends chr${c}.bacEnds.load.psl end # load of chr5_allBacends did not go as planned: 326147 record(s), # 0 row(s) skipped, 1 warning(s) loading psl.tab # load of chr8_allBacends did not go as planned: 212665 record(s), # 0 row(s) skipped, 5 warning(s) loading psl.tab # load of chr12_allBacends did not go as planned: 156947 record(s), # 0 row(s) skipped, 1 warning(s) loading psl.tab # load of chr15_allBacends did not go as planned: 181721 record(s), # 0 row(s) skipped, 1 warning(s) loading psl.tab # load of chr19_allBacends did not go as planned: 282423 record(s), # 0 row(s) skipped, 1 warning(s) loading psl.tab # load of chr20_allBacends did not go as planned: 315248 record(s), # 0 row(s) skipped, 7 warning(s) loading psl.tab # load of chrUn_allBacends did not go as planned: 1524765 record(s), # 0 row(s) skipped, 487 warning(s) loading psl.tab # There are still warnings on loading, most (487) are for chrUn. # alter lfs (BED) tables so that pslTable field is "allBacends" # instead of all_bacends (this was set by the pslPairs program). foreach t (bacEndPairs bacEndSingles bacEndPairsBad) hgsql -e "update $t set pslTable = 'allBacends';" danRer3 end # This improves the performance a lot. # corrected termRegex for some bacCloneXRef searches in trackDb.ra so # that they work correctly (bacPairsIntName, bacSinglesIntName, # bacPairsSangerSts and bacSinglesSangerSts). (2006-04-19, hartera) # Remake the all_bacends table. extractPslLoad extracts psl alignments # by name so even those that are filtered out end up in the all_bacends # table. Wrote a program that matches BAC end psl alignments from the # bacEnd{Pairs, PairsBad, Singles} tables by name, chrom, chromStart and # chromEnd. ssh kkstore02 cd /cluster/data/danRer3/bed/bacends mkdir extractPsl cd extractPsl # Some scores in bacEndSinglesGoodScoresAndCoords.psl are not integers # so fix these and also for the other bacEnd files just in case. cat << '_EOF_' > roundPslScore.pl #!/usr/bin/perl -w use strict; my $file = $ARGV[0]; open(FILE, $file) || die "Can not open $file: $!\n"; while () { my (@f, $line, $num, $score); $line = $_; @f = split(/\t/, $line); $num = $f[4]; $score = round($num); $line =~ s/$num/$score/; print $line; } sub round { my($number) = shift; return int($number + .5); } '_EOF_' chmod +x roundPslScore.pl set bacDir=/cluster/data/danRer3/bed/bacends perl roundPslScore.pl $bacDir/scoresAndCoords/bacEndPairsGoodScores.bed \ > bacEndPairsRoundScore.bed perl roundPslScore.pl $bacDir/scoresAndCoords/bacEndPairsBadGoodScores.bed \ > bacEndPairsBadRoundScore.bed perl roundPslScore.pl \ $bacDir/scoresAndCoords/bacEndSinglesGoodScoresAndCoords.bed \ > bacEndSinglesRoundScore.bed nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \ $bacDir/bacEnds.psl bacEndPairsRoundScore.bed bacPairs.psl nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \ $bacDir/bacEnds.psl bacEndPairsBadRoundScore.bed bacPairsBad.psl nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \ $bacDir/bacEnds.psl \ bacEndSinglesRoundScore.bed bacSingles.psl cat bac*.psl > allBacends.load.psl # Now load database tables: # Do not need to reload singles table as it is still the same, the # scores were rounded to 214 on loading. These are the only scores that # are floats rather than integers. # Drop old split bacends tables and reload new one with only those psls # relevant to alignments in the lfs tables. ssh hgwdev cd /cluster/data/danRer3/bed/bacends/extractPsl foreach c (`cat /cluster/data/danRer3/chrom.lst`) hgsql -e "drop table chr${c}_allBacends;" danRer3 end # change the bacEnd{Pairs, PairBad, Singles} tables so that the # pslTable is all_bacends again. foreach b (Pairs PairsBad Singles) hgsql -e "update bacEnd${b} set pslTable = 'all_bacends';" \ danRer3 end # Then load all_bacends table. Now there are many less alignments than # before, they can all go in one table since the large table size # was previously slowing down the Browser at zoomed out display levels # due to slow access of the very large all_bacends table. wc -l allBacends.load.psl # 549408 allBacends.load.psl hgLoadPsl danRer3 -table=all_bacends allBacends.load.psl hgsql -e 'select count(*) from all_bacends;' danRer3 # 549408 # Table contains the correct number of rows. # Get all the lfNames from the bed files and check that these are all # represented in allBacends.load.psl ssh kkstore02 cd /cluster/data/danRer3/bed/bacends/extractPsl foreach p (*RoundScore.bed) awk '{print $11}' $p >> bedFiles.names end perl -pi.bak -e 's/,/\n/' bedFiles.names sort bedFiles.names | uniq > bedFiles.names.uniq # get psl file names awk '{print $10}' allBacends.load.psl | sort | uniq > pslFile.names.uniq wc -l *.uniq # 512321 bedFiles.names.uniq # 512321 pslFile.names.uniq comm -12 bedFiles.names.uniq pslFile.names.uniq | wc -l # 512321 # Therefore all names from BED files are in PSL file. rm bedFiles* pslFile* cd /cluster/data/danRer3/bed/bacends rm -r all_bacends rm ./scoresAndCoords/*.bacEnds.load.psl # Duplicate rows in bacCloneXRef and bacCloneAlias tables so remove # these, reload tables and test - see sections on # CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES and # BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES ####################################################################### # RE-DO RH MAP: # isPcr of sequences. # 1) Make a list from FASTA file of sequences. # 2) get one record per file. - need to just split on '>' # 3) use rhFix to adapt to get primers, one set per file and name # after sequence # run isPcr as cluster job - one per sequence and primers set # get RHmap info again. need to remove spaces in primers seqs cat << '_EOF_' > getRhInfo #!/usr/bin/awk -f #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG| /^>/ { sub(/>/,"",$0); sub(/\//,"_", $0); gsub(/ /,"",$0); split(toupper($0), a, "\\|"); print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12]; next; } '_EOF_' # << keep emacs coloring happy chmod +x getRhInfo getRhInfo ../../rhMap.headers2 > rhMapInfo.tab ssh hgwdev cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306 mkdir -p isPcr/primers cd isPcr/primers # create primers files ssh kkstore02 cd \ /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($8 != "") print $1,$8,$9 \ > $1".primers.fa"}' rhMapInfo.tab # there are 7519 primer sets which is correct. # get list of sequences cd .. mkdir markerSeqs cd markerSeqs grep '>' ../../rhMap.fa | wc # 11514 # get all sequences. There are 11514 total. # use faSplit sequence 11514 # rhMap.fa is file. Need to fix that one name: perl -pi.bak -e 's/\//_/' ../../rhMap.fa # splits sequences up with one file per name named with sequence name faSplit byname ../../rhMap.fa rhMap ls | wc -l # 11514 ssh pk # make run dir on the san and link to isPcr dir mkdir -p /san/sanvol1/scratch/danRer3/bacends/isPcrRun cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr ln -s /san/sanvol1/scratch/danRer3/bacends/isPcrRun . # get list of sequences with primers cd \ /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/isPcrRun awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($8 != "") print $1 \ > "primerSeqs.lst"}' \ /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers/rhMapInfo.tab foreach m (`cat primerSeqs.lst`) echo /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/san/sanvol1/scratch/danRer3/danRer3_10.ooc -stepSize=5 /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/markerSeqs/${m}.fa /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers/${m}.primers.fa '{'check out line+ /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/isPcrRun/out/${m}.psl'}' >> jobList end para create jobList para try, check, push, check etc. ... # there are 654 that do not have isPcr results. Checked Z4664.MGH and # found that the primers would not align with Blat either. # these are in unmatchedPrimers. They crashed even if maxSize=50000 and # if -flipReverse used. mkdir notMatchedPrimers notMatchedSeqs perl -pi.bak -e 's/\.fa//' unmatchedPrimers foreach f (`cat unmatchedPrimers`) set d=/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr cp ${d}/primers/${f}.primers.fa ./notMatchedPrimers/ cp ${d}/markerSeqs/${f}.fa ./notMatchedSeqs end tar cvzf primers.tar.gz notMatchedPrimers/*primers.fa tar cvzf markers.tar.gz notMatchedSeqs/*.fa # sent these to Yi Zhou by e-mail and see if they can look at them. # include the isPcr parameters. # from PSL extract sequence. need tName, tStart and tEnd, fields 14, 16 and # 17. Then used faFrag to get sequence from FASTA file. ############################################################################ ## BLASTZ swap from panTro2 alignments (DONE 2006-05-07 markd) ssh hgwdev64 mkdir /cluster/data/danRer3/bed/blastz.panTro2.swap ln -s blastz.panTro2.swap /cluster/data/danRer3/bed/blastz.panTro2 cd /cluster/data/danRer3/bed/blastz.panTro2.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \ -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ /cluster/data/panTro2/bed/blastz.danRer3/DEF >& swap.out& # create the net files ssh hgwdev cd /cluster/data/danRer3/bed/blastz.panTro2.swap/axtChain nice netClass -verbose=0 -noAr noClass.net danRer3 panTro2 danRer3.panTro2.net ########################################################################### # LIFTOVER CHAINS TO DANRER4 (DONE, 2006-05-31 - 2006-06-06, hartera) # CLEANUP BLAT DIRECTORY (DONE, 2006-12-14, hartera) # Split (using makeLoChain-split) of danRer4 is doc'ed in makeDanRer4.doc # Do what makeLoChain-split says to do next (start blat alignment) # Use pk as runs faster than on kk. Scripts only run on kk so run manually. ssh pk mkdir -p /cluster/data/danRer3/bed/liftOver cd /cluster/data/danRer3/bed/liftOver cat << '_EOF_' > align.csh #!/bin/csh -fe set oldAssembly = $1 set oldNibDir = $2 set newAssembly = $3 set newSplitDir = $4 set ooc = $5 if ("$ooc" != "") then set ooc = '-ooc='$ooc endif set blatDir = /cluster/data/$oldAssembly/bed/blat.$newAssembly.`date +%Y-%m-%d` echo "Setting up blat in $blatDir" rm -fr $blatDir mkdir $blatDir cd $blatDir mkdir raw psl run cd run echo '#LOOP' > gsub echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} ' \ '-tileSize=11 '$ooc' -minScore=100 -minIdentity=98 -fastMap' \ >> gsub echo '#ENDLOOP' >> gsub # target ls -1S $oldNibDir/*.{nib,2bit} > old.lst # query ls -1S $newSplitDir/*.{nib,fa} > new.lst gensub2 old.lst new.lst gsub spec /parasol/bin/para create spec echo "" echo "First two lines of para spec:" head -2 spec echo "" echo "DO THIS NEXT:" echo " cd $blatDir/run" echo " para try, check, push, check, ..." echo "" exit 0 '_EOF_' # << emacs chmod +x align.csh align.csh danRer3 /san/sanvol1/scratch/danRer3/nib danRer4 \ /san/sanvol1/scratch/danRer4/split10k \ /san/sanvol1/scratch/danRer4/danRer4_11.ooc >&! align.log & # Took a few seconds. # Do what its output says to do next (start cluster job) cd /cluster/data/danRer3/bed/blat.danRer4.2006-05-31/run para try, check, push, check, ... para time # Completed: 784 of 784 jobs # CPU time in finished jobs: 2011355s 33522.59m 558.71h 23.28d 0.064 y # IO & Wait Time: 3926s 65.43m 1.09h 0.05d 0.000 y # Average job time: 2571s 42.84m 0.71h 0.03d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 205412s 3423.53m 57.06h 2.38d # Submission to last job: 219860s 3664.33m 61.07h 2.54d ssh pk cd /cluster/data/danRer3/bed/liftOver cat << '_EOF_' > lift.csh #!/bin/csh -ef set oldAssembly = $1 set newAssembly = $2 set newLiftDir = /san/sanvol1/scratch/$newAssembly/split10k set prefix = /cluster/data/$oldAssembly/bed/blat.$newAssembly set blatDir = `ls -td $prefix.20* | head -1` echo "using dir $blatDir" if ( ! -e $blatDir/raw ) then echo "Can't find $blatDir/raw" endif if (`ls -1 $newLiftDir/*.lft | wc -l` < 1) then echo "Can't find any .lft files in $newLiftDir" exit 1 endif cd $blatDir/raw foreach chr (`awk '{print $1;}' /cluster/data/$newAssembly/chrom.sizes`) echo $chr liftUp -pslQ ../psl/$chr.psl $newLiftDir/$chr.lft warn chr*_$chr.psl end set execDir = $0:h echo "" echo "DO THIS NEXT:" echo " ssh pk" echo " $execDir/makeLoChain-chain $oldAssembly <$oldAssembly-nibdir> $newAssembly <$newAssembly-nibdir>" echo "" exit 0 '_EOF_' # << emacs chmod +x lift.csh lift.csh danRer3 danRer4 >&! lift.log & # makeLoChain-chain can be run on pk. chain alignments makeLoChain-chain danRer3 /san/sanvol1/scratch/danRer3/nib \ danRer4 /san/sanvol1/scratch/danRer4/nib >&! chain.log & cd /cluster/data/danRer3/bed/blat.danRer4.2006-05-31/chainRun para try, check, push, check, ... para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 3414s 56.91m 0.95h 0.04d 0.000 y # IO & Wait Time: 3256s 54.26m 0.90h 0.04d 0.000 y # Average job time: 238s 3.97m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 280s 4.67m 0.08h 0.00d # Submission to last job: 280s 4.67m 0.08h 0.00d # net alignment chains ssh kkstore02 cd /cluster/data/danRer3/bed/liftOver makeLoChain-net danRer3 danRer4 >&! net.log & # load reference to over.chain into database table, # and create symlinks /gbdb and download area ssh hgwdev cd /cluster/data/danRer3/bed/liftOver makeLoChain-load danRer3 danRer4 >&! load.log & # clean up rm *.log # add md5sum.txt to include this new liftOver file cd /usr/local/apache/htdocs/goldenPath/danRer3/liftOver rm md5sum.txt md5sum *.gz > md5sum.txt # copy README.txt from another liftOver directory if it is not there already. # test by converting a region using the "convert" link on # the browser, and comparing to blat of the same region # CLEANUP blat directory (2006-12-14, hartera) ssh kkstore02 rm -r /cluster/data/danRer3/bed/blat.danRer4.2006-05-31 ########################################################################### # CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO # AFFY ZEBRAFISH ALIGNMENTS (DONE, 2006-06-10, hartera) # UPDATE ARRAY DATA TRACK AFTER PROCESSING ARRAY DATA DIFFERENTLY AND # RELOADING INTO hgFixed (see hgFixed.txt for details). # (DONE, 2006-10-20, hartera) # UPDATE ARRAY DATA TRACK AFTER REPROCESSING ARRAY DATA TO ANTILOG THE LOG2 # VALUES FROM NORMALISATION TO GET THE ABSOLUTE VALUES AND # RELOADING INTO hgFixed (see hgFixed.txt for details). # (DONE, 2007-01-08, hartera) # RE-ORDERED DISPLAY IN TRACK (DONE, hartera, 2007-04-09) # Array data is for whole embryos of five wild type zebrafish strains. # Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's # Hospital Boston. Contact: adibiase@enders.tch.harvard.edu ssh hgwdev mkdir /cluster/data/danRer3/bed/ZonLab/wtArray cd /cluster/data/danRer3/bed/ZonLab/wtArray # use AllRatio table for mapping. There are not many arrays in this # dataset so using AllRatio will allow the selection of All Arrays # from the track controls on the track description page. Also set up the # Zebrafish microarrayGroups.ra so that the Medians of replicates or # Means of replicates can also be selected for display. # Create mapped data in zebrafishZonWT.bed. rm zebrafishZonWT.bed hgsql -e 'drop table affyZonWildType;' danRer3 hgMapMicroarray zebrafishZonWT.bed hgFixed.zebrafishZonWTAllRatio \ /cluster/data/danRer3/bed/affyZebrafish/affyZebrafish.psl # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTMedian # Mapped 14494, multiply-mapped 4102, missed 0, unmapped 1123 # Load mapped data into database: hgLoadBed danRer3 affyZonWildType zebrafishZonWT.bed # Loaded 18596 elements of size 15 # add trackDb.ra entry at trackDb/zebrafish level # look at range of scores: hgsql -N -e 'select expScores from zebrafishZonWTAllRatio;' hgFixed \ > ratioExps.out perl -pi.bak -e 's/,/\n/g' ratioExps.out sort ratioExps.out | uniq -c > ratioExps.uniq.count textHistogram -binSize=0.5 -real -maxBinCount=40 -minVal=-10 \ ratioExps.out > expRatios.hist # Most values are between -3 and +2. # Therefore use the following trackDb entry: # track affyZonWildType # shortLabel Wild Type Array # longLabel Zon Lab Expression data for Wild Type Zebrafish strains # group regulation # priority 80 # visibility hide # type expRatio # expScale 2.0 # expStep 0.2 # groupings affyZonWildTypeGroups # The .ra file in /usr/local/apache/cgi-bin/hgCgiData/Zebrafish # (from ~/kent/src/hg/makeDb/hgCgiData/Zebrafish in the source tree) # which is microarrayGroups.ra defines how the array data is # displayed and also grouped for the Medians and Means of Replicates. # It also defines the labels for the track controls for showing # All Arrays, Arrays Grouped By Replicate Means or # Arrays Grouped By Replicate Medians. This is in the description field. # RE-ORDER DISPLAY IN TRACK - (hartera, 2007-04-09) ssh hgwdev cd ~/kent/src/hg/makeDb/hgCgiData/Zebrafish # 14 somites and 15 somites should come before 36 hpf # 14-19 somites stage is 16-19h. # from hgFixed.zebrafishZonWTAllExps # for AB, 0-8 should go after 14, # for TL, 16-22 should go after 24 # for TU, 25-27 should go after 32 # re-order accordingly in the config file: cd /cluster/data/danRer4/bed/ZonLab/wtArray cat << '_EOF_' > formatArray #!/usr/bin/awk -f BEGIN {FS=","} {OFS=","} /expIds/ { sub(/expIds /,"",$0); print "expIds "$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7,$8,$9,$16,$24,$25,$17,$18,$19,$20,$21,$22,$23,$29,$30,$31,$32,$33,$26,$27,$28,$34; next; } /names AB-36-hpf,AB-36-hpf 2/ { sub(/names /,"",$0); print "names "$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7,$8,$9,$16,$24,$25,$17,$18,$19,$20,$21,$22,$23,$29,$30,$31,$32,$33,$26,$27,$28,$34; next; } /names AB-36-hpf,AB-14-somites/ { sub(/names /,"",$0); print "names "$2,$1,$3,$5,$4,$7,$8,$6,$9; next; } /groupSizes 9/ { sub(/groupSizes /,"",$0); print "groupSizes "$2,$1,$3,$5,$4,$7,$8,$6,$9; next; } { print $0; } '_EOF_' chmod +x formatArray formatArray ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra \ > microarrayGroups2.ra cp microarrayGroups2.ra \ ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra cd ~/kent/src/hg/makeDb/hgCgiData/ make my # after doing make, check this in hgwdev-hartera # then commit to CVS as it works fine. ########################################################################### # BUILD GENE SORTER TABLES (AKA FAMILY BROWSER) # (DONE, 2006-06-08 - 2006-06-12, hartera) # Zon Lab WT Affy data tables in hgFixed renamed to reflect that the data # is log2 transformed (DONE, 2006-07-30, hartera) # Recreate the ensToAffyZebrafish and ensToAffyZonWildType tables after # updating the Affy Zebrafish track with different filtering used for the # Blat alignments - see UPDATE AFFY ZEBRAFISH TRACK section. Also the # Affy Zon Lab Wild Type Array data was updated with a different method of # processing - see hgFixed.txt (DONE, 2006-10-25, hartera) # Recreated the ensCanonical and ensIsoforms table after updating proteinID # in ensGene table (DONE, 2006-11-06, hartera) # This should be done after creating ensGene, ensGtp and ensPep tables # for the Ensembl Genes track. # The BlastTab tables are already built - see HGNEAR PROTEIN BLAST TABLES # Blastp of self is ensZfishBlastTab table. # Other blastp ortholog tables are: hgBlastTab (hg18), mmBlastTab(mm8), # rnBlastTab (rn4), dmBlastTab (dm2), ceBlastTab (ce2), # sacCerBlastTab (sacCer1). ssh hgwdev mkdir /cluster/data/danRer3/bed/geneSorter.2006-06-08 ln -s /cluster/data/danRer3/bed/geneSorter.2006-06-08 \ /cluster/data/danRer3/bed/geneSorter cd /cluster/data/danRer3/bed/geneSorter # Create table that maps between known genes and RefSeq # Index is only on first 16 characters, too short for Ensembl names # manually changed hgMapToGene to create index with 20 characters # on name and use local copy of program. $HOME/bin/x86_64/hgMapToGene danRer3 refGene ensGene ensToRefSeq # hgsql -e 'select count(*) from ensToRefSeq;' danRer3 # 9707 # Create table that maps between Ensembl genes and LocusLink # LocusLink is now called Entrez Gene. hgsql -N -e "select mrnaAcc,locusLinkId from refLink" danRer3 > refToLl.txt $HOME/bin/x86_64/hgMapToGene danRer3 refGene ensGene \ ensToLocusLink -lookup=refToLl.txt # Update the following three tables after update of Affy Zebrafish and # Affy Zon Lab Wild Type data (2006-10-25): hgsql -e 'drop table ensToAffyZebrafish;' danRer3 hgsql -e 'drop table ensToAffyZonWildType;' danRer3 hgsql -e 'drop table zebrafishZonWTDistance;' danRer3 # Create table that maps between Ensembl genes and the Affy Zebrfish # probeset consensus sequences. $HOME/bin/x86_64/hgMapToGene danRer3 affyZebrafish ensGene \ ensToAffyZebrafish # Create a table that maps between Ensembl genes and # the Zon lab microarray expression data. $HOME/bin/x86_64/hgMapToGene "-type=bed 12" danRer3 affyZonWildType \ ensGene ensToAffyZonWildType # Create expression distance table. nice hgExpDistance danRer3 hgFixed.zebrafishZonWTMedianRatio \ hgFixed.zebrafishZonWTMedianExps zebrafishZonWTDistance \ -lookup=ensToAffyZebrafish & # Have 15617 elements in hgFixed.zebrafishZonWTMedian # Got 8911 unique elements in hgFixed.zebrafishZonWTMedian # Made zebrafishZonWTDistance.tab # Loaded zebrafishZonWTDistance # Made query index # Took 2 minutes. # To allow data to be viewed in Gene Sorter, add the hgNearOk=1 # to the dbDb table entry for danRer3 on hgcentraltest - # see section on MAKE HGCENTRALTEST ENTRY FOR DANRER3. # added a protein ID field to ensGene before running this hgClusterGenes # Cluster together various alt-splicing isoforms. # Creates the knownIsoforms and knownCanonical tables # Rebuild this after creating updating the ensGene table with # protein IDs from UniProt with >= 90% identity to Ensembl proteins. # (2006-11-06, hartera) hgsql -e 'drop table ensIsoforms;' danRer3 hgsql -e 'drop table ensCanonical;' danRer3 hgClusterGenes danRer3 ensGene ensIsoforms ensCanonical # Got 22877 clusters, from 32143 genes in 28 chromosomes # There are also 22877 genes in the ensGtp table so this is in agreement. ####################################################################### # UPDATE AFFY ZEBRAFISH TRACK USING BLAT WITHOUT -mask OPTION AND # USING -repeats OPTION AND DIFFERENT FILTERING TO REMOVE SHORT # ALIGNMENTS (DONE, 2006-09-27, hartera) # With the previous version of this track, QA found a number of short # alignments of <= 30 bp and there are a number in the <= 50bp range. # These do not seem to be meaningful so filtering was changed to try to # remove these alignments while retaining meaningful alignments. # pslCDnaFilter was used with the same settings as used for the # Genbank EST alignments for zebrafish. # Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the # higher minIdentity is causing alignments to be dropped that should not be. # Blat's minIdentity seems to be more severe than that for pslReps or # pslCDnaFilter as it takes insertions and deletions into account. # These are Jim's recommendations. # Remove old Affy zebrafish directories (DONE, 2006-12-13, hartera) # Array chip sequences already downloaded for danRer1 ssh hgwdev cd /projects/compbio/data/microarray/affyZebrafish mkdir -p /san/sanvol1/scratch/affy cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /san/sanvol1/scratch/affy/ # Set up cluster job to align Zebrafish consensus sequences to danRer3 # remove old link and create new one rm /cluster/data/danRer3/bed/affyZebrafish mkdir -p /cluster/data/danRer3/bed/affyZebrafish.2006-09-27 ln -s /cluster/data/danRer3/bed/affyZebrafish.2006-09-27 \ /cluster/data/danRer3/bed/affyZebrafish # Align sequences on the pitakluster. Scaffolds were aligned for NA # and Un and lifted to chrom level afterwards. Chroms 1-25 and M # were aligned as ~5 Mb chunks. ssh pk cd /cluster/data/danRer3/bed/affyZebrafish mv /san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/scaffold*.fa \ /san/sanvol1/scratch/danRer3/ ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/*.fa) ls -1 $f >> genome.lst end wc -l genome.lst # 15149 genome.lst # for output: mkdir -p /san/sanvol1/scratch/danRer3/affy/psl # use -repeats option to report matches to repeat bases separately # to other matches in the PSL output. echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/scratch/danRer3/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst affy.lst template.sub para.spec para create para.spec para try, check, push ... etc. para time # Completed: 15149 of 15149 jobs #CPU time in finished jobs: 34672s 577.87m 9.63h 0.40d 0.001y #IO & Wait Time: 41580s 692.99m 11.55h 0.48d 0.001 y #Average job time: 5s 0.08m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 145s 2.42m 0.04h 0.00d #Submission to last job: 1400s 23.33m 0.39h 0.02d # need to do pslSort and lift up ssh pk cd /san/sanvol1/scratch/danRer3/affy # Do sort, liftUp and then best in genome filter. # only use alignments that have at least # 95% identity in aligned region. # Previously did not use minCover since a lot of sequence is in # Un and NA so genes may be split up so good to see all alignments. # However, found a number of short alignments of <= 50 bp. These are # not meaningful so maybe need to use minCover. If increased too much, # then hits on poor parts of the assembly will be missed. # use pslCDnaFilter with the same parameters as used for zebrafish # Genbank EST alignments. pslSort dirs raw.psl tmp psl pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl # seqs aligns # total: 14886 830753 # drop minNonRepSize: 2753 745330 # drop minIdent: 2645 38916 # drop minCover: 2472 10516 # weird over: 384 1529 # kept weird: 308 403 # drop localBest: 2559 17395 # kept: 14494 18596 # 97.3% were kept. # There are 15502 Affy sequences originally aligned so there are now # 93.5% remaining. # lift up the coordinates to chrom level #pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null # lift up chrom contigs to chrom level cat /cluster/data/danRer3/jkStuff/liftAll.lft \ /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \ > allLift.lft liftUp affyZebrafish.psl allLift.lft warn contig.psl # Got 30168 lifts in allLift.lft # Lifting contig.psl # rsync these psl files rsync -a --progress /san/sanvol1/scratch/danRer3/affy/*.psl \ /cluster/data/danRer3/bed/affyZebrafish/ ssh kkstore02 cd /cluster/data/danRer3/bed/affyZebrafish # shorten names in psl file sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp mv affyZebrafish.psl.tmp affyZebrafish.psl pslCheck affyZebrafish.psl # psl is good # load track into database ssh hgwdev cd /cluster/data/danRer3/bed/affyZebrafish hgsql -e 'drop table affyZebrafish;' danRer3 hgLoadPsl danRer3 affyZebrafish.psl # Add consensus sequences for Zebrafish chip # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/hgFixed/affyProbes ln -s \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /gbdb/hgFixed/affyProbes # these sequences were loaded previously so no need to reload. hgLoadSeq -abbr=Zebrafish: danRer3 \ /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa # Clean up rm batch.bak contig.psl raw.psl # check number of short alignments: hgsql -e \ 'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer3 # 6 # for previous filtered set, there were 1195 alignments of <= 50 bp so # this has improved. hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer3 # 14494 # Previously 14335 distinct affy sequences were aligned. Many of the # short alignments may also have longer alignments to different regions # of the genome that are good. # CLEANUP: # remove old Affy Zebrafish alignment directories (hartera, 2006-12-13) ssh kkstore02 cd /cluster/data/danRer3/bed rm -r affyZebrafish.2005-08-19 rm -r affyZebrafish.2005-09-25 ######################################################################### # NEW RH MAP SEQUENCES FOR TRACK (in progress, 2006-10-12, hartera) # Data from Yi Zhou at Boston Children's Hospital: # yzhou@enders.tch.harvard.edu ssh kkstore02 mkdir /cluster/data/danRer3/bed/rhMap-2006-10-03 cd /cluster/data/danRer3/bed ln -s rhMap-2006-10-03 rhMap # download data files from e-mail: # rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip unzip rhSequenceSubmit100306.zip unzip rhSequenceSubmitSeq100306.zip dos2unix rhSequenceSubmit100306.txt dos2unix rhSequenceSubmitSeq100306.txt # need to convert format of FASTA file to remove the line numbers ########################################################################### # BACENDS CLEANUP (DONE, 2006-12-13, hartera) ssh kkstore02 cd /cluster/data/danRer3/bed/bacends mv ./seqs/getCloneEnds.csh . rm CHORI73.* DH.* DHBacs.fullnames DHmorethan2.* rm bacEndsChroms.psl bacNAandUnScafs.psl rm bacends.lst genome.lst names.psl namesPls.uniq header pslCheck.log \ raw* rm -r /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl rm -r /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnPsl rm -r newPairs2 rm -r /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun rm -r /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnRun rm -r singles pairs scores rm -r ./cloneandStsAliases/tmp rm ./cloneandStsAliases/*.bak ./cloneandStsAliases/*.tab \ ./cloneandStsAliases/*.sort ./cloneandStsAliases/*.uniq rm DH_bacends.fa rm -r liftedPsl # the psl directory is large, gzip the contents cd psl gzip *.psl ######################################################################### ## Reorder Fish organisms (DONE - 2006-12-22 - Hiram) hgsql -h genome-testdb hgcentraltest \ -e "update dbDb set orderKey = 451 where name = 'danRer3';" ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer3 ########################################################################### # REMAKE RADIATION HYBRID (RH) MAP TRACK (DONE, 2007-02-14, hartera) # Use update of Radiation Hybrid map data from October 2006 and use method # as documented in danRer4.txt to map these sequences to danRer3. # Data from Yi Zhou at Boston Children's Hospital: # yzhou@enders.tch.harvard.edu # Latest RH map sequences and primers received on 2006-10-03 from # Anhua (Peter) Song - asong@enders.tch.harvard.edu ssh kkstore02 mkdir /cluster/data/danRer3/bed/ZonLab/rhMap-2006-10-03 cd /cluster/data/danRer3/bed/ZonLab ln -s rhMap-2006-10-03 rhMap cd rhMap # download data files from e-mail: # rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip unzip rhSequenceSubmit100306.zip unzip rhSequenceSubmitSeq100306.zip dos2unix rhSequenceSubmit100306.txt dos2unix rhSequenceSubmitSeq100306.txt # Sequences are in rhSequenceSubmitSeq100306.txt and primers and other # information are in rhSequenceSubmi100306.txt grep '>' rhSequenceSubmitSeq100306.txt | wc -l # 11514 wc -l rhSequenceSubmit100306.txt # 13438 rhSequenceSubmit100306.txt grep '>' rhSequenceSubmitSeq100306.txt > rhMap.names # remove '>' from names and grab first field perl -pi.bak -e 's/>//' rhMap.names awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \ > rhMap.namesOnly.sort awk 'BEGIN {FS="|"} {print $1;}' rhSequenceSubmit100306.txt | sort | uniq \ > rhMapPrimers.namesOnly.sort wc -l *.sort # 11514 rhMap.namesOnly.sort # 13436 rhMapPrimers.namesOnly.sort (after removing blank line) # get a list of headers from the FASTA file grep '>' rhSequenceSubmitSeq100306.txt > rhMap.headers awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq # BAC_END # EST # GENE # SSLP # STS # There are 5 types of sequence here. awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq #BACends #Custom #Insertion_Mutant #Insertion_Mutants #MGH #NCBI #Sanger SG #Sequencing_Project #ThisseClone #Thisse_Clone #other_zfEst #wu_zfEst #wz awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq # CHBG # MPIEB # Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone; # So there are 11 different sources. # There are 2 sequences with problem primers. E-mailed Peter Song about # these and he suggested to delete thoser primers: # >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.| # >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A| # edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers. # need to reformat FASTA headers so they are in the format: # NAME.SOURCE.TYPE.ORIGIN # Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone # so change these to have the same name. Also shorten Sanger SG to # Shotgun. sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmitSeq100306.txt \ | sed -e 's/Insertion_Mutant/InsertMut/' \ | sed -e 's/Sanger SG/Shotgun/' \ | sed -e 's/ThisseClone/Thisse/' \ | sed -e 's/Thisse_Clone/Thisse/' \ | sed -e 's/Sequencing_Project/Seqproj/' > rhMap100306.fa # Do the same for the primers and information file: sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmit100306.txt \ | sed -e 's/Insertion_Mutant/InsertMut/' \ | sed -e 's/Sanger SG/Shotgun/' \ | sed -e 's/ThisseClone/Thisse/' \ | sed -e 's/Thisse_Clone/Thisse/' \ | sed -e 's/Sequencing_Project/Seqproj/' > rhMapPrimers100306.txt # edit these files to remove the extra newline char after the first primer # for 1942c and then change "/" in FJ34C05.Y1/FJ56G09.Y1.WU_ZFEST to # an underscore (2007-02-14, hartera) perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \ rhMap100306.fa perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \ rhMapPrimers100306.txt # use a script to reformat the names for the FASTA headers to the format # >NAME.SOURCE where name is the first field separated by "|" and source # is the 9th field. The source is used to make the name unique. Some # of these names are BAC ends that occur in the BAC ends track so there # are name clashes in the seq table if the names are not made unique. # Also make the name upper case as for those for the danRer1 and danRer2 # RH map and remove base numbering on each sequence line of FASTA file. cat << '_EOF_' > rhFix #!/usr/bin/awk -f #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG| /^>/ { split(toupper($0), a, "\\|"); print a[1]"."a[9]; next; } /^[0-9]+ / { $0 = $2; } { print $0; } '_EOF_' # << keep emacs coloring happy chmod +x rhFix rhFix rhMap100306.fa > rhMap.fa # Blat sequences vs danRer3 genome ssh pk mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun cd /cluster/data/danRer3/bed/ZonLab/rhMap # put the rhMap sequences on the san mkdir -p /san/sanvol1/scratch/danRer3/rhMap cp rhMap.fa /san/sanvol1/scratch/danRer3/rhMap/ # do blat run to align RH map sequences to danRer3 and and use # chrNA_random and chrUn_random separated into scaffolds. cd blatRun ls -1S /san/sanvol1/scratch/danRer3/rhMap/rhMap.fa > rhMap.lst foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/*.fa) ls -1S $f >> genome.lst end wc -l genome.lst # 15149 genome.lst # for output: mkdir -p /san/sanvol1/scratch/danRer3/rhMap/psl # use -repeats option to report matches to repeat bases separately # to other matches in the PSL output. echo '#LOOP\n/cluster/bin/x86_64/blat -repeats=lower -minIdentity=80 -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/scratch/danRer3/rhMap/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst rhMap.lst template.sub para.spec para create para.spec para try, check, push ... etc. para time # Completed: 15149 of 15149 jobs #CPU time in finished jobs: 13684s 228.07m 3.80h 0.16d 0.000y #IO & Wait Time: 38258s 637.63m 10.63h 0.44d 0.001 y #Average job time: 3s 0.06m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 24s 0.40m 0.01h 0.00d #Submission to last job: 901s 15.02m 0.25h 0.01d # need to do pslSort and lift up ssh pk cd /san/sanvol1/scratch/danRer3/rhMap # Do sort, liftUp and then best in genome filter. pslSort dirs raw.psl tmp psl # only use alignments that have at least # 95% identity in aligned region. # Previously did not use minCover since a lot of sequence is in # Un and NA so genes may be split up so good to see all alignments. # However, found a number of short alignments of <= 50 bp. These are # not meaningful so maybe need to use minCover. If increased too much, # then hits on poor parts of the assembly will be missed. # use pslCDnaFilter with the same parameters as used for zebrafish # Genbank EST alignments. # Use parameters as determined for danRer4 pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.85 -minCover=0.33 raw.psl contig.psl # seqs aligns # total: 11060 1767931 # drop invalid: 1 1 #drop minNonRepSize: 3047 1297013 # drop minIdent: 763 3913 # drop minCover: 4065 420022 # weird over: 288 4267 # kept weird: 130 189 # drop localBest: 2188 34092 # kept: 10447 12890 # Percent sequences aligned: 10447/11514 = 90.7% # This is a compromise between reducing the number of sequences piling # up but not losing all alignments for too many sequences. awk '{print $10}' contig.psl | sort | uniq -c | sort -nr > contig.count head contig.count # 33 ZKP106G9.YA.BACENDS # 21 BZ83M20.Z.BACENDS # 12 ZK4I5.T7.BACENDS # 10 ZC27I3.ZA.BACENDS # 10 ZC261G9.ZAF.BACENDS # 10 ZC261G9.ZA.BACENDS # 8 ZK8O7.T7.BACENDS # 8 ZC77P2.ZB.BACENDS # 8 FJ89A05.X1.WU_ZFEST # 8 FJ07G09.X1.WU_ZFEST cd /cluster/data/danRer3/bed/ZonLab/rhMap # lift up to genome level coordinates liftUp rhMap.psl \ /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft warn \ /san/sanvol1/scratch/danRer3/rhMap/contig.psl # Got 30168 lifts in # /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft pslCheck rhMap.psl # psl looks ok # cleanup rm *.bak *.sort # Load sequence alignments into the database ssh hgwdev cd /cluster/data/danRer3/bed/ZonLab/rhMap # drop old table and reload final psl file hgsql -e 'drop table rhMap;' danRer3 hgLoadPsl danRer3 rhMap.psl # Copy sequences to gbdb if they are not already there. mkdir -p /gbdb/danRer3/rhMap # remove old sequences rm /gbdb/danRer3/rhMap/rhMap022306.fa ln -s \ /cluster/data/danRer3/bed/ZonLab/rhMap/rhMap.fa \ /gbdb/danRer3/rhMap/rhMap20061003.fa # then add sequences to database: # remove old ones first hgsql -e 'select * from extFile where path like "%rhMap%";' danRer3 #| id | name | path | size | #+--------+----------------+------------------------------------+---------+ #| 747628 | rhMap022306.fa | /gbdb/danRer3/rhMap/rhMap022306.fa | 7456861 | #+--------+----------------+------------------------------------+---------+ hgsql -e 'select count(*) from seq where extFile = 747628;' danRer3 hgsql -e 'delete from seq where extFile = 747628;' danRer3 # delete from extFile: hgsql -e 'delete from extFile where id = 747628;' danRer3 hgLoadSeq danRer3 /gbdb/danRer3/rhMap/rhMap20061003.fa # loaded succesfully # Check in the Browser and see if there are many pileups # Much reduced now on chr24. Took 10 random sequences in the pileup from # minCover=0.20 and found that 7 of them still align to danRer4 # with minCover=0.33 and 2 of those that don't also have primers that # do not map using the hgPcr tool. # Add trackDb entry and also an rhMap.html for trackDb/zebrafish/danRer4 # also add the search specs for hgFindSpec to trackDb.ra # Add table of related information for the RH map details pages: # Check that all the headers from rhMap.headers are also in the primers # file which seems to contain the same headers from the FASTA file # as well as additional markers. ssh kkstore02 cd /cluster/data/danRer3/bed/ZonLab/rhMap/ # The same RH map version was used as for danRer4 so the data for the # info table is the same as for danRer4 so copy the file over. See # kent/src/hg/makeDb/doc/danRer4.txt for details on how this file is # produced. cp /cluster/data/danRer4/bed/ZonLab/rhMap/rhMapInfoWithZfinIds.tab . # load the info table ssh hgwdev cd /cluster/data/danRer3/bed/ZonLab/rhMap hgLoadSqlTab danRer3 rhMapZfishInfo ~/kent/src/hg/lib/rhMapZfishInfo.sql \ rhMapInfoWithZfinIds.tab # add code to hgc.c to print ZFIN ID, if available, on the details page # together with the other marker-related information. # added track to trackDb.ra in trackDb/zebrafish/danRer4 with a URL for # the ZFIN IDs to link to the relevant page at http://www.zfin.org # and added an html page for the track. ######################################################################### # BACENDS CLEANUP (DONE, 2007-03-27, hartera) ssh kkstore02 cd /cluster/data/danRer3/bed/bacends # 23G in bacends directory # remove sequence file as already in bacSeqs dir rm Zv5Bacends.fa # du -sh psl # 12G psl nice rm -r psl cd bacends.1 rm bacEndAccs.aliases bacEnds.log bacEnds.names.sort bacPrs.names bacs.log rm ch211 intNames intNames.count out test test.pl bacEndSingles.txt rm -r test2 bacEndAccs rm BACClones* BACEndAccs.txt *.accs allBacEnds* bacEndSingles.names cd ../scoresAndCoords rm allBacEnds.names.* bacEndSinglesGoodScores.bed error.log *.tab \ singles.hits bacEnds.load.psl bacEnds.names rm -r tmp cd ../pairsNew # bacEndSingles.bed is already in singlesNew rm bacEnds.* bed.tab bacEndSingles.bed cd ../singlesNew rm singles.coordcheck bed.tab bacEnds.* cd ../duplicates # duplicatesNew is latest directory so remove everthing else from # duplicates directory rm * cd duplicatesNew rm log* *.lfs cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases rm -r tmp rm bacClones* bacs.names log cd /cluster/data/danRer3/bed/ du -sh bacends # 5.0G bacends # BAC ENDS track was remade in May 2006 (see REDO BACENDS section) # so can remove bacEndsNew which is an old version from 2005" du -sh newBacends # 37G newBacends nice rm -r newBacends ######################################################################### ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) echo danRer3 fr1 tetNig1 mm7 hg18 > /hive/data/genomes/danRer3/bed/multiz5way/species.list update genbank.conf danRer3.upstreamGeneTbl = refGene danRer3.upstreamMaf = multiz5way /hive/data/genomes/danRer3/bed/multiz5way/species.list