# for emacs: -*- mode: sh; -*- # Danio Rerio (zebrafish) from Sanger, version Zv5 (released 5/20/05) # Project website: # http://www.sanger.ac.uk/Projects/D_rerio/ # Assembly notes: # http://www.sanger.ac.uk/Projects/D_rerio/ # ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_assembl_information.shmtl # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-----------+-------------------------+ # | tableName | type | # +-----------+-------------------------+ # | refGene | genePred refPep refMrna | # | mgcGenes | genePred | # | genscan | genePred genscanPep | # +-----------+-------------------------+ ########################################################################### # DOWNLOAD SEQUENCE (DONE, 2006-03-29, hartera) # CHANGED NAME OF SCAFFOLDS AGP FILE (DONE, 2006-04-13, hartera) ssh kkstore01 mkdir /cluster/store8/danRer4 ln -s /cluster/store8/danRer4 /cluster/data cd /cluster/data/danRer4 wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/README wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6.chunks.agp wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6.scaffold.agp wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_scaffolds.fa wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_scaffolds.stats # keep agp file name consistent with Zv5 (hartera, 2006-04-13) mv Zv6.scaffold.agp Zv6.scaffolds.agp ########################################################################### # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2006-03-29, hartera) # ADDED CHUNKS AGP FILE (DONE, 2006-04-13, hartera) ssh kkstore01 mkdir -p /cluster/data/danRer4/M cd /cluster/data/danRer4/M # go to http://www.ncbi.nih.gov/ and search the Nucleotide database for # "Danio mitochondrion genome". That shows the gi number: # 8576324 for the accession, AC024175 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA' # Edit chrM.fa: make sure the header line says it is the # Danio Rerio mitochondrion complete genome, and then replace the # header line with just ">chrM". perl -pi.bak -e 's/>.+/>chrM/' chrM.fa rm *.bak # Make a "pseudo-contig" for processing chrM too: mkdir ./chrM_1 sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa mkdir ./lift echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst echo "chrM_1" > ./lift/ordered.lst # make sure this is tab delimited: echo "0\tM/chrM_1\t16596\tchrM\t16596" > ./lift/ordered.lft # create a .agp file for chrM as hgGoldGapGl and other # programs require a .agp file so create chrM.agp echo "chrM\t1\t16596\t1\tF\tAC024175.3\t1\t16596\t+" \ > chrM.agp # Create a chrM.chunks.agp (hartera, 2006-04-13) mkdir -p /cluster/data/danRer4/M/agps cd /cluster/data/danRer4/M/agps awk 'BEGIN {OFS="\t"} \ {print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' \ ../chrM.agp > chrM.chunks.agp # make sure that all above *.agp files are tab delimited ########################################################################### # CREATE LIST OF CHROMOSOMES (DONE, 2006-04-12, hartera) # Change names of random chroms to chrNA_random and chrUn_random # (DONE, hartera, 2006-04-21) ssh kkstore01 cd /cluster/data/danRer4 awk '{if ($1 !~ /Zv6/) print $1;}' Zv6.scaffolds.agp \ | sort -n | uniq > chrom.lst cp chrom.lst chrom1to25.lst # add chrM, chrUn and chrNA echo "M" >> chrom.lst echo "NA" >> chrom.lst echo "Un" >> chrom.lst # Change names of random chroms to reflect that perl -pi.bak -e 's/NA/NA_random/' chrom.lst perl -pi.bak -e 's/Un/Un_random/' chrom.lst rm *.bak ########################################################################### # MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2006-04-12, hartera) ssh kkstore01 cd /cluster/data/danRer4 # This used to hold scripts -- better to keep them inline here # Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/danRer4/jkStuff # This is where most tracks will be built: mkdir /cluster/data/danRer4/bed ########################################################################### # CHECK AGP FILES AND FASTA SIZE CONSISTENCY (DONE, 2006-04-13, hartera) # ssh kkstore01 cd /cluster/data/danRer4 mkdir -p /cluster/data/danRer4/scaffolds cd /cluster/data/danRer4/scaffolds faSize detailed=on ../Zv6_scaffolds.fa > Zv6.scaffolds.sizes # Check that these sizes correspond to the sizes in the scaffolds agp file # use script compareSizes2.pl cat << '_EOF_' > ../jkStuff/compareSizes2.pl #!/usr/bin/perl -w use strict; my ($file, $agp); $file = $ARGV[0]; $agp = $ARGV[1]; open(FILE, $file) || die "Can not open $file: $!\n"; open(AGP, $agp) || die "Can not open $agp: $!\n"; open(OUT, ">log.txt") || die "Can not create log.txt: $!\n"; my ($l, @f, $name, $size, %scafsHash); while () { $l = $_; @f = split(/\t/, $l); $name = $f[0]; $size = $f[1]; $scafsHash{$name} = $size; } close FILE; while () { my ($line, @fi, $scaf, $end); $line = $_; if ($line =~ /Zv/) { @fi = split(/\t/, $line); $scaf = $fi[5]; $end = $fi[7]; if (exists($scafsHash{$scaf})) { if ($scafsHash{$scaf} == $end) { print OUT "$scaf - ok\n"; } else { print OUT "$scaf - different size to sequence\n"; } } else { print OUT "$scaf - does not exist in list of sizes\n"; } } } close AGP; close OUT; '_EOF_' # << happy emacs chmod +x ../jkStuff/compareSizes2.pl perl /cluster/data/danRer4/jkStuff/compareSizes2.pl \ Zv6.scaffolds.sizes ../Zv6.scaffolds.agp grep different log.txt grep not log.txt # these are all consistent with the sequence sizes # check that the co-ordinates in the agp files are consistent: # field 2 is the start position, field 3 is the end and field 8 is the size # so check that this is consistent. cd /cluster/data/danRer4 awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' Zv6.scaffolds.agp \ > Zv6.scaffolds.coordCheck # this file is empty so they are ok. do the same for the chunks.agp file awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' Zv6.chunks.agp \ > Zv6.chunks.coordCheck # this file is empty so ok # check that the difference between 7th and 8th fields is the same as the # difference between 11th and 12th fields. awk '{if ($5 != "N" && (($8 - $7) != ($12 - $11))) print $6;}' \ Zv6.chunks.agp > Zv6.chunks.coordCheck2 # these are all ok rm Zv6.*.coord* cat << '_EOF_' > ./jkStuff/checkSizesInAgps.pl #!/usr/bin/perl -w use strict; my ($ch, $sc, %scafsHash); $sc = $ARGV[0]; # scaffolds agp $ch = $ARGV[1]; # chunks or contigs agp open(SCAFS, $sc) || die "Can not open $sc: $!\n"; open(CHUNKS, $ch) || die "Can not open $ch: $!\n"; while () { my ($l, @f, $name, $e); $l = $_; @f = split(/\t/, $l); if ($f[5] =~ /^Zv/) { $name = $f[5]; $e = $f[2]; $scafsHash{$name} = $e; } } close SCAFS; my $scaf = ""; my $prev = ""; my $prevEnd = 0; while () { my ($line, @fi); $line = $_; @fi = split(/\t/, $line); # if it is not a gap line if ($fi[4] ne "N") { $scaf = $fi[9]; if (($scaf ne $prev) && ($prev ne "")) { checkCoords($prev, $prevEnd); } $prev = $scaf; $prevEnd = $fi[2]; } } # check last entry in file checkCoords($prev, $prevEnd); close CHUNKS; sub checkCoords { my ($name, $end) = @_; if (exists($scafsHash{$prev})) { if ($scafsHash{$prev} != $prevEnd) { my $ed = $scafsHash{$prev}; print "Scaffold $prev is not consistent between agps\n"; } else { my $ed = $scafsHash{$prev}; print "Scaffold $prev - ok\n"; } } } '_EOF_' # << happy emacs chmod +x ./jkStuff/checkSizesInAgps.pl cd scaffolds perl /cluster/data/danRer4/jkStuff/checkSizesInAgps.pl \ Zv6.scaffolds.agp Zv6.chunks.agp > Zv6.scafsvschunks grep "not consistent" Zv6.scafsvschunks # no lines were inconsistency was reported wc -l Zv6.scafsvschunks # 6653 Zv6.scafsvschunks grep "Zv6" Zv6.scaffolds.agp | wc -l # 6653 # so all the scaffolds were checked and were ok. cd .. rm -r scaffolds ########################################################################### # SPLIT AGP FILES BY CHROMOSOME (DONE, 2006-04-13, hartera) # GENOME FASTA FROM SANGER WAS CREATED USING SCAFFOLDS AGP ssh kkstore01 cd /cluster/data/danRer4 # There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and # then one for chunks (contigs on danRer1) showing how they map on to # scaffolds. # get list of scaffolds from FASTA file and check these are in agp grep '>' Zv6_scaffolds.fa | sed -e 's/>//' | sort | uniq > Zv6FaScafs.lst # get list of scaffolds from agp - do not print from gap lines awk '{if ($7 !~ /contig/) print $6;}' Zv6.scaffolds.agp \ | sort | uniq > Zv6AgpScafs.lst diff Zv6FaScafs.lst Zv6AgpScafs.lst # no difference so all scaffolds are in the FASTA file # add "chr" prefix for the agp files perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp # for chromosomes 1 to 25, create 2 agps for each chrom, one for scaffolds # and one for chunks: foreach c (`cat chrom1to25.lst`) echo "Processing $c ..." mkdir $c perl -we "while(<>){if (/^chr$c\t/) {print;}}" \ ./Zv6.chunks.agp \ > $c/chr$c.chunks.agp perl -we "while(<>){if (/^chr$c\t/) {print;}}" \ ./Zv6.scaffolds.agp \ > $c/chr$c.scaffolds.agp end ########################################################################### # CREATE AGP FILES FOR chrNA AND chrUn (DONE, 2006-04-13, hartera) # RECREATE AGP FILES WITH chrNA and chrUn RENAMED AS chrNA_random # AND chrUn_random (DONE, 2006-04-21, hartera) # NOTE: IN THIS ASSEMBLY AND IN FUTURE, NAME chrNA AND chrUn AS # chrNA_random AND chrUn_random TO REFLECT THAT THEY ARE UNORDERED # COLLECTIONS OF SCAFFOLDS. ssh kkstore01 # chrNA_random consists of WGS contigs that could not be related to any # FPC contig and the scaffolds and contigs are named Zv5_NAN in the # first field of the agp files where the second N is an number. cd /cluster/data/danRer4 mkdir ./NA_random awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.chunks.agp \ > ./NA_random/NA_random.chunks.agp awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.scaffolds.agp \ > ./NA_random/NA_random.scaffolds.agp # change the first field to "chrNA_random" then can use agpToFa to process perl -pi.bak -e 's/Zv6_NA[0-9]+/chrNA_random/' ./NA_random/*.agp wc -l ./NA_random/NA_random.scaffolds.agp # 2898 ./NA_random/NA_random.scaffolds.agp # check files and remove backup files # these are not sorted numerically by scaffold number rm ./NA_random/*.bak # then process chrUn_random - this is made from scaffolds and # contigs where the name is Zv6_scaffoldN in the first field of the # agp files. These scaffolds and contigs are unmapped to chromosomes # in the agp file. chrUn_random is made up of WGS scaffolds that mapped to # FPC contigs, but the chromosome is unknown. mkdir ./Un_random awk '{if ($1 ~ /Zv6_scaffold/) print;}' Zv6.chunks.agp \ > ./Un_random/Un_random.chunks.agp awk '{if ($1 ~ /Zv6_scaffold/) print;}' Zv6.scaffolds.agp \ > ./Un_random/Un_random.scaffolds.agp # change the first field to "chrUn_random" then can use agpToFa to process perl -pi.bak -e 's/Zv6_scaffold[0-9]+/chrUn_random/' ./Un_random/*.agp wc -l ./Un_random/Un_random.scaffolds.agp # 68 ./Un_random/Un_random.scaffolds.agp # check files and remove backup files rm ./Un_random/*.bak # get FASTA file of sequences for NA_random and Un_random and create agp with # Ns between scaffolds # from scaffolds agp, get name of scaffolds to be retrieved from the # FASTA file to make the NA_random and Un_random chromosomes. cd /cluster/data/danRer4 foreach c (NA_random Un_random) awk '{print $6;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst $HOME/bin/i386/faSomeRecords /cluster/data/danRer4/Zv6_scaffolds.fa \ $c/chr$c.scaffolds.lst $c/chr$c.fa end # check that all scaffolds in the list are in the FASTA file for # NA_random and Un_random. # made a change to scaffoldFaToAgp.c so that the the number of Ns to be # inserted between scaffolds can be specified as an option. # There are less and smaller random scaffolds than before so use 50,000 Ns # between scaffolds as for the human random chromosomes. foreach c (NA_random Un_random) $HOME/bin/i386/scaffoldFaToAgp -scaffoldGapSize=50000 $c/chr$c.fa mv $c/chr$c.fa $c/chr$c.scaffolds.fa end # change chrUn to chrNA_random for NA_random, change chrUn to chrUn_random # forUn_random. Change D to W for NA_random and Un_random.. sed -e 's/chrUn/chrNA_random/' ./NA_random/chrNA_random.agp \ | sed -e 's/D/W/' > ./NA_random/chrNA_random.scaffolds.agp # the scaffolds agp for chrNA_random is now sorted numerically by # scaffold number sed -e 's/chrUn/chrUn_random/' ./Un_random/chrUn_random.agp \ | sed -e 's/D/W/' > ./Un_random/chrUn_random.scaffolds.agp # edit ./NA_random/chrNA_random.scaffolds.agp and # ./Un_random/chrUn_random.scaffolds.agp and remove last line as this # just adds an extra 50000 Ns at the # end of the sequence. rm ./NA_random/chrNA_random.agp ./Un_random/chrUn_random.agp cat << '_EOF_' > ./jkStuff/createAgpWithGaps.pl #!/usr/bin/perl use strict; # This script takes a chunks agp and inserts Ns between scaffolds for # the chunks (contigs) agp file. Could also insert Ns between scaffolds # for scaffolds agp. my ($chrom, $numN, $name, $prev, $st, $end, $prevEnd, $id); my $chrom = $ARGV[0]; # chromosome name my $numN = $ARGV[1]; # number of Ns to be inserted my $type = $ARGV[2]; # contigs or scaffolds $prev = ""; $st = 1; $prevEnd = 0; $id = 0; while () { my $l = $_; my @f = split(/\t/, $l); if ($type eq "contigs") { $name = $f[9]; } else { $name = $f[5] } my $currSt = $f[1]; my $currEnd = $f[2]; my $size = $currEnd - $currSt; $id++; $st = $prevEnd + 1; $end = $st + $size; if (($prev ne "") && ($prev ne $name)) { $st = $prevEnd + 1; $end = ($st + $numN) - 1; print "$chrom\t$st\t$end\t$id\tN\t$numN\tcontig\tno\n"; $prevEnd = $end; $id++; } $st = $prevEnd + 1; $end = $st + $size; print "$chrom\t$st\t$end\t$id\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$f[8]"; if ($type eq "contigs") { print "\t$f[9]\t$f[10]\t$f[11]"; } $prevEnd = $end; $prev = $name; } '_EOF_' chmod +x ./jkStuff/createAgpWithGaps.pl cd /cluster/data/danRer4/NA_random # for NA_random, sort the chunks.agp by contig number perl -pi.bak -e 's/Zv6_NA//' NA_random.chunks.agp sort -k6,6n NA_random.chunks.agp > NA_random.chunks2.agp # then put back Zv6_NA perl -pi.bak -e 's/([0-9]+\.[0-9]+)/Zv6_NA$1/' NA_random.chunks2.agp mv NA_random.chunks2.agp NA_random.chunks.agp # Un_random.chunks.agp is already sorted by scaffold number cd /cluster/data/danRer4 foreach c (NA_random Un_random) cd $c perl /cluster/data/danRer4/jkStuff/createAgpWithGaps.pl \ chr${c} 50000 contigs < ${c}.chunks.agp > chr${c}.chunks.agp cd .. end # check co-ordinates # field 2 is the start position, field 3 is the end and field 8 is the size # so check that this is consistent in scaffolds and chunks agp. # check that the difference between 7th and 8th fields is the same as the # difference between 11th and 12th fields for chunks agp. cd /cluster/data/danRer4 foreach c (NA_random Un_random) awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' \ $c/chr${c}.scaffolds.agp > $c/chr${c}.scaffolds.coordCheck awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' \ $c/chr${c}.chunks.agp > $c/chr${c}.chunks.coordCheck awk '{if ($5 != "N" && (($8 - $7) != ($12 - $11))) print $6;}' \ $c/chr${c}.chunks.agp > $c/chr${c}.chunks.coordCheck2 end # check the outputs are empty wc -l NA_random/*.coord* wc -l Un_random/*.coord* rm NA_random/*.coord* Un_random/*.coord* # check that the scaffolds and chunks agp files are consistent with # each other. cat << '_EOF_' > ./jkStuff/checkSizesInAgps.pl #!/usr/bin/perl -w use strict; my ($ch, $sc, %scafsHash); $sc = $ARGV[0]; # scaffolds agp $ch = $ARGV[1]; # chunks or contigs agp open(SCAFS, $sc) || die "Can not open $sc: $!\n"; open(CHUNKS, $ch) || die "Can not open $ch: $!\n"; while () { my ($l, @f, $name, $e); $l = $_; @f = split(/\t/, $l); if ($f[5] =~ /^Zv/) { $name = $f[5]; $e = $f[2]; $scafsHash{$name} = $e; } } close SCAFS; my $scaf = ""; my $prev = ""; my $prevEnd = 0; while () { my ($line, @fi); $line = $_; @fi = split(/\t/, $line); # if it is not a gap line if ($fi[4] ne "N") { $scaf = $fi[9]; if (($scaf ne $prev) && ($prev ne "")) { checkCoords($prev, $prevEnd); } $prev = $scaf; $prevEnd = $fi[2]; } } # check last entry in file checkCoords($prev, $prevEnd); close CHUNKS; sub checkCoords { my ($name, $end) = @_; if (exists($scafsHash{$prev})) { if ($scafsHash{$prev} != $prevEnd) { my $ed = $scafsHash{$prev}; print "Scaffold $prev is not consistent between agps\n"; } else { my $ed = $scafsHash{$prev}; print "Scaffold $prev - ok\n"; } } } '_EOF_' # << happy emacs chmod +x jkStuff/checkSizesInAgps.pl foreach c (NA_random Un_random) perl /cluster/data/danRer4/jkStuff/checkSizesInAgps.pl \ $c/chr${c}.scaffolds.agp $c/chr${c}.chunks.agp \ > $c/${c}.scafsvschunks end foreach c (NA_random Un_random) grep "not consistent" $c/${c}.scafsvschunks end wc -l NA_random/NA_random.scafsvschunks wc -l Un_random/Un_random.scafsvschunks # no lines were inconsistency was reported rm NA_random/NA_random.scafsvschunks Un_random/Un_random.scafsvschunks # clean up foreach c (NA_random Un_random) rm $c/${c}.scaffolds.agp $c/${c}.chunks.agp $c/chr${c}.scaffolds.fa \ $c/chr${c}.scaffolds.lst $c/*.bak end '_EOF_' ########################################################################### # BUILD CHROM-LEVEL SEQUENCE (DONE, 2006-04-13, hartera) # REPEAT THIS FOR chrNA_random AND chrUn_random (DONE, 2006-04-21, hartera) ssh kkstore01 cd /cluster/data/danRer4 # Ignore warnings about chrM files not existing - this chrom has # already been processed - see mitochondrion section above. # Sequence is already in upper case so no need to change foreach c (`cat chrom.lst`) echo "Processing ${c}" $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \ $c/chr$c.fa ./Zv6_scaffolds.fa echo "${c} - DONE" end # move scaffolds agp to be chrom agp and clean up foreach c (`cat chrom.lst`) cd $c cp chr${c}.scaffolds.agp chr${c}.agp mkdir -p agps mv chr${c}.*.agp ./agps/ cd .. end # Repeat just for chrNA_random and chrUn_random (2006-04-21, hartera) foreach c (NA_random Un_random) echo "Processing ${c}" $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \ $c/chr$c.fa ./Zv6_scaffolds.fa echo "${c} - DONE" end # move scaffolds agp to be chrom agp and clean up foreach c (NA_random Un_random) cd $c cp chr${c}.scaffolds.agp chr${c}.agp mkdir -p agps mv chr${c}.*.agp ./agps/ cd .. end ########################################################################## # CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2006-04-14, hartera) # RE-CHECK THESE AFTER CREATING chrNA_random AND chrUn_random SEQUENCE FILES # (DONE, 2006-04-20, hartera) # Check that the size of each chromosome .fa file is equal to the last # co-ordinate of the corresponding agp file. ssh hgwdev cd /cluster/data/danRer4 foreach c (`cat chrom.lst`) foreach f ( $c/chr$c.agp ) set agpLen = `tail -1 $f | awk '{print $3;}'` set h = $f:r set g = $h:r echo "Getting size of $g.fa" set faLen = `faSize $g.fa | awk '{print $1;}'` if ($agpLen == $faLen) then echo " OK: $f length = $g length = $faLen" else echo "ERROR: $f length = $agpLen, but $g length = $faLen" endif end end # all are the OK so FASTA files are the expected size ########################################################################### # CREATING DATABASE (DONE, 2006-04-14, hartera) # Create the database. # next machine ssh hgwdev echo 'create database danRer4' | hgsql '' # if you need to delete that database: !!! WILL DELETE EVERYTHING !!! echo 'drop database danRer4' | hgsql danRer4 # Use df to make sure there is at least 10 gig free on df -h /var/lib/mysql # Before loading data: # Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 1.5T 173G 90% /var/lib/mysql ########################################################################### # CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2006-04-14, hartera) # next machine ssh hgwdev # the following command copies all the data from the table # grp in the database mm8 to the new database danRer4. Use one of the # newest databases to copy from to make sure that the groupings are # up to date. echo "create table grp (PRIMARY KEY(NAME)) select * from mm8.grp" \ | hgsql danRer4 # if you need to delete that table: !!! WILL DELETE ALL grp data !!! echo 'drop table grp;' | hgsql danRer4 ########################################################################### # MAKE HGCENTRALTEST ENTRY FOR DANRER4 (DONE, 2006-04-14, hartera) # CHANGE DATE FORMAT ON HGCENTRALTEST ENTRY (DONE, 2006-04-21, hartera) # Make entry into dbDb and defaultDb so test browser knows about it. ssh hgwdev # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ values("danRer4", "March 2006", \ "/gbdb/danRer4", "Zebrafish", "chr2:15,906,734-15,926,406", 1, \ 37, "Zebrafish", "Danio rerio", \ "/gbdb/danRer4/html/description.html", 0, 0, \ "Sanger Centre, Danio rerio Sequencing Project Zv6");' \ | hgsql -h genome-testdb hgcentraltest # reformat the date (2006-04-21, hartera) echo 'update dbDb set description = "Mar. 2006" where name = "danRer4";' \ | hgsql -h genome-testdb hgcentraltest # Create /gbdb directory for danRer4 mkdir /gbdb/danRer4 # SET AS DEFAULT LATER WHEN READY FOR RELEASE # set danRer4 to be the default assembly for Zebrafish echo 'update defaultDb set name = "danRer4" \ where genome = "Zebrafish";' \ | hgsql -h genome-testdb hgcentraltest ########################################################################### # BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS # (DONE, 2006-04-14, hartera) # RE-DONE JUST FOR chrNA_random AND chrUn_random (DONE, 2006-04-20, hartera) ssh kkstore01 cd /cluster/data/danRer4 foreach c (`cat chrom.lst`) foreach agp ($c/chr$c.agp) if (-e $agp) then set fa = $c/chr$c.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end # Repeat just for chrNA_random and chrUn_random (2006-04-21, hartera) ssh kkstore01 cd /cluster/data/danRer4 foreach c (NA_random Un_random) foreach agp ($c/chr$c.agp) if (-e $agp) then set fa = $c/chr$c.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end ########################################################################### # MAKE LIFTALL.LFT (DONE, 2006-04-14, hartera) # REMAKE LIFTALL.LFT WITH chrNA_random AND chrUn_random # (DONE, 2006-04-21, hartera) ssh kkstore01 cd /cluster/data/danRer4 rm jkStuff/liftAll.lft foreach c (`cat chrom.lst`) cat $c/lift/ordered.lft >> jkStuff/liftAll.lft end ########################################################################### # MAKE TRACKDB ENTRY FOR DANRER4 (DONE, 2006-04-14, hartera) # Should add this later when adding gold/gap tracks. Angie created a # temporary chromInfo table otherwise make update/alpha causes an error # (2006-04-17) # Make trackDb table so browser knows what tracks to expect. ssh hgwdev mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer4 cd ~/kent/src/hg/makeDb/trackDb/zebrafish cvs add danRer4 cvs commit danRer4 cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add danRer4 in all the right places and do make update DBS=danRer4 make alpha DBS=danRer4 cvs commit -m "Added danRer4." makefile ########################################################################### # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2006-04-14, hartera) ssh hgwdev mkdir /cluster/data/danRer4/html # make a symbolic link from /gbdb/danRer4/html to /cluster/data/danRer4/html ln -s /cluster/data/danRer4/html /gbdb/danRer4/html # Add a description page for zebrafish cd /cluster/data/danRer4/html cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer3/description.html . # Edit this for zebrafish danRer4 # create a description.html page here cd ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer4 # Add description page here too cp /cluster/data/danRer4/html/description.html . cvs add description.html cvs commit -m "First draft of description page for danRer4." \ description.html cd ~/kent/src/hg/makeDb/trackDb make update DBS=danRer4 make alpha DBS=danRer4 ########################################################################### # SIMPLE REPEAT [TRF] TRACK (DONE, 2006-04-14, hartera) # RE-RUN FOR chrNA AND chrUn RENAMED AS chrNA_random AND chrUn_random # AND RELOAD THE TABLE (DONE, 2006-04-21, hartera) # MADE A NOTE IN THE HISTORY TABLE TO EXPLAIN WHY THE simpleRepeats TABLE # WAS RELOADED (DONE, 2006-04-22, hartera) # TRF can be run in parallel with RepeatMasker on the file server # since it doesn't require masked input sequence. # Run this on the kilokluster. Need to mask contig and chromosome # sequences so run trf using contig sequences. # First copy over contig sequences to iscratch and then rsync to cluster. ssh kkr1u00 rm -r /iscratch/i/danRer4/contigsNoMask mkdir -p /iscratch/i/danRer4/contigsNoMask cd /cluster/data/danRer4 foreach d (/cluster/data/danRer4/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) echo "Copyig $f ..." cp $f /iscratch/i/danRer4/contigsNoMask/ end end ls /iscratch/i/danRer4/contigsNoMask/*.fa | wc -l # 317 sequence files # rsync to cluster machines foreach R (2 3 4 5 6 7 8) rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/ end ssh kki mkdir -p /cluster/data/danRer4/bed/simpleRepeat cd /cluster/data/danRer4/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << keep emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << keep emacs coloring happy ls -1S /iscratch/i/danRer4/contigsNoMask/chr*.fa > genome.lst gensub2 genome.lst single gsub jobList # 317 jobs para create jobList para try, check, push, check etc... para time # Completed: 317 of 317 jobs # CPU time in finished jobs: 25083s 418.05m 6.97h 0.29d 0.001 y # IO & Wait Time: 933s 15.55m 0.26h 0.01d 0.000 y # Average job time: 82s 1.37m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2732s 45.53m 0.76h 0.03d # Submission to last job: 4604s 76.73m 1.28h 0.05d # Re-do only for chrNA_random and chrUn_random (2006-04-21, hartera) ssh kki cd /cluster/data/danRer4/bed/simpleRepeat rm trf/chrNA*.bed rm trf/chrUn*.bed rm simpleRepeat.bed mkdir -p randomsRun/trf cd randomsRun cp ../runTrf . cp ../gsub . ls -1S /iscratch/i/danRer4/contigsNoMask/chr*_random*.fa > genome.lst gensub2 genome.lst single gsub jobList para create jobList # 46 jobs para try, check, push, check etc... para time # Completed: 46 of 46 jobs # CPU time in finished jobs: 1904s 31.73m 0.53h 0.02d 0.000 y # IO & Wait Time: 103s 1.72m 0.03h 0.00d 0.000 y # Average job time: 44s 0.73m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 241s 4.02m 0.07h 0.00d # Submission to last job: 269s 4.48m 0.07h 0.00d cp ./trf/*.bed /cluster/data/danRer4/bed/simpleRepeat/trf/ # lift up to chrom level cd /cluster/data/danRer4/bed/simpleRepeat rm simpleRepeat.bed liftUp simpleRepeat.bed /cluster/data/danRer4/jkStuff/liftAll.lft warn \ trf/*.bed # Reload into the database ssh hgwdev cd /cluster/data/danRer4/bed/simpleRepeat hgsql -e 'drop table simpleRepeat;' danRer4 hgLoadBed danRer4 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 759659 elements of size 16 # Make a note in the history table to explain why the simpleRepeats # table was reloaded (2006-04-22, hartera) hgsql -e 'update history set errata = \ "Dropped table for reloading after changing names of random chroms." \ where ix = 2;' danRer4 ########################################################################### # CREATE MICROSAT TRACK (done 2006-7-5 JK) ssh hgwdev cd /cluster/data/danRer4/bed mkdir microsat cd microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed /cluster/bin/i386/hgLoadBed canFam2 microsat microsat.bed ########################################################################### # PROCESS SIMPLE REPEATS INTO MASK (DONE, 2005-06-14, hartera) # RE-DO AFTER RENAMING RANDOM CHROMS AS chrNA_random AND chrUn_random # (DONE, 2006-04-21, hartera) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kkstore01 cd /cluster/data/danRer4/bed/simpleRepeat rm -r trfMask mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/danRer4 rm -r ./bed/simpleRepeat/trfMaskChrom mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end ########################################################################### # GET ADDITIONAL ZEBRAFISH REPBASE LIBRARY FOR REPEATMASKER AND ADD TO # DANIO LIBRARY FOR REPEATMASKER (DONE, 2006-04-14, hartera) # Go to http://www.girinst.org/server/RepBase/RepBase11.02.fasta # (03-15-2006) and download zebunc.ref.txt containing unclassified zebrafish # repeats. # Need username and password. Copy to /cluster/bluearc/RepeatMasker/Libraries/ ssh hgwdev cd /cluster/bluearc/RepeatMasker/Libraries # This is /cluster/bluearc/RepeatMasker060320/Libraries # Do a dummy run of RepeatMasker with the -species option. This creates # a zebrafish-specific library from the EMBL format RepBase library. # Then the zebunc.ref unclassified repeats can be added to this library. /cluster/bluearc/RepeatMasker/RepeatMasker -spec danio /dev/null # RepeatMasker version development-$Id: RepeatMasker,v 1.13 2006/03/21 # This creates a specieslib in Libraries/20060315/danio # Format the zebunc.ref library: # Sequence is upper case, change to lower case like the specieslib cat zebunc.ref.txt | tr '[A-Z]' '[a-z]' > zebunc.ref.format perl -pi.bak -e 's/>dr([0-9]+)/>Dr$1#Unknown/' zebunc.ref.format grep '>' zebunc.ref.format | wc -l # 958 cd /cluster/bluearc/RepeatMasker/Libraries/20060315/danio grep '>' specieslib | wc -l # 219 mv specieslib danio.lib cat danio.lib ../../zebunc.ref.format > specieslib grep '>' specieslib | wc -l # 1177 rm danio.lib # make a copy in Libraries directory in case this directory of libraries # is removed. cp specieslib /cluster/bluearc/RepeatMasker/Libraries/danio.lib ########################################################################### # SPLIT SEQUENCE FOR REPEATMASKER RUN (DONE, 2006-04-14, hartera) # SPLIT SEQUENCE AGAIN JUST FOR chrNA_random AND chrUn_random AFTER RENAMING # THESE RANDOM CHROMS (DONE, 2006-04-21, hartera) ssh kkstore01 cd /cluster/data/danRer4 # break up into 500 kb sized chunks at gaps if possible # for RepeatMasker runs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end # took about 3 minutes. # split just for chrNA_random and chrUn_random (2006-04-21, hartera) cd /cluster/data/danRer4 foreach c (NA_random Un_random) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end ########################################################################### # REPEATMASKER RUN (DONE, 2006-04-21, hartera) # Originally run 2006-04-14. There was one sequence chr16_4_10.fa that # failed with a division by zero error. Sent this as a test case with the # danio library to Robert Hubley who fixed the bug and sent a new # version of ProcessRepeats. Checked this into CVS for # /cluster/bluearc/RepeatMasker on 2006-04-19. # When a new library is added for this version of RepeatMasker, need to # check in /cluster/bluearc/RepeatMasker/Libraries for a directory made # up of a date e.g. 20060315 here and inside this are species directories # for which RepeatMasker has already been run. In this directory it creates # a specieslib of the danio repeats. If this exists, this is used for the # RepeatMasker run for that species. Check that this contains the # unclassified Zebrafish repeats with IDs beginning with Dr. This library # with these repeats should have been created in the section above: # Use sequence split into 500 kb chunks. ssh kkstore01 cd /cluster/data/danRer4 mkdir RMRun # Record RM version used: ls -l /cluster/bluearc/RepeatMasker # lrwxrwxrwx 1 angie protein 18 Mar 20 16:50 /cluster/bluearc/RepeatMasker -> RepeatMasker060320 # March 20 2006 (open-3-1-5) version of RepeatMasker # get RM database version grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl \ > RMdatabase.version # RELEASE 20060315 cd /cluster/data/danRer4 cat << '_EOF_' > jkStuff/RMZebrafish #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/danRer4/$2 /bin/cp $2 /tmp/danRer4/$2/ cd /tmp/danRer4/$2 /cluster/bluearc/RepeatMasker060320/RepeatMasker -ali -s -species danio $2 popd /bin/cp /tmp/danRer4/$2/$2.out ./ if (-e /tmp/danRer4/$2/$2.align) /bin/cp /tmp/danRer4/$2/$2.align ./ if (-e /tmp/danRer4/$2/$2.tbl) /bin/cp /tmp/danRer4/$2/$2.tbl ./ if (-e /tmp/danRer4/$2/$2.cat) /bin/cp /tmp/danRer4/$2/$2.cat ./ /bin/rm -fr /tmp/danRer4/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer4/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer4 '_EOF_' # << emacs chmod +x jkStuff/RMZebrafish # move old files out the way and re-run on 2006-04-19 cd /cluster/data/danRer4 mkdir RMOutOld foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig foreach c ($d/$contig*.fa.*) set t=$c:t mv $c /cluster/data/danRer4/RMOutOld/$t.bak end end cp /dev/null RMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/danRer4/jkStuff/RMZebrafish \ /cluster/data/danRer4/$d $f \ '{'check out line+ /cluster/data/danRer4/$d/$f.out'}' \ >> RMRun/RMJobs end end end # Do the run again with new version of ProcessRepeats used # for RepeatMasker. ssh pk cd /cluster/data/danRer4/RMRun para create RMJobs # 4382 jobs written to batch para try, check, push, check ... etc. para time # Completed: 4382 of 4382 jobs # CPU time in finished jobs: 11745656s 195760.94m 3262.68h 135.95d 0.372 y # IO & Wait Time: 18953s 315.88m 5.26h 0.22d 0.001 y # Average job time: 2685s 44.75m 0.75h 0.03d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3878s 64.63m 1.08h 0.04d # Submission to last job: 41887s 698.12m 11.64h 0.48d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kkstore01 cd /cluster/data/danRer4 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif cd .. end # Re-run for just chrNA_random and chrUn_random (start on 2006-04-21) ssh kkstore01 mkdir /cluster/data/danRer4/RMRun/randomsRun cd /cluster/data/danRer4 cp /dev/null RMRun/randomsRun/RMJobs foreach c (NA_random Un_random) foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/danRer4/jkStuff/RMZebrafish \ /cluster/data/danRer4/$d $f \ '{'check out line+ /cluster/data/danRer4/$d/$f.out'}' \ >> RMRun/randomsRun/RMJobs end end end # Do the run again for chrNA_random and chrUn_random. ssh pk cd /cluster/data/danRer4/RMRun/randomsRun para create RMJobs # 468 jobs written to batch para try, check, push, check ... etc. para time # Completed: 468 of 468 jobs # CPU time in finished jobs: 551863s 9197.71m 153.30h 6.39d 0.017 y # IO & Wait Time: 2217s 36.96m 0.62h 0.03d 0.000 y # Average job time: 1184s 19.73m 0.33h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3836s 63.93m 1.07h 0.04d # Submission to last job: 9086s 151.43m 2.52h 0.11d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kkstore01 cd /cluster/data/danRer4 foreach c (NA_random Un_random) foreach d (${c}/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end end #- Lift pseudo-contigs to chromosome level foreach c (NA_random Un_random) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif cd .. end # Load tables #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/danRer4 hgLoadOut danRer4 */chr*.fa.out -verbose=2 >& load.log # bad rep range [5031, 4990] line 51895 of 14/chr14.fa.out # bad rep range [4559, 4558] line 59431 of 16/chr16.fa.out # bad rep range [1202, 1201] line 131633 of 16/chr16.fa.out # bad rep range [280, 252] line 93608 of 17/chr17.fa.out # bad rep range [429, 272] line 43230 of 22/chr22.fa.out # bad rep range [262, 261] line 167346 of 3/chr3.fa.out # bad rep range [889, 888] line 28495 of 5/chr5.fa.out # bad rep range [349, 348] line 113404 of 5/chr5.fa.out # bad rep range [1133, 1132] line 200654 of 5/chr5.fa.out # bad rep range [965, 920] line 3567 of 8/chr8.fa.out # bad rep range [292, 291] line 6354 of NA_random/chrNA_random.fa.out # note: 11 records dropped due to repStart > repEnd # Not too many errors so just ignore, but send examples to Arian Smit # and Robert Hubley. # check coverage of repeats masked featureBits -chrom=chr1 danRer3 rmsk # 25822888 bases of 55500710 (46.527%) in intersection featureBits -chrom=chr1 danRer4 rmsk # 32880041 bases of 70589895 (46.579%) in intersection ########################################################################### # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF AND BUILD NIB FILES # (DONE, 2006-04-22, hartera) # MASK PSEUDO-CONTIGS AS NOT DONE BEFORE (DONE, 2006-05-27, hartera) ssh kkstore01 cd /cluster/data/danRer4 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom # for the chromosomes: foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # check percent sequence masked faSize /cluster/data/danRer4/1/chr1.fa # 70589895 bases (904883 N's 69685012 real 36751306 upper # 32933706 lower) in 1 sequences in 1 files faSize /cluster/data/danRer3/1/chr1.fa # 55805710 bases (1047706 N's 54758004 real 28887275 upper # 25870729 lower) in 1 sequences in 1 files # 47% of danRer4 chr1.fa is in lower case so masked # Build nib files, using the soft masking in the fa mkdir nib foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end ls ./nib/* | wc # 28 # for the contigs (2006-05-27, hartera) ssh kkstore04 cd /cluster/data/danRer4 set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) echo "repeat- and trf-masking contigs of chr$c" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end ########################################################################### # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION AND CREATE 2BIT FILE # (DONE, 2006-04-23, hartera) # CHANGE FILENAME TO 2BIT FILE IN CHROMINFO AND REMOVE NIB DIR IN /gbdb # (DONE, 2006-05-24, hartera) # Make symbolic links from /gbdb/danRer4/nib to the real nibs ssh hgwdev cd /cluster/data/danRer4 mkdir -p /gbdb/danRer4/nib foreach f (/cluster/data/danRer4/nib/chr*.nib) ln -s $f /gbdb/danRer4/nib end # Load /gbdb/danRer4/nib paths into database and save size info # hgNibSeq creates chromInfo table hgNibSeq -preMadeNib danRer4 /gbdb/danRer4/nib */chr*.fa echo "select chrom,size from chromInfo" | hgsql -N danRer4 > chrom.sizes # take a look at chrom.sizes, should be 28 lines wc chrom.sizes # 28 56 422 chrom.sizes # Make one big 2bit file as well, and make a link to it in # /gbdb/danRer4 because hgBlat looks there: faToTwoBit */chr*.fa danRer4.2bit # check the 2bit file twoBitInfo danRer4.2bit 2bit.tab diff 2bit.tab chrom.sizes # should be the same and they are so ok. rm 2bit.tab # add link to this 2bit file from gbdb danRer4 directory ln -s /cluster/data/danRer4/danRer4.2bit /gbdb/danRer4/ # (hartera, 2006-05-24) # change chromInfo table to have 2bit file for filename hgsql -e 'update chromInfo set fileName = "/gbdb/danRer4/danRer4.2bit";' \ danRer4 # then remove nib directory in /gbdb/danRer4 as do not need both nibs # and 2 bit file which is in /gbdb/danRer4. rm -r /gbdb/danRer4/nib ########################################################################### # MAKE GOLD AND GAP TRACKS (DONE, 2006-04-23, hartera) ssh hgwdev cd /cluster/data/danRer4 # the gold and gap tracks are created from the chrN.agp file and this is # the scaffolds or supercontigs agp hgGoldGapGl -noGl -chromLst=chrom.lst danRer4 /cluster/data/danRer4 . # featureBits danRer4 gold # 1626093931 bases of 1626093931 (100.000%) in intersection # featureBits danRer3 gold # 1630323462 bases of 1630323462 (100.000%) in intersection # featureBits danRer4 gap # 148566200 bases of 1626093931 (9.136%) in intersection # featureBits danRer3 gap # 13709500 bases of 1630323462 (0.841%) in intersection # there are larger gaps now in chrNA and chrUn so compare just chr1 # featureBits -chrom=chr1 danRer4 gap # 16000 bases of 70573895 (0.023%) in intersection # featureBits -chrom=chr1 danRer3 gap # 305000 bases of 55500710 (0.550%) in intersection # without random or chrUn chroms: # featureBits -noRandom danRer4 gap # 366200 bases of 1546950119 (0.024%) in intersection # featureBits -noRandom danRer3 gap # 6240000 bases of 1200146216 (0.520%) in intersection # Add trackDb.ra entries for gold and gap tracks and also create # gap.html and gold.html pages. ########################################################################### # PUT MASKED SEQUENCE OUT ON iSERVERS AND THE SAN FOR CLUSTER RUNS # (DONE, 2006-04-23, hartera) # TRFFA SEQUENCED WAS NOT MASKED SO ADD MASKED SEQUENCE TO iSERVERS AND # THE SAN FOR CLUSTER RUNS (DONE, 2006-05-30, hartera) ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/danRer4/nib mkdir -p /iscratch/i/danRer4/nib cp -p /cluster/data/danRer4/nib/chr*.nib /iscratch/i/danRer4/nib # Pseudo-contig fa that have been repeat- and trf-masked: # Add these pseudo-contigs that have been repeat- and trf-masked # and rsync again. (2006-05-30, hartera) rm -rf /iscratch/i/danRer4/trfFa mkdir /iscratch/i/danRer4/trfFa foreach d (/cluster/data/danRer4/*/chr*_?{,?}) cp -p $d/$d:t.fa /iscratch/i/danRer4/trfFa end rm -rf /iscratch/i/danRer4/rmsk mkdir -p /iscratch/i/danRer4/rmsk cp -p /cluster/data/danRer4/*/chr*.fa.out /iscratch/i/danRer4/rmsk cp -p /cluster/data/danRer4/danRer4.2bit /iscratch/i/danRer4/ # rsync files - faster than using iSync # rsync again - still can not rsync to kkr2u00 (hartera, 2006-05-30) foreach R (2 3 4 5 6 7 8) echo "rsync for kkr${R}u00 ..." rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/ end # error rsyncing to kkr2u00: # connect to host kkr2u00 port 22: No route to host # then add the same sequence files to the san ssh kkstore01 # Chrom-level mixed nibs that have been repeat- and trf-masked: mkdir -p /san/sanvol1/scratch/danRer4/nib rm -rf /san/sanvol1/scratch/danRer4/nib cp -p /cluster/data/danRer4/nib/chr*.nib /san/sanvol1/scratch/danRer4/nib cp /cluster/data/danRer4/danRer4.2bit /san/sanvol1/scratch/danRer4 # Pseudo-contig fa that have been repeat- and trf-masked: # Add these pseudo-contigs again (2006-05-30, hartera) ssh kkstore04 rm -rf /san/sanvol1/scratch/danRer4/trfFa mkdir /san/sanvol1/scratch/danRer4/trfFa foreach d (/cluster/data/danRer4/*/chr*_?{,?}) cp -p $d/$d:t.fa /san/sanvol1/scratch/danRer4/trfFa end ########################################################################### # ADD CONTIGS TRACK (DONE, 2006-04-23, hartera) # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from # chunks (contigs) agp files. ssh kkstore01 mkdir -p /cluster/data/danRer4/bed/ctgPos2 cd /cluster/data/danRer4/bed/ctgPos2 # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc foreach c (`cat /cluster/data/danRer4/chrom.lst`) awk 'BEGIN {OFS="\t"} \ {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \ /cluster/data/danRer4/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab end # load the ctgPos2 table ssh hgwdev cd /cluster/data/danRer4/bed/ctgPos2 # use hgLoadSqlTab as it gives more error messages than using # "load data local infile ...". /cluster/bin/i386/hgLoadSqlTab danRer4 ctgPos2 \ ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab # create trackDb.ra entry and html page for ctgPos2 track. # add search for the track and make sure the termRegex will handle # contigs named "Zv6_scaffoldN.N" where N is an integer and all the # contig accessions in the *.chunks.agp files. ########################################################################### # CREATE gc5Base WIGGLE TRACK (DONE, 2006-04-23, hartera) ssh kkstore01 mkdir -p /cluster/data/danRer4/bed/gc5Base cd /cluster/data/danRer4/bed/gc5Base nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer4 \ /cluster/data/danRer4 | wigEncode stdin gc5Base.wig gc5Base.wib # Calculating gcPercent with window size 5 # Using twoBit: /cluster/data/danRer4/danRer4.2bit # File stdout created # Converted stdin, upper limit 100.00, lower limit 0.00 # runs for about 7 minutes # load database with the .wig file and add .wib file to /gbdb/danRer4 ssh hgwdev cd /cluster/data/danRer4/bed/gc5Base mkdir /gbdb/danRer4/wib ln -s `pwd`/gc5Base.wib /gbdb/danRer4/wib time hgLoadWiggle -pathPrefix=/gbdb/danRer4/wib danRer4 gc5Base gc5Base.wig # 17 second load time # verify index is correct: hgsql danRer4 -e "show index from gc5Base;" # should see good numbers in Cardinality column ########################################################################### # MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE, 2005-04-24, hartera) # Use -repMatch=512 (based on size -- for human we use 1024, and # the zebrafish genome is ~50% of the size of the human genome ssh kkr1u00 mkdir /cluster/data/danRer4/bed/ooc cd /cluster/data/danRer4/bed/ooc mkdir -p /san/sanvol1/scratch/danRer4 ls -1 /cluster/data/danRer4/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc -repMatch=512 # Wrote 50424 overused 11-mers to /cluster/bluearc/danRer4/11.ooc # For 10.ooc, repMatch = 4096 for human, so use 2048 blat nib.lst /dev/null /dev/null -tileSize=10 \ -makeOoc=/san/sanvol1/scratch/danRer4/danRer4_10.ooc -repMatch=2048 # Wrote 12231 overused 10-mers to /cluster/bluearc/danRer4/10.ooc # keep copies of ooc files in this directory and copy to iscratch cp /san/sanvol1/scratch/danRer4/*.ooc . cp -p /san/sanvol1/scratch/danRer4/*.ooc /iscratch/i/danRer4/ # rsync to iServers foreach R (2 3 4 5 6 7 8) rsync -a --progress /iscratch/i/danRer4/*.ooc \ kkr${R}u00:/iscratch/i/danRer4/ end ########################################################################### # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer4 (DONE, 2006-04-27, hartera) ssh hgwdev # DNA port is "0", trans prot port is "1" echo 'insert into blatServers values("danRer4", "blat17", "17788", "1", "0"); insert into blatServers values("danRer4", "blat17", "17789", "0", "1");' \ | hgsql hgcentraltest # this enables blat and isPcr, isPcr is enabled by loading blat server # with tilesize=5 (ask for this when request blat servers from # cluster admin). # if you need to delete those entries echo 'delete from blatServers where db="danRer4";' | hgsql hgcentraltest ########################################################################### # AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2006-04-24, hartera) # UPDATED (2006-09-28) - see separate section, UPDATE AFFY ZEBRAFISH TRACK. # NOTE: Jim recommends that, in the future, all AFFY blat alignments should drop # -mask=lower for blat and drop -minIdentity=95 to -minIdentity=90 as the # higher minIdentity is causing alignments to be dropped that should not be. # e.g. blat -fine -minIdentity=90 -ooc=11.ooc # $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl} # pslReps can be used to handle filtering at a later step. Blat's minIdentity # seems to be more severe than that for pslReps as it takes insertions and # deletions into account. # CHECKED ALIGNMENTS USING MASKED TRFFA AND RESULTS ARE THE SAME # (DONE, 2006-05-30, hartera) # array chip sequences already downloaded for danRer1 ssh hgwdev # need to copy sequences to the bluearc first to transfer to the iServers cd /projects/compbio/data/microarray/affyZebrafish mkdir -p /cluster/bluearc/affy cp -p \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /cluster/bluearc/affy/ # Set up cluster job to align Zebrafish consensus sequences to danRer3 ssh kkr1u00 mkdir -p /cluster/data/danRer4/bed/affyZebrafish.2006-04-24 ln -s /cluster/data/danRer4/bed/affyZebrafish.2006-04-24 \ /cluster/data/danRer4/bed/affyZebrafish cd /cluster/data/danRer4/bed/affyZebrafish mkdir -p /iscratch/i/affy cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy foreach R (2 3 4 5 6 7 8) rsync -a --progress /iscratch/i/affy/*.fa \ kkr${R}u00:/iscratch/i/affy/ end # small cluster run to align sequences ssh kki cd /cluster/data/danRer4/bed/affyZebrafish ls -1 /iscratch/i/affy/Zebrafish_consensus.fa > affy.lst ls -1 /iscratch/i/danRer4/trfFa/chr[0-9M]*.fa > genome.lst # for output: mkdir -p psl echo '#LOOP\n/cluster/bin/i386/blat -fine -minIdentity=90 -ooc=/iscratch/i/danRer4/danRer4_11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst affy.lst template.sub para.spec para create para.spec para try, check, push, check .... etc. # para time # Completed: 271 of 271 jobs # CPU time in finished jobs: 15331s 255.51m 4.26h 0.18d 0.000 y # IO & Wait Time: 737s 12.29m 0.20h 0.01d 0.000 y # Average job time: 59s 0.99m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 101s 1.68m 0.03h 0.00d # Submission to last job: 1557s 25.95m 0.43h 0.02d # do pslSort and liftUp ssh kkstore04 cd /cluster/data/danRer4/bed/affyZebrafish # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyZebrafish.psl pslSort dirs raw.psl tmp psl # only use alignments that have at least 95% identity in aligned region. # try minCover as now there is less sequence in chrUn and chrNA # so less likely that genes are split up. grep '>' /cluster/bluearc/affy/Zebrafish_consensus.fa | wc -l # 15502 pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null # see how many sequences are aligned: awk '{print $10;}' contig.psl > contigAligned tail +6 contigAligned | sort | uniq -c | sort -nr > contigAligned.count wc -l contigAligned.count # 14819 contigAligned.count tail +6 contig.psl | wc -l # 21486 # 96% of sequences are aligned. The sequence with the most alignments # aligns 177 times, then the next is 105, then 86, 85, 69, 69, 54, 54 etc. # for danRer3, 14335 were aligned (92% aligned). The sequence with # the most alignments aligned 96 times, then 31, 27, 22, 20, 19 times. # also 854 sequences aligned for danRer4 that did not align for danRer3. # 370 were aligned in danRer3 but not for danRer4. # USED THESE pslReps PARAMETERS: pslReps -minCover=0.30 -minAli=0.95 -nearTop=0.005 \ raw.psl contig2.psl /dev/null # see how many sequences are aligned: awk '{print $10;}' contig2.psl > contig2Aligned tail +6 contig2Aligned | sort | uniq -c | sort -nr > contig2Aligned.count wc -l contig2Aligned.count # 14528 contig2Aligned.count tail +6 contig2.psl | wc -l # 18744 # danRer3 has 21196 total alignments and 14335 sequences aligned. # 94% of sequences are aligned. # 785 sequences were aligned for danRer4 using minCover but not for # danRer3 after using pslReps. 592 sequences were aligned for danRer3 # but not for danRer4 using minCover after using pslReps. # the sequence with the most alignments aligns 105 times, then 85, 69, # 54, 50, 47, 44, 37, 26, 31, 29: # No. of alignments Sequence Name # 105 Zebrafish:Dr.15955.1.A1_at # 85 Zebrafish:Dr.20178.1.A1_at # 69 Zebrafish:Dr.885.1.S1_at # 54 Zebrafish:Dr.15958.1.S1_at # 50 Zebrafish:Dr.25427.1.A1_at # 47 Zebrafish:Dr.16470.1.A1_at # 44 Zebrafish:Dr.490.1.S1_at # 37 Zebrafish:Dr.7806.1.A1_at # 36 Zebrafish:Dr.19.1.A1_at # 31 Zebrafish:Dr.2825.1.A1_at # 29 Zebrafish:Dr.19556.1.A1_at # aligning with the -mask=lower option doesn't make a difference to the # number of alignments and sequences aligned. # there are 291 extra sequences that align when minCover option is # not used. Only 7 of these have 22 or more alignments. # 86 Zebrafish:Dr.24316.1.S1_at # 69 Zebrafish:Dr.14452.1.A1_at # 39 Zebrafish:Dr.12372.1.S1_at # 26 Zebrafish:Dr.18296.2.S1_a_at # 23 Zebrafish:Dr.7519.1.A1_at # 22 Zebrafish:Dr.8680.1.S1_at # 22 Zebrafish:Dr.22175.1.S1_at # clean up rm contig* # use pslReps without the minCover option as it does allow quite a lot # more alignments and the number of total alignments/number of sequences # aligned is still close to that for danRer3. Using nearTop=0.001 does # decrease the number of alignments but also means that some good # alignments are lost. pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyZebrafish.psl ../../jkStuff/liftAll.lft warn contig.psl # shorten names in psl file: sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp mv affyZebrafish.psl.tmp affyZebrafish.psl pslCheck affyZebrafish.psl # co-ordinates are ok. psl is good. # load track into database ssh hgwdev cd /cluster/data/danRer4/bed/affyZebrafish hgLoadPsl danRer4 affyZebrafish.psl # Add consensus sequences for Zebrafish chip # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/hgFixed/affyProbes ln -s \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=Zebrafish: danRer4 \ /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa # Clean up rm batch.bak contig.psl raw.psl # trackDb.ra entry and html are already there in trackDb/zebrafish/ ########################################################################### # CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS DIRECTORY AND # ADD CHROM SIZES FOR BLASTZ CLUSTER RUNS (DONE, 2006-04-24, hartera) # There are no lineage-specific repeats for zebrafish and other species # so use all repeats. ssh pk mkdir -p /san/sanvol1/scratch/danRer4/linSpecRep.notInOthers foreach f (/cluster/data/danRer4/*/chr*.fa.out) cp -p $f \ /san/sanvol1/scratch/danRer4/linSpecRep.notInOthers/$f:t:r:r.out.spec end cp -p /cluster/data/danRer4/chrom.sizes \ /san/sanvol1/scratch/danRer4/ ########################################################################### # BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR # HUMAN (hg18) (DONE, 2006-04-24 - 2006-04-25, hartera) # LOAD BLASTZ PSLS INTO DATABASE AND CHECK FOR HUMAN CONTAMINATION # (DONE, 2006-05-11, hartera) ssh pk # Blastz uses lineage-specific repeats. There are none for human # and zebrafish so use all repeats. # There is a lineage-specific repeats directory for zebrafish (see # section on CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS # DIRECTORY. lineage-specific repeats for hg18 already made - see # makeHg18.doc (BLASTZ ZEBRAFISH section). mkdir -p /cluster/data/danRer4/bed/blastz.hg18.2006-04-24 cd /cluster/data/danRer4/bed ln -s blastz.hg18.2006-04-24 blastz.hg18 cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24 # only 5% of the danRer4 genome is now in the random unordered chroms # so not running only scaffolds for these chroms - run as virtual chroms # and use same parameters as for danRer2. cat << 'EOF' > DEF # danRer4 zebrafish target, human hg18 query export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # use parameters suggested for human-fish evolutionary distance # recommended in doBlastzChainNet.pl help # (previously used for hg16-fr1, danRer1-mm5) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q # TARGET: zebrafish (danRer4) # Use all chroms, including both randoms (chrNA_random and chrUn_random) SEQ1_DIR=/san/sanvol1/scratch/danRer4/nib SEQ1_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 # QUERY: human (hg18) - single chunk big enough to run each chrom by itself # Use all chroms, including all randoms SEQ2_DIR=/san/sanvol1/scratch/hg18/nib SEQ2_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len SEQ2_SMSK=/san/sanvol1/scratch/hg18/linSpecRep.notInOthers SEQ2_CHUNK=300000000 SEQ2_LAP=0 BASE=/cluster/data/danRer4/bed/blastz.hg18.2006-04-24 TMPDIR=/scratch/tmp 'EOF' # << happy emacs chmod +x DEF nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF >& doBlastz.log & # Start: Mon Apr 24 19:20 Stop: Tues Apr 25 05:42 # Did not finish: # netChains: looks like previous stage was not successful # (can't find [danRer4.hg18.]all.chain[.gz]). # This file is there so run again. Continue chainMerge step so remove # all.chain file and chain directory. # NOTE: can leave these files and continue from the net step and it # will work. cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24 rm ./axtChain/*.all.chain.gz rm -r ./axtChain/chain nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue chainMerge `pwd`/DEF >& chainMerge.log & # Took about 10 minutes. # Check results with featureBits and compare to those # for danRer3 and danRer2: ssh hgwdev featureBits danRer4 chainHg18Link # 64196991 bases of 1626093931 (3.948%) in intersection featureBits danRer3 chainHg18Link # 69559338 bases of 1630323462 (4.267%) in intersection featureBits danRer2 chainHg17Link # 70046373 bases of 1560497282 (4.489%) in intersection # After Genbank tracks are loaded, (hartera, 2006-04-27) featureBits -chrom=chr1 danRer4 refGene:cds chainHg18Link -enrichment # refGene:cds 0.732%, chainHg18Link 4.140%, both 0.558%, cover 76.19%, # enrich 18.40x featureBits -chrom=chr1 danRer3 refGene:cds chainHg18Link -enrichment # refGene:cds 0.769%, chainHg18Link 4.124%, both 0.604%, cover 78.49%, # enrich 19.03x featureBits -chrom=chr1 danRer4 refGene:cds netHg18 -enrichment # refGene:cds 0.732%, netHg18 31.154%, both 0.624%, cover 85.21%, # enrich 2.73x featureBits -chrom=chr1 danRer3 refGene:cds netHg18 -enrichment # refGene:cds 0.774%, netHg18 35.434%, both 0.679%, cover 87.73%, # enrich 2.48x # Similar coverage and enrichment as for hg18 chains and net on danRer3. # do the swap for Blastz chains over to human (hg18) and create net, # axtNet, mafNet, liftOver and Downloads. see also makeHg18.doc for # featureBits on these alignments. ssh pk cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24 nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& doSwap.log & # Took about 15 minutes. # Load Blastz results into database (DONE, 2006-05-11, hartera) ssh kkstore04 cd /cluster/data/danRer4/bed/blastz.hg18/pslParts # cat together Blastz for each chrom mkdir pslChrom foreach c (`cat /cluster/data/danRer4/chrom.lst`) echo "Processing $c ..." foreach p (chr${c}.nib*) zcat $p >> ./pslChrom/chr${c}_blastzHg18.psl end end # load Blastz psls into the database ssh hgwdev cd /cluster/data/danRer4/bed/blastz.hg18/pslParts/pslChrom foreach f (*.psl) /cluster/bin/i386/hgLoadPsl danRer4 $f echo "$f Done" end # Then determine how much sequence has 100% identity to human with a # stretch of at least 300 bp. Human contamination was also found in # danRer1 and a user reported it more recently. foreach c (`cat /cluster/data/danRer4/chrom.lst`) echo "chr$c" >> humanContamination.txt hgsql -e "select count(*) from chr${c}_blastzHg18 where matches >= 300 and misMatches = 0;" danRer4 >> humanContamination.txt end # There are 4 on chr11 that fit this criteria (same if decrease to regions # of >= 200 bp with 100% ID). hgsql -e \ 'select * from chr11_blastzHg18 where matches >= 300 and mismatches = 0;' \ danRer4 > chr11HumanSeq # only 2 of these also have no query inserts and 1 of the others only has # a 1 base insert: regions are of size 303, 310 and 367 bp. The region of # 330 bp has a 45 bp insert on the query side - see below #bin matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts #588 303 0 0 0 0 0 0 0 - chr4 191273063 69879746 69880049 chr11 52342180 502145 502448 1 303, 121393014, 502145, #588 330 0 0 0 1 45 0 0 - chr4 191273063 69879319 69879694 chr11 52342180 502545 502875 2 1,329, 121393369,121393415, 502545,502546, #588 310 0 0 0 0 0 0 0 - chr4 191273063 69878956 69879266 chr11 52342180 502928 503238 1 310, 121393797, 502928, #588 667 0 0 0 1 1 0 0 - chr4 191273063 69878268 69878936 chr11 52342180 503258 503925 2 453,214, 121394127,121394581, 503258,503711, ########################################################################### # BLASTZ/CHAIN/NET PREP (DONE 4/25/06 angie) ssh kkstore04 cd /cluster/data/danRer4 cp -p danRer4.2bit /san/sanvol1/scratch/danRer4/ # Create a 2bit file for danRer4 with all chroms (1-25 and M) and the # scaffolds for NA and Un: awk '$1 == $6 {print $1;}' Zv6.scaffolds.agp \ | faSomeRecords Zv6_scaffolds.fa stdin stdout \ | faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa stdin \ /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit twoBitInfo /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit \ /san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes # Make a lift file for scaffolds --> {chrUn, chrNA}: mkdir /cluster/data/danRer4/liftSupertoChrom cd /cluster/data/danRer4/liftSupertoChrom /cluster/bin/scripts/agpToLift \ < ../NA_random/agps/chrNA_random.scaffolds.agp \ > chrNA_random.lft /cluster/bin/scripts/agpToLift \ < ../Un_random/agps/chrUn_random.scaffolds.agp \ > chrUn_random.lft cat chr*.lft > liftNAandUnScaffoldsToChrom.lft cp -p liftNAandUnScaffoldsToChrom.lft /san/sanvol1/scratch/danRer4/ # Distribute on /iscratch/i too (danRer4.2bit is already there): ssh kkr1u00 cd /iscratch/i/danRer4 cp -p /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit . twoBitInfo danRer4ChrUnNAScafs.2bit chromsUnNAScafs.sizes cp -p \ /cluster/data/danRer4/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft . iSync ########################################################################### # BLASTZ/CHAIN/NET XENTRO2 (DONE 4/26/06 angie) ssh kkstore04 mkdir /cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25 cd /cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25 cat << '_EOF_' > DEF # zebrafish vs. frog BLASTZ=/cluster/bin/penn/i386/blastz # Use same params as used for danRer1-xenTro1 (see makeXenTro1.doc) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Zebrafish danRer4 SEQ1_DIR=/iscratch/i/danRer4/danRer4.2bit SEQ1_CTGDIR=/iscratch/i/danRer4/danRer4ChrUnNAScafs.2bit SEQ1_LIFT=/iscratch/i/danRer4/liftNAandUnScaffoldsToChrom.lft SEQ1_LEN=/cluster/data/danRer4/chrom.sizes SEQ1_CTGLEN=/iscratch/i/danRer4/chromsUnNAScafs.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Frog xenTro2 - single chunk big enough to run two of the # largest scaffolds in one job SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit SEQ2_LEN=/cluster/bluearc/xenTro2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25 '_EOF_' # << emacs # kkstore04 can't see /iscratch so use an iServer as fileServer: doBlastzChainNet.pl -blastzOutRoot=/cluster/bluearc/danRer4XenTro2 \ -bigClusterHub=kk -fileServer=kkr8u00 -workhorse=kkr8u00 \ -chainMinScore=5000 -chainLinearGap=loose DEF \ >& do.log & tail -f do.log ln -s blastz.xenTro2.2006-04-25 /cluster/data/danRer4/bed/blastz.xenTro2 ########################################################################### # CREATE LIFT FILES FOR RANDOM CHROMOSOMES' SCAFFOLDS # (DONE, 2006-04-25, hartera) # scaffolds lift files created by scaffoldFaToAgp when agp files created # for chrNA_random and chrUn_random. remove last line as this is an extra # gap line that was removed from the agp. ssh kkstore01 cd /cluster/data/danRer4 foreach c (NA_random Un_random) mkdir -p /cluster/data/danRer4/$c/tmp end # NA_random doesn't have .lft and .gap files from scaffoldFaToAgp so # recreate. It had no tmp dir with the NA_random.scaffolds.agp. awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.scaffolds.agp \ > ./NA_random/tmp/NA_random.scaffolds.agp # change the first field to "chrNA_random" then can use agpToFa to process perl -pi.bak -e 's/Zv6_NA[0-9]+/chrNA_random/' ./NA_random/tmp/*.agp wc -l ./NA_random/tmp/NA_random.scaffolds.agp # 2898 ./NA_random/tmp/NA_random.scaffolds.agp cd /cluster/data/danRer4 foreach c (NA_random) awk '{print $6;}' $c/tmp/$c.scaffolds.agp > $c/tmp/chr$c.scaffolds.lst $HOME/bin/i386/faSomeRecords /cluster/data/danRer4/Zv6_scaffolds.fa \ $c/tmp/chr$c.scaffolds.lst $c/tmp/chr$c.fa end cd /cluster/data/danRer4/NA_random/tmp scaffoldFaToAgp -scaffoldGapSize=50000 chrNA_random.fa # change chrUn to chrNA_random for NA_random, change chrUn to chrUn_random # forUn_random. Change D to W for NA_random and Un_random.. sed -e 's/chrUn/chrNA_random/' chrNA_random.agp \ | sed -e 's/D/W/' > chrNA_random.scaffolds.agp mv chrNA_random.fa chrNA_random.scaffolds.fa # also move the Un_random .lft and .gap files to Un_random/tmp mv ./Un_random/chrUn_random.lft ./Un_random/tmp/chrUn_random.lft mv ./Un_random/chrUn_random.gap ./Un_random/tmp/chrUn_random.gap # for chrNA_random and chrUn_random: remove last line as this is an extra # gap line that was removed from the chrN_random.agp. Add these # scaffold lift files to liftAll.lft. Also need to change the last # field so that the correct total number of bases is being shown in the # last column. cd /cluster/data/danRer4 foreach c (NA_random Un_random) head -n -1 $c/tmp/chr${c}.lft > $c/tmp/chr${c}.scaffolds.lft perl -pi.bak -e "s/chrUn/chr${c}/" $c/tmp/chr${c}.scaffolds.lft if ($c == "NA_random") then perl -pi.bak -e 's/208064280/208014280/' \ $c/tmp/chrNA_random.scaffolds.lft else perl -pi.bak -e 's/19379532/19329532/' \ $c/tmp/chrUn_random.scaffolds.lft endif cat $c/tmp/chr${c}.scaffolds.lft >> ./jkStuff/liftAll.lft rm $c/tmp/chr${c}.lft $c/tmp/chr${c}.gap *.bak end ########################################################################### # AUTO UPDATE GENBANK MRNA AND EST AND MGC GENES RUN # (DONE, 2006-04-25 - 2006-04-26, hartera) ssh hgwdev cd ~kent/src/hg/makeDb/genbank cvs update -d -P etc # edit etc/genbank.conf to add danRer4 and commit this to CVS. # danRer4 (zebrafish) # Lift file partitions unplaced sequence pseudo-chroms danRer4.serverGenome = /cluster/data/danRer4/danRer4.2bit danRer4.clusterGenome = /iscratch/i/danRer4/danRer4.2bit danRer4.ooc = /iscratch/i/danRer4/danRer4_11.ooc danRer4.align.unplacedChroms = chrNA_random chrUn_random danRer4.lift = /cluster/data/danRer4/jkStuff/liftAll.lft danRer4.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} danRer4.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} danRer4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} danRer4.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} danRer4.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} danRer4.downloadDir = danRer4 danRer4.mgcTables.default = full danRer4.mgcTables.mgc = all # end of section added to etc/genbank.conf cvs commit -m "Added danRer4." etc/genbank.conf # update /cluster/data/genbank/ make etc-update # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains # danRer genome information ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial danRer4 & # Start: Tues Apr 25 12:53 Finish: Wed Apr 26 08:38 # logFile: var/build/logs/2006.04.25-12:53:39.danRer4.initalign.log # check log file tail -f var/build/logs/2006.04.25-12:53:39.danRer4.initalign.log # check it has finished (last line in log file): # kkstore02 2006.04.26-08:38:36 danRer4.initalign: finish # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad danRer4 & # logFile: var/dbload/hgwdev/logs/2006.04.26-15:45:19.dbload.log # check it is finished: hgwdev 2006.04.26-17:48:07 dbload: finish # Took about 2 hours. ########################################################################### # SPLIT UP ZEBRAFISH MASKED SEQUENCE FROM chrUn AND chrNA INTO SCAFFOLDS # ADD SOFT-MASKED SCAFFOLDS TO ISERVERS AND THE SAN FOR CLUSTER RUNS # (DONE, 2006-04-27, hartera) ssh kkstore01 cd /cluster/data/danRer4 # for chrNA_random and chrUn_random, get soft-masked sequence. foreach c (NA_random Un_random) cd $c mkdir scaffoldsSoftMask awk 'BEGIN {FS="\t"}{if ($5 != "N") \ print "faFrag -mixed chr'${c}'.fa",$2-1, $3, $6".fa";}' chr${c}.agp \ >> ./scaffoldsSoftMask/faFragSoftMask.csh cd .. end # change permissions run scripts to get sequences foreach d (NA_random Un_random) chmod +x $d/scaffoldsSoftMask/faFragSoftMask.csh end # wrapper shell script to run script to get the soft-masked scaffolds cat << '_EOF_' > jkStuff/getMaskedScaffolds.csh #!/bin/csh foreach c (NA_random Un_random) set dir=/cluster/data/danRer4 echo "Processing $c" cd $dir/$c/scaffoldsSoftMask cp ../chr${c}.fa . echo "Getting soft-masked sequences ..." nice faFragSoftMask.csh >& faFrag.log end '_EOF_' chmod +x jkStuff/getMaskedScaffolds.csh nice ./jkStuff/getMaskedScaffolds.csh & # Took about 2.5 hours. # check a few sequences that they are correct # add name of scaffold to sequence fasta and cat together foreach c (NA_random Un_random) set dir = /cluster/data/danRer4 cd $dir/$c/scaffoldsSoftMask foreach f (Zv*) set g=$f:r set sc=scaffold${c}.fa perl -pi.bak -e "s/>chr[0-9A-Za-z\-\:_]+/>$g/" $f cat $f >> $sc rm *.bak end cp scaffold* $dir/$c/ end grep '>' NA_random/scaffoldNA_random.fa | wc -l # 2898 grep '>' Un_random/scaffoldUn_random.fa | wc -l # 68 # check sizes of final FASTA file with all sequences. check a few # sequence files to see that they are correct - ok cd /cluster/data/danRer4 cat << '_EOF_' > ./jkStuff/checkFastaSizes.csh #!/bin/csh -fe set scafName=$1 set agpLen=$2 set pref=`echo $scafName | cut -c1-2` if ($pref == "Zv") then set g=/cluster/data/danRer4/*/scaffoldsSoftMask/${scafName}.fa set h=$g:t echo "Getting size of $h" set faLen = `faSize $g | awk '{print $1;}'` if ($agpLen == $faLen) then echo " OK: apg length = $h length = $faLen" else echo "ERROR: length = $agpLen, but $h length = $faLen" endif endif '_EOF_' # << happy emacs chmod +x ./jkStuff/checkFastaSizes.csh # use bash as doing a cat in C shell seems to split the line up by space bash for c in NA_random Un_random do echo "Processing $c scaffolds ..."; cat $c/chr${c}.agp | while read line; do scaf=`echo $line | cut -d " " -f6`; size=`echo $line | cut -d " " -f8`; nice ./jkStuff/checkFastaSizes.csh $scaf $size >> checkFastaSizes.log; done done exit # back to C shell grep "ERROR:" checkFastaSizes.log | wc -l # No errors so all are the OK so FASTA files are the expected size # Add soft-masked scaffolds to the Iservers and the san for cluster runs ssh kkr1u00 cd /cluster/data/danRer4 mkdir /iscratch/i/danRer4/scaffoldsSoftMask foreach c (NA_random Un_random) foreach f (/cluster/data/danRer4/$c/scaffoldsSoftMask/Zv*.fa) cp -p $f /iscratch/i/danRer4/scaffoldsSoftMask end cp -p /cluster/data/danRer4/$c/scaffold${c}.fa /iscratch/i/danRer4 end ls /iscratch/i/danRer4/scaffoldsSoftMask/ | wc # 2966 # all files are there # rsync to cluster machines foreach R (2 3 4 5 6 7 8) rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/ end ssh pk mkdir -p /san/sanvol1/scratch/danRer4/scaffoldsSoftMask foreach c (NA_random Un_random) foreach f (/cluster/data/danRer4/$c/scaffoldsSoftMask/Zv*.fa) rsync -a --progress $f /san/sanvol1/scratch/danRer4/scaffoldsSoftMask/ end rsync -a --progress /cluster/data/danRer4/${c}/scaffold${c}.fa \ /san/sanvol1/scratch/danRer4/ end foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa) echo $f >> files.log end wc -l files.log # 2966 files.log rm files.log # All files have transferred. ########################################################################### ## SWAP MM8 blastz result (DONE - 2006-04-28 - Hiram) # ADD SYMBOLIC LINK TO SWAP DIR (DONE, 2006-05-04, hartera) # RE-MAKE MM8 CHAINS AND NET SWAP WITH DANRER4 RANDOM CHROMS # (DONE, 2006-05-24, hartera) ADDED LINK TO SWAP DIR (2006-05-27, hartera) ssh pk cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22 # blastz parameters used in blastz alignment of danRer4 on mm8: # BLASTZ_ABRIDGE_REPEATS=1 # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=6000 # BLASTZ_K=2200 # BLASTZ_M=50 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & ssh hgwdev cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22 time nice -n +19 featureBits danRer4 chainMm8Link \ > fb.danRer4.chainDanRer4Link 2>&1 & cat fb.danRer4.chainDanRer4Link # 60721886 bases of 1626093931 (3.734%) in intersection # Add symbolic link to new swap directory (2006-05-27, hartera) ssh kkstore04 cd /cluster/data/danRer4/bed ln -s blastz.mm8.swap blastz.mm8 ########################################################################### # MONDOM4 BLASTZ TESTS USING LINEAGE-SPECIFIC REPEATS OR DYNAMIC MASKING # AND SWAP (DONE, 2006-04-28, hartera) # used no lineage specific-repeats and M=50 for dynamic masking featureBits danRer4 chainMonDom4 # 541863023 bases of 1626093931 (33.323%) in intersection featureBits danRer4 chainMonDom4NoDyMsk # 534445657 bases of 1626093931 (32.867%) in intersection featureBits monDom4 chainDanRer4 # 856404995 bases of 3501643220 (24.457%) in intersection featureBits monDom4 chainDanRer4NoDyMsk # 812142533 bases of 3501643220 (23.193%) in intersection featureBits -chrom=chr1 danRer4 refGene:cds chainMonDom4Link -enrichment # refGene:cds 0.732%, chainMonDom4Link 5.573%, both 0.550%, cover 75.20%, # enrich 13.49x featureBits -chrom=chr1 danRer4 refGene:cds chainMonDom4NoDyMskLink -enrichment # refGene:cds 0.732%, chainMonDom4NoDyMskLink 4.083%, both 0.550%, # cover 75.15%, enrich 18.40x featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4Link -enrichment # refGene:cds 0.001%, chainDanRer4Link 2.448%, both 0.000%, # cover 55.63%, enrich 22.73x featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4NoDyMskLink -enrichment # refGene:cds 0.001%, chainDanRer4NoDyMskLink 1.807%, both 0.000%, # cover 43.85%, enrich 24.27x # There are only 36 RefSeq genes for monDom4 so results are misleading. # Try mrna and xenoRefGene table. # for mrna tables, not much difference: featureBits -chrom=chr1 monDom4 mrna chainDanRer4Link -enrichment # mrna 0.004%, chainDanRer4Link 2.448%, both 0.002%, cover 54.59%, # enrich 22.30x featureBits -chrom=chr1 monDom4 mrna chainDanRer4NoDyMskLink -enrichment # mrna 0.004%, chainDanRer4NoDyMskLink 1.807%, both 0.002%, # cover 52.67%, enrich 29.15x featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4Link -enrichment # xenoRefGene:cds 0.820%, chainDanRer4Link 2.448%, both 0.655%, # cover 79.88%, enrich 32.63x featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4NoDyMskLink -enrichment # xenoRefGene:cds 0.820%, chainDanRer4NoDyMskLink 1.807%, both 0.661%, # cover 80.63%, enrich 44.63x # For the nets: featureBits -chrom=chr1 danRer4 refGene:cds netMonDom4 -enrichment # refGene:cds 0.732%, netMonDom4 31.056%, both 0.612%, # cover 83.58%, enrich 2.69x featureBits -chrom=chr1 danRer4 refGene:cds netMonDom4NoDyMsk -enrichment # refGene:cds 0.732%, netMonDom4NoDyMsk 31.002%, both 0.617%, # cover 84.31%, enrich 2.72x featureBits -chrom=chr1 monDom4 refGene:cds netDanRer4 -enrichment # refGene:cds 0.001%, netDanRer4 25.224%, both 0.000%, # cover 66.95%, enrich 2.65x featureBits -chrom=chr1 monDom4 refGene:cds netDanRer4NoDyMsk -enrichment # refGene:cds 0.001%, netDanRer4NoDyMsk 24.539%, both 0.000%, # cover 49.19%, enrich 2.00x # rows in tables for chr1 # Assembly Table Number of rows # danRer4 chainMonDom4 36931 # danRer4 chainMonDom4Link 426659 # danRer4 chainMonDom4NoDyMsk 34363 # danRer4 chainMonDom4NoDyMskLink 361572 # monDom4 chainDanRer4 170759 # monDom4 chainDanRer4Link 2552995 # monDom4 chainDanRer4NoDyMsk 139797 # monDom4 chainDanRer4NoDyMskLink 1806858 # all chroms: # danRer4 netMonDom4 399531 # danRer4 netMonDom4NoDyMsk 346482 # monDom4 netDanRer4 395881 # monDom4 netDanRer4NoDyMsk 321288 # Use lineage-specific repeats and no dynamic masking, seem to get # better enrichment and coverage compared to gene CDS regions and also # there are less chains being produced. ########################################################################### # BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR # OPOSSUM (monDom4) (DONE, 2006-04-28 - 2006-04-29, hartera) ssh hgwdev # Remove all test chain and net tables and start again foreach c (`cat chrom.lst`) hgsql -e "drop table chr${c}_chainMonDom4;" danRer4 hgsql -e "drop table chr${c}_chainMonDom4Link;" danRer4 hgsql -e "drop table chr${c}_chainMonDom4NoDyMsk;" danRer4 hgsql -e "drop table chr${c}_chainMonDom4NoDyMskLink;" danRer4 end hgsql -e "drop table netMonDom4;" danRer4 hgsql -e "drop table netMonDom4NoDyMsk;" danRer4 # remove downloads rm -r /usr/local/apache/htdocs/goldenPath/danRer4/vsMonDom4 rm \ /usr/local/apache/htdocs/goldenPath/danRer4/liftOver/danRer4ToMonDom4.over.chain.gz rm /cluster/data/danRer4/bed/liftOver/danRer4ToMonDom4.over.chain.gz # remove old Blastz swap rm -r /cluster/data/danRer4/bed/blastz.monDom4.swap # remove link to old blastz directory rm -r /cluster/data/danRer4/bed/blastz.monDom4 # see makeMonDom4.doc for removal of test tables and download files # and swap directory on monDom4. ssh pk # Blastz uses lineage-specific repeats. There are none for human # and zebrafish so use all repeats. # There is a lineage-specific repeats directory for zebrafish (see # section on CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS # DIRECTORY. lineage-specific repeats for monDom4 made and also nibs - see # makeMonDom4.doc. Need nib files when running Blastz with # lineage-specific repeats. mkdir -p /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28 cd /cluster/data/danRer4/bed ln -s blastz.monDom4.2006-04-28 blastz.monDom4 cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28 # only 5% of the danRer4 genome is now in the random unordered chroms # so not running only scaffolds for these chroms - run as virtual chroms # and use same parameters as for danRer2 but use all repeats as # lineage-specific as monDom4 is now mapped to chroms. cat << 'EOF' > DEF # danRer4 zebrafish target, opossum monDom4 query export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=1 # use parameters suggested for human-fish evolutionary distance # recommended in doBlastzChainNet.pl help. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q # TARGET: zebrafish (danRer4) # Use all chroms, including both randoms (chrNA_random and chrUn_random) SEQ1_DIR=/san/sanvol1/scratch/danRer4/nib SEQ1_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes SEQ1_CHUNK=100000000 SEQ1_LAP=10000 # QUERY: opossum (monDom4) SEQ2_DIR=/san/sanvol1/scratch/monDom4/nib SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes SEQ2_SMSK=/san/sanvol1/scratch/monDom4/linSpecRep.notInOthers SEQ2_CHUNK=50000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/cluster/data/danRer4/bed/blastz.monDom4.2006-04-28 TMPDIR=/scratch/tmp 'EOF' # << happy emacs chmod +x DEF nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF >& doBlastz.log & # Start: Fri Apr 28 13:27 Finish: Apr 29 01:28 # Stopped after making and merging chains: # netChains: looks like previous stage was not successful # (can't find [danRer4.monDom4.]all.chain[.gz]). # Start again with net step and continue: cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28 nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue net `pwd`/DEF >& net.log & # Took about 15 minutes to finish. # Do swap to get danRer4 alignments on monDom4: # see also makeMonDom4.doc cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28 nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& doSwap.log & # Took about 15 minutes. ########################################################################### # BLASTZ FOR FUGU (fr1) (DONE, 2006-04-28 - 2006-04-29, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS # No lineage-specific repeats for this species pair. fr1 is in scaffolds # so not so easy to use repeats with this run anyway. There is a 2bit # file of scaffolds on the Iservers. # Run this with dynamic masking instead. # copy masked fr1 scaffolds 2 bit file to the san - see makeFr1.doc # size of scaffolds FASTA file: ssh kkr1u00 faSize /panasas/store/fr1/scaffolds/scaffoldMaskedUnFr1.fa # 329140338 bases ssh pk mkdir /cluster/data/danRer4/bed/blastz.fr1.2006-04-28 cd /cluster/data/danRer4/bed ln -s blastz.fr1.2006-04-28 blastz.fr1 cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28 # use parameters for fr1 in makeDanRer2.doc. Using scaffolds makes this run # slower so it is best to have the scaffolds in the query. Use HoxD55.q # matrix as Fugu is quite distant from zebrafish. Blastz uses # lineage-specfic repeats but there are none for these two species. # Use soft-masked scaffolds and dynamic masking. cat << '_EOF_' > DEF # zebrafish (danRer4) vs. Fugu (fr1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=0 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - zebrafish (danRer4) SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes # 0.5 Mb chunk for target with 5 kb overlap SEQ1_LIMIT=30 SEQ1_CHUNK=500000 SEQ1_LAP=5000 # QUERY - Fugu (fr1) SEQ2_DIR=/san/sanvol1/scratch/fr1/fr1.2bit # soft-masked scaffolds in 2bit format SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/UnScaffolds/fr1UnScaffolds.2bit SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/UnScaffolds/scaffolds.sizes # large enough chunk to do whole genome at once SEQ2_CHUNK=500000000 SEQ2_LAP=0 BASE=/cluster/data/danRer4/bed/blastz.fr1.2006-04-28 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF >& doBlastz.log & # Start: Fri Apr 28 18:54 Finish: Apr 29 06:35 # Stopped after making and merging chains: # netChains: looks like previous stage was not successful # (can't find [danRer4.fr1.]all.chain[.gz]). # Start again with net step and continue: cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28 nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue net `pwd`/DEF >& net.log & # Took about an hour to finish. # check coverage: featureBits danRer4 chainFr1Link # 139280554 bases of 1626093931 (8.565%) in intersection featureBits danRer3 chainFr1Link # 137698495 bases of 1630323462 (8.446%) in intersection featureBits -chrom=chr1 danRer4 refGene:cds chainFr1Link -enrichment # refGene:cds 0.732%, chainFr1Link 8.464%, both 0.660%, # cover 90.18%, enrich 10.66x featureBits -chrom=chr1 danRer3 refGene:cds chainFr1Link -enrichment # refGene:cds 0.774%, chainFr1Link 8.364%, both 0.713%, # cover 92.09%, enrich 11.01x featureBits -chrom=chr1 danRer4 refGene:cds netFr1 -enrichment # refGene:cds 0.732%, netFr1 52.712%, both 0.710%, # cover 96.97%, enrich 1.84x featureBits -chrom=chr1 danRer3 refGene:cds netFr1 -enrichment # refGene:cds 0.774%, netFr1 58.353%, both 0.759%, # cover 97.95%, enrich 1.68x # Do the Blastz swap to get danRer4 alignments on fr1 # see also makeFr1.doc for featureBits on these alignments. ssh pk cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28 nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& doSwap.log & # Took about 30 minutes. ########################################################################### # BLASTZ FOR TETRAODON (tetNig1) (DONE, 2006-04-29 - 2006-04-30, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS # No lineage-specific repeats for this species pair. # Tetraodon also has no species-specific repeats in the RepeatMasker # library so run this using dynamic masking instead as for danRer2 and # danRer3. # The tetraodon 2bit file of chroms and scaffolds # (tetNig1ChromsRandomScafs.2bit) - this contains sequences for chroms # and for scaffolds of random chroms. ssh pk mkdir /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29 cd /cluster/data/danRer4/bed ln -s blastz.tetNig1.2006-04-29 blastz.tetNig1 cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29 # use parameters for tetNig1 in makeDanRer3.doc. Using scaffolds makes this run # slower so it is best to have the scaffolds in the query. Use HoxD55.q # matrix as tetraodon is quite distant from zebrafish. Blastz uses # lineage-specfic repeats but there are none for these two species. # Use soft-masked scaffolds and dynamic masking. cat << '_EOF_' > DEF # zebrafish (danRer4) vs. tetraodon (tetNig1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_ABRIDGE_REPEATS=0 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2500 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - zebrafish (danRer4) SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes # 0.5 Mb chunk for target with 5 kb overlap SEQ1_LIMIT=30 SEQ1_CHUNK=500000 SEQ1_LAP=5000 # QUERY - Tetraodon (tetNig1) SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit # soft-masked chroms and random scaffolds in 2bit format SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes # large enough chunk to do whole genome at once SEQ2_CHUNK=1000000000 SEQ2_LAP=0 BASE=/cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ `pwd`/DEF >& doBlastz.log & # Start: Sat Apr 29 18:10 Finish: Apr 29 22:41 # Stopped after making and merging chains: # netChains: looks like previous stage was not successful # (can't find [danRer4.tetNig1.]all.chain[.gz]). However, this file # is there so start again with net step and continue: cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29 nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -continue net `pwd`/DEF >& net.log & # Took about 20 minutes to finish. # check coverage compared to danRer3: featureBits danRer4 chainTetNig1Link # 119439512 bases of 1626093931 (7.345%) in intersection featureBits danRer3 chainTetNig1Link # 109205244 bases of 1630323462 (6.698%) in intersection featureBits -chrom=chr1 danRer4 refGene:cds chainTetNig1Link -enrichment # refGene:cds 0.732%, chainTetNig1Link 7.536%, both 0.645%, # cover 88.08%, enrich 11.69x featureBits -chrom=chr1 danRer3 refGene:cds chainTetNig1Link -enrichment # refGene:cds 0.774%, chainTetNig1Link 6.821%, both 0.692%, # cover 89.34%, enrich 13.10x featureBits -chrom=chr1 danRer4 refGene:cds netTetNig1 -enrichment # refGene:cds 0.732%, netTetNig1 55.116%, both 0.705%, # cover 96.33%, enrich 1.75x featureBits -chrom=chr1 danRer3 refGene:cds netTetNig1 -enrichment # refGene:cds 0.774%, netTetNig1 61.540%, both 0.753%, # cover 97.24%, enrich 1.58x # Similar coverage as for tetNig1 chains and nets on zebrafish danRer3. # Do the Blastz swap to get danRer4 alignments on tetNig1 # see also makeTetNig1.doc for featureBits for these alignments on tetNig1. ssh pk cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29 nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& doSwap.log & # Took about 22 minutes to run. ########################################################################### # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2006-05-01, hartera) # RE-MAKE DOWNLOADS FOR AGP, SOFT AND HARD MASKED CHROMS, REPEATMASKER OUT # BECAUSE THEY DID NOT INCLUDE NA_RANDOM AND UN_RANDOM # (DONE, 2007-03-29, hartera) # NOTE THAT zipAll.csh MUST BE ALTERED ACCORDINGLY IN FUTURE. ssh kkstore01 cd /cluster/data/danRer4 #- Build the .tar.gz and *.gz files for bigZips cat << '_EOF_' > jkStuff/zipAll.csh rm -rf bigZips mkdir bigZips tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked # soft masked chrNA and chrUn scaffolds tar cvzf bigZips/scaffoldRandomsFa.tar.gz NA_random/scaffoldNA_random.fa \ Un_random/scaffoldUn_random.fa cd bed/simpleRepeat tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed cd ../.. # get GenBank native mRNAs cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=danRer4 -native GenBank mrna \ /cluster/data/danRer4/bigZips/mrna.fa # get GenBank xeno mRNAs ./bin/i386/gbGetSeqs -db=danRer4 -xeno GenBank mrna \ /cluster/data/danRer4/bigZips/xenoMrna.fa # get native RefSeq mRNAs ./bin/i386/gbGetSeqs -db=danRer4 -native refseq mrna \ /cluster/data/danRer4/bigZips/refMrna.fa # get native GenBank ESTs ./bin/i386/gbGetSeqs -db=danRer4 -native GenBank est \ /cluster/data/danRer4/bigZips/est.fa # gzip the Genbank sequences and create upstream sequence files for RefSeq. cd /cluster/data/danRer4/bigZips gzip *.fa '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/zipAll.csh csh -ef ./jkStuff/zipAll.csh >& zipAll.log & # Took about 35 minutes. #- Look at zipAll.log to make sure all file lists look reasonable. # Make upstream files for zebrafish RefSeq and Copy the .gz files to # hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/danRer4/bigZips foreach I (1000 2000 5000) featureBits danRer4 refGene:upstream:${I} -fa=stdout \ | gzip -c > upstream${I}.fa.gz echo "upstream${I} done" end set gp = /usr/local/apache/htdocs/goldenPath/danRer4 mkdir -p $gp/bigZips cp -p *.gz $gp/bigZips mkdir -p $gp/chromosomes # Add individual chromosomes and file of scaffolds for each random chrom # to chromosomes downloads directory. foreach f (../*/chr*.fa) cp $f $gp/chromosomes end foreach c (NA_random Un_random) cd /cluster/data/danRer4/$c cp scaffold${c}.fa $gp/chromosomes end # create md5sum for bigZips cd $gp/bigZips md5sum *.gz > md5sum.txt # gzip each chrom or scaffolds for chrom separately in chromosomes dir cd $gp/chromosomes foreach f (*.fa) gzip $f end # create md5sum for chromosomes md5sum *.gz > md5sum.txt # Take a look at bigZips/* and chromosomes/* # copy README.txt's from danRer3 and update # RE-MAKE DOWNLOADS FOR AGP, SOFT AND HARD MASKED CHROMS, REPEATMASKER OUT # BECAUSE THEY DID NOT INCLUDE NA_RANDOM AND UN_RANDOM # (DONE, 2007-03-29, hartera) # NOTE THAT zipAll.csh MUST BE ALTERED ACCORDINGLY IN FUTURE. ssh kkstore04 cd /cluster/data/danRer4 #- Rebuild the .tar.gz (agp, files for bigZips cat << '_EOF_' > jkStuff/zip2.csh rm -r bigZips/chromAgp.tar.gz rm -r bigZips/chromOut.tar.gz rm -r bigZips/chromFa.tar.gz rm -r bigZips/chromFaMasked.tar.gz tar cvzf bigZips/chromAgp.tar.gz ?{,?}{,_random}/chr*.agp tar cvzf bigZips/chromOut.tar.gz ?{,?}{,_random}/chr*.fa.out tar cvzf bigZips/chromFa.tar.gz ?{,?}{,_random}/chr*.fa tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}{,_random}/chr*.fa.masked '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/zip2.csh csh -ef ./jkStuff/zip2.csh >& zip2.log & # Took about 10 minutes # Links to these files already exist from the # /usr/local/apache/htdocs/goldenpath/danRer4/bigZips directory. # Recreate the md5sum there to include these new files. cd /usr/local/apache/htdocs/goldenpath/danRer4/bigZips rm md5sum.txt md5sum *.gz > md5sum.txt ########################################################################### # HUMAN (hg18) PROTEINS TRACK FOR hg18 (DONE, 2006-04-28 - 2006-05-03, hartera) ssh kkstore01 bash # if not using bash shell already # make Blast database for non-random chrom sequences mkdir -p /cluster/data/danRer4/blastDb cd /cluster/data/danRer4/blastDb cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA_random/d" \ | sed "/Un_random/d" > chrom.list for i in `cat chrom.list`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list ln -s `cat list` . for i in *.fa do /projects/compbio/bin/i686/formatdb -i $i -p F done rm *.log *.fa list cd /cluster/data/danRer4 for i in `cat blastDb/chrom.list`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft rm blastDb/chrom.list # Now make Blast database for random scaffolds sequences. mkdir /cluster/data/danRer4/scaffoldBlastDb cd /cluster/data/danRer4/scaffoldBlastDb # Take file of all scaffolds for NA_random and Un_random and cat together cat ../NA_random/scaffoldNA_random.fa ../Un_random/scaffoldUn_random.fa \ > allRandomScafs.fasta grep '>' allRandomScafs.fasta | wc -l # 2966 faSplit sequence allRandomScafs.fasta 500 scaf rm allRandomScafs.fasta for i in *.fa do /projects/compbio/bin/i686/formatdb -i $i -p F done rm *.log *.fa # combine databases for chroms and random chroms mkdir -p /san/sanvol1/scratch/danRer4/comboBlastDb cd /cluster/data/danRer4/blastDb for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/danRer4/comboBlastDb; done cd /cluster/data/danRer4/scaffoldBlastDb for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/danRer4/comboBlastDb; done mkdir -p /cluster/data/danRer4/bed/tblastn.hg18KG cd /cluster/data/danRer4/bed/tblastn.hg18KG echo /san/sanvol1/scratch/danRer4/comboBlastDb/*.nsq \ | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 4377 query.lst # we want around 250000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(250000/4377) = 643.016316 mkdir -p /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa split -l 643 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl \ /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice /cluster/home/braney/bin/x86_64/pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done exit # back to tcsh cd /cluster/data/danRer4/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer4/jkStuff/subChr.lft carry $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/danRer4/jkStuff/liftAll.lft carry $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec # then run the Blast cluster jobs ssh kk cd /cluster/data/danRer4/bed/tblastn.hg18KG para create blastSpec para try, check, push, check etc. # pushed 100,000 jobs at a time so need to do para push again later para time # Completed: 253866 of 253866 jobs # CPU time in finished jobs: 52410110s 873501.83m 14558.36h 606.60d 1.662 y # IO & Wait Time: 5508786s 91813.10m 1530.22h 63.76d 0.175 y # Average job time: 228s 3.80m 0.06h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2162s 36.03m 0.60h 0.03d # Submission to last job: 147825s 2463.75m 41.06h 1.71d # Took a while as had to repush some crashed jobs. ssh kkstore01 cd /cluster/data/danRer4/bed/tblastn.hg18KG tcsh mkdir chainRun cd chainRun cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS \ /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/danRer4/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 58 of 58 jobs # CPU time in finished jobs: 759034s 12650.56m 210.84h 8.79d 0.024 y # IO & Wait Time: 217724s 3628.74m 60.48h 2.52d 0.007 y # Average job time: 16841s 280.68m 4.68h 0.19d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 208828s 3480.47m 58.01h 2.42d # Submission to last job: 208891s 3481.52m 58.03h 2.42d ssh kkstore01 cd /cluster/data/danRer4/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done liftUp -nohead -type=.psl stdout \ /cluster/data/danRer4/jkStuff/liftAll.lft carry u.*.psl m60* | \ sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq \ > /cluster/data/danRer4/bed/tblastn.hg18KG/blastHg18KG.psl pslCheck blastHg18KG.psl # this is ok. # load table ssh hgwdev cd /cluster/data/danRer4/bed/tblastn.hg18KG hgLoadPsl danRer4 blastHg18KG.psl # check coverage featureBits danRer4 blastHg18KG # 21159392 bases of 1626093931 (1.301%) in intersection featureBits danRer3 blastHg17KG # 21063005 bases of 1630323462 (1.292%) in intersection featureBits -chrom=chr1 danRer4 refGene:cds blastHg18KG -enrichment # refGene:cds 0.732%, blastHg18KG 1.333%, both 0.428%, cover 58.43%, # enrich 43.83x featureBits -chrom=chr1 danRer3 refGene:cds blastHg17KG -enrichment # refGene:cds 0.774%, blastHg17KG 1.370%, both 0.450%, cover 58.05%, # enrich 42.38x # Similar coverage compared to refGene CDS as for hg17 proteins on danRer3. # back to kkstore04 to clean up ssh kkstore04 rm -rf /cluster/data/danRer4/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut # add trackDb.ra entry and html to ~/kent/src/hg/makeDb/trackDb/trackDb.ra # also added the blastHg18KG.html here. # blastKGPep04 and blastKGRef04 tables required on hg18 - these have # been created - see makeHg18.doc. update of hgc.c, hgTrackUi.c and # hgTracks.c was required - done by Brian. ########################################################################### # MULTIZ7WAY ALIGNMENTS FOR CONSERVATION TRACK # (DONE, 2006-05-04 - 2006-05-10, hartera) # RE-MAKE WITH DANRER4 RANDOMS FOR MM8 AND ADDED FRAMES TABLE AND # MULTIZ7WAY DOWNLOADS (DONE, 2006-05-28 - 2005-05-29, hartera) # for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18. ssh kkstore04 mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28 cd /cluster/data/danRer4/bed cd /cluster/data/danRer4/bed/multiz7way.2006-05-28 # copy MAFs to a cluster-friendly server rm -r /san/sanvol1/scratch/danRer4/mafNet mkdir /san/sanvol1/scratch/danRer4/mafNet foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18) echo $s rsync -av /cluster/data/danRer4/bed/blastz.$s/mafNet/* \ /san/sanvol1/scratch/danRer4/mafNet/$s/ end # prune the hg17 17way tree to just these 7 and update db names: /cluster/bin/phast/tree_doctor \ --prune-all-but=mouse_mm8,human_hg18,monodelphis_monDom4,xenopus_xenTro1,tetraodon_tetNig1,fugu_fr1,zebrafish_danRer3 \ --rename="xenopus_xenTro1 -> xenopus_xenTro2 ; zebrafish_danRer3 -> zebrafish_danRer4" \ /cluster/data/hg18/bed/multiz17way/17way.nh > 7way.nh # carefully edit so that danRer4 is first. copy first to new file cp 7way.nh 7way_zfishFirst.nh # /cluster/bin/phast/draw_tree 7way_zfishFirst.nh > 7way.ps # also made the ps file for the 7way.nh and compared to make sure # that the tree with zebrafish at the top looks correct. /cluster/bin/phast/all_dists 7way_zfishFirst.nh > 7way.distances grep danRer4 7way.distances | sort -k3,3n | \ awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt cat distances.txt # 1.4749 tetraodon_tetNig1 # 1.5154 fugu_fr1 # 1.7480 human_hg18 # 1.7782 monodelphis_monDom4 # 1.8771 xenopus_xenTro2 # 2.1058 mouse_mm8 # the order in the browser display will be by tree topology, # not by distance, so they will be: # danRer4 # 1.5154 fugu_fr1 # 1.4749 tetraodon_tetNig1 # 1.8771 xenopus_xenTro2 # 1.7782 monodelphis_monDom4 # 2.1058 mouse_mm8 # 1.7480 human_hg18 # create species list and stripped down tree for autoMZ sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' \ 7way_zfishFirst.nh > tree-commas.nh sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst ssh pk cd /cluster/data/danRer4/bed/multiz7way.2006-05-28 mkdir maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = danRer4 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/$db/mafNet rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp 'EOF' # << emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf/$(root1).maf} #ENDLOOP 'EOF' # << emacs awk '{print $1}' /cluster/data/danRer4/chrom.sizes > chrom.lst gensub2 chrom.lst single spec jobList para create jobList para try, check, push, check etc. ... para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 7022s 117.03m 1.95h 0.08d 0.000 y IO & Wait Time: 142s 2.37m 0.04h 0.00d 0.000 y Average job time: 256s 4.26m 0.07h 0.00d Longest running job: 0s 0.00m 0.00h 0.00d Longest finished job: 368s 6.13m 0.10h 0.00d Submission to last job: 705s 11.75m 0.20h 0.01d # Make .jpg for tree and install in htdocs/images/phylo/... don't forget # to request a push of that file. The treeImage setting in trackDb.ra # is phylo/danRer4_7way.jpg (relative to htdocs/images). ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-28 cat << '_EOF_' > species7.nh ((zebrafish,(Fugu,Tetraodon)),(X. tropicalis,(opossum,(mouse,human)))) '_EOF_' /cluster/bin/phast/draw_tree species7.nh > species7way.ps # ask Bob to resize image for Browser track description page and convert # to JPEG and rename as danRer4_7way.jpg # Build maf annotation and load dataabase ssh kolossus mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno mkdir maf run cd run rm -f sizes nBeds foreach db (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-28/species.lst`) ln -s /cluster/data/$db/chrom.sizes $db.len if (! -e /cluster/data/$db/$db.N.bed) then twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed} endif ln -s /cluster/data/$db/$db.N.bed $db.bed echo $db.bed >> nBeds echo $db.len >> sizes end echo date > jobs.csh # do smaller jobs first: foreach f (`ls -1rS ../../maf/*.maf`) echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \ /cluster/data/danRer4/danRer4.2bit ../maf/`basename $f` \ >> jobs.csh echo "echo $f" >> jobs.csh end echo date >> jobs.csh csh -efx jobs.csh >&! jobs.log & tail -f jobs.log # Took 27 minutes to run. # Load anno/maf ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf mkdir -p /gbdb/danRer4/multiz7way/anno/maf ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf/*.maf \ /gbdb/danRer4/multiz7way/anno/maf # delete old files from extFile table hgsql -e 'delete from extFile where path like "%multiz7way/anno/maf%";' \ danRer4 cat > loadMaf.csh << 'EOF' date nice hgLoadMaf -pathPrefix=/gbdb/danRer4/multiz7way/anno/maf danRer4 multiz7way date 'EOF' # << emacs csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log # Took about 1 minute. # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kkr7u00 cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf cat *.maf \ | nice hgLoadMafSummary danRer4 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz7waySummary stdin # Created 820403 summary blocks from 4245668 components and # 2120803 mafs from stdin ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf sed -e 's/mafSummary/multiz7waySummary/' ~/kent/src/hg/lib/mafSummary.sql \ > /tmp/multiz7waySummary.sql time nice hgLoadSqlTab danRer4 multiz7waySummary \ /tmp/multiz7waySummary.sql multiz7waySummary.tab # 0.000u 0.000s 2:05.26 0.0% 0+0k 0+0io 209pf+0w rm *.tab /tmp/multiz7waySummary.sql # zip mafs: ssh kkstore04 cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf cat > zipMafs.csh << 'EOF' date foreach f (chr*.maf) set c = $f:r echo $c nice gzip -c $f > $c.maf.gz end date 'EOF' time csh -efx zipMafs.csh >&! zip.log # 219.706u 1.939s 3:41.75 99.9% 0+0k 0+0io 0pf+0w rm *.maf # add Frames table: mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/frames cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/frames # The following is adapted from MarkD's Makefile used for mm7... # and used in makeRn4.doc. #------------------------------------------------------------------------ # get the genes for all genomes # using mrna for danRer4 # using knownGene for mm8 hg18 # using mgcGenes for xenTro2 # using ensGene for fr1 # no genes for monDom4 and tetNig1 # targetDb = danRer4 # queryDbs = mm8 hg18 xenTro2 fr1 (to build frames for) # genePreds; (must keep only the first 10 columns for knownGene) # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using mrna table as genes: danRer4 mkdir genes foreach queryDb (danRer4) set tmpExt = `mktemp temp.XXXXXX` set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt} set tmpMrna = ${queryDb}.mrna.${tmpExt} set tmpCds = ${queryDb}.cds.${tmpExt} echo $queryDb hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \ from all_mrna,gbCdnaInfo,cds \ where (all_mrna.qName = gbCdnaInfo.acc) and \ (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \ ${queryDb} > ${tmpMrnaCds} cut -f 1-2 ${tmpMrnaCds} > ${tmpCds} cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna} mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \ stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds} mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end # using knownGene for mm8 hg18 # using mgcGenes for xenTro2 # using enesGene for fr1 foreach queryDb (mm8 hg18 xenTro2 fr1) if ($queryDb == "xenTro2") then set geneTbl = mgcGenes else if ($queryDb == "fr1") then set geneTbl = ensGene else set geneTbl = knownGene endif hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$queryDb.tmp.gz mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz rm -f $tmpExt end #------------------------------------------------------------------------ # create frames set clusterDir = /cluster/bluearc/danRer4/multiz7wayFrames set multizDir = /cluster/data/danRer4/bed/multiz7way.2006-05-28 set mafDir = $multizDir/maf set geneDir = $multizDir/frames/genes set clusterMafDir = ${clusterDir}/maf set clusterGeneDir = ${clusterDir}/genes set clusterFramesDir = ${clusterDir}/mafFrames.kki # copy mafs to cluster storage mkdir $clusterDir ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/" # copy genes to cluster storage ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/" # run cluster jobs set tmpExt = `mktemp temp.XXXXXX` set paraDir = $multizDir/frames/para.${tmpExt} mkdir mafFrames $paraDir rm -f $paraDir/jobList mkdir ${clusterFramesDir} foreach queryDb (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-28/species.lst`) mkdir ${clusterFramesDir}/${queryDb} foreach c (`awk '{print $1;}' /cluster/data/danRer4/chrom.sizes`) if (-e ${clusterGeneDir}/${queryDb}.gp.gz) then echo /cluster/bin/scripts/mkMafFrames.pl ${queryDb} danRer4 \ ${clusterGeneDir}/${queryDb}.gp.gz ${clusterMafDir}/$c.maf.gz \ ${clusterFramesDir}/${queryDb}/$c.mafFrames \ >> $paraDir/jobList endif end end rm -f $tmpExt ssh -x kki "cd ${paraDir} && para make jobList && para time" # Completed: 140 of 140 jobs # CPU time in finished jobs: 255s 4.25m 0.07h 0.00d 0.000 y # IO & Wait Time: 360s 6.00m 0.10h 0.00d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 8s 0.13m 0.00h 0.00d # Submission to last job: 55s 0.92m 0.02h 0.00d # combine results from cluster foreach queryDb (`cat ../species.lst`) echo $queryDb ssh -x kolossus "cat ${clusterFramesDir}/${queryDb}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${queryDb}.mafFrames.gz" end #------------------------------------------------------------------------ # load the database hgLoadMafFrames danRer4 multiz7wayFrames mafFrames/*.mafFrames.gz #------------------------------------------------------------------------ # clean up rm -rf ${clusterDir} ### # rebuild frames to get bug fix, using 1-pass maf methodology # (2006-06-09 markd) ssh kkstore04 cd /cluster/data/danRer4/bed/multiz7way/frames mv mafFrames/ mafFrames.old nice tcsh # easy way to get process niced (zcat ../maf/*.maf.gz | time genePredToMafFrames danRer4 stdin stdout danRer4 genes/danRer4.gp.gz fr1 genes/fr1.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz xenTro2 genes/xenTro2.gp.gz | gzip >multiz7way.mafFrames.gz)>&log& ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way/frames hgLoadMafFrames danRer4 multiz7wayFrames multiz7way.mafFrames.gz >&log& # end of multiz7way frames and load cd /cluster/data/danRer4/bed ln -s multiz7way.2006-05-28 /cluster/data/danRer4/bed/multiz7way # create and add the tree image for the description page # Make .jpg for tree and install in htdocs/images/phylo/... don't forget # to request a push of that file. The treeImage setting in trackDb.ra # is phylo/danRer4_7way.jpg (relative to htdocs/images). ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-28 cat << '_EOF_' > species7.nh ((zebrafish,(Fugu,Tetraodon)),(X. tropicalis,(opossum,(mouse,human)))) '_EOF_' /cluster/bin/phast/draw_tree species7.nh > species7way.ps # ask Bob to resize image for Browser track description page and convert # to JPEG and rename as danRer4_7way.jpg ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/danRer4_7way.jpg \ /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg # change permissions for display if not already readable to all chmod +r /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg # check for all.joiner entry for 7-way - it is there already. # add html and trackDb.ra entry for danRer4: # track multiz7way # shortLabel Conservation # longLabel Vertebrate Multiz Alignment & Conservation # group compGeno # priority 104 # visibility pack # color 0, 10, 100 # altColor 0,90,10 # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons7way # pairwiseHeight 12 # spanList 1 # yLineOnOff Off # autoScale Off # windowingFunction mean # summary multiz7waySummary # frames multiz7wayFrames # irows on # speciesGroups vertebrate mammal # sGroup_vertebrate fr1 tetNig1 xenTro2 # sGroup_mammal monDom4 mm8 hg18 # treeImage phylo/danRer4_7way.jpg ########################################################################### # MAF DOWNLOADS FOR MULTIZ7WAY (DONE, 2006-05-29, hartera) # GZIPPED UPSTREAM FILES AND ADDED TO DOWNLOADS AND RE-MADE md5sum.txt # (DONE, 2006-06-02, hartera) ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-28 mkdir mafDownloads cd mafDownloads # upstream mafs cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits danRer4 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags danRer4 multiz7way up.bed upstream$i.maf \ -orgs=../species.lst rm up.bed end date 'EOF' time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log # 57.823u 105.238s 4:13.15 64.4% 0+0k 0+0io 2pf+0w # add maf downloads for annotated mafs ssh kkstore04 cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads cat > downloads.csh << 'EOF' date foreach f (../anno/maf/chr*.maf) set c = $f:t:r echo $c nice gzip -c $f > $c.maf.gz end md5sum *.gz > md5sum.txt date 'EOF' # 446.734u 5.629s 7:38.09 98.7% 0+0k 0+0io 2pf+0w ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/danRer4/multiz7way mkdir $dir ln -s \ /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads/{*.gz,md5sum.txt} \ $dir cp /usr/local/apache/htdocs/goldenPath/danRer3/multiz5way/README.txt $dir # edit README.txt # gzip the upstream maf downloads and remake md5sum.txt # (2006-06-02, hartera) ssh kkstore04 cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads foreach f (upstream*.maf) nice gzip -c $f > $f.maf.gz end rm md5sum.txt md5sum *.gz > md5sum.txt ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/danRer4/multiz7way rm $dir/md5sum.txt ln -s \ /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads/{upstream*.gz,md5sum.txt} $dir ########################################################################### # PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 7-WAY ALIGNMENT # (DONE, 2006-05-17 - 2006-05-24, hartera) # REMAKE CONSERVATION TRACK USING MULTIZ 7-WAY INCLUDING DANRER4 RANDOM CHROMS # FOR MM8 ALIGNMENTS (DONE, 2006-05-29, hartera) ssh kkstore04 # Need unzipped maf files for this. cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf foreach f (*.maf.gz) echo $f gunzip -c $f > $f:r end mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons # create a starting-tree.mod based on chr14 (92 Mb) # chr14 is the largest chrom apart from chrNA_random /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr14.maf \ --refseq ../../../14/chr14.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "`cat ../tree-commas.nh`" \ --out-root starting-tree # took less than a minute rm s1.*ss # Get genome-wide average GC content (for all species together, # not just the reference genome). If you have a globally # estimated tree model, as above, you can get this from the # BACKGROUND line in the .mod file. E.g., # ALPHABET: A C G T # ... # BACKGROUND: 0.305239 0.194225 0.194292 0.306244 # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.389 is the GC content. This is used in the -gc argument below. # If you do *not* have a global tree model and you do not know your # GC content, you can get it directly from the MAFs with a command # like: /cluster/bin/phast/$MACHTYPE/msa_view \ --aggregate danRer4,tetNig1,fr1,xenTro2,monDom4,mm8,hg18 -i MAF \ -S /cluster/data/danRer4/bed/multiz7way/maf/chr*.maf > maf_summary.txt # This gives a GC content of 0.426 so use this as it is from mafs for # the whole genome. # break up the genome-wide MAFs into pieces on the san filesystem ssh pk set WINDOWS=/san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/ss mkdir -p $WINDOWS cd $WINDOWS cat << 'EOF' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf set WINDOWS=/san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/ss cd $WINDOWS set c = $1 echo $c rm -fr $c mkdir $c set N = `echo $c | sed -e 's/chr//'` /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \ -M /cluster/data/danRer4/$N/$c.fa \ -o SS -w 10000000,0 -I 1000 -B 5000 -r $c/$c echo "Done" >> $c.done 'EOF' # << emacs chmod +x doSplit.csh rm -f jobList foreach c (`cat /cluster/data/danRer4/chrom.lst`) echo "doSplit.csh chr${c} {check out line+ $WINDOWS/chr$c.done}" >> jobList end para create jobList para push, check etc. para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 831s 13.86m 0.23h 0.01d 0.000 y # IO & Wait Time: 634s 10.56m 0.18h 0.01d 0.000 y # Average job time: 52s 0.87m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 118s 1.97m 0.03h 0.00d # Submission to last job: 118s 1.97m 0.03h 0.00d # Create a random list of 50 1 mb regions (do not use chrNA and chrUn) ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh pk set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons mkdir -p $dir cd $dir # now set up cluster job to estimate model parameters. Parameters # will be estimated separately for each alignment fragment then # will be combined across fragments. Tuning this loop should come # back to here to recalculate. Tuning target-coverage and expected-length. # Create little script that calls phastCons with right arguments cat > makeTree.csh << 'EOF' #!/bin/csh -fe set C = $1:h set treeRun = $2 set cov = $3 set len = $4 set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons mkdir -p $dir/$treeRun/log/${C} $dir/$treeRun/tree/${C} /cluster/bin/phast/x86_64/phastCons $dir/ss/$1 \ /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons/starting-tree.mod \ --gc 0.426 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-length $len --target-coverage $cov \ --quiet --log $dir/$treeRun/log/$1 --estimate-trees $dir/$treeRun/tree/$1 'EOF' # << emacs chmod a+x makeTree.csh # Make sure that the correct GC content is substituted in here. Notice # the target coverage of 0.17. Here we are going to aim # for 65% coverage of coding regions by conserved elements. # Create gensub file # need to add cov and len parameters cat > template << '_EOF_' #LOOP makeTree.csh $(path1) $(path2) #ENDLOOP '_EOF_' # happy emacs # Make cluster job and run it to try out a few parameters close # to those used for danRer3 and danRer2 phastCons runs. echo "treeRun1 0.17 12" > tree.lst echo "treeRun2 0.32 18" >> tree.lst echo "treeRun3 0.32 20" >> tree.lst echo "treeRun4 0.35 18" >> tree.lst gensub2 randomSs.list tree.lst template jobList para create jobList para try,check,push,check etc. # para time # Completed: 200 of 200 jobs # CPU time in finished jobs: 68652s 1144.20m 19.07h 0.79d 0.002 y # IO & Wait Time: 2521s 42.02m 0.70h 0.03d 0.000 y # Average job time: 356s 5.93m 0.10h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 629s 10.48m 0.17h 0.01d # Submission to last job: 2356s 39.27m 0.65h 0.03d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons foreach d ($dir/treeRun*) cd $d ls tree/chr*/*.cons.mod > cons.txt /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls tree/chr*/*.noncons.mod > noncons.txt /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt end # measuring entropy # consEntropy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument # target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok) # the expected length that produces this entropy is the one # to use for phastCons. # foreach treeRun, set the appropriate coverage and length # file: treeRunN cov len # use awk to split up cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons cp tree.lst entropy.csh perl -pi.bak -e 's/^(treeRun[0-9]+)\s*([0-9\.]+)\s*([0-9]+)/echo \"Coverage = $2 Length = $3\"\ncd $1\n\/cluster\/bin\/phast\/x86_64\/consEntropy $2 $3 ave.cons.mod ave.noncons.mod\ncd \.\./' entropy.csh chmod +x entropy.csh entropy.csh >& entropy.out # entropy.out #Coverage = 0.17 Length = 12 #Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068 #Relative entropy: H=0.857449 bits/site #Expected min. length: L_min=12.298748 sites #Expected max. length: L_max=8.165741 sites #Phylogenetic information threshold: PIT=L_min*H=10.545544 bits #### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED #### # These are the same as for danRer2 and give the targeted L_min*H value. # This is from treeRun2. #Coverage = 0.32 Length = 18 #Transition parameters:gamma=0.320000,omega=18.000000, mu=0.055556, nu=0.026144 #Relative entropy: H=0.818130 bits/site #Expected min. length: L_min=12.025818 sites #Expected max. length: L_max=9.281106 sites #Phylogenetic information threshold: PIT=L_min*H=9.838688 bits ### #Coverage = 0.32 Length = 20 #Transition parameters:gamma=0.320000,omega=20.000000, mu=0.050000, nu=0.023529 #Relative entropy: H=0.795926 bits/site #Expected min. length: L_min=12.724131 sites #Expected max. length: L_max=9.927736 sites #Phylogenetic information threshold: PIT=L_min*H=10.127467 bits #Coverage = 0.35 Length = 18 #Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915 #Relative entropy: H=0.827604 bits/site #Expected min. length: L_min=11.542637 sites #Expected max. length: L_max=9.061627 sites #Phylogenetic information threshold: PIT=L_min*H=9.552732 bits # need to iterate and get the right coverage and parameters # try running phastCons below with parameters used above and check the # coverage of coding regions by the most conserved elements # Create cluster dir to do main phastCons run ssh pk mkdir -p \ /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun cp -p ../treeRun2/ave.*.mod . cp -p ../treeRun2/ave.*.mod \ /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ # Use the expected length and target coverage determined above and # the corresponding average conserved and nonconserved models cat > doPhast.csh << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 18 --target-coverage 0.32 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.*.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP doPhast.csh $(root1) $(file1) #ENDLOOP '_EOF_' # happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. para time # Completed: 191 of 191 jobs # CPU time in finished jobs: 4660s 77.67m 1.29h 0.05d 0.000 y # IO & Wait Time: 2927s 48.78m 0.81h 0.03d 0.000 y # Average job time: 40s 0.66m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 83s 1.38m 0.02h 0.00d # Submission to last job: 2246s 37.43m 0.62h 0.03d # combine predictions and transform scores to be in 0-1000 interval ssh kkstore04 cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed \ /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons # Figure out how much is actually covered by the mostConserved data as so: cd /cluster/data/danRer4 faSize */chr*.fa # 1774660131 bases (175779328 N's 1598880803 real 816338509 upper # 782542294 lower) in 28 sequences in 28 files # Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM) # max 208014280 (chrNA_random) median 59765243 # The non-N size is 1598880803 bases cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons awk '{sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/1598880803\n",100.0*sum/1598880803,sum}' \ mostConserved.bed -target-coverage 0.32: % 3.18 = 100.0*50871950/1598880803 length=18 # want to aim for 65% coverage of coding regions ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way/phastCons # get an or of refGene and mgcGenes CDS regions featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed # 11770580 bases of 1626093931 (0.724%) in intersection featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 3.128%, both 0.463%, # cover 63.94%, enrich 20.44x # for danRer3: featureBits danRer3 refSeqOrMgcCdsDanRer3.bed \ /cluster/data/danRer3/bed/multiz5way/mostConserved.bed -enrichment # refSeqOrMgcCdsDanRer3.bed 0.714%, # /cluster/data/danRer3/bed/multiz5way/mostConserved.bed 2.998%, # both 0.474%, cover 66.40%, enrich 22.14x # so use this result for -target-coverage=0.32 -expected-lengths=18 # with L_min*H entropy (PIT) value of 9.84 (aiming for around 9.8) and # 63.9% coverage of coding regions with most conserved elements # (aiming for about 65%) # Load most conserved track into database ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons hgsql -e 'drop table phastConsElements;' danRer4 hgLoadBed danRer4 phastConsElements mostConserved.bed # Loaded 676058 elements of size 5 featureBits danRer4 mgcGenes:cds phastConsElements -enrichment # mgcGenes:cds 0.560%, phastConsElements 3.128%, both 0.366%, # cover 65.36%, enrich 20.89x # Create merged posterier probability file and wiggle track data files # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode ssh kkstore04 cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | wigEncode stdin phastCons7way.wig phastCons7way.wib # takes a few minutes ls -l phastCons* # -rw-rw-r-- 1 hartera protein 255524779 May 29 19:49 phastCons7way.wib # -rw-rw-r-- 1 hartera protein 61525690 May 29 19:49 phastCons7way.wig cp -p phastCons7way.wi? /cluster/data/danRer4/bed/multiz7way/phastCons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons mkdir -p /gbdb/danRer4/wib rm /gbdb/danRer4/wib/phastCons7way.wib ln -s `pwd`/phastCons7way.wib /gbdb/danRer4/wib/phastCons7way.wib # use this if need to reload table hgsql -e 'drop table phastCons7way;' danRer4 # load table hgLoadWiggle danRer4 phastCons7way phastCons7way.wig # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons bash time hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=danRer4 phastCons7way > histogram.data 2>&1 # real 0m30.234s # user 0m23.721s # sys 0m3.234s # create plot of histogram: cat << '_EOF_' > histo.gp set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Zebrafish danRer4 Histogram phastCons7 track" set xlabel " phastCons7 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # happy emacs gnuplot histo.gp > histo.png display histo.png & # add line: wiggle phastCons7way to trackDb.ra for multiz7way to display the # wiggle for the conservation track. # check all.joiner for entries for phastCons7way and phastConsElements7way -ok # copy over html for multiz and edit. ########################################################################### # PHASTCONS SCORES DOWNLOADABLES FOR 7WAY (DONE, 2006-05-30, hartera) # prepare compressed copy of ascii data values for downloads ssh kolossus cd /cluster/data/danRer4/bed/multiz7way.2006-05-28 mkdir phastConsDownloads cd phastConsDownloads cat > downloads.csh << 'EOF' date cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun/ppRaw foreach chr (`awk '{print $1}' /cluster/data/danRer4/chrom.sizes`) echo $chr cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \ | nice gzip -c \ > /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads/$chr.gz end date 'EOF' # << emacs csh -efx downloads.csh >&! downloads.log & tail -f downloads.log # Took ~5 minutes. md5sum *.gz > md5sum.txt ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads set dir = /usr/local/apache/htdocs/goldenPath/danRer4/phastCons7wayScores mkdir $dir ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads/{*.gz,md5sum.txt} $dir # copy over and edit README.txt cd $dir cp \ /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores/README.txt . # Clean up after phastCons run. ssh kkstore04 rm /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons/*.tab rm -r /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons ########################################################################### # CREATED RECIPROCAL BEST NETS AND MAF NETS FOR ALL SPECIES WITH PAIRWISE # ALIGNMENTS USED FOR MULTIZ MULTIPLE ALIGNMENT # (DONE, 2006-05-12 - 2006-05-15 , hartera) # for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18. ssh kolossus mkdir /cluster/data/danRer4/bed/rBestRunForMultiz/ cd /cluster/data/danRer4/bed/rBestRunForMultiz # need to re-run chainNet and keep first output (target-referenced, # target-centric nets) and second output that we usually /dev/null # (query-referenced, target-centric nets). cat > rBestNet.csh << 'EOF' #!/bin/csh -ef foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18) echo "Creating Reciprocal Best Net for $s..." set binDir=/cluster/home/hartera/bin/i386 set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain cd $dir # Run chainNet again, this time keeping the second output: chainPreNet danRer4.$s.all.chain.gz /cluster/data/danRer4/chrom.sizes \ /cluster/data/$s/chrom.sizes stdout \ | $binDir/chainNet stdin /cluster/data/danRer4/chrom.sizes \ /cluster/data/$s/chrom.sizes /dev/null stdout | \ netSyntenic stdin $dir/$s.danRer4_ref.net # get the other species chains from the other species-referenced # (but danRer4-centric) net: chainSwap danRer4.$s.all.chain.gz $s.danRer4.all.chain netChainSubset -verbose=0 $s.danRer4_ref.net \ $s.danRer4.all.chain stdout \ | chainSort stdin $s.danRer4_ref.subset.chain # Net those (sorted) danRer4 chains, and keep both outputs, to get # reciprocal best nets referenced to both species: chainPreNet $s.danRer4_ref.subset.chain \ /cluster/data/$s/chrom.sizes /cluster/data/danRer4/chrom.sizes stdout \ | $binDir/chainNet stdin /cluster/data/$s/chrom.sizes \ /cluster/data/danRer4/chrom.sizes tmp1 tmp2 netSyntenic tmp1 $s.danRer4.rbest.net netSyntenic tmp2 danRer4.$s.rbest.net rm tmp1 tmp2 nice gzip *.rbest.net end 'EOF' chmod +x rBestNet.csh nice rBestNet.csh >& rBestNet.log & # Took about 11 minutes to complete. # Then make axtNet and mafNet cat > makeMafRBestNet.csh << 'EOF' #!/bin/csh -ef foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18) echo "Creating mafs for $s ..." set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain set seqDir=/san/sanvol1/scratch cd $dir # extract recriprocal best chains from the zebrafish-other species rbest.net echo "Get reciprocal best chains for best zebrafish-$s" netChainSubset danRer4.$s.rbest.net.gz danRer4.$s.all.chain.gz \ danRer4.$s.rbest.chain # need to make sure this is sorted and assing unique chain IDs chainSort danRer4.$s.rbest.chain stdout | chainMergeSort stdin \ > danRer4.$s.rbest.newids.chain # need to re-net with new ids chainNet danRer4.$s.rbest.newids.chain /cluster/data/danRer4/chrom.sizes \ /cluster/data/$s/chrom.sizes danRer4.$s.rbest.newids.net /dev/null # split reciprocal best chains and net chainSplit rBestChain danRer4.$s.rbest.newids.chain netSplit danRer4.$s.rbest.newids.net rBestNet mkdir ../axtRBestNet # make axtNet for reciprocal best echo "Making axtRBestNet for $s ..." foreach f (rBestNet/*.net) netToAxt $f rBestChain/$f:t:r.chain \ $seqDir/danRer4/danRer4.2bit $seqDir/$s/$s.2bit stdout \ | axtSort stdin stdout \ | gzip -c > ../axtRBestNet/$f:t:r.danRer4.$s.net.axt.gz end # make mafNet for reciprocal best cd .. mkdir mafRBestNet echo "Making mafRBestNet for $s ..." foreach f (axtRBestNet/*.danRer4.$s.net.axt.gz) axtToMaf -tPrefix=danRer4. -qPrefix=$s. $f \ /cluster/data/danRer4/chrom.sizes /cluster/data/$s/chrom.sizes stdout \ | gzip -c > mafRBestNet/$f:t:r:r:r:r:r.maf.gz end end 'EOF' chmod +x makeMafRBestNet.csh nice makeMafRBestNet.csh >& mafRBestNet.log & # Took about an hour. # NOTE: Must use chainSort and chainMergeSort to reassign unique IDs # to the chains extracted from the rbest.net and then re-net the chains # with the new IDs ortherwise netToAxt crashes due to duplicate chain IDs. # Now do the multiple alignment using reciprocal best mafNets as input # for multiz. # Load up nets and chains from rBestChain and rBestNet ssh hgwdev cd /cluster/data/danRer4/bed/rBestRunForMultiz # Nets from Reciprocal Best have no type field or repeat/gap stats so need # to add these. cat > loadRBest.csh << 'EOF' #!/bin/csh -ef foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18) set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain if ($s == "tetNig1") then set g = TetNig1 else if ($s == "fr1") then set g = Fr1 else if ($s == "xenTro2") then set g = XenTro2 else if ($s == "monDom4") then set g = MonDom4 else if ($s == "mm8") then set g = Mm8 else if ($s == "hg18") then set g = Hg18 endif # load chains echo "Loading chains for $s ..." cd $dir/rBestChain foreach f (*.chain) set c = $f:r hgLoadChain danRer4 ${c}_chainRBest${g} $f end # load nets cd $dir echo "Loading nets for $s ..." # add type field netSyntenic danRer4.${s}.rbest.newids.net noClassRBest.net # add gap/repeat stats to net file using database tables netClass -verbose=0 -noAr noClassRBest.net danRer4 $s \ danRer4.${s}.rbest.withClass.net netFilter -minGap=10 danRer4.${s}.rbest.withClass.net \ | hgLoadNet -verbose=0 danRer4 netRBest${g} stdin end 'EOF' << emacs chmod +x loadRBest.csh nohup nice loadRBest.csh >& loadRBest.log & ########################################################################### # MULTIZ7WAY ALIGNMENTS FOR CONSERVATION TRACK - USING RECIPROCAL BEST NETS # (DONE, 2006-05-18 - 2006-05-24, hartera) # for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18. ssh kkstore04 mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18 cd /cluster/data/danRer4/bed/multiz7way.2006-05-18 # copy MAFs to a cluster-friendly server # use bluearc as the san is down mkdir /cluster/bluearc/danRer4/mafRBestNet foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18) echo $s rsync -av /cluster/data/danRer4/bed/blastz.$s/mafRBestNet/* \ /cluster/bluearc/danRer4/mafRBestNet/$s/ end # prune the hg17 17way tree to just these 7 and update db names: /cluster/bin/phast/tree_doctor \ --prune-all-but=mouse_mm8,human_hg18,monodelphis_monDom4,xenopus_xenTro1,tetraodon_tetNig1,fugu_fr1,zebrafish_danRer3 \ --rename="xenopus_xenTro1 -> xenopus_xenTro2 ; zebrafish_danRer3 -> zebrafish_danRer4" \ /cluster/data/hg18/bed/multiz17way/17way.nh > 7way.nh # carefully edit so that danRer4 is first. copy first to new file cp 7way.nh 7way_zfishFirst.nh # DO THIS LATER AND CREATE FROM TREE WITHOUT DISTANCES /cluster/bin/phast/draw_tree 7way_zfishFirst.nh > 7way.ps # also made the ps file for the 7way.nh and compared to make sure # that the tree with zebrafish at the top looks correct. /cluster/bin/phast/all_dists 7way_zfishFirst.nh > 7way.distances grep danRer4 7way.distances | sort -k3,3n | \ awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt cat distances.txt # 1.4749 tetraodon_tetNig1 # 1.5154 fugu_fr1 # 1.7480 human_hg18 # 1.7782 monodelphis_monDom4 # 1.8771 xenopus_xenTro2 # 2.1058 mouse_mm8 # the order in the browser display will be by tree topology, # not by distance, so they will be: # danRer4 # 1.5154 fugu_fr1 # 1.4749 tetraodon_tetNig1 # 1.8771 xenopus_xenTro2 # 1.7782 monodelphis_monDom4 # 2.1058 mouse_mm8 # 1.7480 human_hg18 # create species list and stripped down tree for autoMZ sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' \ 7way_zfishFirst.nh > tree-commas.nh sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst cp tree-commas.nh 7way.nh ssh pk cd /cluster/data/danRer4/bed/multiz7way.2006-05-18 mkdir maf run cd run # stash binaries mkdir penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set db = danRer4 set c = $1 set maf = $2 set run = `pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /cluster/bluearc/$db/mafRBestNet rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp 'EOF' # << emacs chmod +x autoMultiz.csh cat << 'EOF' > spec #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/danRer4/bed/multiz7way.2006-05-18/maf/$(root1).maf} #ENDLOOP 'EOF' # << emacs awk '{print $1}' /cluster/data/danRer4/chrom.sizes > chrom.lst gensub2 chrom.lst single spec jobList para create jobList para try, check, push, check etc. ... # Took less than 10 minutes to run # Make .jpg for tree and install in htdocs/images/phylo/... don't forget # to request a push of that file. The treeImage setting in trackDb.ra # is phylo/danRer4_7way.jpg (relative to htdocs/images). # ssh hgwdev # DO LATER # cd /cluster/data/danRer4/bed/multiz7way.2006-05-04 # pstopnm -stdout 7way.ps | pnmtojpeg > danRer4_7way.jpg # ask Bob to resize image for Browser track description page. # Build maf annotation and load database ssh kolossus mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno mkdir maf run cd run rm -f sizes nBeds foreach db (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-18/species.lst`) ln -s /cluster/data/$db/chrom.sizes $db.len if (! -e /cluster/data/$db/$db.N.bed) then twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed} endif ln -s /cluster/data/$db/$db.N.bed $db.bed echo $db.bed >> nBeds echo $db.len >> sizes end echo date > jobs.csh # do smaller jobs first: foreach f (`ls -1rS ../../maf/*.maf`) echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \ /cluster/data/danRer4/danRer4.2bit ../maf/`basename $f` \ >> jobs.csh echo "echo $f" >> jobs.csh end echo date >> jobs.csh csh -efx jobs.csh >&! jobs.log & tail -f jobs.log # Load anno/maf ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf mkdir -p /gbdb/danRer4/multiz7wayRBest/anno/maf ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf/*.maf \ /gbdb/danRer4/multiz7wayRBest/anno/maf # Reload as not working correctly. hgsql -e 'drop table multiz7wayRBest;' danRer4 hgsql -e 'delete from extFile where path like "%multiz7wayRBest%";' \ danRer4 cat > loadMaf.csh << 'EOF' date nice hgLoadMaf -pathPrefix=/gbdb/danRer4/multiz7wayRBest/anno/maf danRer4 multiz7wayRBest date 'EOF' # << emacs csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kkr7u00 cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf cat *.maf \ | nice hgLoadMafSummary danRer4 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz7wayRBestSummary stdin # Created 526386 summary blocks from 1972659 components and 1105457 mafs # from stdin ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf sed -e 's/mafSummary/multiz7wayRBestSummary/' \ ~/kent/src/hg/lib/mafSummary.sql \ > /tmp/multiz7wayRBestSummary.sql time nice hgLoadSqlTab danRer4 multiz7wayRBestSummary \ /tmp/multiz7wayRBestSummary.sql multiz7wayRBestSummary.tab # 0.000u 0.000s 0:07.56 0.0% 0+0k 0+0io 4pf+0w rm *.tab /tmp/multiz7wayRBestSummary.sql # ln -s multiz7way.2006-05-18 /cluster/data/danRer4/bed/multiz7way # ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-18/danRer4_7way.jpg \ # /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg # change permissions for display if not already readable to all # chmod +r /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg # check for all.joiner entry for 7-way - it is there already. # add trackDb.ra entry for danRer4: ########################################################################### # PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 7-WAY ALIGNMENT USING MAFS # FROM RECIPROCAL BEST NET (DONE, 2006-05-19 - 2005-05-24, hartera) ssh kkstore04 mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons # create a starting-tree.mod based on chr14 (92 Mb) # chr14 is the largest chrom apart from chrNA_random /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr14.maf \ --refseq ../../../14/chr14.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "`cat ../tree-commas.nh`" \ --out-root starting-tree # took less than a minute rm s1.*ss # Get genome-wide average GC content (for all species together, # not just the reference genome). If you have a globally # estimated tree model, as above, you can get this from the # BACKGROUND line in the .mod file. E.g., # ALPHABET: A C G T # ... # BACKGROUND: 0.309665 0.189697 0.189720 0.310918 # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.379 is the GC content. This is used in the -gc argument below. # If you do *not* have a global tree model and you do not know your # GC content, you can get it directly from the MAFs with a command # like: /cluster/bin/phast/$MACHTYPE/msa_view \ --aggregate danRer4,tetNig1,fr1,xenTro2,monDom4,mm8,hg18 -i MAF \ -S /cluster/data/danRer4/bed/multiz7way/maf/chr*.maf > maf_summary.txt # This gives a GC content of 0.426 so use this as it is from mafs for # the whole genome. # break up the genome-wide MAFs into pieces on the san filesystem ssh pk # should use a directory on the san but it is down and para create is # not working on kk. set WINDOWS=/cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/ss mkdir -p $WINDOWS cd $WINDOWS cat << 'EOF' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/danRer4/bed/multiz7way.2006-05-18/maf set WINDOWS=/cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/ss cd $WINDOWS set c = $1 echo $c rm -fr $c mkdir $c set N = `echo $c | sed -e 's/chr//'` /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \ -M /cluster/data/danRer4/$N/$c.fa \ -o SS -w 10000000,0 -I 1000 -B 5000 -r $c/$c echo "Done" >> $c.done 'EOF' # << emacs chmod +x doSplit.csh rm -f jobList foreach c (`cat /cluster/data/danRer4/chrom.lst`) echo "doSplit.csh chr${c} {check out line+ $WINDOWS/chr$c.done}" >> jobList end para create jobList para push, check etc. para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 847s 14.12m 0.24h 0.01d 0.000 y # IO & Wait Time: 9741s 162.35m 2.71h 0.11d 0.000 y # Average job time: 378s 6.30m 0.11h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 539s 8.98m 0.15h 0.01d # Submission to last job: 581s 9.68m 0.16h 0.01d # Create a random list of 50 1 mb regions (do not use chrNA and chrUn) ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh pk set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons mkdir -p $dir cd $dir # now set up cluster job to estimate model parameters. Parameters # will be estimated separately for each alignment fragment then # will be combined across fragments. Tuning this loop should come # back to here to recalculate. Tuning target-coverage and expected-length. # Create little script that calls phastCons with right arguments cat > makeTree.csh << 'EOF' #!/bin/csh -fe set C = $1:h set treeRun = $2 set cov = $3 set len = $4 set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons mkdir -p $dir/$treeRun/log/${C} $dir/$treeRun/tree/${C} /cluster/bin/phast/x86_64/phastCons $dir/ss/$1 \ /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons/starting-tree.mod \ --gc 0.426 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-length $len --target-coverage $cov \ --quiet --log $dir/$treeRun/log/$1 --estimate-trees $dir/$treeRun/tree/$1 'EOF' # << emacs chmod a+x makeTree.csh # Make sure that the correct GC content is substituted in here. Notice # the target coverage of 0.17. Here we are going to aim # for 65% coverage of coding regions by conserved elements. # Create gensub file # need to add cov and len parameters cat > template << '_EOF_' #LOOP makeTree.csh $(path1) $(path2) #ENDLOOP '_EOF_' # happy emacs # Make cluster job and run it echo "treeRun1 0.17 12" > tree.lst echo "treeRun2 0.32 18" >> tree.lst echo "treeRun3 0.32 20" >> tree.lst echo "treeRun4 0.35 18" >> tree.lst gensub2 randomSs.list tree.lst template jobList para create jobList para try,check,push,check etc. # para time # Completed: 200 of 200 jobs # CPU time in finished jobs: 45500s 758.33m 12.64h 0.53d 0.001 y # IO & Wait Time: 31478s 524.64m 8.74h 0.36d 0.001 y # Average job time: 385s 6.41m 0.11h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 622s 10.37m 0.17h 0.01d # Submission to last job: 821s 13.68m 0.23h 0.01d # try again, mkdir test2. if aim for about 5% coverage and for chr1 on # hg18, netDanRer4 covers about 31% of bases then 0.05/0.30 = 0.156 # want length of about 20 bp to influence the model towards detecting # shorter conserved regions such as TFBSs. cd test2 echo "treeRun5 0.156 20" > tree.lst gensub2 ../randomSs.list tree.lst template jobList para create jobList cd test3 echo "treeRun6 0.156 15" > tree.lst gensub2 ../randomSs.list tree.lst template jobList para create jobList cd test4 # increase coverage and compensate a bit by lowering the expected length echo "treeRun7 0.25 8" > tree.lst gensub2 ../randomSs.list tree.lst template jobList para create jobList cd test5 echo "treeRun8 0.35 12" > tree.lst gensub2 ../randomSs.list tree.lst template jobList para create jobList cd test6 echo "treeRun9 0.5 20" > tree.lst gensub2 ../randomSs.list tree.lst template jobList para create jobList cd test7 echo "treeRun10 0.5 24" > tree.lst gensub2 ../randomSs.list tree.lst template jobList para create jobList cd test8 echo "treeRun11 0.45 22" > tree.lst echo "treeRun12 0.5 26" >> tree.lst echo "treeRun13 0.5 28" >> tree.lst gensub2 ../randomSs.list tree.lst template jobList para create jobList cd test9 echo "treeRun14 0.45 24" > tree.lst echo "treeRun15 0.45 20" >> tree.lst gensub2 ../randomSs.list tree.lst template jobList para create jobList cd test10 echo "treeRun16 0.40 24" > tree.lst echo "treeRun17 0.40 20" >> tree.lst echo "treeRun18 0.42 20" >> tree.lst gensub2 ../randomSs.list tree.lst template jobList para create jobList cd test11 echo "treeRun19 0.38 24" > tree.lst echo "treeRun20 0.38 22" >> tree.lst echo "treeRun21 0.38 20" >> tree.lst gensub2 ../randomSs.list tree.lst template jobList para create jobList # Now combine parameter estimates. We can average the .mod files # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons foreach d ($dir/treeRun*) cd $d ls tree/chr*/*.cons.mod > cons.txt /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \ --output-average ave.cons.mod > cons_summary.txt ls tree/chr*/*.noncons.mod > noncons.txt /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \ --output-average ave.noncons.mod > noncons_summary.txt end # measuring entropy # consEntropy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument # target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok) # the expected length that produces this entropy is the one # to use for phastCons. # foreach treeRun, set the appropriate coverage and length # file: treeRunN cov len # use awk to split up cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons cp tree.lst entropy.csh perl -pi.bak -e 's/^(treeRun[0-9]+)\s*([0-9\.]+)\s*([0-9]+)/echo \"Coverage = $2 Length = $3\"\ncd $1\n\/cluster\/bin\/phast\/x86_64\/consEntropy $2 $3 ave.cons.mod ave.noncons.mod\ncd \.\./' entropy.csh chmod +x entropy.csh entropy.csh >& entropy.out # entropy.out #Coverage = 0.17 Length = 12 #Transition parameters:gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068 #Relative entropy: H=0.782279 bits/site #Expected min. length: L_min=13.655129 sites #Expected max. length: L_max=8.801144 sites #Phylogenetic information threshold: PIT=L_min*H=10.682123 bits #Coverage = 0.32 Length = 18 #Transition parameters:gamma=0.320000, omega=18.000000, mu=0.055556, nu=0.026144 #Relative entropy: H=0.757117 bits/site #Expected min. length: L_min=13.055080 sites #Expected max. length: L_max=9.912578 sites #Phylogenetic information threshold: PIT=L_min*H=9.884225 bits #Coverage = 0.32 Length = 20 #Transition parameters:gamma=0.320000, omega=20.000000, mu=0.050000, nu=0.023529 #Relative entropy: H=0.736191 bits/site #Expected min. length: L_min=13.815340 sites #Expected max. length: L_max=10.615242 sites #Phylogenetic information threshold: PIT=L_min*H=10.170732 bits #Coverage = 0.35 Length = 18 #Transition parameters:gamma=0.350000, omega=18.000000, mu=0.055556, nu=0.029915 #Relative entropy: H=0.768872 bits/site #Expected min. length: L_min=12.471015 sites #Expected max. length: L_max=9.642561 sites #Phylogenetic information threshold: PIT=L_min*H=9.588610 bits #Coverage = 0.156 Length = 20 #Transition parameters:gamma=0.156000, omega=20.000000, mu=0.050000, nu=0.009242 #Relative entropy: H=0.676147 bits/site #Expected min. length: L_min=17.857722 sites #Expected max. length: L_max=12.694666 sites #Phylogenetic information threshold: PIT=L_min*H=12.074436 bits #Coverage = 0.156 Length = 15 #Transition parameters:gamma=0.156000, omega=15.000000, mu=0.066667, nu=0.012322 #Relative entropy: H=0.726430 bits/site #Expected min. length: L_min=15.713919 sites #Transition parameters: gamma=0.250000, omega=8.000000, mu=0.125000, nu=0.041667 #Relative entropy: H=0.950194 bits/site #Expected min. length: L_min=8.951612 sites #Expected max. length: L_max=5.560228 sites #Phylogenetic information threshold: PIT=L_min*H=8.505767 bits #Coverage = 0.5 Length = 20 #Transition parameters:gamma=0.500000, omega=20.000000, mu=0.050000, nu=0.050000 #Relative entropy: H=0.817081 bits/site #Expected min. length: L_min=10.397809 sites #Expected max. length: L_max=9.006386 sites #Phylogenetic information threshold: PIT=L_min*H=8.495855 bits # Coverage = 0.5 Length = 24 #Transition parameters:gamma=0.500000, omega=24.000000, mu=0.041667, nu=0.041667 #Relative entropy: H=0.772807 bits/site #Expected min. length: L_min=11.706841 sites #Expected max. length: L_max=10.170845 sites #Phylogenetic information threshold: PIT=L_min*H=9.047124 bits # Coverage = 0.5 Length = 26 #Transition parameters:gamma=0.500000,omega=26.000000, mu=0.038462, nu=0.038462 #Relative entropy: H=0.755159 bits/site #Expected min. length: L_min=12.299010 sites #Expected max. length: L_max=10.697444 sites #Phylogenetic information threshold: PIT=L_min*H=9.287712 bits #Coverage = 0.5 Length = 28 #Transition parameters:gamma=0.500000,omega=28.000000, mu=0.035714, nu=0.035714 #Relative entropy: H=0.739661 bits/site #Expected min. length: L_min=12.856932 sites #Expected max. length: L_max=11.193931 sites #Phylogenetic information threshold: PIT=L_min*H=9.509775 bits ########USED THESE PARAMETERS################## #Coverage = 0.45 Length = 24 #Transition parameters:gamma=0.450000, omega=24.000000, mu=0.041667, nu=0.034091 #Relative entropy: H=0.749572 bits/site #Expected min. length: L_min=12.663020 sites #Expected max. length: L_max=10.634682 sites #Phylogenetic information threshold: PIT=L_min*H=9.491841 bits #Coverage = 0.40 Length = 24 #Transition parameters:gamma=0.400000, omega=24.000000, mu=0.041667, nu=0.027778 #Relative entropy: H=0.730161 bits/site #Expected min. length: L_min=13.607002 sites #Expected max. length: L_max=11.092981 sites #Phylogenetic information threshold: PIT=L_min*H=9.935307 bits #Coverage = 0.38 Length = 20 #Transition parameters:gamma=0.380000, omega=20.000000, mu=0.050000, nu=0.030645 #Relative entropy: H=0.758676 bits/site #Expected min. length: L_min=12.652818 sites #Expected max. length: L_max=10.063048 sites #Phylogenetic information threshold: PIT=L_min*H=9.599385 bits #Coverage = 0.38 Length = 24 #Transition parameters:gamma=0.380000, omega=24.000000, mu=0.041667, nu=0.025538 #Relative entropy: H=0.723105 bits/site #Expected min. length: L_min=13.987286 sites #Expected max. length: L_max=11.279443 sites #Phylogenetic information threshold: PIT=L_min*H=10.114270 bits # Create cluster dir to do main phastCons run ssh pk mkdir -p \ /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun cp -p ../treeRun1/ave.*.mod . cp -p ../treeRun1/ave.*.mod \ /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ # Use the expected length and target coverage determined above and # the corresponding average conserved and nonconserved models cat > doPhast.csh << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 18 --target-coverage 0.32 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.*.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP doPhast.csh $(root1) $(file1) #ENDLOOP '_EOF_' # happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. para time # Completed: 191 of 191 jobs # CPU time in finished jobs: 4421s 73.69m 1.23h 0.05d 0.000 y # IO & Wait Time: 121036s 2017.26m 33.62h 1.40d 0.004 y # Average job time: 657s 10.95m 0.18h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 726s 12.10m 0.20h 0.01d # Submission to last job: 874s 14.57m 0.24h 0.01d # combine predictions and transform scores to be in 0-1000 interval ssh kkstore04 cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed cp -p mostConserved.bed \ /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons # Figure out how much is actually covered by the mostConserved data as so: cd /cluster/data/danRer4 faSize */chr*.fa # 1774660131 bases (175779328 N's 1598880803 real 816338509 upper # 782542294 lower) in 28 sequences in 28 files # Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM) # max 208014280 (chrNA_random) median 59765243 # 782542294 lower) in 28 sequences in 28 files # Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM) # max 208014280 (chrNA_random) median 59765243 # The non-N size is 1598880803 bases cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons awk '{sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/1598880803\n",100.0*sum/1598880803,sum}' \ mostConserved.bed -target-coverage 0.17: % 1.51 = 100.0*24186350/1598880803 length=12 -target-coverage 0.156: % 1.44 = 100.0*22973222/1598880803 length=20 -target-coverage 0.156: % 1.32 = 100.0*21177329/1598880803 length=15 -target-coverage 0.25: % 1.32 = 100.0*21104503/1598880803 length=8 -target-coverage 0.32: % 1.88 = 100.0*30014509/1598880803 length=20 -target-coverage 0.5: % 3.00 = 100.0*47931076/1598880803 length=20 -target-coverage 0.5: % 2.95 = 100.0*47170018/1598880803 length=24 -target-coverage 0.5: % 2.24 = 100.0*35801661/1598880803 length=28 -target-coverage 0.45: % 2.50 = 100.0*39965003/1598880803 length=24 -target-coverage 0.40: % 2.22 = 100.0*35436744/1598880803 length=24 -target-coverage 0.38: % 2.12 = 100.0*33911465/1598880803 length=20 -target-coverage 0.38: % 2.13 = 100.0*33986115/1598880803 length=24 # want to aim for 65% coverage of coding regions ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons # get an or of refGene and mgcGenes CDS regions featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed # 11753378 bases of 1626093931 (0.723%) in intersection # featureBits danRer3 refGene:cds mgcGenes:cds -or \ # -bed=refSeqOrMgcCdsDanRer3.bed # 11633092 bases of 1630323462 (0.714%) in intersection featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.487%, both 0.332%, # cover 45.97%, enrich 30.90x # for length = 12 and cov = 0.17 PIT=10.7 # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.846%, both 0.388%, # cover 53.74%, enrich 29.12x # for length = 20 and cov = 0.156 PIT=12.1 # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.413%, both 0.333%, # cover 46.04%, enrich 32.59x # for length = 15 and cov = 0.156 PIT=11.4 # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.302%, both 0.313%, # cover 43.36%, enrich 33.30x # decrease length and increase coverage to compensate # for length = 8 and cov = 0.25 PIT=8.5, PIT is too low # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.298%, both 0.304%, # cover 42.06%, enrich 32.40x # try length = 20 and cov = 0.32 PIT=10.8 # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.846%, both 0.388%, # cover 53.74%, enrich 29.12x # length = 20 and cov = 0.5 PIT=8.5 # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 2.948%, both 0.459%, # cover 63.53%, enrich 21.55x # coverage good, need to increase the PIT value so increase the length. # length = 24 and cov = 0.5 PIT=9.05 # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 2.901%, both 0.458%, # cover 63.35%, enrich 21.84x # length = 28 and cov = 0.5 PIT=9.5 # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.202%, both 0.431%, # cover 59.57%, enrich 27.06x # length = 24 and cov = 0.45 PIT=9.5 featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed # 11770580 bases of 1626093931 (0.724%) in intersection featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.458%, both 0.438%, # cover 60.57% enrich 24.64x # length = 20 and cov = 0.38 PIT=9.6 # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.085%, both 0.411%, # cover 56.76%, enrich 27.22x # length = 24 and cov = 0.38 PIT=10.1 # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.090%, both 0.413%, # cover 57.07%, enrich 27.30x # with L_min*H entropy (PIT) value of 9.84 (aiming for around 9.8) and # 53.3% coverage of coding regions with most conserved elements # (aiming for about 65%) # use consRun14 length = 24 cov=0.45 # Load most conserved track into database ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons hgLoadBed danRer4 phastConsRBestElements mostConserved.bed # Loaded elements of size 5 featureBits danRer4 mgcGenes:cds phastConsRBestElements -enrichment # mgcGenes:cds 0.560%, phastConsRBestElements 2.458%, both 0.349%, # cover 62.23%, enrich 25.32x # Create merged posterier probability file and wiggle track data files # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode ssh kkstore04 cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun14 find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | wigEncode stdin phastConsRBest7way.wig phastConsRBest7way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # takes a few minutes ls -l phastCons* #-rw-rw-r-- 1 hartera protein 133817339 May 24 22:48 phastConsRBest7way.wib #-rw-rw-r-- 1 hartera protein 36947021 May 24 22:48 phastConsRBest7way.wig cp -p phastConsRBest7way.wi? \ /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons mkdir -p /gbdb/danRer4/wib ln -s `pwd`/phastConsRBest7way.wib /gbdb/danRer4/wib/phastConsRBest7way.wib # use this if need to reload table hgsql -e 'drop table phastConsRBest7way;' danRer4 # load table hgLoadWiggle danRer4 phastConsRBest7way phastConsRBest7way.wig # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/danRer4/bed/multiz7way.2006-05-04/phastCons bash time hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=danRer4 phastCons7way > histogram.data 2>&1 # real 2m33.069s # user 1m58.310s # sys 0m16.170s # create plot of histogram: cat << '_EOF_' > histo.gp set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Zebrafish danRer4 Histogram phastCons7 track" set xlabel " phastCons7 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # happy emacs gnuplot histo.gp > histo.png display histo.png & # add line: wiggle phastCons7way to trackDb.ra for multiz7way to display the # wiggle for the conservation track. # check all.joiner for entries for phastCons7way and phastConsElements7way -ok # copy over html for multiz and edit. ########################################################################### # BACENDS TRACK (DONE, 2006-08-25, hartera) # Obtain these from the NCBI Trace archive ssh kolossus mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/sequences cd /cluster/data/danRer4/bed/bacEnds/ ln -s /san/sanvol1/scratch/danRer4/bacEnds/sequences . cd sequences # go to NCBI Trace Archive # http://www.ncbi.nlm.nih.gov/Traces/trace.cgi? cat << '_EOF_' > query_tracedb #!/usr/bin/perl -w use strict; use LWP::UserAgent; use HTTP::Request::Common 'POST'; $ENV{'LANG'}='C'; $ENV{'LC_ALL'}='C'; my $query = join ' ', @ARGV; $query = 'help' if $query =~ /^(\-h|\-\-help|\-)$/; $query = join('', ) if ! $query; my $req = POST 'http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=raw', [query=>$query]; my $res = LWP::UserAgent->new->request($req, sub { print $_[0] }); die "Couldn't connect to TRACE server\n" if ! $res->is_success; '_EOF_' chmod +x query_tracedb # ./query_tracedb usage # command to see the help screen with usage examples # count number of entries for zebrafish query_tracedb "query count species_code='DANIO RERIO' AND trace_type_code = 'CLONEEND'" # 473060 # 428904 (08-16-06) # Therefore this is 11 files of 40000 results each. # so get from ftp site: cat << '_EOF_' > getZfishSeqs.csh #!/bin/csh -fe foreach n (0 1 2 3 4 5 6 7 8 9 10) echo "Fetching page $n ..." (echo -n "retrieve_tgz all 0b"; query_tracedb "query page_size 40000 page_number $n binary species_code='DANIO RERIO' AND trace_type_code = 'CLONEEND'") | query_tracedb > data${n}.tgz end '_EOF_' chmod +x getZfishSeqs.csh mkdir -p downloads cp query_tracedb getZfishSeqs.csh ./downloads cd downloads nohup nice getZfishSeqs.csh >& zfishSeqs.log & # Took 5 hours 14 minutes. ## Start: Wed May 10 09:57 Finished: 14:51 # Start: May 2 21:43 Finish: May 3 03:08 ssh kkstore04 # unzip and untar the downloads cd /cluster/data/danRer4/bed/bacEnds/sequences/downloads gunzip *.tgz cat << '_EOF_' > unTarBacs.csh #!/bin/csh -fe foreach t (0 1 2 3 4 5 6 7 8 9 10 11) tar xvf data${t}.tar end '_EOF_' chmod +x unTarBacs.csh nohup unTarBacs.csh >& unTarBacs.log & foreach d (2006*) echo "Processing $d" nice cat ${d}/TRACEINFO.xml >> allTraceInfo.xml nice catBacs.csh >& catBacs.log & # The last archive obtained is empty so try downloading from the ftp site # to be sure to get everything. # get BAC end sequences from NBCI Trace archive ftp site: ssh kkstore04 mkdir /cluster/data/danRer4/bed/bacEnds/sequences2 mkdir /cluster/bluearc/danRer4/bacEndsDownloads cd /cluster/data/danRer4/bed/bacEnds/sequences2 ln -s /cluster/bluearc/danRer4/bacEndsDownloads cd /cluster/data/danRer4/bed/bacEnds/sequences2/bacEndsDownloads # get index page and ftp for the trace server wget --timestamping \ ftp://ftp.ncbi.nih.gov/pub/TraceDB/danio_rerio/ # grab just the ftp link for each file. grep "anc" index.html > ancillary.lst perl -pi.bak -e 's/.+[a-zA-Z]+.+/$1/' ancillary.lst rm *.bak # this contains just the ftp link for each file to get the ancillary # information files. cat << '_EOF_' > getFtpFiles.csh #!/bin/csh -fe set s=$1 foreach f (`cat "${s}"`) echo $f nice wget --timestamping $f end '_EOF_' chmod +x getFtpFiles.csh nohup nice getFtpFiles.csh ancillary.lst >& ancillary.log & # Took about 25 minutes. grep "fasta" index.html > otherFiles.lst grep "mate_pairs" index.html >> otherFiles.lst grep "xml" index.html >> otherFiles.lst perl -pi.bak -e 's/.+[a-zA-Z]+.+/$1/' otherFiles.lst rm *.bak mkdir otherFiles cd otherFiles cp ../otherFiles.lst . # then get these files by ftp nice ../getFtpFiles.csh otherFiles.lst >& otherFiles.log & # Took about 6 hours and 50 minutes. # There are 181 files as expected. foreach f (*.gz) nice gunzip $f end cd .. cat ./otherFiles/fasta* > danRerBacEnds.fa # Took about 20 minutes grep '>' danRerBacEnds.fa | wc -l # 14566448 cat ./otherFiles/xml* > danRer.xml # Took 4 hours and 40 minutes. # find out which have CLONEEND information in them cat << '_EOF_' > findCloneEnds.csh #!/bin/csh -fe foreach f (otherFiles/xml.*) echo $f >> cloneEndsXml.txt grep CLONEEND $f >> cloneEndsXml.txt end '_EOF_' chmod +x findCloneEnds.csh nice findCloneEnds.csh & # Took 1.5 hours # CLONEEND is only in xml.danio_rerio.024 and xml.danio_rerio.033 cd /cluster/data/danRer4/bed/bacEnds/sequences2/bacEndsDownloads cat otherFiles/xml.danio_rerio.024 otherFiles/xml.danio_rerio.033 \ > cloneEnds.xml # cleanup xml files rm otherFiles/xml.* # get list of libraries: grep "LIBRARY_ID" cloneEnds.xml | sort | uniq > libraries.xml.txt grep "TRACE_NAME" cloneEnds.xml | wc -l # 985980 grep "TRACE_NAME" cloneEnds.xml | sort | uniq -c > traceName.xml.count # Hard to tell which are the BAC clone end sequences. These ftp files # contain a mixture of sequences from different sources # Try downloading sequences from Sanger instead. Not all of the sequences # may have been submitted to NCBI anyway yet. ssh kkstore04 cd /cluster/data/danRer4/bed/bacEnds mkdir -p /san/sanvol1/danRer4/bacEnds/ensemblSeqs ln -s /san/sanvol1/danRer4/bacEnds/ensemblSeqs cd ensemblSeqs wget --timestamping \ ftp://ftp.ensembl.org/pub/traces/danio_rerio/fasta/ # gets index.html page # get list of cloneEnd FASTA files grep cloneEnd index.html > cloneEndsFile perl -pi.bak -e 's/.+[a-zA-Z]+.+/$1/' cloneEndsFile rm *.bak foreach f (`cat cloneEndsFile`) echo $f wget --timestamping $f end # then do the same to get the trace info xml files: wget --timestamping \ ftp://ftp.ensembl.org/pub/traces/danio_rerio/traceinfo/ grep cloneEnd index.html > cloneEndsXmlFile perl -pi.bak -e 's/.+[a-zA-Z]+.+/$1/' cloneEndsXmlFile rm *.bak foreach f (`cat cloneEndsXmlFile`) echo $f wget --timestamping $f end gunzip *.gz # check for multiple occurrences of same sequence ID grep trace_name *.xml | sort | uniq -c | sort -nr > traceNames.count # top of list has count of 1 so the end names are unique. grep clone_id *.xml | sort | uniq -c | sort -nr > cloneIds.count # top of list has count of 4. All those clone IDs that appear 3 or 4 times # do so in the CHORI-1073 library - this is the fosmid library. # move CHORI-1073 out of the way mkdir fosmids mv sanger-zfish-CHORI-1073-cloneEnd* ./fosmids # FASTA files have clone end names as sequence names # concatenate the 18 fasta files cat *.fasta > Zv6BacEnds.fa grep '>' Zv6BacEnds.fa | wc -l # 694170 # Zv5 had 729101 but these were not unique reads for each sequence. faSize Zv6BacEnds.fa >& Zv6.faSize.txt # there are 31 sequence names with no sequence. awk '{print $10}' Zv6.faSize.txt > cloneEnds.noSeq # remove extra lines at end of file # list of FASTA files that they are in. grep -f cloneEnds.noSeq *.fasta > cloneEnds.noSeq.files # sent this list of sequence names and files to Kerstin Howe # at Sanger: kj2@sanger.ac.uk . Sanger said that these are just missing # sequences due to poor quality. # invalid FASTA file format # remove these from FASTA file: grep -v -f cloneEnds.noSeq Zv6BacEnds.fa > tmp.fa grep '>' tmp.fa | wc -l # 694139 mv tmp.fa Zv6BacEnds.fa faSize Zv6BacEnds.fa # 728424771 bases (11822219 N's 716602552 real 716602552 upper 0 lower) in # 694139 sequences in 1 files # Total size: mean 1049.4 sd 277.3 min 4 (zKp108D7.za) max 5403 (zC259G13.zb) # median 982 # N count: mean 17.0 sd 42.1 # U count: mean 1032.4 sd 265.3 # L count: mean 0.0 sd 0.0 # Blat these BAC ends vs the danRer4 genome assembly. Gaps between # scaffolds in the NA_random and Un_random chroms are 50,000 so # alignments of BAC ends across adjacent scaffolds are unlikely, # but alignments done separately just in case: ssh pk mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/sequences cd /cluster/data/danRer4/bed/bacEnds/ensemblSeqs cp Zv6BacEnds.fa /san/sanvol1/scratch/danRer4/bacEnds/sequences mkdir -p /cluster/data/danRer4/bed/bacEnds/chromsRun cd /cluster/data/danRer4/bed/bacEnds/chromsRun ls -1S /san/sanvol1/scratch/danRer4/bacEnds/sequences/Zv6BacEnds.fa \ > bacends.lst ls -1S /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > seqs.lst # create out dir mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl # use Blat parameters as for mm5 and hg17 cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc {check out line+ /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 seqs.lst bacends.lst template jobList para create jobList para try, check, push, check, ... # para time # Completed: 271 of 271 jobs # CPU time in finished jobs: 1063126s 17718.77m 295.31h 12.30d 0.034 y # IO & Wait Time: 2531s 42.18m 0.70h 0.03d 0.000 y # Average job time: 3932s 65.54m 1.09h 0.05d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 9404s 156.73m 2.61h 0.11d # Submission to last job: 9891s 164.85m 2.75h 0.11d # Repeat for random chroms, but use separate scaffolds: mkdir -p /cluster/data/danRer4/bed/bacEnds/randomsRun cd /cluster/data/danRer4/bed/bacEnds/randomsRun ls -1S /san/sanvol1/scratch/danRer4/bacEnds/sequences/Zv6BacEnds.fa \ > bacends.lst foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/Zv6*.fa) ls -1S $f >> seqs.lst end # create out dir mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/randomsPsl # use Blat parameters as for mm5 and hg17 cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc {check out line+ /san/sanvol1/scratch/danRer4/bacEnds/randomsPsl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 seqs.lst bacends.lst template jobList para create jobList para try, check, push, check, ... # para time # Completed: 2966 of 2966 jobs # CPU time in finished jobs: 240259s 4004.31m 66.74h 2.78d 0.008 y # IO & Wait Time: 84042s 1400.71m 23.35h 0.97d 0.003 y # Average job time: 109s 1.82m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 997s 16.62m 0.28h 0.01d # Submission to last job: 11925s 198.75m 3.31h 0.14d # lift chrom alignments and randoms alignments and then merge and filter. ssh kolossus cd /cluster/data/danRer4/bed/bacEnds/ nice pslSort dirs rawChroms.psl tmp \ /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl >& chromSort.log # Took 2 hours # very large output so do the randoms on the san cd /san/sanvol1/scratch/danRer4/bacEnds/ nice pslSort dirs rawRandoms.psl tmp randomsPsl >& randomsSort.log # Took 12 minutes # move the rawChroms.psl over to the san mv /cluster/data/danRer4/bed/bacEnds/rawChroms.psl \ /san/sanvol1/scratch/danRer4/bacEnds/ cd /san/sanvol1/scratch/danRer4/bacEnds/ # for danRer3, hg18 etc.: pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ rawChroms.psl bacEndsChroms.psl /dev/null # Took about 1 hour. pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \ rawRandoms.psl bacEndsRandoms.psl /dev/null # Took 2 minutes. # merge files. There is a single liftOver file that works for both the # pseudocontigs and the scaffolds. # remove header for bacEndsRandoms.psl tail +6 bacEndsRandoms.psl > tmp.psl cat bacEndsChroms.psl tmp.psl > bacEndsNoLift.psl # liftUp file to chrom coordinates. liftUp bacEnds.psl \ /cluster/data/danRer4/jkStuff/liftAll.lft warn bacEndsNoLift.psl # Took 2 minutes # REPROCESS BACENDS - see section at end (2006-10-06 - 2006-10-11, hartera) # Now put together the pairs information: ssh kkstore04 cd /cluster/data/danRer4/bed/bacEnds mv /san/sanvol1/danRer4/bacEnds/bacEnds.psl . # cat together the xml files of BAC clone end information cat ensemblSeqs/*.xml > danRerBacEnds.xml # get mate-pair information from xml, forward is SP6, reverse is T7 # edit getBacInfo.pl used for canFam1 and adapt for use with zebrafish # BAC ends. Not all entries in the xml file have clone_id or trace_end # but sometimes they have trace_direction instead of trace_end. # correct directions: cat << '_EOF_' > getZfishBacInfo.pl #!/usr/bin/perl -w use strict; my ($file, $outFile, $name, $clone, $library, $dir); $file = $ARGV[0]; $outFile = $ARGV[1]; open (FILE, $file) || die "Can not open $file : $!\n"; open (OUT, ">$outFile") || die "Can not create $outFile : $!\n"; open (STDERR, ">error.log") || die "Can not create error.log : $!\n"; my %cloneHash = qw { zC CH211- zK DKEY- zKp DKEYP- bZ RP71- dZ BUSM1- CHORI73_ CH73- }; $name = ""; $clone = ""; $dir = ""; while () { chomp; my $l = $_; if ($l =~ /([A-Za-z0-9\_\.]+)/) { $name = $1; } elsif ($l =~ /([A-Z0-9]+\-[0-9A-Z]+)/) { $clone = $1; } elsif ($l =~ /([A-Z0-9a-z\s]+\-?[0-9A-Z]*)<\/library_id>/) { $library = $1; if ($library eq "Daniokey Pilot") { $library = "DKEYP"; } } elsif ($l =~ /(F|R)/) { $dir = $1; } elsif ($l =~ /(F|R)/) { $dir = $1; } # find end of record and print out end information if ($l =~ /^\s+<\/trace>/) { printInfo($name, $clone, $library, $dir); $name = $clone = $dir = $library = ""; } } close FILE; close OUT; close STDERR; sub printInfo { my ($name, $clone, $lib, $d) = @_; # if no clone name read from file then create from trace name if ($clone eq "") { foreach my $c (keys(%cloneHash)) { if ($name =~ /$c/) { if (exists($cloneHash{$c})) { my $prefix = $cloneHash{$c}; $clone = $name; # change to clone name $clone =~ s/$c/$prefix/; # remove suffix $clone =~ s/\.[a-z]+|SP6|T7//; } } } } # convert forward or reverse direction to T7 or SP6 if ($d ne "") { if ($d eq "F") { $d = "T7"; } elsif ($d eq "R") { $d = "SP6"; } } else { print STDERR "No direction for $name found\n"; } # print clone end information print OUT "$clone\t$name\t0\t$lib\t0\t$d\n"; } '_EOF_' # << for emacs chmod +x getZfishBacInfo.pl perl getZfishBacInfo.pl danRerBacEnds.xml bacEndInfo.txt # check all the names are there grep '>' ./ensemblSeqs/Zv6BacEnds.fa > names perl -pi.bak -e 's/>//' names sort names | uniq > names.sort awk '{print $2}' bacEndInfo.txt | sort | uniq > bacEndInfo.names.sort comm -13 bacEndInfo.names.sort names.sort # no difference so all clone ends in the FASTA file are also # in the xml file. rm *.bak *.sort names # create mate-pair information cp /cluster/bin/scripts/convertBacEndPairInfo convertZfishBacEndInfo # comment out line 43 as this removes the suffix after a . from the # trace names. In this case, we need to keep those. # line 43: ($acc, $ver) = split(/\./,$acc); # here used wrong script - used old one. ./convertZfishBacEndInfo bacEndInfo.txt # creates pairs and singles files # 312901 pairs and 35479 singles # looks like pairs were made for both DKEY-32B21A and DKEY-32B21 # need to find singles that could be used in pairs. awk '{print $2}' bacEndSingles.txt > singles.names perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' singles.names perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' singles.names sort singles.names | uniq -c | sort -nr > singles.names.count # 209 have 2 ends for the BAC clone. # some are duplicates of the same end e.g. .ya and .yb but these # have the same BAC clone name. head -209 singles.names.count | awk '{print $2}' > singles.withPairs.names awk '{print $2}' bacEndPairs.txt > pairs.names perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' pairs.names perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' pairs.names mkdir -p /cluster/data/danRer4/bed/bacEnds/pairs cd /cluster/data/danRer4/bed/bacEnds/pairs set dir = /cluster/data/danRer4/bed/bacEnds # use parameters from REDO of danRer3 BAC ends /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose $dir/bacEnds.psl $dir/bacEndPairs.txt all_bacends bacEnds wc -l * # 1714 bacEnds.long # 14889 bacEnds.mismatch # 109213 bacEnds.orphan # 105294 bacEnds.pairs # 347 bacEnds.short # 782 bacEnds.slop # create header required by "rdb" tools echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' \ > ../header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header # edit header to make sure \t is/become tab character cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairs.bed # create bad BAC ends set cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBad.bed # Also create a bad BAC ends set with no orphans since orphans are # already added to the singles track and do not want to add these orphans # twice when extracting PSL. Use this bacEndPairsBadNoOrphans.bed # file when extracting PSLs for adding to the all_bacends table. cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBadNoOrphans.bed # To create singles set: # also need to process bacEndSingles.txt into a database table # for singles in bacEndSingles.txt, create a dummy file where they # are given zJA11B12T7 as dummy sequence pair. If the single is a forward # sequence, put the dummy sequence in the second column, if the single is # a reverse sequence put in first column. use a perl script to do this. cd /cluster/data/danRer4/bed/bacends set bacDir = /cluster/data/danRer4/bed/bacEnds mkdir singles cd singles cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl . perl formatSingles.pl $bacDir/bacEndSingles.txt > \ $bacDir/bacEndSingles.format # then run pslPairs on this formatted file /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \ all_bacends bacEnds wc -l bacEnds.* # 0 bacEnds.long # 0 bacEnds.mismatch # 22036 bacEnds.orphan # 0 bacEnds.pairs # 0 bacEnds.short # 0 bacEnds.slop cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles wc -l bacEnds.singles # 131249 bacEnds.singles # Of these, 109213 are from pair analysis and 22036 from singles. # For danRer3: there are 11439 orphans from singles and 242235 from # pair analysis so a total of 253674 orphans so this has improved. # Although for danRer3, some of these could be replicate reads for the # same BAC clone end. # make singles bed file cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndSingles.bed # check if there are any overlapping alignments that can be removed. cd /cluster/data/danRer4/bed/bacEnds mkdir -p duplicates/overlapRun cd duplicates/overlapRun sort -k1,2 /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed \ > bacEndPairs.lfs wc -l *.lfs # 104546 bacEndPairs.lfs nice /cluster/bin/x86_64/lfsOverlap bacEndPairs.lfs bacEndPairs.bed \ -name -minOverlap=0.999 -notBlocks # Loaded 104546 elements of size 11 # only 5 lines removed sort -k1,2 /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed \ > bacEndSingles.lfs nice /cluster/bin/x86_64/lfsOverlap bacEndSingles.lfs bacEndSingles.bed \ -name -minOverlap=0.999 -notBlocks # Loaded 125695 elements of size 11 # No lines removed. sort -k1,2 \ /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairsBadNoOrphans.bed \ > bacEndPairsBadNoOrphans.lfs wc -l *.lfs # 17611 bacEndPairsBadNoOrphans.lfs nice /cluster/bin/x86_64/lfsOverlap bacEndPairsBadNoOrphans.lfs \ bacEndPairsBadNoOrphans.bed -name -minOverlap=0.999 -notBlocks # Loaded 17611 elements of size 11 # Saving 17608 records to bacEndPairsBadNoOrphans.bed # Only 3 alignments were removed. # Therefore no point in doing using these files. Use the original bed # files for pairs and singles. No further processing of BED files is # needed as they have not been changed in any way. # Remove duplicates directory. rm -r /cluster/data/danRer4/bed/bacEnds/duplicates # use new extract program that extracts PSLs using name and position: ssh kkstore04 set bacDir=/cluster/data/danRer4/bed/bacEnds cd $bacDir/pairs nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \ $bacDir/bacEnds.psl bacEndPairs.bed bacPairs.psl # for this, use bacEndPairsBadNoOrphans since pairs orphans are already # included in bacEndSingles nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \ $bacDir/bacEnds.psl bacEndPairsBadNoOrphans.bed bacPairsBadNoOrphans.psl # then for singles cd $bacDir/singles nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \ $bacDir/bacEnds.psl bacEndSingles.bed bacSingles.psl cd $bacDir cat pairs/*.psl singles/bacSingles.psl > allBacends.load.psl # try old program and compare extractPslLoad -noBin bacEnds.psl pairs/bacEndPairs.bed \ pairs/bacEndPairsBadNoOrphans.bed singles/bacEndSingles.bed \ | sorttbl tname tstart | headchg -del > bacEnds.load.psl wc -l *.load.psl # 364457 allBacends.load.psl # 4568907 bacEnds.load.psl # Much reduced by using only BAC end alignments that are in BED files. # load into database ssh hgwdev cd /cluster/data/danRer4/bed/bacEnds/pairs hgLoadBed danRer4 bacEndPairs bacEndPairs.bed -notItemRgb \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 104546 elements of size 11 # note - this next track isn't pushed to RR, just used for assembly QA hgLoadBed danRer4 bacEndPairsBad bacEndPairsBad.bed -notItemRgb \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 121728 elements of size 11 cd /cluster/data/danRer4/bed/bacEnds/singles cp /cluster/data/danRer3/bed/bacends/singles/bacEndSingles.sql . hgLoadBed danRer4 bacEndSingles bacEndSingles.bed -notItemRgb \ -sqlTable=bacEndSingles.sql # Loaded 125695 elements of size 11 cd /cluster/data/danRer4/bed/bacEnds hgLoadPsl danRer4 -table=all_bacends allBacends.load.psl # All alignments were loaded into the table - no problems. # load BAC end sequences into seq table so alignments may be viewed # symlink to FASTA sequence file in ncbi directory # move BAC ends to the ncbi directory mkdir -p /cluster/data/ncbi/bacends/zebrafish/bacends.1 # remove some files cd ensemblSeqs rm tmp clone* index.html cd /cluster/data/danRer4/bed/bacEnds mv /cluster/data/danRer4/bed/bacEnds/ensemblSeqs/* \ /cluster/data/ncbi/bacends/zebrafish/bacends.1 rm -r ensemblSeqs mkdir -p /gbdb/danRer4/bacends ln -s /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \ /gbdb/danRer4/bacends/Zv6BacEnds.fa hgLoadSeq danRer4 /gbdb/danRer4/bacends/Zv6BacEnds.fa # check trackDb.ra entry and description # cleanup: ssh kkstore04 cd /cluster/data/danRer4/bed/bacEnds/ rm -r sequences rm -r /san/sanvol1/scratch/danRer4/bacEnds/sequences rm -r sequences2 rm changes.txt bacEnds.load.psl *.log du -sh /cluster/data/danRer4/bed/bacEnds # 2.4G /cluster/data/danRer4/bed/bacEnds gzip *.psl *.txt danRerBacEnds.xml du -sh /cluster/data/danRer4/bed/bacEnds # 599M /cluster/data/danRer4/bed/bacEnds # (hartera, 2006-10-02) # NOTE: Some BAC clones have duplicate reads and these end in the # suffixes SP6A, T7A, SP6W and T7W. There is a corresponding read name # without the W or A suffix. The names of the BAC clones # are also suffixed with A or W for these reads. e.g There is a BAC # clone called DKEY-32M8. DKEY-32M8A is the same one sequenced with # different read ending in SP6A and T7A. The BAC ends names are # zK32M8SP6A and zK32M8T7A. # Check if there are any cases where both the version without the W or A # suffix and the version with it are in the same track: ssh hgwdev cd /cluster/data/danRer4/bed/bacEnds mkdir duplicates cd duplicates # found that there are some alignments in all_bacends where there # is SP6W, SP6A, T7W, T7A suffixes for BAC ends. These are duplicate # reads, there is a corresponding read name without the W or A suffix. # Suffix Alignments Unique Names # SP6W 179 153 # SP6A 254 245 # T7W 53 48 # T7A 247 238 hgsql -e 'select count(*) from bacEndPairs where lfNames like "%SP6A%";' \ danRer4 # 126 were found hgsql -e \ 'select count(distinct(name)) from bacEndPairs where lfNames like "%SP6A%";' \ danRer4 # 122 with distinct names hgsql -N -e \ 'select name, lfNames from bacEndPairs where lfNames like "%SP6A%";' \ danRer4 | sort > names.SP6A.txt awk '{print $1}' names.SP6A.txt | sed -e 's/A$//' > names.SP6.txt hgsql -N -e \ 'select name, lfNames from bacEndPairs where lfNames not like "%SP6A%";' \ danRer4 | sort > pairs.nameswithoutA.txt grep -w -f names.SP6.txt pairs.nameswithoutA.txt | sort | uniq \ > pairs.withAandwithout.txt # there are 23 BAC clones in the bacEndPairs table where there are # entries for both the clone names ending in A and that without the A. hgsql -N -e 'select name, lfNames from bacEndSingles where (lfNames like "%SP6A%") or (lfNames like "%SP6W%") or (lfNames like "%T7A%") or (lfNames like "%T7W%");' danRer4 | sort | uniq > singles.names.sort awk '{print $1}' singles.names.sort | sed -e 's/A$//' | sed -e 's/W$//' \ > names.SP6andT7.txt wc -l names.SP6andT7.txt # 372 names.SP6andT7.txt sort names.SP6andT7.txt | uniq > names.SP6andT7.uniq wc -l names.SP6andT7.uniq # 309 names.SP6andT7.uniq # Some may have both names ending in W and in A or could those # where the SP6 and T7 end are both present. hgsql -N -e 'select name, lfNames from bacEndSingles;' danRer4 \ > singles.names.txt grep -w -f names.SP6andT7.uniq singles.names.txt | sort | uniq \ > singles.withAorWandwithout.txt wc -l singles.withAorWandwithout.txt # 212 singles.withAandwithout.txt ssh kkstore04 cd /cluster/data/danRer4/bed/bacEnds/duplicates/tmp/singles # Check to see if any pairs can be made that do not have the same # suffix: A, W or without. Only for cases where there is not a pair # already. awk '{print $2}' singles.names.sort | sort | uniq > bacEnds.namesAorW.sort # also add the BAC ends for those with the same name but withour A or W awk '{print $2}' singles.withAorWandwithout.txt | sort | uniq \ > singles.withAorWandwithout.ends cat bacEnds.namesAorW.sort singles.withAorWandwithout.ends \ | sort | uniq > bacEnds.namesAorWorwithout.sort # make pairs where there is none with the same ending already. If an end # has W and/or A suffix and/or no suffix, use just one and discard others. # use a script to do this. wc -l *.txt # 93 diffSuffix.txt # 69 sameSuffix.txt # 212 singles.withAorWandwithout.txt # 92 singlesEnds.txt # changed program to do second pass using the extra ends. # 76 diffSuffix.txt # 78 extraEnds.txt # 39 extraEnds2.txt # 86 sameSuffix.txt # 92 singlesEnds.txt # /cluster/data/danRer4/bed/bacEnds/duplicates/tmp/singles/test2 # now check to see if any of the BACs represented by singles or pairs # are already in the original file created. # extraEnds2.txt are those to be removed # diffSuffix.txt, sameSuffix.txt and singlesEnds.txt should all # be checked against the entries in the bacEndPairs table since # these are sequences that already passed all the criteria for # being in the BAC end pairs track. mkdir /cluster/data/danRer4/bed/bacEnds/duplicates/remove cd /cluster/data/danRer4/bed/bacEnds/duplicates/remove cp /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed . # those that have the same suffix will have already been paired. It is # the ones that are different that should be put into the pairs file # and those that are singles should go into the singles file before # processing the BAC ends. # first remove the 23 that are duplicated in the bacEndPairs table. cp ../pairs.withAandwithout.txt . # cd /cluster/data/danRer4/bed/bacEnds/duplicates/remove awk '{print $1"A"}' pairs.withAandwithout.txt > bacsToRemove.txt # remove these from the BAC end pairs file grep -wv -f bacsToRemove.txt bacEndPairs.bed > bacEndPairsRemBacA.bed wc -l *.bed # 104546 bacEndPairs.bed # 104523 bacEndPairsRemBacA.bed # then find out if there are any BACs with more than one set of pairs # in each of the lists: sameSuffix.txt and diffSuffix.txt cp ../*Suffix.txt . # the first column has the stem of the BAC end names without the # SP6 or T7 part of the suffix. awk '{print $1;}' sameSuffix.txt | sort | uniq -c | sort -nr \ > sameSuff.count # no duplicates within the file awk '{print $1;}' diffSuffix.txt | sort | uniq -c | sort -nr \ > diffSuff.count # no duplicates within the file cat sameSuffix.txt diffSuffix.txt > allSuff.txt awk '{print $1;}' allSuff.txt | sort | uniq -c | sort -nr \ > allSuff.count # no duplicates between files rm *.count # then check if any of these are represented in the pairs table: # All of these BAC end names begin with zK, these are DKEY- BAC clones # translate names in column 1 to BAC clone names awk '{print $1}' allSuff.txt | sed -e 's/zK/DKEY\-/' | sort \ > allSuff.BACclones.txt grep -w -f allSuff.BACclones.txt bacEndPairsRemBacA.bed \ > newPairsDupsInPairsBed.txt # only one is found: DKEY-32B21: zK32B21T7,zK32B21SP6 awk '{print $4}' newPairsDupsInPairsBed.txt \ > newPairsDupsInPairsBed.name grep "zK32B21" *.txt # found in sameSuffix.txt so delete from this file and from allSuff.txt grep -wv "zK32B21" sameSuffix.txt > sameSuffix2.txt grep -wv "zK32B21" allSuff.txt > allSuff2.txt # in this case the zK32B21T7A alignment is much better than the # zK32B21T7 alignment, also zK32B21SP6A is better than the zK32B21SP6 # alignment therefore it should be replaced with the SP6A and T7A # versions. cp /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed . grep "zK32B21" bacEndSingles.bed # then repeat this for the singles and see if any of those already # have pairs in the bacEndPairsRemBacA.bed file. cp ../singlesEnds.txt . cp ../extraEnds2.txt . # all these ends begin with "zK" so from "DKEY-" library. # get BAC end prefixes and conver to DKEY BAC clone names. awk '{print $1}' singlesEnds.txt | sed -e 's/zK/DKEY\-/' | sort \ > singles.BACclones.txt grep -w -f singles.BACclones.txt bacEndPairsRemBacA.bed \ > singlesInPairsBed.txt wc -l singlesInPairsBed.txt # 40 singlesInPairsBed.txt # get those names from the clone name in bacEndPairsRemBacA.bed awk '{print $4}' singlesInPairsBed.txt | sed -e 's/DKEY\-/zK/' \ | sort | uniq > singlesDupsInPairs.txt wc -l singlesDupsInPairs.txt # 37 singlesDupsInPairs.txt # All of these versions are in Genbank. cat newPairsDupsInPairsBed.name singlesDupsInPairs.txt \ | sed -e 's/zK/DKEY\-/' > allDupsInPairs.txt # BEST WAY FORWARD IS TO START AGAIN WITH PROCESSING THE BAC ENDS AND # PROCESS DUPLICATES AS FOR danRer3. ############################################################################## # REPROCESS BAC ENDS TO DEAL WITH DUPLICATES AND REDO BACENDS TRACKS # (2006-10-06 - 2006-10-11, hartera) # The bacEnds.psl from the first BACENDS TRACK section is used so all # processing is the same up to that point. # Now put together the pairs information: ssh kkstore04 # move old bacends dir out the way mv /cluster/data/danRer4/bed/bacEnds /cluster/data/danRer4/bed/bacEndsOld mkdir /cluster/data/danRer4/bed/bacEnds cd /cluster/data/danRer4/bed/bacEnds # mv /cluster/data/danRer4/bed/bacEndsOld/bacEnds.psl . # cat together the xml files of BAC clone end information cat ensemblSeqs/*.xml > danRerBacEnds.xml # get mate-pair information from xml, # in convertBacEndInfo, forward is T7, reverse is SP6. Use this # although before used the other way round. Arbitrary really as long # as use the same in the same library. CHORI73 library has it the opposite # way round to above. # edit getBacInfo.pl used for canFam1 and adapt for use with zebrafish # BAC ends. Not all entries in the xml file have clone_id or trace_end # but sometimes they have trace_direction instead of trace_end. # correct directions: cat << '_EOF_' > getZfishBacInfo.pl #!/usr/bin/perl -w use strict; my ($file, $outFile, $name, $clone, $library, $dir); $file = $ARGV[0]; $outFile = $ARGV[1]; open (FILE, $file) || die "Can not open $file : $!\n"; open (OUT, ">$outFile") || die "Can not create $outFile : $!\n"; open (STDERR, ">error.log") || die "Can not create error.log : $!\n"; my %cloneHash = qw { zC CH211- zK DKEY- zKp DKEYP- bZ RP71- dZ BUSM1- CHORI73_ CH73- }; $name = ""; $clone = ""; $dir = ""; while () { chomp; my $l = $_; if ($l =~ /([A-Za-z0-9\_\.]+)/) { $name = $1; } elsif ($l =~ /([A-Z0-9]+\-[0-9A-Z]+)/) { $clone = $1; } elsif ($l =~ /([A-Z0-9a-z\s]+\-?[0-9A-Z]*)<\/library_id>/) { $library = $1; if ($library eq "Daniokey Pilot") { $library = "DKEYP"; } } elsif ($l =~ /(F|R)/) { $dir = $1; } elsif ($l =~ /(F|R)/) { $dir = $1; } # find end of record and print out end information if ($l =~ /^\s+<\/trace>/) { printInfo($name, $clone, $library, $dir); $name = $clone = $dir = $library = ""; } } close FILE; close OUT; close STDERR; sub printInfo { my ($name, $clone, $lib, $d) = @_; # if no clone name read from file then create from trace name if ($clone eq "") { foreach my $c (keys(%cloneHash)) { if ($name =~ /$c/) { if (exists($cloneHash{$c})) { my $prefix = $cloneHash{$c}; $clone = $name; # change to clone name $clone =~ s/$c/$prefix/; # remove suffix $clone =~ s/\.[a-z]+|SP6|T7//; } } } } # convert forward or reverse direction to T7 or SP6 if ($d ne "") { if ($d eq "F") { $d = "T7"; } elsif ($d eq "R") { $d = "SP6"; } } else { print STDERR "No direction for $name found\n"; } # print clone end information print OUT "$clone\t$name\t0\t$lib\t0\t$d\n"; } '_EOF_' # << for emacs chmod +x getZfishBacInfo.pl perl getZfishBacInfo.pl danRerBacEnds.xml bacEndInfo.txt # check all the names are there grep '>' ./ensemblSeqs/Zv6BacEnds.fa > names perl -pi.bak -e 's/>//' names sort names | uniq > names.sort awk '{print $2}' bacEndInfo.txt | sort | uniq > bacEndInfo.names.sort comm -13 bacEndInfo.names.sort names.sort # no difference so all clone ends in the FASTA file are also # in the xml file. rm *.bak *.sort names # create mate-pair information # convertBacEndPairInfo does not deal with replicate names. These can # be in a comma separated list in the pairs and singles files. # edit the script so that it does this and parses the bacEndInfo.txt file. cp /cluster/bin/scripts/convertBacEndPairInfo convertZfishBacEndInfo # comment out line 43 as this removes the suffix after a . from the # trace names. In this case, we need to keep those. # line 43: ($acc, $ver) = split(/\./,$acc); cat << 'EOF' > convertZfishBacEndInfo #!/usr/local/bin/perl # File: convertBacEndPairZfishInfo # Date: 10/2006 # Description: Converts bacends.cl_acc_gi_len_primer format file to # bacEnds.pair file used for creating BAC End Pairs tracks # Usage message if ($#ARGV < 0) { print stderr "USAGE: convertBacEndPairInfo \n"; exit(1); } $file = shift(@ARGV); open(FILE, "$file") || die("Could not open $file\n"); $pair = $single = 0; # Read in and record end info print stderr "Reading in end info\n"; while ($line = ) { chomp($line); ($clone, $acc, $gi, $center, $length, $end) = split('\t',$line); # ($acc, $ver) = split(/\./,$acc); $end =~ tr/a-z/A-Z/; $found{$clone} = 1; $clone{$acc} = $clone; $printa{$acc} = 0; $print{$clone} = 0; $end{$acc} = $end; if (&isForward($end)) { # print "Adding $acc for $clone as $end \n"; $t7{$clone} .= "$acc,"; # print "The entry for $clone is $t7{$clone}\n"; } elsif (&isReverse($end)) { $sp6{$clone} .= "$acc,"; } elsif ($end) { print stderr "End $end for $acc / $clone\n"; } } close(OUT); # Print out pairs open(OUT, ">bacEndPairs.txt"); print stderr "Writing out pair info\n"; foreach $clone (keys %found) { if ($t7{$clone} && $sp6{$clone}) { print OUT "$t7{$clone}\t$sp6{$clone}\t$clone\n"; $print{$clone} = 1; @acc = split(/\,/,$t7{$clone}); for ($i = 0; $i <= $#acc; $i++) { $printa{$acc[$i]} = 1; } @acc = split(/\,/,$sp6{$clone}); for ($i = 0; $i <= $#acc; $i++) { $printa{$acc[$i]} = 1; } $pair++; } } close(OUT); # Print out singletons print stderr "Writing out singleton info\n"; open(OUT, ">bacEndSingles.txt"); %sp6Singles; %t7Singles; foreach $acc (keys %printa) { $clone = $clone{$acc}; # if not printed already then add to a new hash for singles if (!$printa{$acc}) { if (&isForward($end{$acc})) { $t7Singles{$clone} .= "$acc,"; } elsif (&isReverse($end{$acc})) { $sp6Singles{$clone} .="$acc,"; } else { print stderr "$acc has unknown end\n"; } } } # then print out the singles: foreach $cl (keys %t7Singles) { print OUT "$t7Singles{$cl}\t$cl\tT7\n"; $single++; } foreach $cl (keys %sp6Singles) { print OUT "$sp6Singles{$cl}\t$cl\tSP6\n"; $single++; } close(OUT); print stderr "$pair pairs and $single singles\n"; sub isForward { $end = shift(@_); if (($end =~ /FORWARD/) || ($end =~ /^T7/) || ($end eq "F") || ($end eq "M13-21") || ($end eq "1") || ($end =~ /^TK/) || ($end =~ /^EC1/) || ($end =~ /^RM1/)) { return 1; } else { return 0; } } sub isReverse { if (($end =~ /REVERSE/) || ($end =~ /^SP6/) || ($end eq "R") || ($end =~ /^TJ/)) { return 1; } else { return 0; } } 'EOF' # remove all W and A suffixes from the end of bacEndInfo.txt clone names cp bacEndInfo.txt bacEndInfo2.txt perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' bacEndInfo2.txt perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' bacEndInfo2.txt ./convertZfishBacEndInfo bacEndInfo2.txt # creates pairs and singles files # 312850 pairs and 34935 singles mkdir -p /cluster/data/danRer4/bed/bacEnds/pairs cd /cluster/data/danRer4/bed/bacEnds/pairs set dir = /cluster/data/danRer4/bed/bacEnds # use parameters from REDO of danRer3 BAC ends /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose $dir/bacEnds.psl $dir/bacEndPairs.txt all_bacends bacEnds wc -l * # 2724 bacEnds.long # 22959 bacEnds.mismatch # 179405 bacEnds.orphan # 156241 bacEnds.pairs # 565 bacEnds.short # 1196 bacEnds.slop # create header required by "rdb" tools echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' \ > ../header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header # edit header to make sure \t is/become tab character cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairs.bed # create bad BAC ends set cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBad.bed # Also create a bad BAC ends set with no orphans since orphans are # already added to the singles track and do not want to add these orphans # twice when extracting PSL. Use this bacEndPairsBadNoOrphans.bed # file when extracting PSLs for adding to the all_bacends table. cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBadNoOrphans.bed # To create singles set: # also need to process bacEndSingles.txt into a database table # for singles in bacEndSingles.txt, create a dummy file where they # are given zJA11B12T7 as dummy sequence pair. If the single is a forward # sequence, put the dummy sequence in the second column, if the single is # a reverse sequence put in first column. use a perl script to do this. cd /cluster/data/danRer4/bed/bacEnds set bacDir = /cluster/data/danRer4/bed/bacEnds mkdir singles cd singles cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl . perl formatSingles.pl $bacDir/bacEndSingles.txt > \ $bacDir/bacEndSingles.format # then run pslPairs on this formatted file /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \ -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \ all_bacends bacEnds wc -l bacEnds.* # 0 bacEnds.long # 0 bacEnds.mismatch # 23398 bacEnds.orphan # 0 bacEnds.pairs # 0 bacEnds.short # 0 bacEnds.slop cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles wc -l bacEnds.singles # 202803 bacEnds.singles # Of these, 179405 are from pair analysis and 23398 from singles. # For danRer3: there are 11439 orphans from singles and 242235 from # pair analysis so a total of 253674 orphans so this has improved. # Although for danRer3, some of these could be more replicate reads for the # same BAC clone end. # make singles bed file cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndSingles.bed # check if there are any overlapping alignments that can be removed. cd /cluster/data/danRer4/bed/bacEnds mkdir -p duplicates/overlapRun cd duplicates/overlapRun sort -k1,2 /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed \ > bacEndPairs.lfs wc -l *.lfs # 154732 bacEndPairs.lfs nice /cluster/bin/x86_64/lfsOverlap bacEndPairs.lfs bacEndPairs.bed \ -name -minOverlap=0.999 -notBlocks # Loaded 154732 elements of size 11 # Took about 2.5 hours. wc -l bacEndPairs* # 154634 bacEndPairs.bed # 154732 bacEndPairs.lfs sort -k1,2 /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed \ > bacEndSingles.lfs nice /cluster/bin/x86_64/lfsOverlap bacEndSingles.lfs bacEndSingles.bed \ -name -minOverlap=0.999 -notBlocks # Loaded 187638 elements of size 11 # Took about 4.5 hours sort -k1,2 \ /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairsBadNoOrphans.bed \ > bacEndPairsBadNoOrphans.lfs wc -l *.lfs # 27301 bacEndPairsBadNoOrphans.lfs nice /cluster/bin/x86_64/lfsOverlap bacEndPairsBadNoOrphans.lfs \ bacEndPairsBadNoOrphans.bed -name -minOverlap=0.999 -notBlocks # Loaded 27301 elements of size 11 # Took 5 minutes # check the numbers of lines are correct foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles) awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \ | sort | uniq -c | sort -nr > ${f}.uniqCount end wc -l * # 154634 bacEndPairs.bed # 154732 bacEndPairs.lfs # 154656 bacEndPairs.uniqCount # 27282 bacEndPairsBadNoOrphans.bed # 27301 bacEndPairsBadNoOrphans.lfs # 27293 bacEndPairsBadNoOrphans.uniqCount # 187601 bacEndSingles.bed # 187638 bacEndSingles.lfs # 187624 bacEndSingles.uniqCount # different numbers for unique count since some of these alignments # were not identical but very close to identical (>0.999 overlap) rm *.uniqCount cd /cluster/data/danRer4/bed/bacEnds/duplicates mv ./overlapRun/* . rm -r overlapRun # copy perl script used for danRer3 to choose 2 BAC ends to represent # each BAC clone since there are often more than one read for each BAC end # in this set, 2 were chosen for each BAC pair or 1 for the singles. This # was based on the ones that had the largest region aligned (using lfSizes). cp /cluster/data/danRer3/bed/bacends/duplicatesNew/pickLfNamesv2.pl . # need to sort by chrom, chromStart foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles) sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed end # run perl script: input bed file, pairs or singles, name of output file perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed mv error.log log.pairs perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed mv error.log log.singles perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \ badPairs2lfNames.bed mv error.log log.badPairs wc -l log* # 1 log.badPairs # 3 log.pairs # 13 log.singles # In future, could pick which set of alignments to pick based on the # Blat score computed by pslScore. # For badPairs, CH211-115F14 has 2 sets of pairs: zC115F14.zb,zC115F14.ya # has a longer region between ends than for zC115F14.za,zC115F14.ya. # so the latter was removed. # for Pairs, CH211-74D17: the alignment with zC74D17.zb,zC74D17.yb was # removed but there is also one with zC74D17.zb,zC74D17.yb to the same # region that was retained so remove this one as zC74D17.zb,zC74D17.ya # covers a longer region. # CH211-98O15 has zC98O15.ya,zC98O15.za aligning to chr3 and # zC98O15.yb,zC98O15.zb align to chr17. There is no similarity between # zC98O15.ya and zC98O15.yb by bl2seq. # CH211-98E22 has zC98E22.ya,zC98E22.za aligning to chr3 and # zC98E22.yb,zC98E22.zb aligning to chr14. zC98E22.ya and zC98E22.yb # has no similarity by bl2seq. # For singles, there are 13 will alignments to more than 1 read for a # BAC end: # CH211-66E17: remove zC66E17.za as it has more mismatches and inserts. # CH211-74O5: remove zC74O5.ya as is has more mismatches. # CH211-42B4: remove zC42B4.yb as it has a shorter alignment. Not much # difference in mismatches or inserts between this and zC42B4.ya. # CH211-98O3: zC98O3.yb aligns to chr13 and zC98O3.ya aligns to chr16 and # they have no similarity to each other. # CH211-89J7: remove zC89J7.zb as it has more mismatches and inserts. # CH211-97A18: remove zC97A18.yb has more mismatches and inserts. # CH211-48O20: zC48O20.zb aligns to chr22 and zC48O20.za aligns twice # to chr16. No similarity by bl2seq. # CH211-60H17: remove zC60H17.ya as it has a more mismatches. # CH211-189J23: remove zC189J23.yb as it has a large tNumInsert. # CH211-124G12: remove zC124G12.za as it has more mismatches and inserts. # CH211-60P6: remove zC60P6.ya as it has more inserts. # CH211-42A6: remove zC42A6.za as it has more inserts. # CH211-69K2: remove zC69K22.za as it has more inserts. # Reported discrepancies to Mario Caccamo at Sanger (mc2@sanger.ac.uk) # Here is his reply: # This looks like a clone swap problem where names where associated to # the wrong clones. All the examples you mention below are from # projects sequenced at Max Planck (Germany). # CH211-98O15 - the right place for this one is in chr3. This clone is # currently assigned to ctg247 in chr3. # CH211-98O3 - should go to chr14 (there is a problem in Zv6 most # probably). This clone is assigned to ctg3009. The b ends are correct. # CH211-48O20 - unfortunately this clone is not fingerprinted so I don't # have any independent information to confirm the right placement. # So for pairs, # CH211-98O15: retain zC98O15.ya,zC98O15.za aligning to chr3 # CH211-98O3: retain zC98O3.yb and zC98O3.zb (should go to chr14) # NOTE: For some singles, the lfStart does not equal the chromStart. # Also chromStart - chromEnd should equal lfSizes. # pslPairs has added min/2 to the end or subtracted min/2 from the start # depending on whether it is a left or a right BAC end and the # alignment orientation. min used here was 25000. # That is ok. This is what gives the display where the aligning block is # shown with a line with arrows on it showing the direction. ssh kkstore04 cd /cluster/data/danRer4/bed/bacEnds/duplicates # create remove lists for each set of alignments. cat << 'EOF' > pairsToRemove zC74D17.zb,zC74D17.yb zC98O15.yb,zC98O15.zb zC98E22.ya,zC98E22.za zC98E22.yb,zC98E22.zb 'EOF' cat << 'EOF' > singlesToRemove zC66E17.za zC74O5.ya zC42B4.yb zC98O3.ya zC89J7.zb zC97A18.yb zC48O20.zb zC48O20.za zC60H17.ya zC189J23.yb zC124G12.za zC60P6.ya zC42A6.za 'EOF' mv pairs2lfNames.bed pairs2lfNamesOld.bed mv singles1lfName.bed singles1lfNameOld.bed # recreate these files removing alignments for ends in lists above grep -wv -f pairsToRemove bacEndPairsSort.bed > pairs2lfNames.bed grep -wv -f singlesToRemove bacEndSinglesSort.bed > singles1lfName.bed # for each of these new bed files, checks were made that there are # only 2 BAC ends per alignments for pairs and 1 for singles. # For each pair, there should only be 2 ends which can appear either # way round depending on the orientation and there should be 1 end for # the beginning (suffix T7, t7 or z) and one end for the end # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g. # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite # orientation. For singles, there should be a single BAC end for each # alignment and for each BAC clone, a sequence for either or both types # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate # alignments. e.g. wc -l pairs2lfNames.bed # 154632 pairs2lfNames.bed grep ',' pairs2lfNames.bed | wc -l # 154632 # should be the same number, every line should have a comma # should be twice the number of above, just 2 end names per line awk '{print $11}' pairs2lfNames.bed | sort | uniq > pairs.ends wc -l pairs.ends # 147668 pairs.ends sed -e 's/,/\n/g' pairs.ends > pairs.ends2 wc -l pairs.ends2 # 295336 pairs.ends2 # should be twice the number of above, just 2 end names per lines so # correct. perl -pi.bak -e \ 's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' pairs.ends sort pairs.ends | uniq > pairs.ends.uniq # check that these have the right combination of ends - one forward and # one reverse. all ok. # repeat for badPairs and singles # badPairs: wc -l badPairs2lfNames.bed # 27281 badPairs2lfNames.bed grep ',' badPairs2lfNames.bed | wc -l # 27281 # should be the same number, every line should have a comma # should be twice the number of above, just 2 end names per line awk '{print $11}' badPairs2lfNames.bed | sort | uniq > badPairs.ends wc -l badPairs.ends # 25795 badPairs.ends sed -e 's/,/\n/g' badPairs.ends > badPairs.ends2 wc -l badPairs.ends2 # 51590 badPairs.ends2 # should be twice the number of above, just 2 end names per lines so # correct. perl -pi.bak -e \ 's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' badPairs.ends sort badPairs.ends | uniq > badPairs.ends.uniq # check that these have the right combination of ends - one forward and # one reverse. all ok. # for singles wc -l singles1lfName.bed # 187587 singles1lfName.bed grep ',' singles1lfName.bed | wc -l # 0 # should be 0 as there should only be one BAC end name per line. awk '{print $11}' singles1lfName.bed | sort | uniq > singles.ends wc -l singles.ends # 172981 singles.ends # some singles have more than 1 alignment so appear more than once. perl -pi.bak -e \ 's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1/g' singles.ends sort singles.ends | uniq > singles.ends.uniq # check that these have the the right suffixes for the BAC ends. all ok. # clean up rm *.bak *.ends *.ends2 *.uniq # Finally overlaps in BAC clone names were checked. All BAC clones # represented in each of the pairs, badPairs and singles bed files are # unique to that file. Between all three bed files, 302606 BAC clones # have alignments. foreach f (pairs2lfNames.bed badPairs2lfNames.bed singles1lfName.bed) awk '{print $4}' $f | sort | uniq > ${f}.names end wc -l *.names # 25421 badPairs2lfNames.bed.names # 147501 pairs2lfNames.bed.names # 129684 singles1lfName.bed.names # 302606 total comm -12 pairs2lfNames.bed.names badPairs2lfNames.bed.names comm -12 pairs2lfNames.bed.names singles1lfName.bed.names comm -12 badPairs2lfNames.bed.names singles1lfName.bed.names # None of these files should have any BAC clone names in common and # they do not so they are ok. # NOTE: using sort and uniq on hgwdev produces tab delimited output # after merging rows with the same BAC name, the scoring is now # wrong in the bed files. # Scores should be 1000 if there is 1 row for that name, else # 1500/number of rows for that sequence name - calculated by pslPairs. # Correct the scores. ssh kkstore04 mkdir -p /cluster/data/danRer4/bed/bacEnds/scoresAndCoords cd /cluster/data/danRer4/bed/bacEnds/scoresAndCoords # copy over correctScores2.pl and checkscores.pl scripts from danRer3 and # scripts were edited so that hits file is split on space,not on tabs cp \ /cluster/data/danRer3/bed/bacends/scoresAndCoords/correctScores2.pl . cp \ /cluster/data/danRer3/bed/bacends/scoresAndCoords/checkScores.pl . awk '{print $4}' ../duplicates/pairs2lfNames.bed \ | sort | uniq -c > pairs.hits perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits \ noBin > bacEndPairsGoodScores.bed # same for singles awk '{print $4}' ../duplicates/singles1lfName.bed \ | sort | uniq -c > singles.hits perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits \ noBin > bacEndSinglesGoodScores.bed # and for badPairs awk '{print $4}' ../duplicates/badPairs2lfNames.bed \ | sort | uniq -c > badPairs.hits perl correctScores2.pl ../duplicates/badPairs2lfNames.bed \ badPairs.hits noBin > bacEndPairsBadGoodScores.bed # check that the scores are now correct awk '{print $4, $5}' bacEndPairsGoodScores.bed \ | sort | uniq -c > pairs.count perl checkScores.pl < pairs.count # all the BAC clones should be in good.txt and none in bad.txt # wc -l should give same number of lines in good.txt as in pairs.hits # and therefore bad.txt should be empty. # repeat for other bed files awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \ | sort | uniq -c > badPairs.count perl checkScores.pl < badPairs.count awk '{print $4, $5}' bacEndSinglesGoodScores.bed \ | sort | uniq -c > singles.count perl checkScores.pl < singles.count # for the singles, 6 ended up in bad.txt because their scores are # 214.285714285714 which is correct for 7 alignments. Rounding the score # caused the discrepancy. # round these values otherwise get a loading error when loading database: perl -pi.bak -e 's/214\.285714285714/214/' bacEndSinglesGoodScores.bed # clean up rm error.log *.txt *.count *.hits ssh hgwdev cd /cluster/data/danRer4/bed/bacEnds/scoresAndCoords # copy over table definition from danRer3 cp /cluster/data/danRer3/bed/bacends/singles/bacEndSingles.sql \ ../singles # Now load database tables: hgsql -e 'drop table bacEndPairs;' danRer4 hgLoadBed danRer4 bacEndPairs bacEndPairsGoodScores.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb # Loaded 154632 elements of size 11 hgsql -e 'drop table bacEndSingles;' danRer4 hgLoadBed danRer4 bacEndSingles bacEndSinglesGoodScores.bed \ -sqlTable=../singles/bacEndSingles.sql -notItemRgb # Loaded 187587 elements of size 11 hgsql -e 'drop table bacEndPairsBad;' danRer4 hgLoadBed danRer4 bacEndPairsBad bacEndPairsBadGoodScores.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb # Loaded 27281 elements of size 11 # clean up rm *.tab *.bak error.log # The Zv6 BAC end sequences are already in /gbdb/danRer4/bacends/ and # they have been loaded into the seq table - this is from the first section # on BACENDS tracks. No need to repeat this here. # loaded BAC end sequences into seq table so alignments may be viewed # moved BAC ends to the ncbi directory previously # symlink to FASTA sequence file in ncbi directory mkdir -p /gbdb/danRer4/bacends ln -s /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \ /gbdb/danRer4/bacends/Zv6BacEnds.fa hgLoadSeq danRer4 /gbdb/danRer4/bacends/Zv6BacEnds.fa # use new extract program that extracts PSLs using name and position: ssh kkstore04 set bacDir=/cluster/data/danRer4/bed/bacEnds cd $bacDir/scoresAndCoords nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \ $bacDir/bacEnds.psl bacEndPairsGoodScores.bed bacPairs.psl # for this, use bacEndPairsGoodScores.bed which was derived from # bacEndPairsBadNoOrphans since pairs orphans are already # included in bacEndSingles nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \ $bacDir/bacEnds.psl bacEndPairsBadGoodScores.bed bacPairsBad.psl # then for singles nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \ $bacDir/bacEnds.psl bacEndSinglesGoodScores.bed bacSingles.psl cd $bacDir cat $bacDir/scoresAndCoords/*.psl > allBacends.load.psl wc -l *.load.psl # 542725 allBacends.load.psl # load PSL file into database ssh hgwdev cd /cluster/data/danRer4/bed/bacEnds/ hgsql -e 'drop table all_bacends;' danRer4 hgLoadPsl danRer4 -table=all_bacends allBacends.load.psl # All alignments were loaded into the table - no problems. # check trackDb.ra entry and modify description. # Moved the searches up to the top level zebrafish trackDb.ra file # in trackDb/zebrafish/ since the searches are common to all zebrafish # assemblies. Deleted searches from each assembly trackDb.ra. ########################################################################### # CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES # (bacEndAlias, bacCloneAlias and bacCloneXRef) # (DONE, 2006-09-29 - 2006-10-27, hartera) # Process data and create bacEndAlias table ssh kkstore04 # create a list of BAC end names and their accessions # Downloaded BAC ends accessions from SRS # SRS at Sanger is no longer available. # Go to http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?CMD=search&DB=nucgss # This is dbGSS at NCBI: GSS is Genomic Sequence Survey # Search: Danio rerio[Organism] AND BAC # There are 159020 entries. This is the same as for the BACEndAccs.txt # for danRer3 in: /cluster/data/danRer3/bed/bacends/bacends.1 # getBacEndInfo.pl and extToIntNames.pl was used to create # BACEnd_accessions.txt. Use this from danRer3 to load table. cd /cluster/data/danRer4/bed/bacEnds cp /cluster/data/danRer3/bed/bacends/bacends.1/BACEnd_accessions.txt . grep '>' /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \ | sed -e 's/>//' > allBacEnds.names # copy over getBacEndInfov2.pl - this produces the bacEndAccs.aliases file cp /cluster/data/danRer3/bed/bacends/bacends.1/getBacEndInfov2.pl . # edit to remove section that creates pairs and singles files # and rename to getBacEndAliases.pl cat << 'EOF' > getBacEndAliases.pl #!/usr/bin/perl -w use strict; my $file = $ARGV[0]; # list of BAC end sequence read Sanger names my $file2 = $ARGV[1]; # list of BAC ends and GenBank accessions # translation for sequence prefixes from Sanger internal names to external names my %cloneHash = qw { zC CH211- ZC CH211- zK DKEY- zKp DKEYP- bZ RP71- dZ BUSM1- CHORI73_ CH73- }; # need to get bacends into pairs and singles # find duplicates also # Get and store BAC ends and accessions my %bacEnds; open (BACENDS, $file2) || die "Can not open $file2: $!"; while () { chomp; my ($be, $a) = split(/\t/); print "bac end $be and acc is $a\n"; $bacEnds{$be} = $a; } close BACENDS; my %bacs; my %bacAccs; open(FILE, $file) || die "Can not open $file: $!"; open(STDERR, ">bacs.log") || die "Can not create bacs.log: $!"; open(OUT, ">direction.txt") || die "Can not create direction.txt:$!"; open(ACCS, ">bacEndAccs.aliases") || die "Can not create bacEndsAccs.aliases: $!"; while () { chomp; my $seqName = $_; print "seqName is $seqName here\n"; $seqName =~ /^([CHORI73]*[|z|Z|b|d]?[C|K|Z|_]p?)([0-9]+[A-Z][0-9]+)\.?[pq1k]*(SP6|T7|ASP6|AT7|SP6W|T7W|y|z|Z)/; my $prefix = $1; my $rest = $2; print "prefix is $prefix, rest is $rest\n"; my $dir = $3; print STDERR "dir is $dir\n"; print OUT "$dir\n"; my $direction; # forward or reverse direction if (($dir =~ /SP6/) || ($dir =~ /T7/) ) { $direction = $dir; } # reverse direction (as in convertZfishBacEndInfo) elsif ($dir =~ /(sp6)/i || $dir =~ /y/i) { $direction = "SP6"; } # forward direction (as in convertZfishBacEndInfo) elsif ($dir =~ /(t7)/i ||$dir =~ /z/i) { $direction = "T7"; } else { print STDERR "seqName is $seqName - direction not found\n"; } print "dir is $dir and direction is $direction\n"; my $extName = ""; my $intName = $prefix.$rest; print "prefix is $prefix\n"; my $mid = ""; $mid = $rest; $mid =~ s/\-//; $mid =~ tr/a-z/A-Z/; print "after trans, mid is $mid here\n"; if ($mid =~ /^([A-Z]*)0*([0-9]+[A-Z]+)0*([0-9]+$)/) { print "matched mid $mid here\n"; my $new = $1.$2.$3; $mid = $new; print "new mid is $mid\n"; } if (exists ($cloneHash{$prefix})) { my $extPrefix = $cloneHash{$prefix}; $extName = $extPrefix.$mid; print "External name is $extName\n"; } else { $extName = ""; } # need to get duplicate clones, if switch to lower case and remove # . and - and use as key to bacs hash # add the internal and external name for BAC to hash my $fullName = $seqName; # my $intNameStem = $intName; my $upDir = $dir; $dir =~ tr/a-z/A-Z/; # preserve prefix and change middle part of name to upper case my $upperIntName = $prefix.$mid; my $upperFullName = $prefix.$mid.$dir; print "upper internal name is $upperIntName here\n"; # my $newFullName = ""; print "internal name is $intName, altered seq name is $upperIntName\n"; print "full name is now $upperFullName\n"; if (exists($bacEnds{$upperFullName})) { my $ac = $bacEnds{$upperFullName}; print "seq is $upperFullName; acc is $ac\n"; $bacs{$upperIntName}->{$upperFullName}->{acc} = $ac; } push (@{$bacs{$upperIntName}->{$upperFullName}->{seqs} }, $seqName); $bacs{$upperIntName}->{$upperFullName}->{extName} = $extName; $bacs{$upperIntName}->{$upperFullName}->{direction} = $direction; if (exists($bacAccs{$upperIntName}) ){ my $bacAcc = $bacAccs{$upperIntName}; print "bacacc is $bacAcc\n"; $bacs{$upperIntName}->{$upperFullName}->{bacAcc} = $bacAcc; } if (exists($bacEnds{$upperFullName} )) { my $bacEndAcc = $bacEnds{$upperFullName}; print "bacendacc is $bacEndAcc\n"; $bacs{$upperIntName}->{$upperFullName}->{bacEndAcc} = $bacEndAcc; } } close FILE; # print accessions for BacEnds with BAC end aliases my $count = 0; print "printing accessions.\n"; foreach my $a (keys(%bacs)) { print "$a is bac end from bacEnds hash\n"; foreach my $f (keys %{ $bacs{$a} } ) { if (exists($bacs{$a}->{$f}->{acc} ) ) { my $acc = $bacs{$a}->{$f}->{acc}; my @ids = @{$bacs{$a}->{$f}->{seqs} }; foreach my $i (@ids) { $count++; print ACCS "$i\t$count\t$acc\n"; } } } } 'EOF' chmod +x getBacEndAliases.pl perl getBacEndAliases.pl allBacEnds.names BACEnd_accessions.txt \ > bacEnds.log wc -l bacEndAccs.aliases # 159370 bacEndAccs.aliases # clean up rm *.log direction.txt # Only the DKEY- library clone ends have accessions in Genbank # load this alias table and accessions for clone ends ssh hgwdev cd /cluster/data/danRer4/bed/bacEnds # Carry on and process this file into the bacEndAlias table. hgLoadSqlTab danRer4 bacEndAlias ~/kent/src/hg/lib/bacEndAlias.sql \ bacEndAccs.aliases # Loaded successfully. # Get the latest versions of the clonemarkers, contig names and markers # files from Sanger: Provided by Mario Caccamo (mc2@sanger.ac.uk) # at the Sanger Institute. ssh kkstore04 mkdir -p /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases # Problem with the markers file - generated incorrectly. Contacted # Sanger to ask for new markers file on 10/12/06 and new set of files # were put up for ftp on 10/26/06. Another problem with markers file # was found - there was a number in the second field instead of the # sanger sts name which is an ID beginning with "et" or "st". Notified' # Sanger and new files put out for ftp on 10/27/06. wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/README wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/clonemarkers.27.10.06.txt wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/ctgnames.27.10.06.txt wget --timestamp \ ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/markers.27.10.06.txt wc -l *27.10.06.txt # 32612 clonemarkers.27.10.06.txt # 168828 ctgnames.27.10.06.txt # 12407 markers.27.10.06.txt # get list of BAC end names, lfNames foreach f (../scoresAndCoords/*.bed) echo $f awk '{print $11;}' $f >> allBacEnds.names end wc -l allBacEnds.names # 369500 allBacEnds.names # this is the total number of lines in the *.bed files perl -pi.bak -e 's/,/\n/g' allBacEnds.names sort allBacEnds.names | uniq > allBacEnds.names.uniq # get list of BAC clone names foreach f (bacEndPairs bacEndPairsBad bacEndSingles) awk '{print $4}' \ /cluster/data/danRer4/bed/bacEnds/scoresAndCoords/${f}GoodScores.bed \ >> bacs.names end sort bacs.names | uniq > bacs.names.uniq wc -l *.uniq # 518827 allBacEnds.names.uniq # 302606 bacs.names.uniq # from psl file awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names # remove first few lines with no names tail +6 bacEndsPsl.names | sort | uniq > bacEndsPsl.names.uniq wc -l bacEndsPsl.names.uniq # 549034 bacEndsPsl.names.uniq # this is all the BAC ends that originally had alignments # Add an alias table for BAC clones # bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc # Add a xref table to give external clone registry names, internal names # sanger name, relationship between STS and BAC clone (method of finding # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT, # Genbank accession and STS primer sequences # bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc set dir=/cluster/data/danRer4/bed/bacEnds/ awk 'BEGIN {OFS="\t"}{print $4, $1}' \ $dir/scoresAndCoords/bacEndPairsGoodScores.bed > bacClones.namesandchrom awk 'BEGIN {OFS="\t"}{print $4, $1}' \ $dir/scoresAndCoords/bacEndSinglesGoodScores.bed >> bacClones.namesandchrom sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq wc -l bacClones.namesandchrom.uniq # 306079 bacClones.namesandchrom.uniq # so created a list of names and chroms for BAC clones only in pairs # and singles, exclude bad Pairs since this track is not shown on RR. # use a list of internal names,Genbank accessions, and BAC clone names # use BACClonesIdsandAccs.txt. # get list of UniSTS IDs using aliases to search alias file # print Sanger name, alias and UniSTS ID, use find_markers3.pl cat << '_EOF_' > find_markers3.pl # example: # perl find_markers.pl UniSTS.aliases markers.02.12.04.txt use strict; my $verbose = 0; my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest); my $aliasFile = $ARGV[0]; my $markersFile = $ARGV[1]; open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n"; open(MARKERS, $markersFile) || die "Can not open $markersFile\n"; # store aliases from aliasFile my ($id, $al, @alsArray, %aliasHash); while () { chomp; ($id, $al) = split /\t/; @alsArray = split(/;/, $al); foreach my $as (@alsArray) { push (@{$aliasHash{$as} }, $id); } } close ALIAS; while () { my @idArray; ($f, $t, $m, $idArray[0]) = 0; my @ids; chomp; ($a, $b, $aliases, @rest) = split /\|/; if ($verbose > 3) { printf "aliases $aliases \n"; } @alias = split /;/, $aliases; ALIAS: foreach $s (@alias) { if ($s =~ /[\D]+/) { if ($verbose > 5) { printf "this $s \n"; } if (exists($aliasHash{$s})) { @idArray = @{$aliasHash{$s}}; } if ($idArray[0]) { $f = 1; $t = $s; @ids = @idArray; if ($verbose) { printf "this $s found $m \n"; } last ALIAS; } } } if ($f) { my @sNames = split(/;/, $b); foreach my $sn (@sNames) { foreach my $i (@ids) { printf "$sn\t$i\n"; } } } } close MARKERS; '_EOF_' chmod +x find_markers3.pl # download latest version of UniSTS (2006-10-26) ssh kkstore02 mkdir -p /cluster/store5/sts.2006-10-26 ln -s /cluster/store5/sts.2006-10-26 /cluster/data/ncbi cd /cluster/data/ncbi/sts.2006-10-26 wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz mkdir -p /cluster/store5/UniSTS.2006-10-26 ln -s /cluster/store5/UniSTS.2006-10-26 /cluster/data/ncbi cd /cluster/data/ncbi/UniSTS.2006-10-26 wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases wget --timestamp -r l1 \ ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/ mv /cluster/data/ncbi/UniSTS.2006-10-26/ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio /cluster/data/ncbi/UniSTS.2006-10-26 rm -r /cluster/data/ncbi/UniSTS.2006-10-26/ftp.ncbi.nih.gov # then back to danRer4 BAC ends tables: ssh kkstore04 cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases # change internal names in files to have CHORI73 instead of zH to # keep names the same as those used in the BAC end tables. perl -pi.bak -e 's/zH([0-9]+)/CHORI73_$1/' *.27.10.06.txt perl find_markers3.pl /cluster/data/ncbi/UniSTS.2006-10-26/UniSTS.aliases \ markers.27.10.06.txt > sangerandUniSTSId.txt # Need to sort and uniq this file since the UniSTS IDs are being # replicated for each instance of the sanger name in field 2 of the # markers file. In some cases the sanger name is replicated. sort sangerandUniSTSId.txt | uniq > sangerandUniSTSId.uniq # No need to reformat this for zfishBacClonesandSts # FPC contig information (i.e. FPC contig number) from ctgnames file is # not included in the tables as these are dynamic and constantly # changing with the assembly. # bacs.names.uniq has the list of BACS in this track # Get accessions for BAC clones from Genbank (as for danRer3) # go to http://www.ncbi.nlm.nih.gov # 1) select "Nucleotide" as the search database. # 2) Search string: # Danio rerio[ORGN] AND clone[TITL] NOT survey[TITL] # Including only those with BAC in the record seems to exclude some of the # BAC clones as well as other types of sequence so this "BAC" was not # used in the search. # Those sequences with "genomic survey" in the title appear to be # BAC clone end accessions. Here, we want only BAC clone accessions. # 3) There are 1148560 sequences. (2006-10-27). Select File from Send To # pulldown menu and name file "BACClones.gbAccs.txt". # use script from danRer3 to parse out clone ID and the accession: cat << '_EOF_' > getAccsandIdsFromGb.pl #!/usr/bin/perl -w use strict; my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-"); my %cloneHash = qw { CH211- zC DKEY- zK DKEYP- zKp RP71- bZ BUSM1- dZ CH73- CHORI73_ }; my $found = "FALSE"; my $acc = ""; my $id = ""; while () { my ($l, @f, $intId, $extPref, $intPref); $intPref = ""; $extPref = ""; chomp; $l = $_; if ($l =~ /^[0-9]+:\s+([A-Z]+[0-9]{3,})/) { $acc = ""; $acc = $1; $found = "FALSE"; } elsif ($l =~ /clone/) { $id = ""; # check for clone name in this line foreach my $p (@clonePrefixes) { if ($l =~ /clone:?\s?($p[0-9-A-Za-z]+)/) { $id = $1; # translate to upper case $id =~ tr/a-z/A-Z/; $extPref = $p; $found = "TRUE"; } } } if ($found eq "TRUE") { if (exists($cloneHash{$extPref})) { $intPref = $cloneHash{$extPref}; } $intId = $id; # translate this to internal ID $intId =~ s/$extPref/$intPref/; print "$intId\t$acc\t$id\n"; $found = "FALSE"; } } '_EOF_' chmod +x getAccsandIdsFromGb.pl nice perl getAccsandIdsFromGb.pl < BACClones.gbAccs.txt \ > BACClonesIdsandAccs.txt & # Took about 1 minute # compare the BAC clones for which accessions were found to those # for danRer3: awk '{print $3}' BACClonesIdsandAccs.txt | sort | uniq \ > clonesWithAccs.dr4 awk '{print $3}' \ /cluster/data/danRer3/bed/bacends/bacends.1/BACClonesIdsandAccs.txt \ | sort | uniq > clonesWithAccs.dr3 comm -13 clonesWithAccs.dr4 clonesWithAccs.dr3 # DKEY-188F22 # DKEY-30O13 # Checked these out for searching for each in the Nucleotide database # at Genbank. DKEY-30O13 only has accessions for the # end sequences. DKEY-188F22 has an accession for the BAC clone: AP007256 # For some reason this was not found by the search. # Add this to list: echo "zK188F22\tAP007256\tDKEY-188F22" >> BACClonesIdsandAccs.txt # use zfishBacClonesandSts to create tab files for loading into # bacCloneAlias and bacCloneXRef tables # make output directory mkdir out # Asked Sanger for another version of the file with the Sanger sts aliases # instead of these numbers in the second field of the markers file. # (2006-10-26). Received new file (2006-10-27) # Increased NUMSANGER from 5 to 40 and MAXSANGER from 50 to 60 # because there are multiple occurrences of Sanger names in the second # field of the markers file and this can be quite a long list. # clonemarkers file now has 0 for relationship where before it was blank. # change this to blank again otherwise processed incorrectly. perl -pi.bak -e 's/\|0/\|/' clonemarkers.27.10.06.txt nice $HOME/bin/x86_64/zfishBacClonesandSts ctgnames.27.10.06.txt \ clonemarkers.27.10.06.txt markers.27.10.06.txt \ bacClones.namesandchrom.uniq BACClonesIdsandAccs.txt \ sangerandUniSTSId.uniq ./out > ./out/zfishBacs.out & # output is in out directory so copy over cp ./out/*.tab . # sort alias tab file by sangerName wc -l *.tab # 120211 bacAlias.tab # 507274 bacXRef.tab # make sure there are no replicate lines: # also sort alias tab file by sangerName sort bacAlias.tab | uniq | sort -k2 > bacAlias.sort.tab.uniq sort bacXRef.tab | uniq > bacXRef.tab.uniq wc -l bac*.tab.uniq # 58758 bacAlias.sort.tab.uniq # 353042 bacXRef.tab.uniq ssh hgwdev cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases hgsql -e 'drop table bacCloneAlias;' danRer4 hgsql -e 'drop table bacCloneXRef;' danRer4 hgLoadSqlTab danRer4 bacCloneAlias \ $HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab.uniq hgLoadSqlTab danRer4 bacCloneXRef \ $HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab.uniq ########################################################################### # BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES # (DONE, 2006-10-27, hartera) # The following tests were carried out to check that all the data # in the bacCloneAlias and bacCloneXRef tables is correct. ssh hgwdev cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases mkdir -p testTables cd testTables # copy scripts over from danRer3: cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/getName*.pl . cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/getSanger*.pl . cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/formatUniSts.pl . # scripts were created for danRer2 - see danRer2.txt # Check that the correct aliases are associated with their Sanger STS names awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \ ../markers.27.10.06.txt > sNameandaliases # use script to get one Sanger name and one alias on each line perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format sort sNameandaliases.format | uniq > sNameandaliases.sort # get Sanger names and aliases from database hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer4 \ | sort | uniq > alias.db.sort wc -l alias.db.sort # 58758 alias.db.sort diff sNameandaliases.sort alias.db.sort # No difference between data file and data from database so ok # Check Sanger STS names correspond in bacAlias and bacCloneXRef tables # get Sanger names from alias table hgsql -N -e 'select sangerName from bacCloneAlias;' danRer4 \ | sort | uniq > sName.alias.sort wc -l sName.alias.sort # 15595 sName.alias.sort # get Sanger names from xRef table hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \ is not null;' danRer4 | sort | uniq > sName.xRef.sort wc -l sName.xRef.sort # 15946 sName.xRef.sort comm -23 sName.alias.sort sName.xRef.sort # nothing unique to alias file so all Sanger names in the alias table are # also in the xRef table comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias wc -l sNamexRefNotAlias # 351 sNamexRefNotAlias awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.27.10.06.txt | sort | uniq \ > clonemarkers.sNames.sort # get Sanger names from markers file awk 'BEGIN {FS="|"}{print $2}' ../markers.27.10.06.txt > markers.sNames # remove semi-colons and sort sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort # sanger names unique to markers file comm -13 clonemarkers.sNames.sort markers.sNames.sort # there are none comm -23 clonemarkers.sNames.sort markers.sNames.sort \ > sNames.clonemarkersOnly wc -l sNames.clonemarkersOnly # 351 sNames.clonemarkersOnly diff sNames.clonemarkersOnly sNamexRefNotAlias # No difference so all the extra Sanger Names in the xRef # table are from the clonemarkers file and these have no aliases in # the markers file so they are not in the alias table so this is all ok. # Check that Sanger STS names and primers are associated correctly cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases/testTables # get sanger names and primers from markers file awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \ ../markers.27.10.06.txt > sNameandPrimers # use script to reformat and write with one Sanger name per line chmod +x getSangerandPrimers.pl perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format # Need to sort and uniq due to multiple occurrences of the same # Sanger name in some lines of the markers file. sort sNameandPrimers.format | uniq > sNameandPrimers.format.sort wc -l sNameandPrim* # 12407 sNameandPrimers # 32098 sNameandPrimers.format # 15595 sNameandPrimers.format.sort # get Sanger names and primers from database hgsql -N -e \ 'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \ where sangerName is not null and leftPrimer is not null and \ rightPrimer is not null;' danRer4 | sort | uniq \ > sNamesandprimers.fromdb.sort wc -l sNamesandprimers.fromdb.sort # 15595 sNamesandprimers.fromdb.sort diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort # No difference so ok. # Check that UniSTS IDs and Sanger STS names are associated correctly # get Sanger names and UniSTS IDs from the database hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \ uniStsId is not null;' danRer4 | sort | uniq > sNameUniSTS.fromdb.sort wc -l sNameUniSTS.fromdb.sort # 5699 sNameUniSTS.fromdb.sort # Need to reformat the sNameUniSTS.fromdb.sort chmod +x formatUniSts.pl perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \ > sNameUniSTS.fromdb.format.sort # get Sanger names from data file and see how many UniSTS IDs there are # for each name awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \ > sangerandUniSTSId.count # the most is 160 - this is high due to replicate occurrences of sanger # STS names (sangerName) in the markers file. Replicates are removed # during processing. # 160 etID9511.14 # 132 etID8743.18 # 124 etID9682.15 # 124 etID9681.15 # 96 etID10372.18 # 84 etID8170.14 # 76 etID10495.5 # 66 etID9328.14 # 56 etID9708.3 # use uniq'd file used to create database tables. sort ../sangerandUniSTSId.uniq > sangerandUniSTSId.txt.sort diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort # No difference between data from original file and that in database so ok # Check that chrom mappings and external BAC clone names are correct # get extNames and chroms they map to from the database hgsql -N -e 'select name, chroms from bacCloneXRef where \ chroms is not null;' danRer4 | sort | uniq \ > nameandchromsfromdb.sort # reformat nameandchromsfromdb.sort perl formatUniSts.pl < nameandchromsfromdb.sort | sort \ > nameandchromsfromdb.format.sort # compare extNames and chroms from db to those in data file cp ../bacClones.namesandchrom . sort -u bacClones.namesandchrom > bacClones.namesandchrom.uniq diff bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort # no difference - all ok # Check Genbank accessions and internal BAC clone names hgsql -N -e 'select intName,genbank from bacCloneXRef where \ genbank is not null;' danRer4 | sort | uniq \ > intNamesandAccs.fromdb.sort # this should be a subset of zfish_accsMerged.txt - not all BAC clones # listed here appear in either our BAC ends tracks or the markers files. awk 'BEGIN {OFS="\t"} {print $1,$2}' ../BACClonesIdsandAccs.txt \ | sort -u > BACClonesIntandAccs.sort comm -23 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort # there is nothing in the database that is not in BACClonesIntandAccs.sort comm -13 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort \ > onlyinzfishAccs wc -l onlyinzfishAccs # 86 onlyinzfishAccs hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' \ danRer4 | sort | uniq > intNamesNoAcc.fromdb.sort awk '{print $1;}' BACClonesIntandAccs.sort > intNames.withAccs.sort comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \ > indbNoAccsandAccs.out # none of these names are common to both so all accessions from # BACClonesIdsandAccs.txt are in the database for the internal names stored # where there are accessions available. # Test Sanger STS names, internal names and external names are all correct # Test Sanger STS name and internal BAC clone names are associated correctly # get internal names and Sanger names from data file awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.27.10.06.txt \ | sort | uniq > intNameandSanger.sort hgsql -N -e 'select intName, sangerName from bacCloneXRef \ where sangerName is not null;' danRer4 \ | sort | uniq > intNameandSanger.fromdb.sort diff intNameandSanger.sort intNameandSanger.fromdb.sort # No difference between data from file and that from database so ok # Check BAC clone internal name and relationship fields # get internal names and relationships from data file awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.27.10.06.txt \ | sort | uniq > intNameandRelation.sort # get internal names and relationships from database, some internal names # may have different relationships associated with each internal name # and Sanger sts name pair hgsql -N -e 'select intName, relationship from bacCloneXRef \ where relationship != 0;' danRer4 \ | sort | uniq > intNameandrelation.fromdb.sort # differences unique to database file comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \ > intNameRelation.indbonly # differences unique to data file comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \ > intNameRelation.incloneMarkersonly wc -l intNameRelation* # 5051 intNameRelation.incloneMarkersonly # 5051 intNameRelation.indbonly awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names awk '{print $1}' intNameRelation.incloneMarkersonly \ > intNameRelation.incloneMarkersonly.names diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names # there is no difference in the internal names with relationship fields # no difference in names and the only places these should differ is that # the second column should all be 3 in the data from the database only. # this is because all the relationship entries that were blank were # in the clonemarkers file were changed to 3 when entered into the database. awk '{print $2}' intNameRelation.indbonly | sort | uniq # 3 - correct so all ok # all the differences should be that those that are blank in clonemarkers # are 3 in the database. # check that those that have 0 in the database bacCloneXRef relationshipe # field are not in the list from cloneMarkers # select these internal names with 0 relationship from the database hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \ danRer4 | sort | uniq > intNameNoRelation.fromdb.sort # get all the internal names from the data file awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.10.06.txt \ | sort | uniq > intNamefromCloneMarkers.sort comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort # nothing in common between these two files as expected so there are # no internal names in the db with 0 in the relationship field that # appear in the clonemarkers file. # Check all BAC clone internal names and external names from the # ctgnames file are in the database # get intName and extName from ctgnames file awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.27.10.06.txt \ | sort | uniq > intNameandextNamefromCtgNames.sort # get intName and extName from database hgsql -N -e 'select intName,name from bacCloneXRef;' danRer4 \ | sort | uniq > intNameandextName.fromdb.sort wc -l intNameandextName* # 334890 intNameandextName.fromdb.sort # 168828 intNameandextNamefromCtgNames.sort comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \ > intandextindbAndCtgNames wc -l intandextindbAndCtgNames # 168828 intandextindbAndCtgNames # there are 168828 name pairs common between the file and the database # and this is the same number of name pairs as in the data file diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort # no difference between those name pairs from the data file and those that # are common between the data file and the database so all internal and # external names from ctgNames file are in the database # get the list of extra ones from db comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \ > intandextNamesindbNotinCtgNames wc -l intandextNamesindbNotinCtgNames # 166062 intandextNamesindbNotinCtgNames # get list of internal names from the clonemarkers file awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.10.06.txt | sort | uniq \ > clonemarkers.intName.sort wc -l clonemarkers.intName.sort # 14460 clonemarkers.intName.sort # compare these intNames to those from the database not in the ctgnames file comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames # none of these clone markers internal names are in this list so they # must all be in the ctgnames file too. These extra internal names will be # translations of external names found in the list of mappings of BAC clones # to chroms. # Check that all the BAC clone external names from the list of chromosome # mappings and from the ctgnames file are in the database. # get all extNames from baclones.namesandchrom.uniq and from ctgnames awk '{print $1}' ../bacClones.namesandchrom.uniq > \ extNames.ctgnamesandbacClones awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.27.10.06.txt \ >> extNames.ctgnamesandbacClones wc -l extNames.ctgnamesandbacClones # 474907 extNames.ctgnamesandbacClones sort extNames.ctgnamesandbacClones | uniq \ > extNames.ctgnamesandbacClones.sort wc -l extNames.ctgnamesandbacClones.sort # 334890 extNames.ctgnamesandbacClones.sort # get extNames from the database hgsql -N -e 'select name from bacCloneXRef;' danRer4 | sort | uniq \ > extNames.fromdb.sort wc -l extNames.fromdb.sort # 334890 extNames.fromdb.sort comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \ > extNames.fromdbandfiles wc -l extNames.fromdbandfiles # 334890 extNames.fromdbandfiles # find extNames in common from data files and database diff extNames.fromdb.sort extNames.fromdbandfiles # no difference, all extNames from files are in db # Check that all BAC clone internal names from the ctgnames and clonemarkers # files are in the database # get internal names from ctgnames and clonemarkers files awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.27.10.06.txt \ > intNames.ctgnamesandclonemarkers awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.27.10.06.txt \ >> intNames.ctgnamesandclonemarkers wc -l intNames.ctgnamesandclonemarkers # 201440 intNames.ctgnamesandclonemarkers sort intNames.ctgnamesandclonemarkers | uniq \ > intNames.ctgnamesandclonemarkers.sort wc -l intNames.ctgnamesandclonemarkers.sort # 168828 intNames.ctgnamesandclonemarkers.sort # get internal names from database hgsql -N -e 'select intName from bacCloneXRef;' danRer4 | sort | uniq \ > intNames.fromdb.sort wc -l intNames.fromdb.sort # 334890 intNames.fromdb.sort # some of these intNames are derived from the corresponding extNames # all of the intNames from the file should be in the db comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \ > intNames.fromdbandfiles wc -l intNames.fromdbandfiles # 168828 intNames.fromdbandfiles comm -13 intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort comm -23 intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort # no difference, all intNames from files are in db # Check that all translations are correct between BAC clone # external and internal names. # write script to get the prefixes from internal and external names chmod +x getNamePrefixes.pl hgsql -N -e 'select name, intName from bacCloneXRef;' danRer4 \ | sort | uniq > extandintNames.fromdb.sort perl getNamePrefixes.pl < extandintNames.fromdb.sort \ > extandintNames.prefixes sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq # these all look good # BUSM1 dZ # CH211 zC # CH211 zc # CH73 CHORI # CT7 bP # DKEY zK # DKEY zk # DKEYP zKp # RP71 bZ # XX bY # zk is a internal name prefix for the external name prefix, DKEY-. There # is only one example where this is used (DKEY-81G7) and this in the # ctgnames file and is in the bacCloneXRef table so that is ok. # All data looks good in these tables now. ########################################################################### # SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM OTHER DANRER ASSEMBLIES # (DONE, 2006-06-27, hartera) # ADD TO SAN FOR PK RUNS (DONE, 2006-05-30, hartera) ssh kkr3u00 # change script to do this and only rsync to 4,5,6,7, and 8 as # kkr1u00 and kkr2u00 are down. cd /cluster/data/danRer4/bed mkdir -p liftOver cd liftOver # commented out lines in local copy that makes the script abort if # kkr1u00 not used. can not connect to kkr1u00 at the moment. ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh danRer4 \ /cluster/data/danRer4/nib >&! split.log & # rsync didn't work properly so do manually foreach R (4 5 6 7 8) rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/ end ssh kk # add split10k to san for pk runs (2006-05-30, hartera) rsync -a --progress /iscratch/i/danRer4/split10k \ /san/sanvol1/scratch/danRer4/ ########################################################################### # LIFTOVER CHAINS TO DANRER3 (DONE, 2006-05-30 = 2006-05-31, hartera) # Split (using makeLoChain-split) of danRer3 is doc'ed in makeDanRer3.doc # Do what makeLoChain-split says to do next (start blat alignment) # Took too long on kk. Try pk. Scripts only run on kk so run manually. ssh pk mkdir -p /cluster/data/danRer4/bed/liftOver cd /cluster/data/danRer4/bed/liftOver cat << '_EOF_' > align.csh #!/bin/csh -fe set oldAssembly = $1 set oldNibDir = $2 set newAssembly = $3 set newSplitDir = $4 set ooc = $5 if ("$ooc" != "") then set ooc = '-ooc='$ooc endif set blatDir = /cluster/data/$oldAssembly/bed/blat.$newAssembly.`date +%Y-%m-%d` echo "Setting up blat in $blatDir" rm -fr $blatDir mkdir $blatDir cd $blatDir mkdir raw psl run cd run echo '#LOOP' > gsub echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} ' \ '-tileSize=11 '$ooc' -minScore=100 -minIdentity=98 -fastMap' \ >> gsub echo '#ENDLOOP' >> gsub # target ls -1S $oldNibDir/*.{nib,2bit} > old.lst # query ls -1S $newSplitDir/*.{nib,fa} > new.lst gensub2 old.lst new.lst gsub spec /parasol/bin/para create spec echo "" echo "First two lines of para spec:" head -2 spec echo "" echo "DO THIS NEXT:" echo " cd $blatDir/run" echo " para try, check, push, check, ..." echo "" exit 0 '_EOF_' # << emacs chmod +x align.csh align.csh danRer4 /san/sanvol1/scratch/danRer4/nib danRer3 \ /san/sanvol1/scratch/danRer3/split10k \ /san/sanvol1/scratch/danRer3/danRer3_11.ooc >&! align.log & # Took a few seconds. # Do what its output says to do next (start cluster job) cd /cluster/data/danRer4/bed/blat.danRer3.2006-05-30/run para try, check, push, check, ... para time >&! run.time # Completed: 784 of 784 jobs # CPU time in finished jobs: 1482693s 24711.54m 411.86h 17.16d 0.047 y # IO & Wait Time: 2873s 47.89m 0.80h 0.03d 0.000 y # Average job time: 1895s 31.58m 0.53h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 11350s 189.17m 3.15h 0.13d # Submission to last job: 13914s 231.90m 3.87h 0.16d ssh pk cd /cluster/data/danRer4/bed/liftOver cat << '_EOF_' > lift.csh #!/bin/csh -ef set oldAssembly = $1 set newAssembly = $2 set newLiftDir = /san/sanvol1/scratch/$newAssembly/split10k set prefix = /cluster/data/$oldAssembly/bed/blat.$newAssembly set blatDir = `ls -td $prefix.20* | head -1` echo "using dir $blatDir" if ( ! -e $blatDir/raw ) then echo "Can't find $blatDir/raw" endif if (`ls -1 $newLiftDir/*.lft | wc -l` < 1) then echo "Can't find any .lft files in $newLiftDir" exit 1 endif cd $blatDir/raw foreach chr (`awk '{print $1;}' /cluster/data/$newAssembly/chrom.sizes`) echo $chr liftUp -pslQ ../psl/$chr.psl $newLiftDir/$chr.lft warn chr*_$chr.psl end set execDir = $0:h echo "" echo "DO THIS NEXT:" echo " ssh pk" echo " $execDir/makeLoChain-chain $oldAssembly <$oldAssembly-nibdir> $newAssembly <$newAssembly-nibdir>" echo "" exit 0 '_EOF_' # << emacs chmod +x lift.csh lift.csh danRer4 danRer3 >&! lift.log & # makeLoChain-chain can be run on pk. chain alignments makeLoChain-chain danRer4 /san/sanvol1/scratch/danRer4/nib \ danRer3 /san/sanvol1/scratch/danRer3/nib >&! chain.log & cd /cluster/data/danRer4/bed/blat.danRer3.2006-05-30/chainRun para try, check, push, check, ... para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 4030s 67.16m 1.12h 0.05d 0.000 y # IO & Wait Time: 939s 15.66m 0.26h 0.01d 0.000 y # Average job time: 177s 2.96m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 797s 13.28m 0.22h 0.01d # Submission to last job: 953s 15.88m 0.26h 0.01d # net alignment chains ssh kkstore04 cd /cluster/data/danRer4/bed/liftOver makeLoChain-net danRer4 danRer3 >&! net.log & # load reference to over.chain into database table, # and create symlinks /gbdb and download area ssh hgwdev cd /cluster/data/danRer4/bed/liftOver makeLoChain-load danRer4 danRer3 >&! load.log & # clean up rm *.log # add md5sum.txt to include this new liftOver file cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver md5sum *.gz > md5sum.txt # copy README.txt from another liftOver directory. # test by converting a region using the "convert" link on # the browser, and comparing to blat of the same region ########################################################################### # PRODUCING GENSCAN PREDICTIONS (DONE, 2006-05-27, hartera) # Use scaffolds for random chroms to avoid getting false predictions # spanning scaffolds in chrNA_random and chrUn_random. ssh kkstore04 cd /cluster/data/danRer4 # already have a file of soft-masked scaffolds for chrNA_random and # chrUn_random. Use this to create hard-masked scaffolds FASTA file # for Genscan run. foreach c (NA_random Un_random) cd /cluster/data/danRer4/$c mkdir scaffoldsHardMask echo "Hard-masking scaffolds for $c ..." cd scaffoldsSoftMask foreach f (*.fa) maskOutFa $f hard ../scaffoldsHardMask/${f}.masked end end ssh hgwdev mkdir /cluster/data/danRer4/bed/genscan cd /cluster/data/danRer4/bed/genscan cvs co hg3rdParty/genscanlinux ssh pk cd /cluster/data/danRer4/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) cp /dev/null genome.list foreach c (`cat /cluster/data/danRer4/chrom.lst`) echo $c if (($c == "NA_random") || ($c == "Un_random")) then foreach s (/cluster/data/danRer4/${c}/scaffoldsHardMask/Zv6_*.fa.masked) egrep '[ACGT]' $s > /dev/null if ($status == 0) echo $s >> genome.list end else foreach f ( `ls -1S /cluster/data/danRer4/$c/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end endif end wc -l genome.list # 3237 genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try, check, push, check ... etc. para time # Completed: 3236 of 3237 jobs # Crashed: 1 jobs # CPU time in finished jobs: 46601s 776.69m 12.94h 0.54d 0.001 y # IO & Wait Time: 10409s 173.48m 2.89h 0.12d 0.000 y # Average job time: 18s 0.29m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 363s 6.05m 0.10h 0.00d # Submission to last job: 445s 7.42m 0.12h 0.01d # If there are crashes, diagnose with "para problems" / "para crashed". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". para problems > problems nice /cluster/bin/x86_64/gsBig /cluster/data/danRer4/8/chr8_5/chr8_5.fa.masked gtf/chr8_5.fa.gtf -trans=pep/chr8_5.fa.pep -subopt=subopt/chr8_5.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 >& chr8_5.fa.log & # Took about 5 minutes to run # check log and then remove it rm chr8_5.fa.log ssh kkstore04 cd /cluster/data/danRer4/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/danRer4/bed/genscan ldHgGene danRer4 genscan genscan.gtf # Read 44534 transcripts in 325488 lines in 1 files # 44534 groups 28 seqs 1 sources 1 feature types # 44534 gene predictions hgPepPred danRer4 generic genscanPep genscan.pep hgLoadBed danRer4 genscanSubopt genscanSubopt.bed # Loaded 332782 elements of size 6 # compare to other assemblies: featureBits danRer4 genscan # 64448019 bases of 1626093931 (3.963%) in intersection featureBits rn4 genscan # 54781052 bases of 2571531505 (2.130%) in intersection featureBits monDom4 genscan # 45991425 bases of 3501643220 (1.313%) in intersection featureBits tetNig1 genscan # 30459626 bases of 342403326 (8.896%) in intersection featureBits -chrom=chr1 refGene genscan -enrichment # refGene 1.129%, genscan 4.195%, both 0.653%, cover 57.80%, enrich 13.78x # check CDS only featureBits -chrom=chr1 danRer4 refGene:cds genscan:cds -enrichment # refGene:cds 0.746%, genscan:cds 4.195%, both 0.631%, cover 84.52%, # enrich 20.15x ########################################################################### # BLASTZ/CHAIN/NET GALGAL3 (DONE 5/30/06 angie) ssh pk mkdir /cluster/data/danRer4/bed/blastz.galGal3.2006-05-30 cd /cluster/data/danRer4/bed/blastz.galGal3.2006-05-30 cat << '_EOF_' > DEF # zebrafish vs. chicken BLASTZ=/cluster/bin/penn/i386/blastz # Use same params as used for danRer1-xenTro1 (see makeXenTro1.doc) BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Zebrafish danRer4 SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit SEQ1_CTGDIR=/san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit SEQ1_LIFT=/san/sanvol1/scratch/danRer4/liftNAandUnScaffoldsToChrom.lft SEQ1_LEN=/cluster/data/danRer4/chrom.sizes SEQ1_CTGLEN=/san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes SEQ1_CHUNK=50000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Chicken galGal3 - single chunk big enough to run while chrom SEQ2_DIR=/san/sanvol1/galGal3/nib SEQ2_LEN=/cluster/data/galGal3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/cluster/data/danRer4/bed/blastz.galGal3.2006-05-30 '_EOF_' # << emacs doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/scratch/danRer4GalGal3 \ -bigClusterHub=pk -smallClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose DEF \ >& do.log & tail -f do.log ln -s blastz.galGal3.2006-05-30 /cluster/data/danRer4/bed/blastz.galGal3 ########################################################################### # CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO # AFFY ZEBRAFISH ALIGNMENTS (DONE, 2006-06-10, hartera) # UPDATE ARRAY DATA TRACK AFTER PROCESSING ARRAY DATA DIFFERENTLY AND # RELOADING INTO hgFixed (see hgFixed.txt for details). # (DONE, 2006-10-20, hartera) # UPDATE ARRAY DATA TRACK AFTER REPROCESSING ARRAY DATA TO ANTILOG THE LOG2 # VALUES FROM NORMALISATION TO GET THE ABSOLUTE VALUES AND # RELOADING INTO hgFixed (see hgFixed.txt for details). # (DONE, 2007-01-08, hartera) # RE-ORDERED DISPLAY IN TRACK - see ZON LAB WILD TYPE MICROARRAY DATA section # in danRer3.txt make doc. (DONE, hartera, 2007-04-09) # Array data is for whole embryos of five wild type zebrafish strains. # Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's # Hospital Boston. Contact: adibiase@enders.tch.harvard.edu ssh hgwdev mkdir /cluster/data/danRer4/bed/ZonLab/wtArray cd /cluster/data/danRer4/bed/ZonLab/wtArray # use AllRatio table for mapping. There are not many arrays in this # dataset so using AllRatio will allow the selection of All Arrays # from the track controls on the track description page. Also set up the # Zebrafish microarrayGroups.ra so that the Medians of replicates or # Means of replicates can also be selected for display. # Create mapped data in zebrafishZonWT.bed. rm zebrafishZonWT.bed hgsql -e 'drop table affyZonWildType;' danRer4 hgMapMicroarray zebrafishZonWT.bed hgFixed.zebrafishZonWTAllRatio \ /cluster/data/danRer4/bed/affyZebrafish/affyZebrafish.psl # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTMedian # Mapped 14952, multiply-mapped 3867, missed 0, unmapped 665 hgLoadBed danRer4 affyZonWildType zebrafishZonWT.bed # Loaded 18819 elements of size 15 # add trackDb.ra entry at trackDb/zebrafish level # look at range of scores: hgsql -N -e 'select expScores from zebrafishZonWTAllRatio;' hgFixed \ > ratioExps.out perl -pi.bak -e 's/,/\n/g' ratioExps.out sort ratioExps.out | uniq -c > ratioExps.uniq.count textHistogram -binSize=0.5 -real -maxBinCount=40 -minVal=-10 \ ratioExps.out > expRatios.hist # Most values are between -3 and +2. # Therefore use the following trackDb entry: # track affyZonWildType # shortLabel Wild Type Array # longLabel Zon Lab Expression data for Wild Type Zebrafish strains # group regulation # priority 80 # visibility hide # type expRatio # expScale 2.0 # expStep 0.2 # groupings affyZonWildTypeGroups # The .ra file in /usr/local/apache/cgi-bin/hgCgiData/Zebrafish # (from ~/kent/src/hg/makeDb/hgCgiData/Zebrafish in the source tree) # which is microarrayGroups.ra defines how the array data is # displayed and also grouped for the Medians and Means of Replicates. # It also defines the labels for the track controls for showing # All Arrays, Arrays Grouped By Replicate Means or # Arrays Grouped By Replicate Medians. This is in the description field. # RE-ORDERED DISPLAY IN TRACK - see danRer3.txt make doc # (hartera, 2007-04-09) # 14 somites and 15 somites should come before 36 hpf # 14-19 somites stage is 16-19h. # from hgFixed.zebrafishZonWTAllExps # for AB, 0-8 should go after 14, # for TL, 16-22 should go after 24 # for TU, 25-27 should go after 32 # re-order accordingly in the config file: # ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra ########################################################################### # HUMAN ORTHOLOGS ADDED TO AFFY ZEBRAFISH TRACK DETAILS # (DONE, 2006-06-08, hartera) # Human orthologs were mapped to Affy Zebrafish probes by # Tony DiBiase (adibiase@enders.tch.harvard.edu) from Len Zon's group # at Children's Hospital, Boston. They map to human hg16. ssh kkstore04 mkdir -p /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs cd /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs sed -e 's/"//g' cumuList.gedi.2005oct12.txt > hg16Orthologs.txt awk \ 'BEGIN {FS="\t"} {OFS="\t"} {if ($2 == $1) print $1,"",""; else print;}' \ hg16Orthologs.txt > hg16Orthologs.tab # create a table definition for this set: cat << 'EOF' > orthologs.sql # Link together an item with an ortholog CREATE TABLE affyToHg16Orthologs ( name varchar(255) not null, # Item ID geneSymbol longblob not null, # Gene Symbol of ortholog description longblob not null, # Description of ortholog # Indices INDEX(name(20)), INDEX(geneSymbol(20)) ); 'EOF' # load table ssh hgwdev cd /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs hgsql -e 'drop table affyToHg16Orthologs;' danRer4 hgLoadSqlTab danRer4 affyToHg16Orthologs orthologs.sql hg16Orthologs.tab # edit hgc.c to use this table on affyZebrafish details page and add # a search to use the human ortholog gene symbol in a search: # affyZebrafishHg16Ortholog, put in trackDb/zebrafish/trackDb.ra ########################################################################### # SWAP rn4 BLASTZ CHAIN/NET (DONE, 2006-06-19, hartera) # See also makeRn4.doc ssh pk cd /cluster/data/rn4/bed/blastzDanRer4.2006-06-19 # blastz parameters used in blastz alignment of danRer4 on mm8: # BLASTZ_ABRIDGE_REPEATS=0 # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=6000 # BLASTZ_K=2200 # BLASTZ_M=50 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& swap.log & ssh hgwdev featureBits danRer4 chainRn4Link # 68978593 bases of 1626093931 (4.242%) in intersection featureBits danRer4 refGene:cds chainRn4Link -chrom=chr1 -enrichment # refGene:cds 0.746%, chainRn4Link 4.333%, both 0.564%, # cover 75.55%, enrich 17.43x featureBits danRer3 refGene:cds chainRn4Link -chrom=chr1 -enrichment # refGene:cds 0.786%, chainRn4Link 4.320%, both 0.604%, # cover 76.87%, enrich 17.80x featureBits danRer4 refGene:cds netRn4 -chrom=chr1 -enrichment # refGene:cds 0.746%, netRn4 29.601%,both 0.623%,cover 83.49%,enrich 2.82x featureBits danRer3 refGene:cds netRn4 -chrom=chr1 -enrichment # refGene:cds 0.786%, netRn4 33.103%, both 0.671%,cover 85.33%,enrich 2.58x # Add symbolic link to new swap directory ssh kkstore04 cd /cluster/data/danRer4/bed ln -s blastz.rn4.swap blastz.rn4 # Check README.txt for downloads. ####################################################################### # VEGA GENES (DONE, 2006-08-14 - 2006-08-25, hartera) # ADD DESCRIPTIONS FOR VEGA GENES (DONE, 2006-09-25 - 2006-09-26, hartera) # Data provided by Kerstin Howe from Sanger: kj2@sanger.ac.uk # and also Mario Caccamo: mc2@sanger.ac.uk ssh kkstore04 mkdir /cluster/data/danRer4/bed/vega cd /cluster/data/danRer4/bed/vega wget --timestamping \ ftp://ftp.sanger.ac.uk/pub/kj2/gff/vega_in_ensembl.gff wget --timestamping \ ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/genes_for_tom_new.txt # checked list of genes found in vega_in_ensembl.gff but not in # genes_for_tom_new.txt against this file grep -f genesWithNoInfo.txt genes_for_tom_20060725.txt # got a list of 20 that were not in this file: genesWithNoInfo2.txt # e-mailed Kerstin at Sanger and got the information for these 20 genes: # moreInfo.txt # Need to rewrite this file using tabs: # checked format for VEGA genes in hg17. Includes an alternate name. cd /cluster/data/hg17/bed/vega30 # to look at human VEGA # vegaInfo is transcriptId, otterId, geneId, method and geneDesc awk '{if ((($9 ~ /^ID=OTTDART/) && ($9 ~ /Parent=OTTDARG/)) || \ (($9 ~ /^ID=OTTDART/) && ($9 ~ /Parent=OTTDART/))) print $9;}' \ vega_in_ensembl.gff | sort | uniq > vegaIDs.txt perl -pi.bak -e 's/ID=//' vegaIDs.txt # list of transcript ID and corresponding gene ID for Vega perl -pi.bak -e 's/;Parent=/\t/' vegaIDs.txt perl -pi.bak -e 's/;Note=Only//' vegaIDs.txt # write a script to reformat the GFF3 file to GFF format. # some exon and CDS items belong to more than one transcript ID so these # lines can just be duplicated. Those items that are labelled as mRNA or # gene can be ignored and not added to the GFF file. Some of these lines # have an extra comment e.g. Note=" . These will be ignored anyway as # they are on the lines with mRNA or gene in them so they will not be in # the final GFF file. cat << '_EOF_' > formatGff3ToGff.pl #!/usr/bin/perl -w use strict; my (%idsHash, $gffFile, $idsFile); $gffFile = $ARGV[0]; open(GFF, $gffFile) || die "Can not open $gffFile\n"; while () { my ($line, @f, $t, @trans, $r, $chr); $line = $_; if ($line !~ /^#/) { @f = split(/\t/, $line); $chr = "chr" . $f[0]; if (($f[2] ne "gene") && ($f[2] ne "mRNA")) { $f[8] =~ /Parent=(OTTDART[0-9]+[A-Z0-9,]+)/; $t = $1; @trans = split(/,/, $t); foreach $r (@trans) { print "$chr\t$f[1]\t$f[2]\t$f[3]\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$r\n"; } } } else { # print lines beginning with "#" print $line; } } close GFF; '_EOF_' chmod +x formatGff3ToGff.pl # Use script to format the GFF3 file to GFF format in order to load # using ldHgGene perl formatGff3ToGff.pl vega_in_ensembl.gff > vega.gff # then use the info file to grab those genes that are pseudogenes, get the # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes # to a separate file. Create an info file. Remove the .NOVEL or .PUTATIVE # or .KNOWN or .NOVEL from the method column and add as a separate # confidence column. # check number of items on each line: there are 4 or 6. # Some genes have more than one clone ID in a comma separated list # so create two files for loading into two tables. # Found that some of the clone ID fields have comma separated lists # and for OTTDARG00000006367, there are 30. Therefore create two info # tables where one is just for clone IDs. # NOTE: in future, make sure each row of vegaInfoZfish.txt output has # 8 fields. The pseudogene entries are missing an entry in the # confidence field so this should be an empty field. cat << '_EOF_' > formatVegaInfo.pl #!/usr/bin/perl -w use strict; # format Vega additional information into one file for vegaInfoZfish table # and another for the vegaToCloneIdZfish table which contains the # geneId and cloneId for each gene since there are multiple clone IDs for # some of the genes. my ($idsFile, $infoFile, $outFile1, $outFile2, %idsHash); $idsFile = $ARGV[0]; $infoFile = $ARGV[1]; $outFile1 = $ARGV[2]; $outFile2 = $ARGV[3]; open (IDS, $idsFile) || die "Can not open $idsFile: $!\n"; open (INFO, $infoFile) || die "Can not open $infoFile: $!\n"; open (OUT1, ">$outFile1") || die "Can not create $outFile1: $!\n"; open (OUT2, ">$outFile2") || die "Can not create $outFile2: $!\n"; open (STDERR, ">info.log") || die "Can not create info.log: $!\n"; while () { my ($line, @f); chomp; $line = $_; @f = split(/\t/, $line); $idsHash{$f[1]} = $f[0]; } close IDS; while () { my ($line,@fi,$id,$gene,$trans,@transIds, $tr,@clones, $c,@t, $method, $conf); chomp; $gene = ""; $line = $_; @fi = split(/\t/, $line); $id = $gene = $fi[1]; # get all the transcript IDs for a gene while (exists($idsHash{$id})) { $trans = $idsHash{$id}; push(@transIds, $trans); $id = $trans; } # push clone IDs into an array: @clones = split(/,/, $fi[2]); @t = split(/\./, $fi[3]); $method = $t[0]; if ($#t > 0) { $conf = $t[1]; } elsif ($#t == 0) { $conf = ""; } else { print STDERR "Should be 4 or 6 items per row, found $#fi \n"; } foreach $tr (@transIds) { print OUT1 "$tr\t$fi[1]\t$fi[0]"; if ($#fi == 5) { print OUT1 "\t$fi[4]\t$fi[5]\t$method\t\t$conf\n"; } elsif ($#fi == 3) { print OUT1 "\t\t\t$method\t\t$conf\n"; } # print out clone IDs for each transcript foreach $c (@clones) { print OUT2 "$tr\t$c\n"; } } if($gene && !exists($idsHash{$gene})) { print STDERR "$gene\n"; } } close IDS; close INFO; close OUT1; close OUT2; close STDERR; '_EOF_' chmod +x formatVegaInfo.pl wc -l genes_for_tom_new.txt # 4822 genes_for_tom_new.txt awk '{print $2}' genes_for_tom_new.txt | sort | uniq > genesWithInfo.txt awk '{if ($2 ~ /OTTDARG/) print $2;}' vegaIDs.txt \ | sort | uniq > genesFromGff.txt wc -l genesFromGff.txt # 4947 genesFromGff.txt comm -12 genesWithInfo.txt genesFromGff.txt | wc -l # 4033 comm -13 genesWithInfo.txt genesFromGff.txt | wc -l # 914 comm -13 genesWithInfo.txt genesFromGff.txt > genesWithNoInfo.txt # sent this list to Sanger to ask about getting additional information # for these genes. comm -23 genesWithInfo.txt genesFromGff.txt | wc -l # 789 # got another file from Sanger that should contain the information # for the 914 genes missing information above. ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/060725/genes_for_tom_20060725.txt # check if this contains all of the list missing before sort genesWithNoInfo.txt | uniq > genesWithNoInfo.sort awk '{print $2}' genes_for_tom_20060725.txt | sort | uniq > genes.txt comm -13 genes.txt genesWithNoInfo.uniq > genesWithNoInfo2.txt # there are 20 of these. Sent these to Sanger and received # information for these. Copied and pasted these from e-mail into # moreInfo.txt. Write script to reformat: addTabs.pl perl addTabs.pl < moreInfo.txt > geneInfo3.txt grep -f genesWithInfo.txt genes_for_tom_20060725.txt > tmp wc -l tmp # 4738 wc -l genesWithInfo.txt # 4822 genesWithInfo.txt # Not all of these are in genes_for_tom_20060725.txt so merge all the # info files and uniq: cat genes_for_tom_new.txt genes_for_tom_20060725.txt geneInfo3.txt \ | sort | uniq > allGeneInfo.txt awk '{print $2}' allGeneInfo.txt | sort | uniq -c | sort -nr > count # counts gene names - often occur twice but with more information in # one case than the other. Seems like newer file has most information for # each gene. grep -f genesFromGff.txt genes_for_tom_20060725.txt > info1.txt # then list genes in info1.txt comm -13 genesInInfo1.sort genesFromGff.txt > genes1 wc -l genes1 # 55 genes1 grep -f genes1 genes_for_tom_new.txt > info2.txt awk '{print $2}' info2.txt | sort | uniq > genesInInfo2.txt comm -13 genesInInfo2.sort genes1 > genes2 wc -l genes2 # 20 genes2 # genes2 is list of genes not found in either file. Should be 20 left. awk '{print $2}' geneInfo3.txt | sort | uniq > genes3 comm -12 genes2 genes3 | wc -l # 20 - so these are the same 20 that are in geneInfo3.txt # These are in geneInfo3.txt. cat all these files together cat info1.txt info2.txt geneInfo3.txt > allGeneInfo2.txt # Recreate the tab file for loading into the vegaInfoZfish table: rm vegaInfoZfish.txt # Use new version that prints out one row for each accession in field 3. perl formatVegaInfo.pl vegaIDs.txt allGeneInfo2.txt vegaInfoZfish.txt \ vegaToCloneId.txt # info.log contains genes for which are not in the gff file of VEGA # and this is empty as it should be. wc -l vegaInfoZfish.txt # 6606 vegaInfoZfish.txt wc -l vegaToCloneId.txt # 7245 vegaToCloneId.txt awk '{print $1}' vegaInfoZfish.txt | sort | uniq -c | sort -nr > out2 # transcripts only have 1 entry awk '{print $2}' vegaInfoZfish.txt | sort | uniq > infogenes.txt comm -13 infogenes.txt genesFromGff.txt # There are no genes in the GFF file that are not in vegaInfoZfish.txt # Then remake the pseudogenes track from this. # Next step is to find which transcripts are pseudogenes. grep pseudogene vegaInfoZfish.txt | sort | uniq | wc -l # There are only 51 in the info file, and all of these are in the GFF # file. Anyway, this is too sparse for a separate track, but # a subtrack could be created. # Get transcript IDs for pseudogenes. grep pseudogene vegaInfoZfish.txt | awk '{print $1}' > pseudogenes.ids grep -f pseudogenes.ids vega.gff > vegaPseudoGene.gff awk '{print $9}' vegaPseudoGene.gff |sort | uniq | wc -l # 51 grep -v -f pseudogenes.ids vega.gff > vegaGene.gff wc -l vega*ff # 98170 vega.gff # 97999 vegaGene.gff # 171 vegaPseudoGene.gff # load gff files: ssh hgwdev cd /cluster/data/danRer4/bed/vega hgsql -e 'drop table vegaGene;' danRer4 hgsql -e 'drop table vegaPseudoGene;' danRer4 ldHgGene danRer4 vegaGene vegaGene.gff # Read 6555 transcripts in 88104 lines in 1 files # 6555 groups 25 seqs 1 sources 2 feature types # 6555 gene predictions ldHgGene danRer4 vegaPseudoGene vegaPseudoGene.gff # Read 51 transcripts in 171 lines in 1 files # 51 groups 9 seqs 1 sources 1 feature types # 51 gene predictions # Then create SQL table for adding the zebrafish-specific information # Add clone_id to a separate table instead of this one. cat << '_EOF_' > ~/kent/src/hg/lib/vegaInfoZfish.as table vegaInfoZfish "Vega Genes track additional information" ( string transcriptId; "Vega transcript ID" string geneId; "Vega gene ID (OTTER ID)" string sangerName; "Sanger gene name" string zfinId; "ZFIN ID" string zfinSymbol; "ZFIN gene symbol" string method; "GTF method field" string geneDesc; "Vega gene description" string confidence; "Status (KNOWN, NOVEL, PUTATIVE, PREDICTED)" ) '_EOF_' cd ~/kent/src/hg/lib/ autoSql vegaInfoZfish.as vegaInfoZfish mv vegaInfoZfish.h ../inc/ # commit vegaInfoZfish{.h,.c,.as,.sql} files to CVS # add INDEX(geneId) to vegaInfoZfish.sql # Need to change geneDesc to longblob type because some descriptions # are long (2006-09-26, hartera) cd ~/kent/src/hg/lib perl -pi.bak -e 's/geneDesc varchar\(255\)/geneDesc longblob/' \ vegaInfoZfish.sql # create a second table for the cloneId accessions since there # are multiple ids for some VEGA genes. Otherwise, there would be # a comma separated list in this field or many rows repeated but just # different in the cloneId field. Associate transcript ID to clone IDs. grep ',' allGeneInfo2.txt | wc -l # 378 cat << '_EOF_' > ~/kent/src/hg/lib/vegaToCloneId.as table vegaToCloneId "Vega Genes track cloneId information" ( string transcriptId; "Vega transcript ID" string cloneId; "clone ID" ) '_EOF_' cd ~/kent/src/hg/lib/ autoSql vegaToCloneId.as vegaToCloneId # replace PRIMARY KEY(transcriptId) with Indices on geneId and cloneId: perl -pi.bak -e \ 's/PRIMARY KEY\(transcriptId\)/INDEX\(transcriptId\),\nINDEX\(cloneId\)/' \ vegaToCloneId.sql rm *.bak # mv vegaInfoZfish.h ../inc/ cd /cluster/data/danRer4/bed/vega hgsql -e 'drop table vegaInfoZfish;' danRer4 hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \ vegaInfoZfish.txt hgsql -e 'drop table vegaToCloneId;' danRer4 hgLoadSqlTab danRer4 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \ vegaToCloneId.txt # Add code to hgc.c so that this works for Zebrafish and creates the # relevant links. Add searches by vega transcript ID, ZFIN ID and # clone ID. Add a Vega zebrafish-specific description to # trackDb/zebrafish. The Pseudogenes are a subtrack of the Genes track # because it is too sparse to show as a separate track. # Added entry in zebrafish/trackDb.ra to create these tracks as subtracks of # a Vega Genes track. # track vegaGeneZfish # compositeTrack on # shortLabel Vega Genes # longLabel Vega Annotations # group genes # priority 37 # visibility hide # chromosomes chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chr23,chr24,chr25 # type genePred # url http://vega.sanger.ac.uk/Danio_rerio/geneview?transcript=$$ # track vegaGene # subTrack vegaGeneZfish # shortLabel Vega Genes # longLabel Vega Gene Annotations # priority 1 # color 0,100,180 # track vegaPseudoGene # subTrack vegaGeneZfish # shortLabel Vega Pseudogenes # longLabel Vega Annotated Pseudogenes # priority 2 # color 155,0,125 # ADD Descriptions for Vega Genes (2006-09-25 - 2006-09-26, hartera) # Looked into using description from BioMart for VEGA genes but easier # to get them all directly from Sanger. Kerstin sent a list of # descriptions: for_rachel.txt ssh kkstore04 mkdir /cluster/data/danRer4/bed/vega/description # copy file here and rename cd /cluster/data/danRer4/bed/vega/description mv for_rachel.txt vegaDesc.txt # get list of VEGA gene IDs in vegaInfoZfish ssh hgwdev cd /cluster/data/danRer4/bed/vega/description hgsql -e 'select distinct(geneId) from vegaInfoZfish;' danRer4 | sort \ > geneIdsFromInfo.sort # get sorted list of gene IDs from description file: awk '{print $2;}' vegaDesc.txt | sort | uniq > vegaDesc.ids.sort wc -l *.sort comm -12 geneIdsFromInfo.sort vegaDesc.ids.sort | wc # 4892 comm -23 geneIdsFromInfo.sort vegaDesc.ids.sort > genesNoDesc.txt # 55 with no description. sent this list to Sanger and got the # descriptions for these too: descriptions_for_Rachel.txt awk '{print $1}' descriptions_for_Rachel.txt | sort | uniq \ > geneIds.newDesc.sort comm -12 genesNoDesc.txt geneIds.newDesc.sort | wc # 55 gene names in common ssh kkstore04 cd /cluster/data/danRer4/bed/vega/description cat vegaDesc.txt descriptions_for_Rachel.txt > vegaAllDesc.txt wc -l vegaAllDesc.txt # 6440 vegaAllDesc.txt # clean up rm genesNoDesc.txt geneIds* vegaDesc.ids.sort # Then add these to the vegaInfoZish table cat << 'EOF' > addDesc.pl #!/usr/bin/perl -w use strict; my ($infoFile, $descFile, %descHash); $infoFile = $ARGV[0]; # vegaInfoZfish.txt file $descFile = $ARGV[1]; # file of descriptions open(INFO, $infoFile) || die "Can not open $infoFile : $!\n"; open(DESC, $descFile) || die "Can not open $descFile : $!\n"; while () { my($line, @f, $id, $desc); chomp; $line = $_; @f = split(/\t/, $line); if ($#f > 0 && $f[1] =~ /^OTTDARG/) { $id = $f[1]; $desc = $f[2]; } elsif ($f[0] =~ /^(OTTDARG[0-9]+)\s*(.+)/) { # some lines have just id and description with only a space between $id = $1; $desc = $2; } else { print "OTTDARG ID not found \n"; } $descHash{$id} = $desc; } close DESC; while () { my ($li, @fi, $de, $i, $last); $de = ""; chomp; $li = $_; @fi = split(/\t/, $li); if ($fi[1] =~ /OTTDARG/) { if (exists($descHash{$fi[1]})) { $de = $descHash{$fi[1]}; } else { print "There is no description for $fi[1] available.\n"; } } $last = $#fi; for ($i = 0; $i <= 5; $i++ ) { print "$fi[$i]\t"; } print "$de\t"; if ($last == 5) { # if there are only 5 fields, the last one is missing so add extra tab print "\t\n"; } else { print "$fi[$last]\n"; } } close INFO; 'EOF' chmod +x addDesc.pl # add new descriptions to vegaInfoZfish.txt file perl addDesc.pl ../vegaInfoZfish.txt vegaAllDesc.txt \ > vegaInfoZfishWithDesc.txt # Reload vegaInfoZfish table ssh hgwdev cd /cluster/data/danRer4/bed/vega/description # 105 warnings when loading the table # remove "\N" from desc perl -pi.bak -e 's/\\N//' vegaInfoZfishWithDesc.txt # this removed 3 warnings # after dumping the contents of the table and diffing with the input # file, found that the pseudogenes are missing the confidence field # and so there is a tab missing from the file. Modified addDesc.pl to # add the extra tab when only 7 tabbed fields instead of 8 is found # in a row. hgsql -e 'drop table vegaInfoZfish;' danRer4 hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \ vegaInfoZfishWithDesc.txt # Try loading GTF format file (2006-10-19) ssh kkstore04 cd /cluster/data/danRer4/bed/vega/new wget ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf ssh hgwdev cd /cluster/data/danRer4/bed/vega/new ldHgGene -bin -genePredExt danRer4 vegaNew vega.gtf # Error: Read 6371 transcripts in 88275 lines in 1 files # 6371 groups 25 seqs 4 sources 2 feature types # invalid gffGroup detected on line: chr22 NOVEL exon 6782575 # 67832400.000000 - . gene_id "si:rp71-1i20.2"; transcript_id # "si:rp71-1i20.2-001"; # GFF/GTF group si:rp71-1i20.2-001 on chr22+, this line is on chr22-, all group # members must be on same seq and strand # transcript_id is not unique. otter_transcript_id is unique so switch these. cp vega.gtf vegaNew.gtf # ldHgGene groups by transcript Id so use OTTER IDS instead perl -pi.bak -e 's/transcript_id/other_transcript_id/' vegaNew.gtf perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaNew.gtf ldHgGene -bin -genePredExt danRer4 vegaNew vegaNew.gtf # worked ok # Added this as a vegaGeneNew subtrack for Vega Genes ssh kkstore04 cd /cluster/data/danRer4/bed/vega/new # find genes that has same transcript Ids for different OTTER gene_ids awk 'BEGIN {FS="\t"} {print $9}' vega.gtf > vegaAttributes awk 'BEGIN {FS=";"} {print $2, $5}' vegaAttributes \ > vegaAttrib.transIdandotterId sort vegaAttrib.transIdandotterId | uniq \ > vegaAttrib.transIdandotterId.uniq awk '{print $2}' vegaAttrib.transIdandotterId.uniq | sed -e 's/\s//' \ | sort | uniq -c | sort -nr > vegaAttrib.transId.count # 88 of these transcripts have more than one entry in gtf file. Need # to check if they have different OTTER gene ids in each case. head -88 vegaAttrib.transId.count | awk '{print $2}' > transIds.morethan1 grep -w -f transIds.morethan1 vegaAttrib.transIdandotterId.uniq \ > transIdswithDiffOtterGeneIds.txt awk '{print $2}' transIdswithDiffOtterGeneIds.txt | sort | uniq \ > transIds.diffOtterGeneIds.txt # send transIdswithDiffOtterGeneIds.txt to Kerstin at Sanger. List # of transcript Ids with different instances of OTTER gene ids. # WAITING NOW FOR VEGA GENE UPDATE (2006-10-19) # Received e-mail from Ian Sealy at Sanger (is1@sanger.ac.uk) that # Vega gene update is ready in gtf format (2006-11-02) ssh kkstore04 cd /cluster/data/danRer4/bed/vega mkdir update cd update wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf ssh hgwdev cd /cluster/data/danRer4/bed/vega/update ldHgGene -bin -genePredExt danRer4 vegaUpdate vega.gtf # Read 6823 transcripts in 93253 lines in 1 files 6823 groups 25 seqs 4 sources 2 feature types invalid gffGroup detected on line: chr22 PUTATIVE exon 67909276791256 0.000000 - . gene_id "si:rp71-1i20.2"; transcript_id "RP71-1I20.1-001"; GFF/GTF group RP71-1I20.1-001 on chr22+, this line is on chr22-, all group members must be on same seq and strand # Still has non-unique transcript IDs - need to wait for next release # of VEGA genes and Ensembl for this to be fixed. # Received new update of VEGA from Ian Sealy (is1@sanger.ac.uk) on # 2007-02-14. ssh kkstore04 cd /cluster/data/danRer4/bed/vega wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf # Load into database # 2007-03-09 ssh hgwdev cd /cluster/data/danRer4/bed/vega ldHgGene -bin -genePredExt danRer4 vega vega.gtf invalid gffGroup detected on line: chr4 NOVEL exon 35259893 352599940.000000 + . gene_id "sinup"; transcript_id "siah2l-001"; GFF/GTF group siah2l-001 on chr21-, this line is on chr4+, all group members must be on same seq and strand # still get duplicate transcript IDs on different chromosome. # Below is what Kerston Howe (kj2@sanger.ac.uk) advised on these cases: # "this will continue to happen as long as the map still changes. The # gene in question was annotated on two adjacent clones which were # apparently then broken up and assigned to different chromosomes. # Usually, this is not too alarming (just delete those cases, please)" # Find other such cases: awk 'BEGIN{OFS="\t"} {print $1, $12}' vega.gtf > vegachromAndId.txt sort vegachromAndId.txt | uniq > vegachromAndId.uniq awk '{print $2}' vegachromAndId.uniq | sort | uniq -c | sort -nr \ > vegaIds.count # These transcript IDs all appear twice on different chromosomes. There could # be cases where there are transcripts that are duplicated on the same # chromosome. # 2 "taf6-001"; # 2 "siah2l-001"; # 2 "rasgrf2-001"; # 2 "lmx1b-001"; # 2 "fvt1-001"; # 2 "ckmt2-002"; # 2 "ckmt2-001"; # 2 "accn2c-001"; # There are some cases where the gene is on the same chrom but different # strands. awk 'BEGIN{OFS="\t"} {print $1, $7, $12}' vega.gtf \ | sort | uniq > vegachromStrandAndId.uniq awk '{print $1, $3}' vegachromStrandAndId.uniq | sort | uniq -c \ | sort -nr > vegaIdsAndChroms.count # These occur twice on different strands of the same chromosome: # 2 chr19 "DKEY-264N13.5-001"; # 2 chr14 "stx5a-001"; # Remove these from the GTF file as suggested by Kerstin Howe (Sanger) head -8 vegaIds.count | awk '{print $2}' > transcriptIds.remove head -2 vegaIdsAndChroms.count | awk '{print $3}' >> transcriptIds.remove grep -v -f transcriptIds.remove vega.gtf > vega2.gtf # reload into danRer4 database hgsql -e 'drop table vegaUpdate;' danRer4 ldHgGene -bin -genePredExt danRer4 vegaUpdate vega2.gtf # successfully loads now. # ldHgGene groups by transcript Id so use OTTER IDS instead sed -e 's/transcript_id/other_transcript_id/' vega.gtf > vegaFormat.gtf perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaFormat.gtf # Now it loads ok without removing duplicate transcript IDs: ldHgGene -bin -genePredExt danRer4 vegaFormat vegaFormat.gtf # Read 8817 transcripts in 119707 lines in 1 files # 8817 groups 29 seqs 4 sources 2 feature types # 8817 gene predictions hgsql -N -e 'select distinct(name2) from vegaFormat;' danRer4 > name2 # Extra information obtained from Sanger: ssh kkstore04 cd /cluster/data/danRer4/bed/vega wget --timestamping \ ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/061111/noH/genes_for_tom.txt sort name2 > name2.sort awk '{print $1}' genes_for_tom.txt | sort | uniq > genesfortom.symbs.sort comm -23 name2.sort genesfortom.symbs.sort > vegaGtfOnly wc -l vegaGtfOnly # 4021 awk '{print $6}' genes_for_tom.txt | sort | uniq > genesfortom.altsymb.sort comm -23 vegaGtfOnly genesfortom.altsymb.sort # rest of symbols found as alternate symbols in column 6 of this file # subtract this from original list comm -13 vegaGtfOnly name2.sort > genesincol1 # Also received descriptions file and additional information from Sanger. # Now the track can be updated since the vega.gtf file loads into the # database see VEGA UPDATE section below. ####################################################################### # VEGA UPDATE (DONE, 2007-03-26 - 2007-03-28, hartera) # Data provided by Kerstin Howe from Sanger: kj2@sanger.ac.uk # and also Ian Sealy: is1@sanger.ac.uk # GTF file sent on 2007-02-14 # Updated formatVegaInfo.pl script as not all transcripts were being included # in the vegaInfoZfish and the vegaToCloneId tables so all tables were # re-made (DONE, 2007-04-06, hartera) ssh kkstore04 mkdir /cluster/data/danRer4/bed/vega.2007-02-14 cd /cluster/data/danRer4/bed/vega ln -s /cluster/data/danRer4/bed/vega.2007-02-14 \ /cluster/data/danRer4/bed/vega wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf wget --timestamping \ ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/061111/noH/genes_for_tom.txt # list of gene descriptions by Kerstin Howe (2007-03-12) mv for_rachel.txt vegaDescriptions.txt mv genes_for_tom.txt vegaInformation.txt # vegaInfo is transcriptId, otterId, geneId, method and geneDesc # Get otter transcript ID and otter gene ID: awk 'BEGIN{OFS="\t"} \ {if (($17 ~ /otter_gene_id/) && ($19 ~ /otter_transcript_id/)) \ print $20, $18;}' vega.gtf \ > vegaIDs.txt perl -pi.bak -e 's/;//g' vegaIDs.txt perl -pi.bak -e 's/\"//g' vegaIDs.txt # list of transcript ID and corresponding gene ID for Vega sort vegaIDs.txt | uniq > vegaIDs.uniq # then use the info file to grab those genes that are pseudogenes, get the # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes # to a separate file. Create an info file. Remove the .NOVEL or .PUTATIVE # or .KNOWN or .NOVEL from the method column and add as a separate # confidence column. # check number of items on each line: there are 4 or 6. # Some genes have more than one clone ID in a comma separated list # so create two files for loading into two tables. # Found that some of the clone ID fields have comma separated lists # and for OTTDARG00000006367, there are 30. Therefore create two info # tables where one is just for clone IDs. # NOTE: in future, make sure each row of vegaInfoZfish.txt output has # 8 fields. The pseudogene entries are missing an entry in the # confidence field so this should be an empty field. # Updated formatVegaInfo.pl as not getting all transcript IDs in the # vegaInfoZfish table (hartera, 2007-04-06) cat << '_EOF_' > formatVegaInfo.pl #!/usr/bin/perl -w use strict; # format Vega additional information into one file for vegaInfoZfish table # and another for the vegaToCloneIdZfish table which contains the # geneId and cloneId for each gene since there are multiple clone IDs for # some of the genes. my ($idsFile, $infoFile, $outFile1, $outFile2, %idsHash); $idsFile = $ARGV[0]; # list of Transcript IDs and Gene IDs $infoFile = $ARGV[1]; # information file for Vega genes $outFile1 = $ARGV[2]; # output1 is the formatted file of Vega info for table $outFile2 = $ARGV[3]; # output2 is a vega to clone ID conversion table open (IDS, $idsFile) || die "Can not open $idsFile: $!\n"; open (INFO, $infoFile) || die "Can not open $infoFile: $!\n"; open (OUT1, ">$outFile1") || die "Can not create $outFile1: $!\n"; open (OUT2, ">$outFile2") || die "Can not create $outFile2: $!\n"; open (STDERR, ">info.log") || die "Can not create info.log: $!\n"; while () { my ($line, @f); chomp; $line = $_; @f = split(/\t/, $line); # hash is keyed by gene ID but there could be more than one transcript # associated with a gene ID so need to create an array for the hash push @{$idsHash{$f[1]}}, $f[0]; } close IDS; while () { my ($line,@fi,$id,$gene,@transIds, $tr,@clones, $c,@t, $method, $conf); chomp; $gene = ""; $line = $_; @fi = split(/\t/, $line); $id = $gene = $fi[1]; # get all the transcript IDs for a gene if (exists($idsHash{$id})) { @transIds = @{$idsHash{$id}}; } # push clone IDs into an array: @clones = split(/,/, $fi[2]); @t = split(/\./, $fi[3]); $method = $t[0]; if ($#t > 0) { $conf = $t[1]; } elsif ($#t == 0) { $conf = ""; } else { print STDERR "Should be 4 or 6 items per row, found $#fi \n"; } foreach $tr (@transIds) { print OUT1 "$tr\t$fi[1]\t$fi[0]"; if ($#fi == 5) { print OUT1 "\t$fi[4]\t$fi[5]\t$method\t\t$conf\n"; } elsif ($#fi == 3) { print OUT1 "\t\t\t$method\t\t$conf\n"; } # print out clone IDs for each transcript foreach $c (@clones) { print OUT2 "$tr\t$c\n"; } } if($gene && !exists($idsHash{$gene})) { print STDERR "$gene\n"; } } close IDS; close INFO; close OUT1; close OUT2; close STDERR; '_EOF_' chmod +x formatVegaInfo.pl wc -l vegaInformation.txt # 7169 vegaInformation.txt awk '{print $2}' vegaInformation.txt | sort | uniq > genesWithInfo.txt awk '{if ($2 ~ /OTTDARG/) print $2;}' vegaIDs.uniq \ | sort | uniq > genesFromGtf.txt # Number of genes with info AND in gtf file: wc -l genesFromGtf.txt # 6171 genesFromGtf.txt comm -12 genesWithInfo.txt genesFromGtf.txt | wc -l # 6171 # Number of genes wih no info: comm -13 genesWithInfo.txt genesFromGtf.txt | wc -l # 0 # Use perl script above to extract vegaInfo table information. # Re-did this with updated perl script to get all transcript IDs # (hartera, 2007-04-07) perl formatVegaInfo.pl vegaIDs.uniq vegaInformation.txt \ vegaInfoZfish.txt vegaToCloneId.txt # info.log contains genes for which are not in the gff file of VEGA # and this is empty as it should be. wc -l vegaInfoZfish.txt # 8817 vegaInfoZfish.txt wc -l vegaToCloneId.txt # 9698 vegaToCloneId.txt # The vegaToCloneId.txt file is also larger than before as it now # has all the transcript IDs (hartera, 2007-04-05). awk '{print $1}' vegaInfoZfish.txt | sort | uniq -c | sort -nr > out2 # transcripts only have 1 entry awk '{print $2}' vegaInfoZfish.txt | sort | uniq > infogenes.txt comm -13 infogenes.txt genesFromGtf.txt # There are no genes in the GFF file that are not in vegaInfoZfish.txt # However, there are genes in the information file that do not have # transcripts represented that are in the GFF file. # Then remake the pseudogenes track from this. # Next step is to find which transcripts are pseudogenes. grep pseudogene vegaInfoZfish.txt | sort | uniq | wc -l # Once vegaInfoZfish.txt updated, found 53 pseudogenes so need to update # the pseudogene track # There are only 53 in the info file, and all of these are in the GFF # file. Anyway, this is too sparse for a separate track, but # a subtrack could be created. # Get transcript IDs for pseudogenes. grep pseudogene vegaInfoZfish.txt | awk '{print $1}' > pseudogenes.ids grep -w -f pseudogenes.ids vega.gtf > vegaPseudoGene.gtf awk '{print $20}' vegaPseudoGene.gtf | sort | uniq | wc -l # 53 # Need to remake the vegGene table: grep -vw -f pseudogenes.ids vega.gtf > vegaGene.gtf wc -l vega*gtf # 119707 vega.gtf # 119529 vegaGene.gtf # 178 vegaPseudoGene.gtf # Need to relabel IDs to get the name to be the otter transcript ID # and name 2 to be the transcript_id (needs to be labeled as gene_id) # Also, relabel the otter_transcript_id to be transcript_id as ldHgGene # groups the rows by this ID. sed -e 's/gene_id/tmp_id/' vegaGene.gtf > vegaGeneFormat.gtf perl -pi.bak -e 's/transcript_id/gene_id/' vegaGeneFormat.gtf perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaGeneFormat.gtf # Do the same for the pseudogene GTF files: sed -e 's/gene_id/tmp_id/' vegaPseudoGene.gtf > vegaPseudoGeneFormat.gtf perl -pi.bak -e 's/transcript_id/gene_id/' vegaPseudoGeneFormat.gtf perl -pi.bak -e 's/otter_transcript_id/transcript_id/' \ vegaPseudoGeneFormat.gtf rm *.bak # load GTF files for Vega genes and pseudogenes: # Reloaded all tables after updating as above (2007-04-06, hartera) ssh hgwdev cd /cluster/data/danRer4/bed/vega hgsql -e 'drop table vegaGene;' danRer4 hgsql -e 'drop table vegaPseudoGene;' danRer4 ldHgGene -bin -genePredExt danRer4 vegaGene vegaGeneFormat.gtf # Read 8764 transcripts in 119529 lines in 1 files # 8764 groups 29 seqs 3 sources 2 feature types # 8764 gene predictions ldHgGene -bin -genePredExt danRer4 vegaPseudoGene vegaPseudoGeneFormat.gtf # Read 53 transcripts in 178 lines in 1 files # 53 groups 11 seqs 1 sources 1 feature types # 53 gene predictions hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer4 \ | sort | uniq hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer4 \ | sort | uniq # vegaGene includes several scaffolds so need to lift file to chrom # level for these and reload vegaGene. vegaPseudoGene has no scaffolds. # scaffolds in vegaGene: # chrZv6_scaffold3697 # chrZv6_scaffold3723 # chrZv6_scaffold3731 # chrZv6_scaffold3734 # These are all on the chrUn_random virtual chrom ssh kkstore04 cd /cluster/data/danRer4/bed/vega sed -e 's/chrZv6_scaffold/Zv6_scaffold/g' vegaGeneFormat.gtf \ > vegaGeneFormat2.gtf liftUp vegaGeneFormatLifted.gtf \ /cluster/data/danRer4/jkStuff/liftAll.lft carry vegaGeneFormat2.gtf # Reload vegaGene table: ssh hgwdev cd /cluster/data/danRer4/bed/vega hgsql -e 'drop table vegaGene;' danRer4 ldHgGene -bin -genePredExt danRer4 vegaGene vegaGeneFormatLifted.gtf # Read 8764 transcripts in 119529 lines in 1 files # 8764 groups 26 seqs 3 sources 2 feature types # 8764 gene predictions # Vega information tables: # mySQL table definition and autosql-generated files created previously # for zebrafish-specific information (vegaInfoZfish) in the VEGA GENES # section above. # Add clone_id to a separate table instead of this one. # created a second table for the cloneId accessions since there # are multiple ids for some VEGA genes. Otherwise, there would be # a comma separated list in this field or many rows repeated but just # different in the cloneId field. Associate transcript ID to clone IDs. # see VEGA GENES section # load these tables: cd /cluster/data/danRer4/bed/vega hgsql -e 'drop table vegaInfoZfish;' danRer4 hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \ vegaInfoZfish.txt hgsql -e 'drop table vegaToCloneId;' danRer4 hgLoadSqlTab danRer4 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \ vegaToCloneId.txt # Add code to hgc.c so that this works for Zebrafish and creates the # relevant links. Add searches by vega transcript ID, ZFIN ID and # clone ID. trackDb entry added as in VEGA GENES section above. # Added track handler to hgTracks.c for vegaGeneZfish so that the # transcript names from the name2 column of the genePred table is # used for the item name displayed in the track. # Add a Vega zebrafish-specific html description to trackDb/zebrafish. # The Pseudogenes are a subtrack of the Genes track # because it is too sparse to show as a separate track. # ADD Descriptions for Vega Genes # Looked into using description from BioMart for VEGA genes but easier # to get them all directly from Sanger. Kerstin sent a list of # descriptions: for_rachel.txt # Add these again to updated tables (2007-04-06, hartera) ssh kkstore04 mkdir -p /cluster/data/danRer4/bed/vega/description # copy file here and rename cd /cluster/data/danRer4/bed/vega/description mv ../vegaDescriptions.txt . # get list of VEGA gene IDs in vegaInfoZfish ssh hgwdev cd /cluster/data/danRer4/bed/vega/description hgsql -N -e 'select distinct(geneId) from vegaInfoZfish;' danRer4 | sort \ > geneIdsFromInfo.sort # get sorted list of gene IDs from description file: awk '{print $1;}' vegaDescriptions.txt | sort | uniq > vegaDesc.ids.sort wc -l *.sort # 6171 geneIdsFromInfo.sort # 14150 vegaDesc.ids.sort comm -12 geneIdsFromInfo.sort vegaDesc.ids.sort | wc # 6168 comm -23 geneIdsFromInfo.sort vegaDesc.ids.sort > genesNoDesc.txt # There are 3 with no description # OTTDARG00000004654 # OTTDARG00000018757 # OTTDARG00000018760 # Searched for these three at # http://vega.sanger.ac.uk/Danio_rerio/index.html # and found that these three do not have a description. # add them to the descriptions list ssh kkstore04 cd /cluster/data/danRer4/bed/vega/description # add the three with no description to the descriptions list cat vegaDescriptions.txt genesNoDesc.txt > vegaAll.txt # remove header tail +2 vegaAll.txt | sort | uniq > vegaAllDesc.txt wc -l vegaAll* # 23058 vegaAll.txt # 15460 vegaAllDesc.txt # clean up rm genesNoDesc.txt geneIds* vegaDesc.ids.sort # Then add these to the vegaInfoZish table cat << 'EOF' > addDesc.pl #!/usr/bin/perl -w use strict; my ($infoFile, $descFile, %descHash); $infoFile = $ARGV[0]; # vegaInfoZfish.txt file $descFile = $ARGV[1]; # file of descriptions open(INFO, $infoFile) || die "Can not open $infoFile : $!\n"; open(DESC, $descFile) || die "Can not open $descFile : $!\n"; open(ERROR, ">error.log") || die "Can not create error.log : $!\n"; open(OUT, ">out.txt") || die "Can not create out.txt: $!\n"; while () { my($line, @f, $id, $desc); chomp; $line = $_; @f = split(/\t/, $line); if ($f[0] =~ /^OTTDARG/) { $id = $f[0]; $desc = $f[1]; } else { print ERROR "OTTDARG ID is not found on a line of the descriptions file.\n"; } $descHash{$id} = $desc; } close DESC; while () { my ($li, @fi, $de, $i, $last); $de = ""; chomp; $li = $_; @fi = split(/\t/, $li); if ($fi[1] =~ /OTTDARG/) { if (exists($descHash{$fi[1]})) { $de = $descHash{$fi[1]}; } else { print ERROR "There is no description for $fi[1] available.\n"; } } $last = $#fi; for ($i = 0; $i <= 5; $i++ ) { print OUT "$fi[$i]\t"; } print OUT "$de\t"; if ($last == 5) { # if there are only 5 fields, the last one is missing so add extra tab print OUT "\t\n"; } else { print OUT "$fi[$last]\n"; } } close INFO; close ERROR; 'EOF' chmod +x addDesc.pl # add new descriptions to vegaInfoZfish.txt file perl addDesc.pl ../vegaInfoZfish.txt vegaAllDesc.txt # check output in out.txt then rename mv out.txt vegaInfoZfishWithDesc.txt rm error.log # empty # Reload vegaInfoZfish table ssh hgwdev cd /cluster/data/danRer4/bed/vega/description hgsql -e 'drop table vegaInfoZfish;' danRer4 hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \ vegaInfoZfishWithDesc.txt # No errors loading # Added code already to hgc.c so that this works for Zebrafish and creates the # relevant links. Add searches by vega transcript ID, ZFIN ID and # clone ID. trackDb entry added as in VEGA GENES section above. # Added track handler to hgTracks.c for vegaGeneZfish so that the # transcript names from the name2 column of the genePred table are # used for the item name displayed in the track. # Add a Vega zebrafish-specific html description to trackDb/zebrafish. # The Pseudogenes are a subtrack of the Genes track because it is too sparse # to show as a separate track. ########################################################################## # N-SCAN gene predictions (nscanGene) - (2006-08-30 markd) cd /cluster/data/danRer4/bed/nscan/ # obtained NSCAN predictions from michael brent's group # at WUSTL wget -nv -r -np http://ardor.wustl.edu/jeltje/zebrafish/chr_gtf wget -nv -r -np http://ardor.wustl.edu/jeltje/zebrafish/chr_ptx # clean up and rename downloaded directorys: mv ardor.wustl.edu/jeltje/zebrafish/chr_gtf . mv ardor.wustl.edu/jeltje/zebrafish/chr_ptx . rm -rf ardor.wustl.edu rm chr_*/index.html* gzip chr_*/* chmod a-w chr_*/*.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt danRer4 nscanGene chr_gtf/chr*.gtf.gz # load protein, add .1 suffix to match transcript id hgPepPred -suffix=.1 danRer4 generic nscanPep chr_ptx/chr*.fa.gz rm *.tab # update trackDb; need a danRer4-specific page to describe informants zebrafish/danRer4/nscanGene.html (copy from mm8 and edit) zebrafish/danRer4/trackDb.ra # changed search regex to termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9] ####################################################################### # UPDATE AFFY ZEBRAFISH TRACK USING BLAT WITHOUT -mask OPTION AND # USING -repeats OPTION AND DIFFERENT FILTERING TO REMOVE SHORT # ALIGNMENTS (DONE, 2006-09-27 - 2006-09-28, hartera) # With the previous version of this track, QA found a number of short # alignments of <= 30 bp and there are a number in the <= 50bp range. # These do not seem to be meaningful so filtering was changed to try to # remove these alignments while retaining meaningful alignments. # pslCDnaFilter was used with the same settings as used for the # Genbank EST alignments for zebrafish. # Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the # higher minIdentity is causing alignments to be dropped that should not be. # Blat's minIdentity seems to be more severe than that for pslReps or # pslCDnaFilter as it takes insertions and deletions into account. # These are Jim's recommendations. # NOTE: Also added alignments for NA_random and Un_random, these had not # been done for the original affyZebrafish track but should have been. # Array chip sequences already downloaded for danRer1 ssh hgwdev cd /projects/compbio/data/microarray/affyZebrafish mkdir -p /san/sanvol1/scratch/affy cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /san/sanvol1/scratch/affy/ # Set up cluster job to align Zebrafish consensus sequences to danRer3 mkdir -p /cluster/data/danRer4/bed/affyZebrafish.2006-09-27 # remove old link and create new one rm /cluster/data/danRer4/bed/affyZebrafish ln -s /cluster/data/danRer4/bed/affyZebrafish.2006-09-27 \ /cluster/data/danRer4/bed/affyZebrafish # Align sequences on the pitakluster. Scaffolds were aligned for NA_random # and Un_random and lifted to chrom level afterwards. Chroms 1-25 and M # were aligned as ~5 Mb chunks. ssh pk cd /cluster/data/danRer4/bed/affyZebrafish ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa) ls -1 $f >> genome.lst end wc -l genome.lst # 3237 genome.lst # for output: mkdir -p /san/sanvol1/scratch/danRer4/affy/psl # use -repeats option to report matches to repeat bases separately # to other matches in the PSL output. echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/scratch/danRer4/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst affy.lst template.sub para.spec para create para.spec para try, check, push ... etc. para time # Completed: 3237 of 3237 jobs #CPU time in finished jobs: 19319s 321.98m 5.37h 0.22d 0.001 y #IO & Wait Time: 9297s 154.95m 2.58h 0.11d 0.000 y #Average job time: 9s 0.15m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 98s 1.63m 0.03h 0.00d #Submission to last job: 3135s 52.25m 0.87h 0.04d # need to do pslSort and lift up ssh pk cd /san/sanvol1/scratch/danRer4/affy # Do sort, liftUp and then best in genome filter. # only use alignments that have at least # 95% identity in aligned region. # Previously did not use minCover since a lot of sequence is in # Un and NA so genes may be split up so good to see all alignments. # However, found a number of short alignments of <= 50 bp. These are # not meaningful so maybe need to use minCover. If increased too much, # then hits on poor parts of the assembly will be missed. # use pslCDnaFilter with the same parameters as used for zebrafish # Genbank EST alignments. pslSort dirs raw.psl tmp psl pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl # seqs aligns # total: 15272 828202 #drop minNonRepSize: 2763 741674 # drop minIdent: 2656 39188 # drop minCover: 2550 10784 # weird over: 359 1439 # kept weird: 277 347 # drop localBest: 2830 17737 # kept: 14952 18819 # Kept 97.9% of alignments. There are 15502 Affy sequences originally # aligned so there are now 96.5% remaining. # lift up the coordinates to chrom level liftUp affyZebrafish.psl \ /cluster/data/danRer4/jkStuff/liftAll.lft warn contig.psl # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft # Lifting contig.psl # rsync these psl files rsync -a --progress /san/sanvol1/scratch/danRer4/affy/*.psl \ /cluster/data/danRer4/bed/affyZebrafish/ ssh kkstore04 cd /cluster/data/danRer4/bed/affyZebrafish # shorten names in psl file sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp mv affyZebrafish.psl.tmp affyZebrafish.psl pslCheck affyZebrafish.psl # psl is good # load track into database ssh hgwdev cd /cluster/data/danRer4/bed/affyZebrafish hgsql -e 'drop table affyZebrafish;' danRer4 hgLoadPsl danRer4 affyZebrafish.psl # Add consensus sequences for Zebrafish chip # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/hgFixed/affyProbes ln -s \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /gbdb/hgFixed/affyProbes # these sequences were loaded previously so no need to reload. hgLoadSeq -abbr=Zebrafish: danRer3 \ /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa # Clean up rm batch.bak contig.psl raw.psl # check number of short alignments: hgsql -e \ 'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer4 # 7 # for previous filtered set, there were 1272 alignments of <= 50 bp so # this has improved. hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer4 # 14952 # Previously, there were 14819 so more sequences have aligned but less # short alignments are retained. Many of the short alignments may also # have longer alignments to different regions of the genome that are good. ######################################################################### # COMPUGEN ZEBRAFISH OLIGOS TRACK (in progress, 2006-10-20, hartera) # Align the zebrafish oligos from Compugen used to create the arrays # used by GIS to study expression at different developmental stages. ssh hgwdev mkdir -p /projects/compbio/data/microarray/compugen/zebrafish # save Compugen oligos FASTA file here. obtained from # Sinnakaruppan Mathavan at the # Genome Institute of Singapore (GIS). # Permission was obtained from Compugen to display the sequences # along with a disclaimer. see README.txt cd /projects/compbio/data/microarray/compugen/zebrafish unzip Zebrafish\ Oligos_Compugen_XEBLIB96_pov_070704.zip # this gives an Excel file, XEBLIB96_pov_070704.xls # save as a tab separated text file using Excel: XEBLIB96_pov_070704.txt # Remove quotation marks sed -e 's/"//g' XEBLIB96_pov_070704.txt > GISArray.txt # also remove other unwanted characters, ^@, which is ASCII for NULL tr -d '\0' < GISArray.txt > GISArray.format.txt awk 'BEGIN{FS="\t"} {if ($2 !~ /Serial/ && ($2 != "")) print ">"$2"\n"$4}' \ GISArray.format.txt > GISZfishArray.fa grep '>' GISZfishArray.fa | wc -l # 16399 # align sequences to the zebrafish genome on pk mkdir -p /san/sanvol1/scratch/compugen cp /projects/compbio/data/microarray/compugen/zebrafish/GISZfishArray.fa \ /san/sanvol1/scratch/compugen/ # Set up cluster job to align Zebrafish consensus sequences to danRer4 mkdir -p /cluster/data/danRer4/bed/compugenZebrafish.2006-11-03 ln -s /cluster/data/danRer4/bed/compugenZebrafish.2006-11-03 \ /cluster/data/danRer4/bed/compugenZebrafish # Align sequences on the pitakluster. Scaffolds were aligned for NA_random # and Un_random and lifted to chrom level afterwards. Chroms 1-25 and M # were aligned as ~5 Mb chunks. ssh pk cd /cluster/data/danRer4/bed/compugenZebrafish ls -1 /san/sanvol1/scratch/compugen/GISZfishArray.fa > oligos.lst ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa) ls -1 $f >> genome.lst end wc -l genome.lst # 3237 genome.lst # oligos are 65 bp in length. # for output: mkdir -p /san/sanvol1/scratch/danRer4/compugen/psl # use -repeats option to report matches to repeat bases separately # to other matches in the PSL output. echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/scratch/danRer4/compugen/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst oligos.lst template.sub para.spec para create para.spec para try, check, push ... etc. para time # Completed: 3237 of 3237 jobs # CPU time in finished jobs: 1948s 32.46m 0.54h 0.02d 0.000 y # IO & Wait Time: 11145s 185.75m 3.10h 0.13d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 428s 7.13m 0.12h 0.00d # Submission to last job: 621s 10.35m 0.17h 0.01d # need to do pslSort and lift up ssh pk cd /san/sanvol1/scratch/danRer4/compugen # Do sort, liftUp and then best in genome filter. # only use alignments that have at least # 95% identity in aligned region. # Previously did not use minCover since a lot of sequence is in # Un and NA so genes may be split up so good to see all alignments. # However, found a number of short alignments of <= 50 bp. These are # not meaningful so maybe need to use minCover. If increased too much, # then hits on poor parts of the assembly will be missed. # use pslCDnaFilter with the same parameters as used for zebrafish # Genbank EST alignments. pslSort dirs raw.psl tmp psl pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \ -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl # for Compugen: # Dropping minCover to 0.10 doesn't make a difference. Decreasing the minId to # 0.92 increases the number of sequences aligned and does not increase # the number of alignments for sequences with the most alignments. # Removed the minimum non repeat filter does significantly increase the # number of alignments for some sequences. 145 CGENZEB_456015402_0 79 CGENZEB_456008445_0 72 CGENZEB_456015991_0 53 CGENZEB_456012678_0 46 CGENZEB_456004521_0 # Total sequences: 16399 seqs aligns total: 15544 102554 drop minNonRepSize: 1004 72545 drop minIdent: 825 3549 weird over: 13 48 kept weird: 8 16 drop localBest: 1288 7040 kept: 14632 19420 # 89.2% are kept. # minCov = 0.10 minNonRepSize = 8 # seqs aligns seqs aligns total: 15544 102554 drop minNonRepSize: 1004 72545 drop minIdent: 825 3549 weird over: 13 48 kept weird: 8 16 drop localBest: 1288 7040 kept: 14632 19420 # 89.2% are kept. # minCov=0.10 minNonRepSize = 10 seqs aligns total: 15544 102554 drop minNonRepSize: 1015 72795 drop minIdent: 811 3462 weird over: 13 48 kept weird: 8 16 drop localBest: 1278 6901 kept: 14616 19396 # 89.1% kept. # minNonRepSize = 0 seqs aligns total: 15544 102554 drop minIdent: 1344 23893 weird over: 42 271 kept weird: 24 44 drop localBest: 1772 49794 kept: 15338 28867 # 93.8% kept from total # but there are large numbers of alignments for some probes: # 62 CGENZEB_456005547_0 603 CGENZEB_456005221_0 454 CGENZEB_456010007_0 409 CGENZEB_456014900_0 372 CGENZEB_456009900_0 # try increase identity but low minReps pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \ -ignoreNs -bestOverlap -minId=0.97 -minCover=0.15 raw.psl \ contigMinRep8minId97.psl # seqs aligns total: 15544 102554 drop minNonRepSize: 1004 72545 drop minIdent: 1982 8772 weird over: 9 29 kept weird: 7 14 drop localBest: 766 2915 kept: 13715 18322 # this has improved highest number of hits a lot but this is similar to # that achieved with higher identity too # but only kept 80% of seqeuences. 145 CGENZEB_456015402_0 79 CGENZEB_456008445_0 72 CGENZEB_456015991_0 53 CGENZEB_456012678_0 46 CGENZEB_456004521_0 # lower minCov: pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \ -ignoreNs -bestOverlap -minId=0.95 -minCover=0.08 raw.psl \ contigMinCov8.psl # seqs aligns total: 15544 102554 drop minNonRepSize: 1004 72545 drop minIdent: 825 3549 weird over: 13 48 kept weird: 8 16 drop localBest: 1288 7040 kept: 14632 19420 # 89.2%, now nearBest = 0.1% pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \ -ignoreNs -bestOverlap -minId=0.95 -minCover=0.10 raw.psl \ contigMinCov10NearBest1percent.psl # seqs aligns total: 15544 102554 drop minNonRepSize: 1004 72545 drop minIdent: 825 3549 weird over: 13 48 kept weird: 7 15 drop localBest: 1350 7451 kept: 14632 19009 # same number of sequences aligning but less overall alignments: # 115 CGENZEB_456015402_0 # 71 CGENZEB_456015991_0 # 71 CGENZEB_456008445_0 # 46 CGENZEB_456004521_0 # 38 CGENZEB_456008610_0 # CGENZEB_456012678_0 now went down to 1. # 89.2% aligned # use minCover = 0.40 pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \ -ignoreNs -bestOverlap -minId=0.95 -minCover=0.40 raw.psl \ contig.psl seqs aligns total: 15544 102554 drop minNonRepSize: 1004 72545 drop minIdent: 825 3549 weird over: 13 48 kept weird: 7 15 drop localBest: 1350 7451 kept: 14632 19009 # little difference using minCover=0.60 cd /san/sanvol1/scratch/danRer4/compugen rm contig* # Use these parameters: pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \ -ignoreNs -bestOverlap -minId=0.95 -minCover=0.40 raw.psl \ contig.psl # use minCover = 0.40 seqs aligns total: 15544 102554 drop minNonRepSize: 1004 72545 drop minIdent: 825 3549 weird over: 13 48 kept weird: 7 15 drop localBest: 1350 7451 kept: 14632 19009 # use minCover=0.60 seqs aligns total: 15544 102554 drop minNonRepSize: 1004 72545 drop minIdent: 825 3549 drop minCover: 198 507 weird over: 9 39 kept weird: 4 12 drop localBest: 1285 7009 kept: 14588 18944 # lift up the coordinates to chrom level liftUp compugenZebrafish.psl \ /cluster/data/danRer4/jkStuff/liftAll.lft warn contig.psl # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft # Lifting contig.psl # rsync these psl files rsync -a --progress /san/sanvol1/scratch/danRer4/compugen/*.psl \ /cluster/data/danRer4/bed/compugenZebrafish ssh kkstore04 cd /cluster/data/danRer4/bed/compugenZebrafish # shorten names in psl file pslCheck compugenZebrafish.psl # psl is good # load track into database ssh hgwdev cd /cluster/data/danRer4/bed/compugenZebrafish hgsql -e 'drop table compugenZebrafish;' danRer4 hgLoadPsl danRer4 compugenZebrafish.psl # Add entry in trackDb/zebrafish/trackDb.ra and a search for hgFindSpec # Add a description page. # Need to add disclaimer for sequences. # Add consensus sequences for Zebrafish chip # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/hgFixed/compugenProbes ln -s \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /gbdb/hgFixed/affyProbes # these sequences were loaded previously so no need to reload. hgLoadSeq -abbr=Zebrafish: danRer3 \ /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa # Clean up rm batch.bak contig.psl raw.psl # check number of short alignments: hgsql -e \ 'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer4 # 7 # for previous filtered set, there were 1272 alignments of <= 50 bp so # this has improved. hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer4 # 14952 # Previously, there were 14819 so more sequences have aligned but less # short alignments are retained. Many of the short alignments may also # have longer alignments to different regions of the genome that are good. ######################################################################### # ENSEMBL GENES TRACKS FOR ENSEMBL VERSION 42 # ENSEMBL GENES (PROTEIN-CODING) AND ENSEMBL NON-CODING GENES # (DONE, 2007-01-08 - 2007-01-09 hartera) # Obtained from BioMart at Ensembl (The Wellcome Trust Sanger Institute) # Starting downloading Ensembl v41 genes (2006-12-13) # get "unexpected end of file" error with the peptide download. # Notified Ensembl (2006-12-15). # Ensembl helpdesk say that the files sometimes get terminated early # for large downloads so try using this link to BioMart instead: # http://www.biomart.org/biomart/martview # Repeat above using this link. This has Ensembl42 though so e-mailed # Ensembl to ask if they are releasing Ensembl42 soon (2006-12-18) # Ensembl was updated to v42 in Dec. 2006 so use this new data set # (2007-01-08): ssh kkstore04 mkdir -p /cluster/data/danRer4/bed/ensembl42 cd /cluster/data/danRer4/bed/ensembl42 # Get the Ensembl gene data from BioMart at: # http://www.biomart.org/biomart/martview # Follow this sequence through the pages: (NOTE: this interface has changed # significantly since danRer3). Ensembl version is 42 (Dec 2006). # 1) The Dataset link in the left panel is selected. Select the # Ensembl dataset (v42 here) and the Danio_rerio choice (ZFISH6 here). # 2) Click on the Attributes link in the left panel. # 3) Select Structures. Click on the + next to GENE to expand it # and check the boxes for the Ensembl Gene ID and Ensembl # Transcript ID. # 4) Clicking on the "Count" link on the top black menu shows that there # are 28,508 / 28,508 Genes selected in Danio rerio genes (ZFISH6) # 5) Click on the "Results" link on the top black menu and then select GFF # as the format and select to export all results to a # "Compressed web file (notify by e-mail)" and hit the "Go" button and # enter e-mail address as requested. # When results are ready, you will receive an e-mail with a link to # download the results, save as ensemblGene42.gff.gz # Save as and move file to # /cluster/data/danRer4/bed/ensembl42 gunzip ensemblGene42.gff.gz # file unzips ok. # Repeat above but at step 3, selec the Features Attribute # select Ensembl Transcript ID and Biotype under the GENE section. # Select "Text, tab separated" as the output format and gzip # compression. Biotype gives information to separate the genes into # protein-coding and RNA genes and pseudogenes. # For step 5, select CSV as the output and then select to export all # results to a "Compressed web file (notify by e-mail)" and hit the # "Go" button and enter e-mail address as requested. # Save as ensemblGene42Biotype.tsv.gz and move to # /cluster/data/danRer4/bed/ensembl42 gunzip ensemblGene42Biotype.tsv.gz # file unzips ok. # The Ensembl gene predictions are mapped to chromosomes except for # chrNA and chrUn. Use lift files for scaffolds to these chroms. # get chrUn and chrNA Ensembl records. ssh kkstore04 cd /cluster/data/danRer4/bed/ensembl42 # need to lift up the NA and Un scaffolds to chroms liftUp -type=.gtf ensemblGene42.lifted \ /cluster/data/danRer4/jkStuff/liftAll.lft carry ensemblGene42.gff # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft # Lifting ensemblGene42.gff wc -l ensemblGene42* # 807871 ensemblGene42.gff # 807871 ensemblGene42.lifted # 39626 ensemblGene42Biotype.tsv # check there are no scaffolds left in lifted file: grep Zv6_NA ensemblGene42.lifted grep Zv6_scaffold ensemblGene42.lifted # there are none so ok. # add chr at beginning of each line. NA and Un already have "chr" # prefix so then remove the extra one. sed -e "s/^/chr/" ensemblGene42.lifted | sed -e "s/chrchr/chr/" \ > ensGene42.gff # check file sizes -ok and some of the lifted co-ordinates # Also remove the suffix that denotes the transcript version number. # This is not in the ensGtp or ensPep tables. perl -pi.bak -e 's/\.[0-9]+//'g ensGene42.gff # Next split up the gff into a protein-coding gene set and a RNA gene and # pseudogene set and load into different tracks. # get transcript IDs only for protein coding transcripts grep "protein_coding" ensemblGene42Biotype.tsv | awk '{print $1}' \ > ensGene42ProteinCoding.ids # skip header line and grab everything else from the file tail +2 ensemblGene42Biotype.tsv | grep -v "protein_coding" \ | awk '{print $1}' > ensGene42NonCoding.ids wc -l ensGene42*ids # 3560 ensGene42NonCoding.ids # 36065 ensGene42ProteinCoding.ids # 39625 total wc -l ensemblGene42Biotype.tsv # 39626 ensemblGene41Biotype.tsv # extra line is the header line # then get only the protein-coding trancsripts from the GFF file # write a script to do this as grep is slow cat << 'EOF' > getIds.pl #!/usr/bin/perl -w use strict; my ($in, $file, %ids); $in = $ARGV[0]; # list of ids $file = $ARGV[1]; # GFF file or other data file open(IN, $in) || die "Can not open $in :$!\n"; open (FILE, $file) || die "Can not open $file :$!\n"; open (FOUND, ">found.log") || "Can not create found.log: $!\n"; while () { chomp; my $l = $_; $ids{$l} = 1; } close(IN); # read GFF file or other data file and check whether transcript ID is in # the hash before printing out that line. while (){ my ($line, $transId); $line = $_; $transId = ""; if ($line =~ /(ENSDART[0-9]+)/){ $transId = $1; } if (exists($ids{$transId})){ print $line; print FOUND "$transId\n"; } } close(FILE); 'EOF' chmod +x getIds.pl perl getIds.pl ensGene42ProteinCoding.ids ensGene42.gff \ > ensGene42ProteinCoding.gff # uniq found.log and check against input ids sort found.log | uniq > foundProtein.uniq sort ensGene42ProteinCoding.ids > ens42ProteinIds.sort comm -13 foundProtein.uniq ens42ProteinIds.sort # All ids were found in the gff file perl getIds.pl ensGene42NonCoding.ids ensGene42.gff \ > ensGene42NonCoding.gff sort found.log | uniq > foundNonCoding.uniq sort ensGene42NonCoding.ids > ens42NonCodingIds.sort comm -13 foundNonCoding.uniq ens42NonCodingIds.sort # All ids were found in the gff file rm *.sort *.uniq *.bak found.log wc -l ensGene42*.gff # 807871 ensGene42.gff # 3695 ensGene42NonCoding.gff # 804176 ensGene42ProteinCoding.gff # load into database ssh hgwdev cd /cluster/data/danRer4/bed/ensembl42 hgsql -e 'drop table ensGene;' danRer4 hgsql -e 'drop table ensGeneNonCoding;' danRer4 /cluster/bin/x86_64/ldHgGene danRer4 ensGene ensGene42ProteinCoding.gff # Read 36065 transcripts in 804176 lines in 1 files # 36065 groups 27 seqs 1 sources 4 feature types # 36065 gene predictions /cluster/bin/x86_64/ldHgGene danRer4 ensGeneNonCoding ensGene42NonCoding.gff # Read 3560 transcripts in 3695 lines in 1 files # 3560 groups 27 seqs 1 sources 1 feature types # 3560 gene predictions # The only difference between Ensembl v42 and v41 for zebrafish is two # extra gene predictions in the non-coding category in v42. # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use BioMart to create it as above, except: # Step 3) Choose "Features". Expand the GENE section and under # "Ensembl Attributes", check boxes for Ensembl Gene ID, # Ensembl Transcript ID, Ensembl Peptide ID. # After clicking on the Results link in the top black menu, # Choose CSV as the output format and Export all results to a # "Compressed web file (notify by e-mail)" and hit the # "Go" button and enter e-mail address as requested. # Result name: ensembl42Gtp.tsv.gz ssh kkstore04 cd /cluster/data/danRer4/bed/ensembl42 gunzip ensembl42Gtp.tsv.gz # separate the IDs for protein-coding genes and the rest (RNA genes and # pseudogenes). # transcript ID and gene ID are in different columns than before so switch # Gene ID should be in first column and Transcrip ID in the second column. awk 'BEGIN {FS="\t"} {OFS="\t"} {print $2,$1,$3}' ensembl42Gtp.tsv \ > ens42GtpFormat.tsv perl getIds.pl ensGene42ProteinCoding.ids ens42GtpFormat.tsv \ > ensGtpProteinCoding.txt # uniq found.log and check against input ids sort found.log | uniq > foundProtein.uniq sort ensGene42ProteinCoding.ids > ens42ProteinIds.sort comm -13 foundProtein.uniq ens42ProteinIds.sort perl getIds.pl ensGene42NonCoding.ids ens42GtpFormat.tsv \ > ensGtpNonCoding.txt # uniq found.log and check against input ids sort found.log | uniq > foundNonCoding.uniq sort ensGene42NonCoding.ids > ens42NonCodingIds.sort comm -13 foundNonCoding.uniq ens42NonCodingIds.sort # All ids were found in the gff file rm *.sort *.uniq *.bak found.log wc -l ensGtp*.txt # 3560 ensGtpNonCoding.txt # 36065 ensGtpProteinCoding.txt # The non-coding set have only gene ids and transcript ids and # no protein ids. # Load database ssh hgwdev cd /cluster/data/danRer4/bed/ensembl42/ hgsql -e 'drop table ensGtp;' danRer4 # load ensGtp for protein-coding genes hgLoadSqlTab danRer4 ensGtp ~/kent/src/hg/lib/ensGtp.sql \ ensGtpProteinCoding.txt # only load IDs for the protein coding genes. The non-coding genes # have no protein ID. # Get the ensembl peptide sequences from # http://www.biomart.org/biomart/martview # Follow this sequence: # 1) Choose the Ensembl Genes 42 as the database and then # Danio Rerio genese (ZFISH6) as the dataset. # 2) Click on the Attributes link in the left panel. Select sequences. # 3) Expand the SEQUENCES section and choose Peptide as type of sequence # to export and then expand the Header Information section and select # Ensembl Gene ID from Gene Attributes and # Ensembl Transcript ID and Ensembl Peptide ID from # Transcript Attributes # 4) Click on the Filters link in the left panel and expand the GENE # section. Select the Gene type box and then select protein_coding as # these are the only genes with an associated protein sequence. # 5) Click on the Results link in the top black menu bar and # choose FASTA for the output and export all results to # Compressed file (notify by e-mail). # save the file as ensembl42Pep.fasta.gz and move to # /cluster/data/danRer4/bed/ensembl42 # Got results URL by e-mail but BioMart seems to be currently inaccessible ssh kkstore04 cd /cluster/data/danRer4/bed/ensembl42 gunzip ensembl42Pep.fasta.gz grep '>' ensembl42Pep.fasta | wc -l # 36048 grep '>' ensembl42Pep.fasta > headers awk 'BEGIN {FS="|"} {print $2;}' headers > pepTranscript.ids sort pepTranscript.ids | uniq > pepTranscript.ids.sort sort ensGene42ProteinCoding.ids | uniq > proteinCoding.ids.sort comm -13 proteinCoding.ids.sort pepTranscript.ids.sort # no difference comm -23 proteinCoding.ids.sort pepTranscript.ids.sort > noPep # There are 17 of these. # found some of them on the Ensembl zebrafish Genome Browser and found # the peptide sequences. E-mailed Ensembl's helpdesk to ask how to get # peptide sequences for these 17 transcript IDs (2007-01-09). # Then downloaded peptide sequences for just this set of 17, but only got # 16 of them. To do this, follow the instructions as above for the # obtaining the peptide sequences but on the Filters page, expand the GENE # section and check the box for ID list limit and select # Ensembl Transcript ID(s) and paste in the list. Name output file # otherIDs.fasta.gz gunzip otherIDs.fasta.gz grep '>' otherIDs.fasta > headers2 awk 'BEGIN {FS="|"} {print $2;}' headers2 > otherPepTranscript.ids sort otherPepTranscript.ids | uniq > otherPepTranscript.ids.sort comm -13 noPep otherPepTranscript.ids.sort # no difference comm -23 noPep otherPepTranscript.ids.sort # ENSDART00000049311 # Repeat above procedure to query for peptide sequence for just this # transcript ID and name file: otherIDs2.fasta.gz # E-mailed helpdesk@ensembl.org to report all these problems (2007-01-09) gunzip otherIDs2.fasta.gz # Concatenate all sequences: cat ensembl42Pep.fasta otherIDs.fasta otherIDs2.fasta > ens42Pep.fasta grep '>' ens42Pep.fasta | wc # 36065 grep '>' ens42Pep.fasta > all.headers awk 'BEGIN {FS="|"} {print $2;}' all.headers | sort | uniq > allTxIds.sort comm -13 proteinCoding.ids.sort allTxIds.sort # no difference comm -23 proteinCoding.ids.sort allTxIds.sort # no difference so got all protein sequences for the protein-coding # trancsript IDs now. # load into database ssh hgwdev cd /cluster/data/danRer4/bed/ensembl42 hgsql -e 'drop table ensPep;' danRer4 hgPepPred danRer4 ensembl ensembl42Pep.fasta # edit trackDb/zebrafish/danRer4 to have an ensGene entry with the # archive date for Enembl v42 which is used for creating stable archive # links for the transcript ID and protein ID to make sure that these # always connect to the correct version of Ensembl Genes. # added track handler to hgTracks.c for ensGeneNonCoding and added # code to hgc.c to handle creating the correct stable archive link for # a particular version of Ensembl. # trackDb/zebrafish/danRer4/trackDb.ra entries for ensGene and # ensGeneNonCoding include these lines for creating the correct URLs: # url http://dec2006.archive.ensembl.org/Danio_rerio/transview?transcript=$$ # urlName gene # archive dec2006 # Add Biotype and External Gene ID to the Ensembl Non-Coding genes table # These can be retrieved from BioMart using the method as above for # Biotype but also selecting the External Gene ID. Click on the Filter # link on the left panel and expand the GENE section and check the box # for Gene Type and select all types except for protein_coding. # Select TSV as the output and Compressed file (*.gz) as the format. # save as ensNonCoding.biotype.txt.gz ssh hgwdev cd /cluster/data/danRer4/bed/ensembl42 gunzip ensNonCoding.biotype.txt.gz tail +2 ensNonCoding.biotype.txt > ensNonCoding.biotype.tab cat << 'EOF' > ensBiotype.sql CREATE TABLE ensBiotype ( transcriptId varchar(255) not null, biotype varchar(255) not null, extGeneId varchar(255) not null ); 'EOF' hgLoadSqlTab danRer4 ensBiotype ensBiotype.sql ensNonCoding.biotype.tab # Add extra fields to ensNonCoding genePred table: hgsql -e \ 'alter table ensGeneNonCoding add biotype varchar(255) NOT NULL;' \ danRer4 hgsql -e \ 'alter table ensGeneNonCoding add extGeneId varchar(255) NOT NULL;' \ danRer4 # Add index to the extGeneId column: hgsql -e 'alter table ensGeneNonCoding add index(extGeneId);' danRer4 hgsql -e 'select count(*) from ensGeneNonCoding;' danRer4 # 3560 hgsql -e 'update ensGeneNonCoding set biotype = "";' danRer4 hgsql -e 'update ensGeneNonCoding set extGeneId = "";' danRer4 # Now populate these columns with data from the ensBiotype table hgsql -e 'select count(*) from ensGeneNonCoding as g, ensBiotype as b \ where g.name = b.transcriptId;' danRer4 # 3560 hgsql -e 'update ensGeneNonCoding as g, ensBiotype as b \ set g.biotype = b.biotype where g.name = b.transcriptId;' danRer4 hgsql -e 'select count(*) from ensGeneNonCoding where biotype != "";' \ danRer4 # 3560 # then set the External Gene ID: hgsql -e 'update ensGeneNonCoding as g, ensBiotype as b \ set g.extGeneId = b.extGeneId where g.name = b.transcriptId;' danRer4 hgsql -e 'select count(*) from ensGeneNonCoding where biotype != "";' \ danRer4 # 3393 # This is correct since 167 rows in the ensNonCoding.biotype.tab have no # external Gene ID: awk '{if ($3 == "") print;}' ensNonCoding.biotype.tab | wc -l # 167 # 3393 + 167 = 3360 # Now check code in hgc.c for handling the details page for this track. ######################################################################### # RADIATION HYBRID (RH) MAP TRACK (DONE, 2007-01-12 - 2007-01-23, hartera) # Data from Yi Zhou at Boston Children's Hospital: # yzhou@enders.tch.harvard.edu # Latest RH map sequences and primers received on 2006-10-03 from # Anhua (Peter) Song - asong@enders.tch.harvard.edu # Changed the name of rhMapInfo table and related files to rhMapZfishInfo # to make the name more zebrafish-specific (2007-02-08, hartera) # Remake track as one of the primer sequences was in the sequence for # 1942C.INSERTMUT and also changed another marker name to remove a forward # slash. Remade rhMapZfishInfo table and removed spaces from primer sequences. # (2007-02-14, hartera) # Collected stats on RH map alignments for Yi Zhou (DONE, 2007-03-28, hartera) ssh kkstore04 mkdir /cluster/data/danRer4/bed/ZonLab/rhMap-2006-10-03 cd /cluster/data/danRer4/bed/ZonLab ln -s rhMap-2006-10-03 rhMap cd rhMap # download data files from e-mail: # rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip unzip rhSequenceSubmit100306.zip unzip rhSequenceSubmitSeq100306.zip dos2unix rhSequenceSubmit100306.txt dos2unix rhSequenceSubmitSeq100306.txt # Sequences are in rhSequenceSubmitSeq100306.txt and primers and other # information are in rhSequenceSubmi100306.txt grep '>' rhSequenceSubmitSeq100306.txt | wc -l # 11514 wc -l rhSequenceSubmit100306.txt # 13438 rhSequenceSubmit100306.txt grep '>' rhSequenceSubmitSeq100306.txt > rhMap.names # remove '>' from names and grab first field perl -pi.bak -e 's/>//' rhMap.names awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \ > rhMap.namesOnly.sort awk 'BEGIN {FS="|"} {print $1;}' rhSequenceSubmit100306.txt | sort | uniq \ > rhMapPrimers.namesOnly.sort wc -l *.sort # 11514 rhMap.namesOnly.sort # 13436 rhMapPrimers.namesOnly.sort (after removing blank line) # get a list of headers from the FASTA file grep '>' rhSequenceSubmitSeq100306.txt > rhMap.headers awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq # BAC_END # EST # GENE # SSLP # STS # There are 5 types of sequence here. awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq #BACends #Custom #Insertion_Mutant #Insertion_Mutants #MGH #NCBI #Sanger SG #Sequencing_Project #ThisseClone #Thisse_Clone #other_zfEst #wu_zfEst #wz awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq # CHBG # MPIEB # Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone; # So there are 11 different sources. # There are 2 sequences with problem primers. E-mailed Peter Song about # these and he suggested to delete thoser primers: # >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.| # >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A| # edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers. # need to reformat FASTA headers so they are in the format: # NAME.SOURCE.TYPE.ORIGIN # Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone # so change these to have the same name. Also shorten Sanger SG to # Shotgun. sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmitSeq100306.txt \ | sed -e 's/Insertion_Mutant/InsertMut/' \ | sed -e 's/Sanger SG/Shotgun/' \ | sed -e 's/ThisseClone/Thisse/' \ | sed -e 's/Thisse_Clone/Thisse/' \ | sed -e 's/Sequencing_Project/Seqproj/' > rhMap100306.fa # Do the same for the primers and information file: sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmit100306.txt \ | sed -e 's/Insertion_Mutant/InsertMut/' \ | sed -e 's/Sanger SG/Shotgun/' \ | sed -e 's/ThisseClone/Thisse/' \ | sed -e 's/Thisse_Clone/Thisse/' \ | sed -e 's/Sequencing_Project/Seqproj/' > rhMapPrimers100306.txt # edit these files to remove the extra newline char after the first primer # for 1942c and then change "/" in FJ34C05.Y1/FJ56G09.Y1.WU_ZFEST to # an underscore (2007-02-14, hartera) perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \ rhMap100306.fa perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \ rhMapPrimers100306.txt # use a script to reformat the names for the FASTA headers to the format # >NAME.SOURCE where name is the first field separated by "|" and source # is the 9th field. The source is used to make the name unique. Some # of these names are BAC ends that occur in the BAC ends track so there # are name clashes in the seq table if the names are not made unique. # Also make the name upper case as for those for the danRer1 and danRer2 # RH map and remove base numbering on each sequence line of FASTA file. cat << '_EOF_' > rhFix #!/usr/bin/awk -f #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG| /^>/ { split(toupper($0), a, "\\|"); print a[1]"."a[9]; next; } /^[0-9]+ / { $0 = $2; } { print $0; } '_EOF_' # << keep emacs coloring happy chmod +x rhFix rhFix rhMap100306.fa > rhMap.fa # Blat sequences vs danRer4 genome ssh pk mkdir -p /cluster/data/danRer4/bed/ZonLab/rhMap/blatRun cd /cluster/data/danRer4/bed/ZonLab/rhMap # put the rhMap sequences on the san mkdir -p /san/sanvol1/scratch/danRer4/rhMap cp rhMap.fa /san/sanvol1/scratch/danRer4/rhMap/ # do blat run to align RH map sequences to danRer4 and and use # chrNA_random and chrUn_random separated into scaffolds. cd blatRun ls -1S /san/sanvol1/scratch/danRer4/rhMap/rhMap.fa > rhMap.lst ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa) ls -1 $f >> genome.lst end wc -l genome.lst # 3237 genome.lst # for output: mkdir -p /san/sanvol1/scratch/danRer4/rhMap/psl # use -repeats option to report matches to repeat bases separately # to other matches in the PSL output. echo '#LOOP\n/cluster/bin/x86_64/blat -repeats=lower -minIdentity=80 -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/scratch/danRer4/rhMap/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst rhMap.lst template.sub para.spec para create para.spec para try, check, push ... etc. para time # Completed: 3237 of 3237 jobs #CPU time in finished jobs: 4787s 79.78m 1.33h 0.06d 0.000 y #IO & Wait Time: 8080s 134.67m 2.24h 0.09d 0.000 y #Average job time: 4s 0.07m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 18s 0.30m 0.01h 0.00d #Submission to last job: 752s 12.53m 0.21h 0.01d # need to do pslSort and lift up ssh pk cd /san/sanvol1/scratch/danRer4/rhMap # Do sort, liftUp and then best in genome filter. # only use alignments that have at least # 95% identity in aligned region. # Previously did not use minCover since a lot of sequence is in # Un and NA so genes may be split up so good to see all alignments. # However, found a number of short alignments of <= 50 bp. These are # not meaningful so maybe need to use minCover. If increased too much, # then hits on poor parts of the assembly will be missed. # use pslCDnaFilter with the same parameters as used for zebrafish # Genbank EST alignments. pslSort dirs raw.psl tmp psl pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 # drop minNonRepSize: 3068 1286657 # drop minIdent: 3442 104586 # drop minCover: 2838 205568 # weird over: 163 1124 # kept weird: 107 172 # drop localBest: 3011 17130 # kept: 11121 14216 # 11514 # The percentage aligned is 11121/11514 = 96.6% # Number of alignments for markers with most alignments after filtering: # 35 BZ83M20.Z.BACENDS # 17 ZKP63A5.YA.BACENDS # 17 ZKP117C9.YA.BACENDS # 16 ZK30E10.SP6.BACENDS # 15 ZC133H17.ZA.BACENDS # 12 Z13442.MGH # 11 ZK105J10.T7.BACENDS # 10 ZC261G9.ZAF.BACENDS # 10 ZC261G9.ZA.BACENDS # 9 ZK19H9.SP6.BACENDS # 9 Z4910.MGH # 9 FJ07G09.X1.WU_ZFEST # 8 ZK4I5.T7.BACENDS # 8 ZC27I3.ZA.BACENDS pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.92 -minCover=0.15 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 # drop minNonRepSize: 3068 1286657 # drop minIdent: 2740 60578 # drop minCover: 3083 223430 # weird over: 318 3132 # kept weird: 154 249 # drop localBest: 3480 43022 # kept: 11212 14470 # Percentage aligned is 11212/11514 = 97.4% pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=10 \ -ignoreNs -bestOverlap -minId=0.92 -minCover=0.15 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 #drop minNonRepSize: 3026 1258275 # drop minIdent: 2902 72521 # drop minCover: 3256 231002 # weird over: 344 3365 # kept weird: 157 252 # drop localBest: 3604 51799 # kept: 11228 14560 # There isn't much difference 11228/11514 = 97.5% awk '{print $10}' contig.psl | sort | uniq -c | sort -nr # Top numbers of hits: # 35 BZ83M20.Z.BACENDS # 17 ZKP63A5.YA.BACENDS # 17 ZKP117C9.YA.BACENDS # 16 ZK30E10.SP6.BACENDS # 15 ZC133H17.ZA.BACENDS # 13 FJ07G09.X1.WU_ZFEST # 12 Z13442.MGH # 11 ZK105J10.T7.BACENDS # 10 ZC261G9.ZAF.BACENDS # 10 ZC261G9.ZA.BACENDS # 9 ZK19H9.SP6.BACENDS # 9 Z4910.MGH # 9 Z3157.MGH # 8 ZK4I5.T7.BACENDS pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.90 -minCover=0.15 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 # drop minNonRepSize: 3068 1286657 # drop minIdent: 2306 34000 # drop minCover: 3166 230461 # weird over: 388 5030 # kept weird: 168 270 # drop localBest: 3647 62505 # kept: 11232 14534 # Percent sequences aligned: 11232/11514 = 97.6% pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.90 -minCover=0.20 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 # drop minNonRepSize: 3068 1286657 # drop minIdent: 2306 34000 # drop minCover: 3418 245102 # weird over: 343 4235 # kept weird: 159 252 # drop localBest: 3206 48291 # kept: 11189 14107 # Percent sequences aligned: 11189/11514 = 97.2% pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.80 -minCover=0.20 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 #drop minNonRepSize: 3068 1286657 # drop minIdent: 1 2 # drop minCover: 3599 256955 # weird over: 414 8594 # kept weird: 173 270 # drop localBest: 3410 70389 # kept: 11205 14154 # Percent sequences aligned: 11205/11514 = 97.3% # 35 BZ83M20.Z.BACENDS # 17 ZKP63A5.YA.BACENDS # 17 ZKP117C9.YA.BACENDS # 16 ZK30E10.SP6.BACENDS # 15 ZC133H17.ZA.BACENDS # 13 FJ07G09.X1.WU_ZFEST # 11 ZK105J10.T7.BACENDS # 10 ZC261G9.ZAF.BACENDS # 10 ZC261G9.ZA.BACENDS # 9 ZK19H9.SP6.BACENDS # 9 Z4910.MGH # 8 ZK4I5.T7.BACENDS # 8 ZC27I3.ZA.BACENDS # 8 Z7243.MGH # 8 Z3157.MGH pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.80 -minCover=0.15 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 #drop minNonRepSize: 3068 1286657 # drop minIdent: 1 2 # drop minCover: 3322 238087 # weird over: 470 9995 # kept weird: 181 288 # drop localBest: 3876 88821 # kept: 11246 14590 # Percent sequences aligned: 11246/11514 = 97.7% # Use lower minId and higher minCover (0.20) as for the BAC ends and for # the RH map on other zebrafish assemblies. pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.85 -minCover=0.20 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 #drop minNonRepSize: 3068 1286657 # drop minIdent: 775 3806 # drop minCover: 3552 255528 # weird over: 403 7578 # kept weird: 171 268 # drop localBest: 3358 68020 # kept: 11203 14146 # 97.3% (11203/11514) of sequences are aligned using these filter criteria # Loaded these sequences as below and then checked the rhMap track in the # danRer4 Genome Browser to see if there are any pileups. # there is one big pileup on chr24 that is in the same region as # that was found for danRer3 after using liftOver: # i.e. chr13:8,112,962-8,113,055 on danRer3 which lifts over to # chr24:8,191,404-8,191,497 on danRer4 and there is also a pileup # of RH map sequences here. If you look at Z33743, it has 3 alignments # to chr23, chr24 and chrNA_random. The chr23 alignment is the best and # this is where its primers map to. If a higher threshold is taken # for min coverage in the filtering, this may be avoided. Checked all the # whole chromosome views in the Browser and chr24 is the only one that # appears to have this large pileup. # try increasing the minCover parameter: pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.85 -minCover=0.25 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 #drop minNonRepSize: 3068 1286657 # drop minIdent: 775 3806 # drop minCover: 3754 271241 # weird over: 358 6379 # kept weird: 157 252 # drop localBest: 2916 52769 # kept: 11100 13684 # Percent sequences aligned: 11100/11514 = 96.4% pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.85 -minCover=0.30 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 # drop minNonRepSize: 3068 1286657 # drop minIdent: 775 3806 # drop minCover: 3929 283124 # weird over: 310 5451 # kept weird: 145 236 # drop localBest: 2549 41325 # kept: 10938 13245 # Percent sequences aligned: 10938/11514 = 95.0% pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.85 -minCover=0.40 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 #drop minNonRepSize: 3068 1286657 # drop minIdent: 775 3806 # drop minCover: 4293 298517 # weird over: 245 4052 # kept weird: 128 211 # drop localBest: 2079 26658 # kept: 10489 12519 # Percent sequences aligned: 10489/11514 = 91.1% pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.85 -minCover=0.35 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 # drop minNonRepSize: 3068 1286657 # drop minIdent: 775 3806 # drop minCover: 4119 292022 # weird over: 274 4640 # kept weird: 137 227 # drop localBest: 2279 32801 # kept: 10724 12871 # Percent sequences aligned: 10724/11514 = 93.1% pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.85 -minCover=0.32 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 # drop minNonRepSize: 3068 1286657 # drop minIdent: 775 3806 # drop minCover: 4001 287002 # weird over: 296 5113 # kept weird: 144 235 # drop localBest: 2437 37599 # kept: 10862 13093 # Percent sequences aligned: 10862/11514 = 94.3% rm contig* # Final parameters: use minCover=0.33 pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.85 -minCover=0.33 raw.psl contig.psl # seqs aligns # total: 11326 1628158 # drop invalid: 1 1 # drop minNonRepSize: 3068 1286657 # drop minIdent: 775 3806 # drop minCover: 4045 288763 # weird over: 287 4946 # kept weird: 142 233 # drop localBest: 2375 35906 # kept: 10818 13025 # Percent sequences aligned: 10818/11514 = 94.0% # This is a compromise between reducing the number of sequences piling # up on chr24 but not losing all alignments for too many sequences. cd /cluster/data/danRer4/bed/ZonLab/rhMap # lift up to genome level coordinates rm rhMap*psl liftUp rhMap.psl \ /cluster/data/danRer4/jkStuff/liftAll.lft warn \ /san/sanvol1/scratch/danRer4/rhMap/contig.psl # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft pslCheck rhMap.psl # psl looks ok # cleanup rm *.bak rhMap.headers rhMap.names *.sort headers.new # Load sequence alignments into the database ssh hgwdev cd /cluster/data/danRer4/bed/ZonLab/rhMap # drop test tables and reload final psl file # drop old rhMap table hgsql -e 'drop table rhMap;' danRer4 hgLoadPsl danRer4 rhMap.psl # Copy sequences to gbdb if they are not already there. mkdir -p /gbdb/danRer4/rhMap # remove old sequences rm /gbdb/danRer4/rhMap/rhMap20061003.fa ln -s \ /cluster/data/danRer4/bed/ZonLab/rhMap/rhMap.fa \ /gbdb/danRer4/rhMap/rhMap20061003.fa # then add sequences to database: # remove old sequences (2007-02-14, hartera) hgsql -e 'select * from extFile where path like "%rhMap%";' danRer4 # id | name | path | size | +--------+------------------+--------------------------------------+---------+ #| 709793 | rhMap20061003.fa | /gbdb/danRer4/rhMap/rhMap20061003.fa | 7456887 | hgsql -e 'select count(*) from seq where extFile = 709793;' danRer4 # 11514 hgsql -e 'delete from seq where extFile = 709793;' danRer4 hgsql -e 'delete from extFile where id = 709793;' danRer4 # then reload the new sequence file hgLoadSeq danRer4 /gbdb/danRer4/rhMap/rhMap20061003.fa # loaded succesfully # Check in the Browser and see if there are many pileups # Much reduced now on chr24. Took 10 random sequences in the pileup from # minCover=0.20 and found that 7 of them still align to danRer4 # with minCover=0.33 and 2 of those that don't also have primers that # do not map using the hgPcr tool. # Add trackDb entry and also an rhMap.html for trackDb/zebrafish/danRer4 # also add the search specs for hgFindSpec to trackDb.ra # Add table of related information for the RH map details pages: # Check that all the headers from rhMap.headers are also in the primers # file which seems to contain the same headers from the FASTA file # as well as additional markers. # Remake the rhMapZfishInfo table too (hartera, 2007-02-14) so that # new line is removed from 1942C.INSERTMUT line and also the underscore is # added to the FJ34C05.Y1_FJ56G09.Y1.WU_ZFEST ID in place of "/". ssh kkstore04 cd /cluster/data/danRer4/bed/ZonLab/rhMap/ grep '>' rhMap100306.fa > rhMap.headers perl -pi.bak -e 's/>//' rhMap.headers sort rhMap.headers > rhMap.headers.sort sort rhMapPrimers100306.txt > rhMapPrimers.sort wc -l *.sort # 11514 rhMap.headers.sort # 13437 rhMapPrimers.sort comm -12 rhMap.headers.sort rhMapPrimers.sort | wc -l # 11514 in common # so all FASTA headers from rhMap022306.fa are in the primers file # Get headers again from rhMap.fa file as the names of the sources have # been changed. Parse out information from headers to add to an rhMapInfo # table so that this information can be displayed on the details page for # the RH map markers. # Fields: 1 - name, 2 - linkage group (chrom), 3 - position number on the # RH map for that linkage group, 4 - distance (in cR) from the # top of a linkage group, 4 - position number in entire RH map (ordered # from LG1 to LG25, 5 - type of marker (SSLP, BAC_END, EST, GENE, STS), # 9 - source, 10 - institute that mapped the marker, 11 - 5' forward primer, # 12 - 3' reverse primer. # Sort headers by linkage group and by position grep '>' rhMap100306.fa > rhMap.headers2 # then use the rhMap.headers2 file to extract the marker information # and to reformat the names for the FASTA headers to the format # >NAME.SOURCE where name is the first field separated by "|" and source # is the 9th field so that names in the rhMap and rhMapInfo tables are # the same. The source is used to make the name unique. cat << '_EOF_' > getRhInfo #!/usr/bin/awk -f #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG| /^>/ { sub(/>/,"",$0); split(toupper($0), a, "\\|"); print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12]; next; } '_EOF_' # << keep emacs coloring happy chmod +x getRhInfo getRhInfo rhMap.headers2 > rhMapInfo.tab # Sort headers by linkage group (LG) and by position sort -k 2,2 -k 3,3n rhMapInfo.tab > rhMapInfoSorted.tab wc -l rhMapInfoSorted.tab # 11514 rhMapInfoSorted.tab # Need to add ZFIN IDs - data received on 2006-06-23 # rhSeqWithZdbNameToRachel.zip unzip rhSeqWithZdbNameToRachel.zip tail +3 rhSeqWithZdbNameToRachel.txt \ | awk 'BEGIN {OFS= "\t"} {print $1, $7}' \ | sort | uniq > rhSeqZfinIds.txt # translate names to upper case cat rhSeqZfinIds.txt | tr '[a-z]' '[A-Z]' > rhSeqZfinIds.format.txt # then map these marker names and ZFIN IDs to markers in # rhMapInfoSorted.tab. Also remove spaces - some of the primer sequences # have spaces (hartera, 2007-02-14) cat << 'EOF' > mapZfinIds.pl #!/usr/bin/perl -w use strict; my ($zf, $rh, %zfinIds); $zf = $ARGV[0]; # file of ZFIN IDs and marker names $rh = $ARGV[1]; # rhMapInfo.tab file open (ZFIN, $zf) || die "Can not open $zf :$!\n"; open (RH, $rh) || die "Can not open $rh : $!\n"; while (){ my ($line, @fi); chomp; $line = $_; @fi = split(/\t/, $line); # store ZFIN ID in hash keyed by marker name $zfinIds{$fi[1]} = $fi[0]; } close ZFIN; # read in the markers from rhMapInfo file while (){ my ($li, @f, $marker, @m, $mName, $j, $i); $mName = ""; $zf = ""; chomp; $li = $_; @f = split(/\t/, $li); $marker = $f[0]; # split by "." @m = split(/\./, $marker); # remove the extension after the last "." $mName = $m[0]; if (($mName ne "") && (exists($zfinIds{$mName}))) { $zf = $zfinIds{$mName}; } for ($j = 1; $j < $#m; $j++){ $mName = $mName . "." . $m[$j]; } if (($mName ne "") && (exists($zfinIds{$mName}))) { $zf = $zfinIds{$mName}; } print "$f[0]\t$zf"; # print other fields and remove spaces for ($i = 1; $i <= $#f; $i++){ $f[$i] =~ s/\s//g; print "\t$f[$i]"; } if ($#f == 6){ print "\t\t"; } print "\n"; } 'EOF' chmod +x mapZfinIds.pl perl mapZfinIds.pl rhSeqZfinIds.format.txt rhMapInfoSorted.tab \ > rhMapInfoWithZfinIds.tab # There are 1867 markers with no ZFIN ID wc -l rhMapInfo* # 11514 rhMapInfo.tab # 11514 rhMapInfoSorted.tab # 11514 rhMapInfoWithZfinIds.tab # When loading, found that 1942.C has only 1 primer. Problem with # rhMapPrimers100306.txt. There was a new line between the primers # for this file so remove it there and in rhMap100306.fa and then # process it again (now this was done at an earlier step, 2007-02-14). # Create a table with RH map item information including type, source, # origin and primer sequences. # already created rhMapInfo.sql, rhMapInfo.c and rhMapInfo.h files # using autosql - see danRer3.txt. None of the assemblies with RH # map on the RR have this rhMapInfo table so it can be redefined. # load these into a table called rhMapInfo2 - this is rhMapInfo # with an extra column for the ZFIN ID. # Use autosql to create a .sql file. ssh hgwdev # rename the information table and make it zebrafish specific # (2007-02-08, hartera) cat << 'EOF' > ~/kent/src/hg/lib/rhMapZfishInfo.as table rhMapZfishInfo "Zebrafish Radiation Hybrid map information" ( string name; "Name of Radiation Hybrid (RH) map marker" string zfinId; "ZFIN ID for the marker" string linkageGp; "Linkage group to which the marker was mapped" uint position; "Position number in RH map for this linkage group" uint distance; "Distance from the top of linkage group (cR)" string markerType; "Type of marker" string source; "Source of marker" string mapSite; "Institution that mapped the marker" string leftPrimer; "Forward primer sequence" string rightPrimer; "Reverse primer sequence" ) 'EOF' # << happy emacs # create .sql, .c and .h files using autoSql cd ~/kent/src/hg/lib autoSql rhMapZfishInfo.as rhMapZfishInfo mv rhMapZfishInfo.h ../inc # edit rhMapZfishInfo.sql and add an index (INDEX(zfinId)). # commit these files (*.as, *sql, *.c and *.h) to CVS replacing # the original rhMapInfo* files. # make changes to hgc so that it prints the ZFIN ID in addition to the # other rhMapZfishInfo fields. # reload table with new name (2007-02-08, hartera): cd /cluster/data/danRer4/bed/ZonLab/rhMap hgsql -e 'drop table rhMapInfo;' danRer4 # reloaded the rhMapZfishInfo table (2007-02-08, hartera) hgsql -e 'drop table rhMapZfishInfo;' danRer4 hgLoadSqlTab danRer4 rhMapZfishInfo ~/kent/src/hg/lib/rhMapZfishInfo.sql \ rhMapInfoWithZfinIds.tab # add code to hgc.c to print ZFIN ID, if available, on the details page # together with the other marker-related information. # added track to trackDb.ra in trackDb/zebrafish/danRer4 with a URL for # the ZFIN IDs to link to the relevant page at http://www.zfin.org # and added an html page for the track. # Added the rhMapZfishInfo.h file to the makefile in src/hg/lib # and replaced rhMapInfo with rhMapZfishInfo in src/hg/hgc/hgc.c # RH MAP STATISTICS # Get some stats for Yi Zhou at Harvard (2007-03-20 & 2007-03-28) # Of the 11514 markers with sequence information, 10818 aligned (94%) # using a filter for 85% sequence identity and all portions of all # alignments for a sequence must be within 0.5% of the identity of the # best alignments for each portion of the marker. The query must have at # least 0.33 of the sequence aligned and at least 16 bases must not be in # repeat regions. cd /cluster/data/danRer4/bed/ZonLab/rhMap mkdir stats cd stats hgsql -e 'select count(distinct(qName)) from rhMap;' danRer4 # 10818 hgsql -N -e 'select qName from rhMap;' danRer4 | sort | uniq -c \ | sort -nr > qNames.count # send this list too # 1701 markers have 2 or more BLAT alignment that pass the filter. hgsql -N -e 'select name, linkageGp from rhMapZfishInfo;' danRer4 \ > markers.linkageGroups hgsql -N -e 'select qName, tName from rhMap;' danRer4 > rhMap.align.chroms ssh kkstore04 cd /cluster/data/danRer4/bed/ZonLab/rhMap/stats sed -e 's/LG/chr/' markers.linkageGroups > markers.rhMap.chroms # some marker names contain "LG" awk '{print $1}' markers.linkageGroups | grep "LG" # there are 18 and all begin with "TLG" sed -e 's/Tchr/TLG/' markers.rhMap.chroms > markers.rhMap.chroms2 sort markers.rhMap.chroms2 | uniq > markers.rhMap.chroms.sort wc -l markers.rhMap.chroms* # 11514 markers.rhMap.chroms # 11514 markers.rhMap.chroms.sort # 11514 markers.rhMap.chroms2 # same when uniqued sort rhMap.align.chroms | uniq > rhMap.align.chroms.sort wc -l rhMap.align* # 13025 rhMap.align.chroms # 11344 rhMap.align.chroms.sort # Find how well the RH map and Zv6 agree in terms of chromosome # assignment given that linkage group number is the same as the # chromosome number. comm -23 rhMap.align.chroms.sort markers.rhMap.chroms.sort \ > diffChromInGenome # need to find just those in rhMap.align.chroms.sort that are # in rhMap. awk '{print $1}' rhMap.align.chroms.sort | sort | uniq > rhMap.align.names foreach n (`cat rhMap.align.names`) echo $n grep -w $n markers.rhMap.chroms.sort >> markers.rhMap.chroms.aligned end # 10818 in markers.rhMap.chroms.aligned # 10818 rhMap.align.names # then compare this list to the ones that are aligned to the genome comm -13 rhMap.align.chroms.sort markers.rhMap.chroms.aligned \ > diffChromInRHMap wc -l diffChromInRHMap # 1392 diffChromInRHMap # these are the markers that have a different chromosome (linkage group) # assigned in the RH map to that found by BLAT alignment of the marker # sequence to the genome. This list shows the linkage groups (chr) in the # RH map then generate a list of where these align in the genome # These are markers that have at least one alignment to the same chrom # as in the linkage map. They may be aligning to other chroms too. awk '{print $1}' diffChromInRHMap > diffChromInRHMap.names foreach n (`cat diffChromInRHMap.names`) echo $n grep -w $n rhMap.align.chroms.sort >> rhMap.genomeAlign.diffInRHmap end wc -l rhMap.genomeAlign.diffInRHmap # 1562 rhMap.genomeAlign.diffInRHmap # This is the list of markers that differ in chrom between the RH map # and genome alignment with the list of chroms to which they are # aligned by BLAT in an alignment of the marker sequence to the genome. # There are more lines in this file because some markers align more than # once to the genome so they appear more than once in the file. # Therefore of those markers aligned, 10818, there are 1392 (12.9%) # that are aligning to a different chromosome. # Some of these may be aligning to chrUn_random or chrNA_random grep random rhMap.genomeAlign.diffInRHmap | awk '{print $1}' \ | sort | uniq > diffInRHmap.alignedToRandom wc -l diffInRHmap.alignedToRandom # 142 diffInRHmap.alignedToRandom # Of the markers with different chroms in the genome alignment and the # linkage map, 142 (1.3% of 10818) are aligning to chrUn_random or # chrNA_random so the sequence containing these markers has # not yet been placed on a chromosome. ######################################################################### ## Reorder Fish organisms (DONE - 2006-12-22 - Hiram) hgsql -h genome-testdb hgcentraltest \ -e "update dbDb set orderKey = 450 where name = 'danRer4';" ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer4 ######################################################################### # BLASTZ/CHAIN/NET oryLat1 (DONE - 2007-01-19,20 - Hiram) ssh kkstore04 mkdir /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19 cd /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19 cat << '_EOF_' > DEF # Zebrafish vs. Medaka # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Zebrafish danRer4, no randoms or Un in this sequence SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp) # chrUn in Scaffolds for this alignment run SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ2_CHUNK=40000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -verbose=2 \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/danRer4OryLat1 > do.log 2>&1 & # real 556m6.806s cat fb.danRer4.chainOryLat1Link.txt # 209746583 bases of 1626093931 (12.899%) in intersection cd /cluster/data/danRer4/bed ln -s blastz.oryLat1.2007-01-19 blastz.oryLat1 ## swap to oryLat1 - also in oryLat1.txt mkdir /cluster/data/oryLat1/bed/blastz.swap.danRer4 cd /cluster/data/oryLat1/bed/blastz.swap.danRer4 time doBlastzChainNet.pl -verbose=2 \ /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & cat fb.oryLat1.chainDanRer4Link.txt # 156014546 bases of 700386597 (22.275%) in intersection cd /cluster/data/oryLat1/bed ln -s blastz.swap.danRer4 blastz.danRer4 ######################################################################### # BLASTZ/CHAIN/NET fr2 (DONE - 2007-01-29 - Hiram) ssh kkstore04 mkdir /cluster/data/danRer4/bed/blastz.fr2.2007-01-29 cd /cluster/data/danRer4/bed/blastz.fr2.2007-01-29 cat << '_EOF_' > DEF # Zebrafish vs. Fugu # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Zebrafish danRer4, no randoms or Un in this sequence SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: Fugu fr2 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/danRer4/bed/blastz.fr2.2007-01-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl DEF -verbose=2 \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/danRer4Fr2 > do.log 2>&1 & ## recover from pk kluster problems and finish blastz job time doBlastzChainNet.pl DEF -verbose=2 \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=cat -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/danRer4Fr2 > cat.log 2>&1 & ## recover from kki kluster problems and finish chain job time doBlastzChainNet.pl DEF -verbose=2 \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=chainMerge -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/danRer4Fr2 > chainMerge.log 2>&1 & # real 554m13.214s ## swap mkdir /cluster/data/fr2/bed/blastz.danRer4.swap cd /cluster/data/fr2/bed/blastz.danRer4.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/danRer4/bed/blastz.fr2.2007-01-29/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & # running 2007-01-30 - 16:35 time doBlastzChainNet.pl -verbose=2 \ /cluster/data/danRer4/bed/blastz.fr2.2007-01-29/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 & ssh hgwdev cd /cluster/data/danRer4/bed/blastz.fr2.2007-01-29 time nice -n +19 featureBits danRer4 chainFr2Link \ > fb.danRer4.chainFr2Link.txt 2>&1 # 138918185 bases of 1626093931 (8.543%) in intersection time nice -n +19 featureBits fr2 chainDanRer4Link \ > fb.fr2.chainDanRer4Link.txt 2>&1 # 80963231 bases of 393312790 (20.585%) in intersection # ASZ (3-22-2007)this process failed to create four tables, so I created # them an left them empty (as discussed with Hiram). CREATE TABLE `danRer4`.`chrUn_random_chainFr2` ( `bin` smallint( 5 ) unsigned NOT NULL default '0', `score` double NOT NULL default '0', `tName` varchar( 255 ) NOT NULL default '', `tSize` int( 10 ) unsigned NOT NULL default '0', `tStart` int( 10 ) unsigned NOT NULL default '0', `tEnd` int( 10 ) unsigned NOT NULL default '0', `qName` varchar( 255 ) NOT NULL default '', `qSize` int( 10 ) unsigned NOT NULL default '0', `qStrand` char( 1 ) NOT NULL default '', `qStart` int( 10 ) unsigned NOT NULL default '0', `qEnd` int( 10 ) unsigned NOT NULL default '0', `id` int( 10 ) unsigned NOT NULL default '0', KEY `bin` ( `bin` ) , KEY `id` ( `id` ) ) TYPE = MYISAM ; CREATE TABLE `danRer4`.`chrUn_random_chainFr2Link` ( `bin` smallint( 5 ) unsigned NOT NULL default '0', `tName` varchar( 255 ) NOT NULL default '', `tStart` int( 10 ) unsigned NOT NULL default '0', `tEnd` int( 10 ) unsigned NOT NULL default '0', `qStart` int( 10 ) unsigned NOT NULL default '0', `chainId` int( 10 ) unsigned NOT NULL default '0', KEY `bin` ( `bin` ) , KEY `chainId` ( `chainId` ) ) TYPE = MYISAM ; CREATE TABLE `danRer4`.`chrNA_random_chainFr2` ( `bin` smallint( 5 ) unsigned NOT NULL default '0', `score` double NOT NULL default '0', `tName` varchar( 255 ) NOT NULL default '', `tSize` int( 10 ) unsigned NOT NULL default '0', `tStart` int( 10 ) unsigned NOT NULL default '0', `tEnd` int( 10 ) unsigned NOT NULL default '0', `qName` varchar( 255 ) NOT NULL default '', `qSize` int( 10 ) unsigned NOT NULL default '0', `qStrand` char( 1 ) NOT NULL default '', `qStart` int( 10 ) unsigned NOT NULL default '0', `qEnd` int( 10 ) unsigned NOT NULL default '0', `id` int( 10 ) unsigned NOT NULL default '0', KEY `bin` ( `bin` ) , KEY `id` ( `id` ) ) TYPE = MYISAM ; CREATE TABLE `danRer4`.`chrNA_random_chainFr2Link` ( `bin` smallint( 5 ) unsigned NOT NULL default '0', `tName` varchar( 255 ) NOT NULL default '', `tStart` int( 10 ) unsigned NOT NULL default '0', `tEnd` int( 10 ) unsigned NOT NULL default '0', `qStart` int( 10 ) unsigned NOT NULL default '0', `chainId` int( 10 ) unsigned NOT NULL default '0', KEY `bin` ( `bin` ) , KEY `chainId` ( `chainId` ) ) TYPE = MYISAM ; ########################################################################### # CREATE LIFTOVER FROM danRer4 TO danRer5 # (DONE, 2007-09-21 - 2007-09-22, hartera) ssh kkstore04 mkdir /cluster/data/danRer4/bed/blat.danRer5 cd /cluster/data/danRer4/bed/blat.danRer5 time nice doSameSpeciesLiftOver.pl danRer4 danRer5 \ -bigClusterHub pk \ -ooc /san/sanvol1/scratch/danRer4/danRer4_11.ooc \ -buildDir=/cluster/data/danRer4/bed/blat.danRer5 >& do.log & # 0.337u 0.208s 4:58:26.59 0.0% 0+0k 0+0io 28pf+0w # Remove symbolic link to liftOver chains and copy over the file rm ../liftOver/danRer4ToDanRer5.over.chain.gz cp -p danRer4ToDanRer5.over.chain.gz ../liftOver # a link in /usr/local/apache/htdocs/goldenPath/danRer5/liftOver has # already been made to this file and md5sum.txt needs to be updated ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver md5sum *.gz > md5sum.txt md5sum *.gz > ../../goldenPath/liftOver/md5sum.txt ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver ln -s /cluster/data/danRer5/bed/liftOver/danRer4ToDanRer5.over.chain.gz . ############################################################################# # CONTRAST GENES (2007-10-02 markd) # recieved predictions from Sam Gross cd /cluster/data/danRer4/bed/contrastGene/ wget http://www.stanford.edu/~ssgross/contrast.danRer4.bed # this is a custom track, not a pure BED tail +2 contrast.danRer4.bed | hgLoadBed -tab danRer4 contrastGene stdin # verify # load track db (ra and contrastGene.html are global # request push of contrastGene ########################################################################### ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: danRer4.upstreamGeneTbl = refGene danRer4.upstreamMaf = multiz7way /hive/data/genomes/danRer4/bed/multiz7way/species.lst