# for emacs: -*- mode: sh; -*- # Danio Rerio (zebrafish) from Sanger, version Zv4 (released 6/30/04) # Project website: # http://www.sanger.ac.uk/Projects/D_rerio/ # Assembly notes: # http://www.sanger.ac.uk/Projects/D_rerio/Zv4_assembly_information.shtml # NOTE: Error in scaffolds agp file. Notified Sanger and got new scaffolds # agp and recreated FASTA files from this new one (2004-11-29) # Previous agp file was missing a scaffold from the end of most chromosomes. # There is also a chrUn set of scaffolds that are in the chunks agp file - these# just have the identifier Zv4_scaffoldN instead of a chromosome number and # they are scaffolds that correspond to FPC contigs but their position is # unknown so they are not mapped to a chromosome. # DOWNLOAD SEQUENCE (DONE, 2004-10-18, hartera) # ERRORS IN SCAFFOLDS AGP SO GET NEW AGP FROM SANGER AND DOWNLOAD # (hartera, 2004-11-29) from Mario Caccamo: mc2@sanger.ac.uk ssh kksilo mkdir /cluster/store8/danRer2 ln -s /cluster/store8/danRer2 /cluster/data cd /cluster/data/danRer2 wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/README wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/stats wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.chunks.agp wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.scaffolds.agp wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.fasta # get new agp file and download to /cluster/data/danRer2 (hartera, 2004-11-29) # Remove all chrom directories to start processing with new agp file ssh kksilo cd /cluster/data/danRer2 foreach c (`cat chrom.lst`) rm -r $c end # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2004-10-18, hartera) # DOWNLOAD SEQUENCE AND CREATE FILES AGAIN (hartera, 2004-11-30) # ADD chrM.chunks.agp (DONE, 2004-12-06, hartera) # Error reported by a user: chrM.agp is space delimited rather # than tab delimited so correct this. NCBI defines the agp format as # being tab delimited. (DONE, 2005-04-25, hartera) ssh kksilo mkdir -p /cluster/data/danRer2/M cd /cluster/data/danRer2/M # go to http://www.ncbi.nih.gov/ and search Nucleotide for # "Danio mitochondrion genome". That shows the gi number: # 8576324 for the accession, AC024175 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA' # Edit chrM.fa: make sure the header line says it is the # Danio Rerio mitochondrion complete genome, and then replace the # header line with just ">chrM". perl -pi.bak -e 's/>.+/>chrM/' chrM.fa rm *.bak # Make a "pseudo-contig" for processing chrM too: mkdir ./chrM_1 sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa mkdir ./lift echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst echo "chrM_1" > ./lift/ordered.lst echo "0 M/chrM_1 16596 chrM 16596" > ./lift/ordered.lft # create a .agp file for chrM as hgGoldGapGl and other # programs require a .agp file so create chrM.agp cat << '_EOF_' > ./chrM.agp chrM 1 16596 1 F AC024175.3 1 16596 + '_EOF_' # Create a chrM.chunks.agp (hartera, 2004-12-06) cd /cluster/data/danRer2/M/agps awk 'BEGIN {OFS="\t"} \ {print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' ../chrM.agp \ > chrM.chunks.agp # chrM.agp is space delimited instead of tab delimited # so correct this (hartera, 2005-04-25) cd /cluster/data/danRer2/M # edit chrM.agp and change field delimiters from spaces to tabs. # add this new file to zipped files of agp and get it pushed to the # downloads area - see "MAKE DOWNLOADABLE SEQUENCE FILES" section of # this make doc. # Create list of chromsosomes (DONE, 2004-10-18, hartera) # Add "M" for mitochondrion chromosome (2004-10-25, hartera) # Add "Un" for chrUn (2004-11-29, hartera) ssh kksilo cd /cluster/data/danRer2 awk '{print $1;}' Zv4.scaffolds.agp | sort -n | uniq > chrom.lst # add NA - these are contigs in the chunks agp echo "NA" >> chrom.lst # add chrM echo "M" >> chrom.lst # add chrUn echo "Un" >> chrom.lst # SPLIT AGP FILES BY CHROMOSOME (DONE, 2004-10-19, hartera) # AGP USED TO CREATE FASTA WAS SCAFFOLDS AGP # RE-DO SPLITTING AGP FILES AFTER GETTING NEW SCAFFOLDS AGP # (hartera, 2004-11-29) ssh kksilo cd /cluster/data/danRer2 # There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and # then one for chunks (contigs on danRer1) showing how they map on to # scaffolds. # add "chr" prefix for the agp files perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp # for chromosomes: foreach c (`cat chrom.lst`) mkdir $c perl -we "while(<>){if (/^chr$c\t/) {print;}}" \ ./Zv4.chunks.agp \ > $c/chr$c.chunks.agp perl -we "while(<>){if (/^chr$c\t/) {print;}}" \ ./Zv4.scaffolds.agp \ > $c/chr$c.scaffolds.agp end # CREATE AGP AND FASTA FOR chrNA (DONE, 2004-10-25, hartera) # REMAKE AGP WITH NEW SCAFFOLDS AGP FILE AND CREATE chrUn AGP # (hartera, 2004-11-29) ssh kksilo cd /cluster/data/danRer2 # for NA make agp files grep "Zv4_NA" Zv4.chunks.agp > NA.chunks.agp # make a scaffolds agp for NA - use first 9 fields of chunks file # and remove ".1" from 6th field awk 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5, $6, $7, $8, $9}' \ NA.chunks.agp | perl -pi.bak -e 's/(Zv4_NA[0-9]+)\.1+/$1/' \ > NA.scaffolds.agp # move agps to NA directory created above mv NA.scaffolds.agp NA.chunks.agp ./NA # from scaffolds agp, get name of scaffolds to get from FASTA file for NA foreach c (NA) awk '{print $1;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst $HOME/bin/i386/faSomeRecords /cluster/data/danRer2/Zv4.fasta \ $c/chr$c.scaffolds.lst $c/chr$c.fa end # create agp with 1000Ns between scaffolds as contig gaps for chrNA foreach c (NA) $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa mv $c/chr$c.fa $c/chr$c.scaffolds.fa perl -pi -e "s/chrUn/chr$c/" $c/chr$c.* end # change the type to "W" in the agp for WGS perl -pi.bak -e "s/D/W/;" NA/chrNA.agp cd NA mv chrNA.agp chrNA.scaffolds.agp rm *.bak # use this chrNA.scaffolds.agp to create chrNA.chunks.agp cat << '_EOF_' > /cluster/data/danRer2/jkStuff/createChunksAgp.pl #!/usr/bin/perl -w use strict; # input is list of scaffolds and chunks and file of chrN.scaffolds.agp with Ns. my $accs = $ARGV[0]; my $scaf = $ARGV[1]; open (ACCS, $accs) || die "Can not open $accs: $!"; open (SCAFS, $scaf) || die "Can not open $scaf: $!"; my %accsHash; while () { chomp; my @fi = split(/\t/); $accsHash{$fi[0]} = $fi[1]; } close ACCS; while (my $line = ) { chomp $line; my @f = split(/\t/, $line); my $acc; if ($f[4] ne "N") { if (exists($accsHash{$f[5]}) ) { $acc = $accsHash{$f[5]}; else { print "$f[5] can not be found\n"; } print "$f[0]\t$f[1]\t$f[2]\t$f[3]\t$f[4]\t$acc\t$f[6]\t$f[7]\t$f[8]"; print "\t$f[5]\t$f[6]\t$f[7]"; } else { print $line; } print "\n"; } '_EOF_' chmod +x /cluster/data/danRer2/jkStuff/createChunksAgp.pl awk 'BEGIN {OFS="\t";} { if ($1 ~ /^Zv4_NA/) print $1,$6 }' \ /cluster/data/danRer2/Zv4.chunks.agp > NA.accs perl ../jkStuff/createChunksAgp.pl NA.accs chrNA.scaffolds.agp \ > chrNA.chunks.agp # Also create agp for chrUn - these are scaffolds that map to FPC contigs # but are unplaced on the chromosomes # for Un, make agp files ssh kksilo cd /cluster/data/danRer2 mkdir Un # make a scaffolds agp for Un - use first 9 fields of chunks file # and remove ".1" from 6th field awk 'BEGIN {OFS="\t"} {if ($1 ~ /Zv4_scaffold/) \ print $1, $2, $3, $4, $5, $6, $7, $8, $9}' \ Zv4.chunks.agp | perl -pi.bak -e 's/(Zv4_NA[0-9]+)\.1+/$1/' \ > Un/Un.scaffolds.agp # from scaffolds agp, get name of scaffolds to get from FASTA file for Un foreach c (Un) awk '{print $1;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst $HOME/bin/i386/faSomeRecords /cluster/data/danRer2/Zv4.fasta \ $c/chr$c.scaffolds.lst $c/chr$c.fa end # create agp with 1000Ns between scaffolds as contig gaps for chrUn foreach c (Un) $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa mv $c/chr$c.fa $c/chr$c.scaffolds.fa end # change the type to "W" in the agp for WGS perl -pi.bak -e "s/D/W/;" Un/chrUn.agp cd Un mv chrUn.agp chrUn.scaffolds.agp rm *.bak # create chunks agp for chrUn # this includes accessions so need to get from Zv4.chunks.agp # modify perl script above to do this (2004-12-06, hartera) awk 'BEGIN {OFS="\t";} { if ($1 ~ /^Zv4_/) print $1,$6 }' \ /cluster/data/danRer2/Zv4.chunks.agp > Un.accs perl ../jkStuff/createChunksAgp.pl Un.accs chrUn.scaffolds.agp \ > chrUn.chunks.agp # BUILD CHROM-LEVEL SEQUENCE (DONE, 2004-10-21, hartera) # Move scaffolds files for NA into scaffolds directory (2004-11-22, hartera) # RE-BUILD SEQUENCE WITH NEW AGPS FROM CORRECTED SCAFFOLDS AGP # (2004-11-30, hartera) ssh kksilo cd /cluster/data/danRer2 # Sequence is already in upper case so no need to change foreach c (`cat chrom.lst`) echo "Processing ${c}" $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \ $c/chr$c.fa ./Zv4.fasta echo "${c} - DONE" end # some Ns in sequence files are in lower case so change to upper case foreach c (`cat chrom.lst`) cat $c/chr${c}.fa | tr 'n' 'N' > $c/chr${c}.fa.tr if ($c == "Un") then perl -pi.bak -e 's/^>chrUN/>chrUn/' $c/chr${c}.fa.tr endif mv $c/chr${c}.fa.tr $c/chr${c}.fa end # move scaffolds agp to be chrom agp and clean up (2004-11-30) foreach c (`cat chrom.lst`) cd $c rm *.bak cp chr${c}.scaffolds.agp chr${c}.agp mkdir agps mv chr${c}.*.agp ./agps/ cd .. end # move scaffolds files for NA into scaffolds directory (2004-11-22) # and again (2004-11-30) foreach c (NA Un) mkdir -p /cluster/data/danRer2/$c/scaffolds cd /cluster/data/danRer2/$c mv chr$c.scaffolds.* ./scaffolds rm $c.*.agp cd .. end # CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2004-10-21, hartera) # CHECKED THESE ARE OK (hartera, 2004-11-30) # Check that the size of each chromosome .fa file is equal to the # last coord of the .agp: ssh hgwdev cd /cluster/data/danRer2 foreach c (`cat chrom.lst`) foreach f ( $c/chr$c.agp ) set agpLen = `tail -1 $f | awk '{print $3;}'` set h = $f:r set g = $h:r echo "Getting size of $g.fa" set faLen = `faSize $g.fa | awk '{print $1;}'` if ($agpLen == $faLen) then echo " OK: $f length = $g length = $faLen" else echo "ERROR: $f length = $agpLen, but $g length = $faLen" endif end end # all are the OK so FASTA files are the expected size # BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS # (DONE, 2004-10-25, hartera) # RE-DONE (2004-11-30, hartera) ssh kksilo cd /cluster/data/danRer2 foreach c (`cat chrom.lst`) foreach agp ($c/chr$c.agp) if (-e $agp) then set fa = $c/chr$c.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end # MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-10-25, hartera) # This used to hold scripts -- better to keep them inline here # Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/danRer2/jkStuff # This is where most tracks will be built: mkdir /cluster/data/danRer2/bed # CREATING DATABASE (DONE, 2004-10-25, hartera) # Create the database. # next machine ssh hgwdev echo 'create database danRer2' | hgsql '' # if you need to delete that database: !!! WILL DELETE EVERYTHING !!! echo 'drop database danRer2' | hgsql danRer2 # Delete and re-create database as above (hartera, 2004-11-30) # Use df to make sure there is at least 10 gig free on df -h /var/lib/mysql # Before loading data: # Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 637G 1023G 39% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-10-25, hartera) # RECREATE GRP TABLE (hartera, 2004-11-30) # next machine ssh hgwdev # the following command copies all the data from the table # grp in the database danRer1 to the new database danRer2 echo "create table grp (PRIMARY KEY(NAME)) select * from danRer1.grp" \ | hgsql danRer2 # if you need to delete that table: !!! WILL DELETE ALL grp data !!! echo 'drop table grp;' | hgsql danRer2 # REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2004-10-26, hartera) # There is a new Repeat library at WUSTL that has new repeats for Danio rerio # This is Dr.repeats.020501 # Add a README about these repeats ssh kksilo cd /cluster/data/danRer2 wget --timestamp \ http://www.genetics.wustl.edu/fish_lab/repeats/Dr.repeats.020501 mv Dr.repeats.020501 /cluster/bluearc/RepeatMasker/Libraries/danioRW.lib # add danioRW.lib to danio.lib cd /cluster/bluearc/RepeatMasker/Libraries cp danio.lib danioRMandRW.lib cat danioRW.lib >> danioRMandRW.lib # add type as "unknown" to this file as these repeats are not classified perl -pi.bak -e 's/^(>Dr[0-9]+)/$1#Unknown/' danioRMandRW.lib # these new repeats are not classified by type so add "DrRW" as type later # Add a README about these repeats from WUSTL wget --timestamp \ http://www.genetics.wustl.edu/fish_lab/repeats/Readme.txt #- Split contigs into 500kb chunks, at gaps if possible: foreach c (`cat chrom.lst`) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/danRer2 # use RepeatMasker from January 2004 cat << '_EOF_' > jkStuff/RMZebrafish #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/danRer2/$2 /bin/cp $2 /tmp/danRer2/$2/ cd /tmp/danRer2/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -lib danioRMandRW.lib $2 popd /bin/cp /tmp/danRer2/$2/$2.out ./ if (-e /tmp/danRer2/$2/$2.align) /bin/cp /tmp/danRer2/$2/$2.align ./ if (-e /tmp/danRer2/$2/$2.tbl) /bin/cp /tmp/danRer2/$2/$2.tbl ./ if (-e /tmp/danRer2/$2/$2.cat) /bin/cp /tmp/danRer2/$2/$2.cat ./ /bin/rm -fr /tmp/danRer2/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2 '_EOF_' chmod +x jkStuff/RMZebrafish2 mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/danRer2/jkStuff/RMZebrafish \ /cluster/data/danRer2/$d $f \ '{'check out line+ /cluster/data/danRer2/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh kk cd /cluster/data/danRer2/RMRun para create RMJobs para try, para check, para check, para push, para check,... # para time # CPU time in finished jobs: 10326858s 172114.29m 2868.57h 119.52d 0.327 y # IO & Wait Time: 33702s 561.71m 9.36h 0.39d 0.001 y # Average job time: 3081s 51.35m 0.86h 0.04d # Longest job: 4065s 67.75m 1.13h 0.05d # Submission to last job: 39673s 661.22m 11.02h 0.46d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/danRer2 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/danRer2 hgLoadOut danRer2 */chr*.fa.out # When try masking sequence (see below) there are 60 instances of negative # co-ordinates: # 2 Dr000074 # 48 Dr000158 # 6 Dr000375 # 1 Dr000511 # 1 Dr000759 # 1 Dr000975 # 5 Dr001181 # Sent sample output from chr1_3_21 to Arian Smit and he suggested that # Dr000158 is a Satellite. When this classification is added to the FASTA # header: >Dr000158#Satellite, the negative co-ordinates disappeared. # If the classification is changed to "Unknown" then there are negative # co-ordinates. # Took a look at these 7 repeats above and found overlapping matches # Yi Zhou at Boston Children's Hospital looked at the repeats and split # them up into smaller chunks: danioRMandRWsplit.libe # Running RepeatMasker with this library removed a lot of negative co-ordinates # but some new ones appeared. There are 7 instances of negative co-ordinates. # TDR5, (TAGA)n, Dr000355, Dr001182 # Dr001182 has two repeats, the second being an imperfect replicate of the # first so this was split into two repeats and RepeatMasker run again (11/18/04) # This time there were TDR5, (TAGA)n, Dr000355, Dr000509 with negative repeats # but only 5 instances. # 11/13/04 # try RepeatMasker with 7 repeats with negative co-ords split into smaller # repeats. get list of repeat names without these then get those sequences ssh kksilo cd /cluster/data/danRer2/bed/ $HOME/bin/i386/faSomeRecords \ /cluster/bluearc/RepeatMasker/Libraries/danioRMandRW.lib \ rmandrw.txt danioRMandRWsplit.lib # splitreps is list of split repeats cat splitreps >> danioRMandRWsplit.lib mv danioRMandRWsplit.lib /cluster/bluearc/RepeatMasker/Libraries/ # then run repeat masker on problem repeat areas e.g. chr1_3_21.fa mkdir /cluster/data/danRer2/RMRun/testSplitReps cd /cluster/data/danRer2/RMRun/testSplitReps nice /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -lib danioRMandRWsplit.lib chr1_3_21.fa # works well so change all the classes to unknown and re-run with this library perl -pi.bak -e 's/^>(Dr[0-9a-z\-]+)#DrRW/>$1#Unknown/' danioRMandRWsplit.lib # then run RepeatMasker as above with new split library (DONE, 2004-11-16) # edit RMZebrafish to use this library (library is danioRMandRWsplit.lib) # and run from RMRun2 directory # need to remove chrN.fa.masked files # then convert chrN.fa back to upper case ssh kksilo cd /cluster/data/danRer2 foreach c (`cat chrom.lst`) cd $c echo "Processing $c ..." rm chr${c}.fa.masked cat chr${c}.fa | tr '[a-z]' '[A-Z]' > chr${c}.tmp perl -pi.bak -e 's/^>CHR([0-9A-Z]+)/>chr$1/' chr${c}.tmp mv chr${c}.tmp chr${c}.fa rm chr${c}.tmp.bak cd .. end # still get some negative co-ordinates when try masking # Dr001182 is a repeat sequence which contains two instances of a repeat with # the second one not being a perfect match to the first # split these into two repeats and then re-run RepeatMasker # # e-mailed Kerstin Jekosch at Sanger as it looks like they have used this # WUSTL Repeat library to mask the Zv4 assembly for Ensembl. Kerstin recommended # downloading this new library from RepBase as it has been properly # formatted for RepBase version 10. In RepBase, it is the Zebrafish Unclassified # library and it consists of 958 unfinished consensus sequences of unclassified # repeats extracted from the library at # http://www.genetics.wustl.edu/fish_lab/repeats/Dr.repeats.020501 # which has a total of 1225 repeats. 267 repeats present in the library have # been replaced by a set of consensus sequences of classified transposable # elements that are reported in Repbase Update and Repbase Reports. # DOWNLOAD NEW VERSION OF THE WUSTL ZEBRAFISH REPEATS FROM REPBASE # (DONE, 2004-11-18, hartera) # Go to http://www.girinst.org/server/RepBase/ # and select zebunc (Zebrafish unclassified library) # click on repeatmaskerlibraries and then download # repeatmaskerlibrariesJuly2004.tar.gz gunzip repeatmaskerlibrariesJuly2004.tar.gz tar xvf repeatmaskerlibrariesJuly2004.tar perl -pi.bak -e 's/^>(Dr[0-9]+)/>$1#Unknown/' zebunc.ref cat danio.lib zebunc.ref >> danioandzebunc.lib # REDO REPEATMASKER RUN AND LOAD NEW RESULTS (DONE, 2004-11-22, hartera) # REDONE (hartera, 2004-12-01) # sequences already split into 500kb chunks - see above # use RepeatMasker open-3.0 version, Sep 2004, this was in # /cluster/bluearc/RepeatMasker.040909 and is now the default # new zebrafish library was downloaded from RepBase - zebunc.ref # copy to /cluster/bluearc/RepeatMasker.040909/Libraries # add "Unknown" as classification for these repeats perl -pi.bak -e 's/>(Dr[0-9]+)/>$1#Unknown \@danio [S:]' zebunc.ref # add to RepeatMasker library mv RepeatMasker.lib RepeatMasker.lib.old cat RepeatMasker.lib.old zebunc.ref >> RepeatMasker.lib cat << '_EOF_' > jkStuff/RMZebrafish #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/danRer2/$2 /bin/cp $2 /tmp/danRer2/$2/ cd /tmp/danRer2/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species danio $2 popd /bin/cp /tmp/danRer2/$2/$2.out ./ if (-e /tmp/danRer2/$2/$2.align) /bin/cp /tmp/danRer2/$2/$2.align ./ if (-e /tmp/danRer2/$2/$2.tbl) /bin/cp /tmp/danRer2/$2/$2.tbl ./ if (-e /tmp/danRer2/$2/$2.cat) /bin/cp /tmp/danRer2/$2/$2.cat ./ /bin/rm -fr /tmp/danRer2/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2 '_EOF_' chmod +x jkStuff/RMZebrafish rm -r RMRun mkdir -p RMRun cp /dev/null RMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/danRer2/jkStuff/RMZebrafish \ /cluster/data/danRer2/$d $f \ '{'check out line+ /cluster/data/danRer2/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh kk cd /cluster/data/danRer2/RMRun para create RMJobs para try, para check, para check, para push, para check,... # para time # Completed: 3899 of 3899 jobs # CPU time in finished jobs: 11636116s 193935.26m 3232.25h 134.68d 0.369 y # IO & Wait Time: 39078s 651.31m 10.86h 0.45d 0.001 y # Average job time: 2994s 49.91m 0.83h 0.03d # Longest job: 4064s 67.73m 1.13h 0.05d # Submission to last job: 19022s 317.03m 5.28h 0.22d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/danRer2 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/danRer2 hgLoadOut danRer2 */chr*.fa.out # Note: 23 records dropped due to repStart > repEndm (2004-12-01) # Processing 1/chr1.fa.out bad rep range in: 1354 157 0 0 chr1 7313059 7313288 -5489473 5 - (0) (917) (917) 689 5 3 bad rep range in: 794 247 63 0 chr1 22624284 22624474 -39583549 - (0) (860) (860) 659 0 1 * bad rep range in: 841 234 44 0 chr1 27730487 27730692 -34477331 - (0) (555) (555) 342 5 1 # Processing 11/chr11.fa.out bad rep range in: 2007 124 74 108 chr11 6853360 6853376 -3076000 2 + HATN10_DR DNA hAT 167 166 -289 2 # Processing 13/chr13.fa.out bad rep range in: 939 65 0 158 chr13 11182894 11182923 -26476056 + DIRS1_DR LTR DIRS1 6133 6132 0 1 # Processing 14/chr14.fa.out bad rep range in: 350 125 29 137 chr14 39592288 39592300 -20407524 + HAT1_DR DNA hAT 612 611 -2681 8 # Processing 16/chr16.fa.out bad rep range in: 311 225 0 136 chr16 38708823 38708835 -4054346 + Dr000294 Unknown Unknown 403 400 -549 6 # Processing 18/chr18.fa.out bad rep range in: 9780 128 42 58 chr18 12479604 12480762 -35227702 + Dr000158 Unknown Unknown 45 -2229 -3930 1 # Processing 19/chr19.fa.out bad rep range in: 249 169 0 167 chr19 25584255 25584266 -26439321 + Dr000331 Unknown Unknown 314 311 -844 3 # Processing 2/chr2.fa.out bad rep range in: 596 206 44 0 chr2 24978519 24978655 -27097055 - (1) (317) (317) 176 5 4 bad rep range in: 326 56 19 0 chr2 40153004 40153058 -11922652 - (129) (79) (79) 25 5 2 bad rep range in: 1454 56 0 0 chr2 50993722 50993901 -1081809 - (0) (1155) (1155) 977 5 8 # Processing 3/chr3.fa.out bad rep range in: 605 76 0 11 chr3 42820214 42820307 -1974931 - (6) (5062) (5062) 4971 5 3 # Processing 4/chr4.fa.out bad rep range in: 1072 143 21 100 chr4 920087 920366 -3235941 8 - (1) (540) (540) 284 5 1 bad rep range in: 330 194 109 29 chr4 1398685 1398823 -3188096 1 - (204) (375) (375) 227 5 14 bad rep range in: 978 212 58 90 chr4 24351201 24351580 -8928204 - (594) (553) (553) 187 5 1 # Processing 5/chr5.fa.out bad rep range in: 4380 113 49 44 chr5 4937637 4937659 -6284667 7 + TDR23 DNA DNA 889 888 -259 1 # Processing 6/chr6.fa.out bad rep range in: 649 14 0 0 chr6 7782737 7782809 -2542980 7 - (956) (419) (419) 348 5 1 # Processing 7/chr7.fa.out bad rep range in: 272 158 0 50 chr7 19882140 19882200 -42635991 - (800) (419) (419) 363 5 7 # Processing NA/chrNA.fa.out bad rep range in: 1068 181 113 122 chrNA 75025954 75025966 -243271514 + TDR18 DNA DNA 188 187 -385 5 bad rep range in: 493 109 3 153 chrNA 202800523 20280058 5 -115496895 + Dr000876 Unknown Unknown 1 -32 -157 1 bad rep range in: 444 271 4 164 chrNA 249521533 24952154 8 -68775932 + CR1-1_DR LINE L2 4833 4832 -490 9 # Processing Un/chrUn.fa.out bad rep range in: 237 149 0 152 chrUn 96855194 96855206 -76596190 + Dr000331 Unknown Unknown 312 311 -844 1 # To display the new repeats which are classed as "Unknown", add this class # to $HOME/kent/src/hg/hgTracks/rmskTrack.c # to the repeatClassNames and repeatClasses arrays # check coverage of repeats masked # featureBits -chrom=chr1 danRer1 rmsk # 11589712 bases of 40488791 (28.624%) in intersection # featureBits -chrom=chr1 danRer2 rmsk # 26879295 bases of 61678023 (43.580%) in intersection # MAKE LIFTALL.LFT (DONE, 2004-10-26, hartera) # RE-DONE (hartera, 2004-12-01) ssh kksilo cd /cluster/data/danRer2 cat */lift/ordered.lft > jkStuff/liftAll.lft # SIMPLE REPEAT [TRF] TRACK (DONE, 2004-10-26, hartera) # RE-DONE (DONE, hartera, 2004-12-02) # TRF runs pretty quickly now... it takes a few hours total runtime, # so instead of binrsyncing and para-running, just do this on the # local fileserver ssh kksilo rm -r /cluster/data/danRer2/bed/simpleRepeat mkdir -p /cluster/data/danRer2/bed/simpleRepeat cd /cluster/data/danRer2/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/danRer2/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end chmod a+x jobs.csh csh -ef jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l endsInLf trf/* # kksilo overloaded so take chrNA_35 onwards as jobs2.csh and run on kolossus liftUp simpleRepeat.bed /cluster/data/danRer2/jkStuff/liftAll.lft warn \ trf/*.bed # Load into database ssh hgwdev cd /cluster/data/danRer2/bed/simpleRepeat hgLoadBed danRer2 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # PROCESS SIMPLE REPEATS INTO MASK (DONE, 2004-10-26, hartera) # RE-DONE (2004-12-02, hartera) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kksilo cd /cluster/data/danRer2/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/danRer2 mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF # (DONE, 2004-11-22, hartera) # RE-DONE (2004-12-02, hartera) # RE-MAKE MASKED CHR1 SEQUENCE AS INCOMPLETELY MASKED (2005-06-06, hartera) # Although there was an error in the RepeatMasker output for chr1, this # loaded ok for the chr1_rmsk table so that is complete. It is just the # actual sequence that was not masked properly. ssh kksilo cd /cluster/data/danRer2 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom # errors in the RepeatMasker output with negative co-ordinates # (60 instances) - mainly for specific sequences in the new zebrafish # library so took a look at these together with Yi Zhou from Boston # Children's Hospital who split these repeats into smaller sequences. # These were used to replace the original repeats in the repeat library and # RepeatMasker was re-run. Finally, it was found that a newer version of # RepeatMasker open-3.0.5 version reduced the negative co-ordinates to # 2 instances and a cleaned up version of the new zebrafish library # from RepBase was also used - see above. foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # with the new version of RepeatMasker, there are 2 instances of negative # co-ordinates which can just be ignored. # WARNING: negative rEnd: -2229 chr18:12479605-12480762 Dr000158 # WARNING: negative rEnd: -32 chrNA:202800524-202800585 Dr000876 # with new agp and chrom sequence, there is also an instance of a * instead # of a co-ordinate (2004-12-02) # repeat- and trf-masking 1/chr1.fa # invalid signed number: "*" # mask contigs also foreach c (`cat chrom.lst`) echo "repeat- and trf-masking contigs of chr$c" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end # same warnings here too: # WARNING: negative rEnd: -2229 chr18_3:2110746-2111903 Dr000158 # WARNING: negative rEnd: -32 chrNA_41:1844381-1844442 Dr000876 # repeat- and trf-masking contigs of chr1 # invalid signed number: "*" (2004-12-02) and the co-ordinates for chr18_3 # are: WARNING: negative rEnd: -2229 chr18_3:1862490-1863647 Dr000158 # Build nib files, using the soft masking in the fa mkdir nib foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # Re-do masking for chr1.fa. A user noticed that the masking for this # is very low compared to other chroms (2% vs almost 50%). Re-running # maskOutFa and debugging revealed that the error reported above # "invalid signed number: "*" " is causing the program to abort # so the chrom is incompletely masked. This row of the RepeatMasker file # (chr1.fa.out) is incorrect and the maskOutFa program fails when it # tries to load this line into a data structure. # Solution: remove the erroneous line (line 52575) and re-run maskOutFa ssh kkstore01 mkdir /cluster/data/danRer2/tmpMask cd /cluster/data/danRer2/tmpMask cp /cluster/data/danRer2/1/chr1.fa . cp /cluster/data/danRer2/1/chr1.fa.out . # make sure it is all upper case cat chr1.fa | tr '[a-z]' '[A-Z]' > chr1.tmp sed -e 's/>CHR1/>chr1/' chr1.tmp > chr1.fa # remove line 52575 from chr1.fa.out # re-run maskOutFa set trfCtg=/cluster/data/danRer2/bed/simpleRepeat/trfMask set trfChr=/cluster/data/danRer2/bed/simpleRepeat/trfMaskChrom foreach f (chr1.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end # then add chr1New as FASTA heading and copy to the directory for chr1 sed -e 's/>chr1/>chr1New/' chr1.fa > chr1New.fa sed -e 's/>chr1/>chr1New/' chr1.fa.masked > chr1New.fa.masked mv chr1.fa.out /cluster/data/danRer2/1/chr1New.fa.out mv chr1New.fa /cluster/data/danRer2/1/ mv chr1New.fa.masked /cluster/data/danRer2/1/ cd .. rm -r tmpMask cd 1 faSize chr1.fa # 62208023 bases (3421437 N's 58786586 real 57507587 upper 1278999 lower) faSize chr1New.fa # 2.1% are in lower case # 62208023 bases (3421437 N's 58786586 real 31874160 upper 26912426 lower) # 43% masked as lower case faSize chr1.fa.masked # 62208023 bases (4700436 N's 57507587 real 57507587 upper 0 lower) # 7.6% are Ns faSize chr1New.fa.masked # 62208023 bases (30333863 N's 31874160 real 31874160 upper 0 lower) # 49% are Ns # just use this new masked sequence for downloads and replace the # masked chr1 sequences for downloads - see DOWNLOADABLE SEQUENCE # section below # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE, 2004-11-22, hartera) # RE-DONE (2004-12-02, hartera) # MOVE danRer2.2bit out of gbdb nib directory (2004-12-15, hartera) # Make symbolic links from /gbdb/danRer2/nib to the real nibs. ssh hgwdev cd /cluster/data/danRer2 mkdir -p /gbdb/danRer2/nib foreach f (/cluster/data/danRer2/nib/chr*.nib) ln -s $f /gbdb/danRer2/nib end # Load /gbdb/danRer2/nib paths into database and save size info # hgNibSeq creates chromInfo table hgNibSeq -preMadeNib danRer2 /gbdb/danRer2/nib */chr*.fa echo "select chrom,size from chromInfo" | hgsql -N danRer2 > chrom.sizes # take a look at chrom.sizes, should be 28 lines wc chrom.sizes # Make one big 2bit file as well, and make a link to it in # /gbdb/danRer2/nib because hgBlat looks there: faToTwoBit */chr*.fa danRer2.2bit ln -s /cluster/data/danRer2/danRer2.2bit /gbdb/danRer2/nib/ # Move this link out of nib directory (2004-12-15, hartera) rm /gbdb/danRer2/nib/danRer2.2bit ln -s /cluster/data/danRer2/danRer2.2bit /gbdb/danRer2/ # MAKE GOLD AND GAP TRACKS (DONE, 2004-11-22, hartera) # RE-DONE (2004-12-03, hartera) ssh hgwdev cd /cluster/data/danRer2 # the gold and gap tracks are created from the chrN.agp file and this is # the scaffolds or supercontigs agp # move other agp files out of the way to an agps directory foreach c (`cat chrom.lst`) mkdir ./$c/agps mv ./$c/chr${c}.chunks.agp ./$c/agps/ mv ./$c/chr${c}.scaffolds.agp ./$c/agps/ end # move the scaffolds agp for NA to agps directory mv ./NA/scaffolds/chrNA.scaffolds.agp ../agps # the gold and gap tracks are created from the chrN.agp file hgGoldGapGl -noGl -chromLst=chrom.lst danRer2 /cluster/data/danRer2 . # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DANRER2 # (DONE, 2004-11-22, hartera) # Correct nibPath for danRer2 in dbDb table (2004-12-15, hartera) # Make trackDb table so browser knows what tracks to expect: ssh hgwdev mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer2 cd $HOME/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add danRer2 in all the right places and do make update make alpha cvs commit -m "Added danRer2." makefile # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ values("danRer2", "June 2004", \ "/gbdb/danRer1/nib", "Zebrafish", "chr2:16,330,443-16,335,196", 1, \ 37, "Zebrafish", "Danio rerio", \ "/gbdb/danRer2/html/description.html", 0, 0, \ "Sanger Centre, Danio rerio Sequencing Project Zv4");' \ | hgsql -h genome-testdb hgcentraltest # set danRer2 to be the default assembly for Zebrafish echo 'update defaultDb set name = "danRer2" \ where genome = "Zebrafish";' \ | hgsql -h genome-testdb hgcentraltest # dbDb table has danRer1 instead of danRer2 in nib path. However, this # should point to the position of the danRer2.2bit file # (2004-12-15, hartera) echo 'update dbDb set nibPath="/gbdb/danRer2" \ where name = "danRer2";' | hgsql hgcentraltest # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-11-22, hartera) ssh hgwdev mkdir /cluster/data/danRer2/html # make a symbolic link from /gbdb/danRer2/html to /cluster/data/danRer2/html ln -s /cluster/data/danRer2/html /gbdb/danRer2/html # Add a description page for zebrafish cd /cluster/data/danRer2/html cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1/description.html . # Edit this for zebrafish danRer2 # create a description.html page here mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer2 cd ~/kent/src/hg/makeDb/trackDb/zebrafish cvs add danRer2 cvs commit danRer2 # Add description page here too cd danRer2 cp /cluster/data/danRer2/html/description.html . cvs add description.html cvs commit -m "First draft of description page for danRer2." \ description.html # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE, 2004-11-22, hartera) # RE-DONE (2004-12-03, hartera) # PUT MASKED SEQUENCE ON BLUEARC (DONE, 2004-12-22 and 2004-12-23, hartera) ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/danRer2/nib mkdir -p /iscratch/i/danRer2/nib cp -p /cluster/data/danRer2/nib/chr*.nib /iscratch/i/danRer2/nib # Pseudo-contig fa that have been repeat- and trf-masked: rm -rf /iscratch/i/danRer2/trfFa mkdir /iscratch/i/danRer2/trfFa foreach d (/cluster/data/danRer2/*/chr*_?{,?}) cp -p $d/$d:t.fa /iscratch/i/danRer2/trfFa end rm -rf /iscratch/i/danRer2/rmsk mkdir -p /iscratch/i/danRer2/rmsk cp -p /cluster/data/danRer2/*/chr*.fa.out /iscratch/i/danRer2/rmsk cp -p /cluster/data/danRer2/danRer2.2bit /iscratch/i/danRer2/ iSync # add to the bluearc ssh kksilo mkdir -p /cluster/bluearc/danRer2/nib cp -p /cluster/data/danRer2/nib/chr*.nib /cluster/bluearc/danRer2/nib mkdir -p /cluster/bluearc/danRer2/trfFa foreach d (/cluster/data/danRer2/*/chr*_?{,?}) cp -p $d/$d:t.fa /cluster/bluearc/danRer2/trfFa end # CREATE gc5Base wiggle TRACK (DONE, 2004-12-03, hartera) ssh kksilo mkdir -p /cluster/data/danRer2/bed/gc5Base cd /cluster/data/danRer2/bed/gc5Base # The number of bases that hgGcPercent claimed it measured is calculated, # which is not necessarily always 5 if it ran into gaps, and then the # division by 10.0 scales down the numbers from hgGcPercent to the range # [0-100]. wigEncode now replaces wigAsciiToBinary and the previous # processing step between these two programs. The result file is *.wig. # Each value represents the measurement over five bases beginning with # . wigEncode also calculates the zoomed set of data. nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer2 \ /iscratch/i/danRer2/nib | \ wigEncode stdin gc5Base.wig gc5Base.wib # load the .wig file back on hgwdev: ssh hgwdev cd /cluster/data/danRer2/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/danRer2/wib/gc5Base \ danRer2 gc5Base gc5Base.wig # and symlink the .wib file into /gbdb mkdir -p /gbdb/danRer2/wib/gc5Base ln -s `pwd`/gc5Base.wib /gbdb/danRer2/wib/gc5Base # MAKE 10.OOC, 11.OOC FILE FOR BLAT (DONE, 2004-12-03, hartera) # Use -repMatch=512 (based on size -- for human we use 1024, and # the zebrafish genome is ~50% of the size of the human genome ssh kkr1u00 mkdir /cluster/data/danRer2/bed/ooc cd /cluster/data/danRer2/bed/ooc mkdir -p /cluster/bluearc/danRer2 ls -1 /cluster/data/danRer2/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/danRer2/11.ooc -repMatch=512 # Wrote 44008 overused 11-mers to /cluster/bluearc/danRer2/11.ooc # For 10.ooc, repMatch = 4096 for human, so use 2048 blat nib.lst /dev/null /dev/null -tileSize=10 \ -makeOoc=/cluster/bluearc/danRer2/10.ooc -repMatch=2048 # Wrote 10639 overused 10-mers to /cluster/bluearc/danRer2/10.ooc # keep copies of ooc files in this directory and copy to iscratch cp /cluster/bluearc/danRer2/*.ooc . cp -p /cluster/bluearc/danRer2/*.ooc /iscratch/i/danRer2/ iSync # ADD CONTIGS TRACK (DONE, 2004-12-06, hartera) # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from lift # RELOAD CONTIGS TRACK - NOT ALL CONTIGS HAD LOADED INTO DATABASE TABLE # (DONE, 2005-06-15, hartera) ssh kksilo mkdir -p /cluster/data/danRer2/bed/ctgPos2 cd /cluster/data/danRer2/bed/ctgPos2 # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc foreach c (`cat /cluster/data/danRer2/chrom.lst`) awk 'BEGIN {OFS="\t"} \ {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \ /cluster/data/danRer2/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab end ssh hgwdev cd /cluster/data/danRer2/bed/ctgPos2 # Reload table (2005-06-16, hartera) - not all the contigs had loaded # into the table because the unique primary key for ctgPos2.sql consisted # of only the first 14 characters of the contig name. However, contigs for # danRer2 have names such as "Zv4_scaffold99.1" and "Zv4_scaffold99.3" for # which the first 14 characters are not unique so only the first one is # loaded. Also, there are cases where the contig has an accession and more # than one contig share the same accession as their name so names are not # unique. Therefore, this was changed in ctgPos2.sql so that the first # 20 characters of the contig name and the chromStart are the composite # primary key: PRIMARY KEY(contig(20),chromStart), echo "drop table ctgPos2" | hgsql danRer2 hgsql danRer2 < ~/kent/src/hg/lib/ctgPos2.sql echo "load data local infile 'ctgPos2.tab' into table ctgPos2" \ | hgsql danRer2 # Add trackDb.ra entry and ctgPos2.html # ENSEMBL GENES (DONE, 2004-12-06, hartera) ssh hgwdev mkdir -p /cluster/data/danRer2/bed/ensembl cd /cluster/data/danRer2/bed/ensembl # Get the ensembl protein data from # http://www.ensembl.org/Danio_rerio/martview # Follow this sequence through the pages: # Page 1) Make sure that the Danio_rerio choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensemblGene.gtf.gz # the Ensembl gene predictions are mapped to chromosomes except for # chrNA and chrUn. So, a lift file needs to be created to lift from the # scaffold to chromosome co-ordinates ssh kksilo mkdir -p /cluster/data/danRer2/bed/ensembl/liftSupertoChrom cd /cluster/data/danRer2/bed/ensembl/liftSupertoChrom # the lift files for scaffolds to chrom for NA and Un are already created cp /cluster/data/danRer2/Un/chrUn.lft . cp /cluster/data/danRer2/NA/chrNA.lft . cat *.lft >> liftUnScaffoldToChrom.lft # get chrUn and chrNA records cd /cluster/data/danRer2/bed/ensembl awk '$1 ~ /^Zv4_NA[0-9]+/ || $1 ~ /^Zv4_scaffold[0-9]+/' ensemblGene.gtf \ > ensemblGenechrUns.gtf # get records for all other chroms awk '$1 ~ /^[0-9]+/' ensemblGene.gtf > ensemblGenechroms.gtf liftUp -type=.gtf ensemblGenechrUns.lifted \ ./liftSupertoChrom/liftUnScaffoldToChrom.lft warn ensemblGenechrUns.gtf # Got 38882 lifts in ./liftSupertoChrom/liftUnScaffoldToChrom.lft sed -e "s/^/chr/" ensemblGenechroms.gtf > ensGene.gtf cat ensemblGenechrUns.lifted >> ensGene.gtf # check some of the lifted co-ordinates # load into database ssh hgwdev cd /cluster/data/danRer2/bed/ensembl /cluster/bin/i386/ldHgGene danRer2 ensGene \ /cluster/data/danRer2/bed/ensembl/ensGene.gtf # Read 32062 transcripts in 592139 lines in 1 files # 32062 groups 27 seqs 1 sources 4 feature types # 32062 gene predictions # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. gunzip ensGtp.tsv.gz hgsql danRer2 < ~/kent/src/hg/lib/ensGtp.sql # remove header line from ensGtp.txt echo "load data local infile 'ensGtp.tsv' into table ensGtp" \ | hgsql -N danRer2 # Get the ensembl peptide sequences from # http://www.ensembl.org/Danio_rerio/martview # Choose Danio Rerio as the organism # Follow this sequence through the pages: # Page 1) Choose the Ensembl Genes choice. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Sequences" tab. # Page 4) Choose Transcripts/Proteins and peptide Only as the output, # choose text/fasta and gzip compression, # name the file ensemblPep.fa.gz and then hit export. gunzip ensemblPep.fa.gz hgPepPred danRer2 ensembl ensemblPep.fa # AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2004-12-06, hartera) # sequences already downloaded for danRer1 ssh hgwdev cd /projects/compbio/data/microarray/affyZebrafish cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /cluster/bluearc/affy/ # Set up cluster job to align Zebrafish consensus sequences to danRer2 ssh kkr1u00 mkdir -p /cluster/data/danRer2/bed/affyZebrafish.2004-12-06 ln -s /cluster/data/danRer2/bed/affyZebrafish.2004-12-06 \ /cluster/data/danRer2/bed/affyZebrafish cd /cluster/data/danRer2/bed/affyZebrafish mkdir -p /iscratch/i/affy cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy iSync ssh kk cd /cluster/data/danRer2/bed/affyZebrafish ls -1 /iscratch/i/affy/Zebrafish_consensus.fa > affy.lst ls -1 /iscratch/i/danRer2/trfFa/ > genome.lst echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/danRer2/11.ooc /iscratch/i/danRer2/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub gensub2 genome.lst affy.lst template.sub para.spec mkdir psl para create para.spec para try, check, push ... etc. # para time # Completed: 299 of 299 jobs # CPU time in finished jobs: 4702s 78.37m 1.31h 0.05d 0.000 y # IO & Wait Time: 3077s 51.28m 0.85h 0.04d 0.000 y # Average job time: 26s 0.43m 0.01h 0.00d # Longest job: 128s 2.13m 0.04h 0.00d # Submission to last job: 685s 11.42m 0.19h 0.01d ssh kksilo cd /cluster/data/danRer2/bed/affyZebrafish # Do sort, best in genome filter, and convert to chromosome coordinates # to create affyZebrafish.psl pslSort dirs raw.psl tmp psl # only use alignments that have at least # 95% identity in aligned region. # do not use minCover since a lot of sequence is in Un, NA and Finished # so genes may be split up so good to see all alignments pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp affyZebrafish.psl ../../jkStuff/liftAll.lft warn contig.psl # shorten names in psl file sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.bak mv affyZebrafish.psl.bak affyZebrafish.psl pslCheck affyZebrafish.psl # psl is good # load track into database ssh hgwdev cd /cluster/data/danRer2/bed/affyZebrafish hgLoadPsl danRer2 affyZebrafish.psl # Add consensus sequences for Zebrafish chip # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/hgFixed/affyProbes ln -s \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /gbdb/hgFixed/affyProbes hgLoadSeq -abbr=Zebrafish: danRer2 \ /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa # Clean up rm batch.bak contig.psl raw.psl # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/zebrafish/danRer2 # RH MAP TRACK (DONE, 2004-12-07, hartera) # create sym links to RH sequences in /gbdb/danRer2 (2004-12-17, hartera) # RE-DO RH MAP TRACK WITH MORE STRINGENT POST-BLAT FILTERING # (DONE, 2005-05-15, hartera) # Data from Leonard Zon's lab at the Childrens Hospital, Boston # Radiation hybdrid (RH) map data consists of 8707 genomic sequences: # (1) 2835 BAC clone ends (Max Planck Inst. Robert # Geisler's lab, Tuebingen, Germany, # (2) 2015 NCBI ESTs (submitted from around the planet, # (3) 171 RH shotgun sequences (Max Planck Inst. & Children^?s # Hosp. Boston, # (4) 3686 WZ compiled ESTs from WashU # ssh kkr1u00 mkdir -p /cluster/data/danRer2/bed/ZonLab/rhMap cd /cluster/data/danRer2/bed/ZonLab/rhMap # sequences for RHmap are in /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rh.fa\ # rhcontigs.fa is the sequences all in one file with formatted header # copy to iscratch if not there already mkdir -p /iscratch/i/danRer2/rhmap cp -p /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rhcontigs.fa \ /iscratch/i/danRer2/rhmap iSync # do the blat run to map RH map sequences to danRer2 ssh kk cd /cluster/data/danRer2/bed/ZonLab/rhmap mkdir psl ls -1S /iscratch/i/danRer2/rhmap/rhcontigs.fa > rhmap.lst ls -1S /iscratch/i/danRer2/trfFa/*.fa > genome.lst # try same parameters as for bacEnds cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -tileSize=10 -ooc=/iscratch/i/danRer2/10.ooc {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 genome.lst rhmap.lst gsub spec para create spec para try para check para push, try ... etc. # Completed: 299 of 299 jobs # CPU time in finished jobs: 7977s 132.94m 2.22h 0.09d 0.000 y # IO & Wait Time: 3114s 51.91m 0.87h 0.04d 0.000 y # Average job time: 37s 0.62m 0.01h 0.00d # Longest job: 54s 0.90m 0.01h 0.00d # Submission to last job: 155s 2.58m 0.04h 0.00d ssh kksilo cd /cluster/data/danRer2/bed/ZonLab/rhMap # Make & check the psl table # Do sort, best in genome filter, and convert to chromosome coordinates # to create rhmap.psl pslSort dirs raw.psl tmp psl # only use alignments that have at least 80% identity in aligned region. # these are the parameters used for STS markers # pslReps -nearTop=0.0001 -minAli=0.8 -noIntrons raw.psl \ # contig.psl /dev/null # Processed 667164 alignments # try more stringent filtering for these: minAli=0.96 and minCover=0.40 # was suggested by Jim (DONE, 2005-05-15, hartera) - these parameters # worked well for danRer1 - see makeDanRer1.doc mv rhMap.psl rhMap.psl.old mv contig.psl contig.psl.old pslReps -nearTop=0.0001 -minAli=0.96 -minCover=0.40 raw.psl \ contig.psl /dev/null # Processed 667164 alignments liftUp rhMap.psl /cluster/data/danRer2/jkStuff/liftAll.lft warn contig.psl pslCheck rhMap.psl # psl is ok. # for previous parameters, the sequence with the most alignments had 1128. # over 1400 have multiple alignments and 7064 sequences have 1 alignment # now the most is 8. 78 sequences have 3-8 alignments, 820 have 2 and # 6709 have 1 alignment. # for the new filtered psl there are 8609 alignments, there were 13821 # for the previous parameters (8492 sequences aligned, 98%) # Load sequence alignments into database. # Reload rhMap alignments - more stringent filtering (hartera, 2005-05-15) ssh hgwdev cd /cluster/data/danRer2/bed/ZonLab/rhMap echo "drop table rhMap;" | hgsql danRer2 hgLoadPsl danRer2 rhMap.psl hgsql -N -e "select qName from rhMap;" danRer2 | sort | uniq | wc # 7607 out of 8690 sequences aligned (88%) # Add RH Map sequences # Copy sequences to gbdb if they are not there already # create sym link in /gbdb/danRer2 directory instead of /gbdb/danRer1 # (2004-12-17, hartera) mkdir -p /gbdb/danRer2/rhMap ln -s \ /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rhcontigs.fa \ /gbdb/danRer2/rhMap cd /cluster/data/danRer2/bed/ZonLab/rhMap hgLoadSeq danRer2 /gbdb/danRer1/rhmap/rhcontigs.fa # change extFile entry echo 'update extFile set path = "/gbdb/danRer2/rhMap/rhcontigs.fa" where id = 15513;' | hgsql danRer2 # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DANRER2 # (DONE, 2004-12-08, hartera) # hgcentraltest is now on hgwdev ssh hgwdev # DNA port is "0", trans prot port is "1" echo 'insert into blatServers values("danRer2", "blat14", "17789", "0", "0"); insert into blatServers values("danRer2", "blat14", "17788", "1", "0");' \ | hgsql hgcentraltest # if you need to delete those entries echo 'delete from blatServers where db="danRer2";' \ | hgsql hgcentraltest # to check the entries: echo 'select * from blatServers where db="danRer2";' \ | hgsql hgcentraltest # AUTO UPDATE GENBANK MRNA AND EST RUN (DONE, 2004-12-09, hartera) ssh eieio cd /cluster/data/genbank # This is a new assembly, edit the etc/genbank.conf file and add: # danRer2 (zebrafish) danRer2.genome = /iscratch/i/danRer2/nib/chr*.nib danRer2.lift = /cluster/data/danRer2/jkStuff/liftAll.lft danRer2.downloadDir = danRer2 # Default includes native genbank mRNAs and ESTs, # genbank xeno mRNAs but no xenoESTs, native RefSeq mRNAs but # not xeno RefSeq cvs commit -m "Added danRer2" etc/genbank.conf # No need to edit ~/kent/src/hg/makeDb/genbank/src/align/gbBlat # since the 11.ooc file for danRer1 will be good for both assemblies. # as no new sequence was added, just an improvement on the mapping of # unmapped sequence to chromosomes # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains # danRer genome information # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, all sequences: nice bin/gbAlignStep -verbose=1 -initial danRer2 # Load results for all sequences: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad danRer2 # Move results need to be moved out the way cd /cluster/data/genbank/work mv initial.danRer2 initial.danRer2.allseqs # BLASTZ SWAP FOR hg17 vs. danRer2 BLASTZ TO CREATE danRer2 vs. hg17 # (DONE, 2004-12-09, hartera) # USE RESCORED ALIGNMENTS (see makeHg17.doc) ssh kolossus mkdir -p /cluster/data/danRer2/bed/blastz.hg17.swap cd /cluster/data/danRer2/bed/blastz.hg17.swap # use rescored axtChrom from blastzDanRer2 on hg17 set aliDir = /cluster/data/hg17/bed/blastz.danRer2 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 1.1G /cluster/data/hg17/bed/blastz.danRer2/axtChrom # 1.1G unsorted # 1.1G axtChrom rm -r unsorted # translate sorted axt files into psl cd /cluster/data/danRer2/bed/blastz.hg17.swap mkdir -p pslChrom set tbl = "blastzHg17" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/danRer2/bed/blastz.hg17.swap/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl danRer2 $f echo "$f Done" end # CHAIN HUMAN (hg17) BLASTZ (DONE, 2004-12-10, hartera) # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS # AND DEGENERATE DNA (DONE, 2004-12-21, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/danRer2/bed/blastz.hg17.swap mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain # Reuse gap penalties from hg16 vs chicken run. cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap $1 \ /iscratch/i/danRer2/nib \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # create input list and batch for cluster run ls -1S /cluster/data/danRer2/bed/blastz.hg17.swap/axtChrom/*.axt \ > input.lst gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 2416s 40.27m 0.67h 0.03d 0.000 y # IO & Wait Time: 351s 5.85m 0.10h 0.00d 0.000 y # Average job time: 99s 1.65m 0.03h 0.00d # Longest job: 144s 2.40m 0.04h 0.00d # Submission to last job: 524s 8.73m 0.15h 0.01d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # only chrNA has a large number of chains with score <= 5000 (>200,000 chains) # chrUn has about 90,000 chains with score <=5000 # apply filter with minScore=5000 rm -r chain mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain chainSplit chain all.chain # remove repeats from chains and reload into database # (2004-12-21, hartera) mv chain chainRaw mkdir chain cd chainRaw foreach f (*.chain) set c = $f:r echo $c nice chainAntiRepeat /iscratch/i/danRer2/nib \ /iscratch/i/gs.18/build35/bothMaskedNibs $f \ ../chain/$c.chain end cd .. chainMergeSort ./chain/*.chain > all.chain.antirepeat chainSplit chainAR all.chain.antirepeat # load filtered chains and check ssh hgwdev cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain/chainAR foreach i (*.chain) set c = $i:r hgLoadChain danRer2 ${c}_chainHg17 $i echo done $c end # featureBits -chrom=chr1 danRer2 refGene:cds chainHg17 -enrichment # refGene:cds 0.512%, chainHg17 34.242%, both 0.444%, cover 86.63%, # enrich 2.53x # featureBits -chrom=chr1 danRer2 refGene:cds chainHg17Link -enrichment # refGene:cds 0.512%, chainHg17Link 4.760%, both 0.399%, cover 77.80%, # enrich 16.3 # featureBits -chrom=chr1 danRer1 refGene:cds chainHg17Link -enrichment # refGene:cds 0.529%, chainHg17Link 3.924%, both 0.409%, cover 77.24%, # enrich 19.69x # number of rows for chr1: # chainHg17Link 185694 # after using chainAntiRepeat # chainHg17ARLink 176455 # after chainAntiRepeat done on filtered chains: # featureBits -chrom=chr1 danRer2 refGene:cds chainHg17ARLink -enrichment # refGene:cds 0.512%, chainHg17ARLink 4.625%, both 0.398%, cover 77.79%, # enrich 16.82x # NET HUMAN (hg17) BLASTZ (DONE, 2004-12-10,hartera) # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera) ssh kksilo cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain mkdir preNet cd chainAR foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 126734336, utime 657 s/100, stime 69 # Add classification info using db tables: # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option # linSpecRep.notInHuman and linSpecRep.notInZebrafish exist # (see makeHg17.doc) ssh hgwdev cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain time netClass noClass.net danRer2 hg17 humanhg17.net \ -tNewR=/cluster/bluearc/danRer2/linSpecRep.notInHuman \ -qNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish -noAr # 87.940u 51.110s 4:05.95 56.5% 0+0k 0+0io 631pf+0w # load net into database netFilter -minGap=10 humanhg17.net | hgLoadNet danRer2 netHg17 stdin # featureBits danRer1 refGene:cds netHg17 -enrichment # refGene:cds 0.462%, netHg17 27.868%, both 0.384%, cover 83.21%, enrich 2.99x # featureBits danRer2 refGene:cds netHg17 -enrichment # refGene:cds 0.468%, netHg17 30.608%, both 0.406%, cover 86.82%, enrich 2.84x # BLASTZ SWAP FOR mm5 vs danRer2 BLASTZ TO CREATE danRer2 vs mm5 BLASTZ # (DONE, 2004-12-13, hartera) ssh kolossus mkdir -p /cluster/data/danRer2/bed/blastz.mm5.swap cd /cluster/data/danRer2/bed/blastz.mm5.swap # use rescored axtChrom from blastzDanRer1 on mm5 set aliDir = /cluster/data/mm5/bed/blastz.danRer2 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 919M /cluster/data/mm5/bed/blastz.danRer2/axtChrom # 921M unsorted # 919M axtChrom rm -r unsorted # translate sorted axt files into psl ssh kolossus cd /cluster/data/danRer2/bed/blastz.mm5.swap mkdir -p pslChrom set tbl = "blastzMm5" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/danRer2/bed/blastz.mm5.swap/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl danRer2 $f echo "$f Done" end # CHAIN MOUSE (mm5) BLASTZ (DONE, 2004-12-13, hartera) # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS # AND DEGENERATE DNA (DONE, 2004-12-21, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/danRer2/bed/blastz.mm5.swap mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/danRer2/bed/blastz.mm5.swap/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from hg16 vs chicken run. cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy # create chains with only default filtering on score - minScore = 1000 cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap $1 \ /iscratch/i/danRer2/nib \ /cluster/bluearc/scratch/mus/mm5/softNib $2 >& $3 '_EOF_' chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 2405s 40.08m 0.67h 0.03d 0.000 y # IO & Wait Time: 447s 7.46m 0.12h 0.01d 0.000 y # Average job time: 102s 1.70m 0.03h 0.00d # Longest job: 147s 2.45m 0.04h 0.00d # Submission to last job: 523s 8.72m 0.15h 0.01d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # only chrUn and chrNA have >100,000 chains with score <=5000 # filter on minScore=5000 mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chainFilt5k rm -r chain chainSplit chain all.chainFilt5k # remove repeats from chains and reload into database # (2004-12-21, hartera) ssh kksilo cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain mv chain chainRaw mkdir chain cd chainRaw foreach f (*.chain) set c = $f:r echo $c nice chainAntiRepeat /iscratch/i/danRer2/nib \ /cluster/bluearc/scratch/mus/mm5/softNib $f \ ../chain/$c.chain end cd .. chainMergeSort ./chain/*.chain > all.chain.antirepeat chainSplit chainAR all.chain.antirepeat # load chains into database ssh hgwdev cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain/chainAR foreach i (*.chain) set c = $i:r hgLoadChain danRer2 ${c}_chainMm5 $i echo done $c end # featureBits -chrom=chr1 danRer2 refGene:cds chainMm5 -enrichment # refGene:cds 0.512%, chainMm5 34.341%, both 0.436%, cover 85.08%, enrich 2.48x # featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment # refGene:cds 0.512%, chainMm5Link 4.588%, both 0.395%, cover 77.08%, # enrich 16.80x # featureBits -chrom=chr1 danRer1 refGene:cds chainMm5 -enrichment # refGene:cds 0.529%, chainMm5 42.554%, both 0.459%, cover 86.70%, enrich 2.04x # featureBits -chrom=chr1 danRer1 refGene:cds chainMm5Link -enrichment # refGene:cds 0.529%, chainMm5Link 7.948%, both 0.412%, cover 77.80%, # enrich 9.79x # after chainAntiRepeats: # featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment # refGene:cds 0.512%, chainMm5Link 4.499%, both 0.395%, cover 77.08%, # enrich 17.13x # before chainAntiRepeat: # chr1_chainMm5Link 167661 # after chainAntiRepeat: # chr1_chainMm5Link 162853 # NET MOUSE (mm5) BLASTZ (DONE, 2004-12-13, hartera) # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera) ssh kksilo cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain mkdir preNet cd chainAR foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 119873536, utime 849 s/100, stime 81 # Add classification info using db tables: # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInMouse cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \ /cluster/bluearc/mm5/linSpecRep.notInZebrafish cp /iscratch/i/danRer2/linSpecRep.notInMouse/* \ /cluster/bluearc/danRer2/linSpecRep.notInMouse ssh hgwdev cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain time netClass noClass.net danRer2 mm5 mouseMm5.net \ -tNewR=/cluster/bluearc/danRer2/linSpecRep.notInMouse \ -qNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish -noAr # 86.210u 51.710s 4:02.29 56.9% 0+0k 0+0io 206pf+0w netFilter -minGap=10 mouseMm5.net | hgLoadNet danRer2 netMm5 stdin # featureBits danRer2 refGene:cds netMm5 -enrichment # refGene:cds 0.468%, netMm5 30.706%, both 0.404%, cover 86.40%, enrich 2.81x # featureBits danRer1 refGene:cds netMm5 -enrichment # refGene:cds 0.461%, netMm5 36.622%, both 0.395%, cover 85.70%, enrich 2.34x # add html and trackDb.ra entries # after chainAntiRepeat: # featureBits danRer2 refGene:cds netMm5 -enrichment # refGene:cds 0.468%, netMm5 30.565%, both 0.405%, cover 86.40%, enrich 2.83x # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-12-14, hartera) # CORRECTION MADE TO chrM.agp FILE - MADE TAB DELIMITED INSTEAD OF # SPACE DELIMITED SO REPLACE OLD AGP FILE IN DOWNLOADS # (DONE, 2005-04-25, hartera) # REPLACE chr1 WITH chr1New IN chromFa and chromFaMasked - chr1 was # incompletely masked due to an error in the RepeatMasker output for chr1. # See section on "MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF" # (2005-06-06, hartera) # UPDATED README FILES WITH NEW SEQUENCE FTP FOR SANGER (2005-08-04,hartera) ssh kksilo cd /cluster/data/danRer2 #- Build the .zip files cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip mkdir zip # chrom AGP's zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp # chrom RepeatMasker out files zip -j zip/chromOut.zip */chr*.fa.out # soft masked chrom fasta zip -j zip/chromFa.zip */chr*.fa # hard masked chrom fasta zip -j zip/chromFaMasked.zip */chr*.fa.masked # chrom TRF output files cd bed/simpleRepeat zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed cd ../.. # get GenBank native mRNAs cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=danRer2 -native GenBank mrna \ /cluster/data/danRer2/zip/mrna.fa # get GenBank xeno mRNAs ./bin/i386/gbGetSeqs -db=danRer2 -xeno GenBank mrna \ /cluster/data/danRer2/zip/xenoMrna.fa # get native RefSeq mRNAs ./bin/i386/gbGetSeqs -db=danRer2 -native refseq mrna \ /cluster/data/danRer2/zip/refMrna.fa # get native GenBank ESTs ./bin/i386/gbGetSeqs -db=danRer2 -native GenBank est \ /cluster/data/danRer2/zip/est.fa cd /cluster/data/danRer2/zip # zip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs zip -j mrna.zip mrna.fa zip -j xenoMrna.zip xenoMrna.fa zip -j refMrna.zip refMrna.fa zip -j est.zip est.fa '_EOF_' # << this line makes emacs coloring happy csh ./jkStuff/zipAll.csh |& tee ./jkStuff/zipAll.log cd zip # remake just the agp files zip with correcte chrM.agp (2005-04-25,hartera) ssh kksilo cd /cluster/data/danRer2 rm /cluster/data/danRer2/zip/chromAgp.zip # chrom AGP's zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp #- Look at zipAll.log to make sure all file lists look reasonable. # Make upstream files and Copy the .zip files to # hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/danRer2/zip # make upstream files for zebrafish RefSeq featureBits danRer2 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa featureBits danRer2 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test set gp = /usr/local/apache/htdocs/goldenPath/danRer2 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips mkdir -p $gp/chromosomes foreach f (../*/chr*.fa) zip -j $gp/chromosomes/$f:t.zip $f end cd $gp/bigZips md5sum *.zip > md5sum.txt cd $gp/chromosomes md5sum *.zip > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # Remake just the chromAgp.zip with correct chrM.agp (2005-04-25,hartera) ssh kksilo cd /cluster/data/danRer2 rm /cluster/data/danRer2/zip/chromAgp.zip # chrom AGP's zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp ssh hgwdev cd /cluster/data/danRer2/zip #- Check zip file integrity: foreach f (chromAgp.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l chromAgp.zip.test # looks good set gp = /usr/local/apache/htdocs/goldenPath/danRer2 rm $gp/bigZips/chromAgp.zip rm $gp/bigZips/md5sum.txt cp -p chromAgp.zip $gp/bigZips # go to directory with zip files and remake the md5sum.txt file cd $gp/bigZips md5sum *.zip > md5sum.txt # Remake chr1 zips for masked file with chr1New.fa (2005-06-06, hartera) ssh kkstore01 cd /cluster/data/danRer2 # move old chr1 files temporarily out of the way mkdir 1/tmp mv 1/chr1.fa.out ./tmp mv 1/chr1.fa ./tmp mv 1/chr1.fa.masked ./tmp rm zip/chromOut.zip zip/chromFa.zip zip/chromFaMasked.zip # chrom RepeatMasker out files zip -j zip/chromOut.zip */chr*.fa.out # soft masked chrom fasta zip -j zip/chromFa.zip */chr*.fa # hard masked chrom fasta zip -j zip/chromFaMasked.zip */chr*.fa.masked cd /cluster/data/danRer2/zip #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test ssh hgwdev cd /cluster/data/danRer2/zip set gp = /usr/local/apache/htdocs/goldenPath/danRer2 # copy over just those modified zip files cp -p chromOut.zip $gp/bigZips cp -p chromFa.zip $gp/bigZips cp -p chromFaMasked.zip $gp/bigZips # remove chr1.fa.zip and zip chr1New rm $gp/chromosomes/chr1.fa.zip zip -j $gp/chromosomes/chr1New.fa.zip ../1/chr1New.fa # replace md5sum cd $gp/bigZips rm md5sum.txt md5sum *.zip > md5sum.txt cd $gp/chromosomes rm md5sum.txt md5sum *.zip > md5sum.txt # Update README.txt for each to state that a new masked chr1 was produced # updated bigZips and chromosomes README.txt files with new ftp site # for the assembly sequence at Sanger (2005-08-04, hartera): # now ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/ # MAKE VSHG17 DOWNLOADABLES (DONE, 2004-12-14, hartera) # REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat # (DONE, 2004-12-21, hartera) ssh hgwdev cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChrom set gp = /usr/local/apache/htdocs/goldenPath/danRer2 mkdir -p $gp/vsHg17/axtChrom cp -p *.axt $gp/vsHg17/axtChrom cd $gp/vsHg17/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # copy chains and net to downloads area # re-make chains and net downloadables (2004-12-21, hartera) rm $gp/vsHg17/human*.gz $gp/vsHg17/md5sum.txt cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/humanHg17.chain.gz gzip -c humanhg17.net > /cluster/data/danRer2/zip/humanHg17.net.gz cd $gp/vsHg17 mv /cluster/data/danRer2/zip/human*.gz . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MAKE VSMM5 DOWNLOADABLES (DONE, 2004-12-14, hartera) # REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat # (DONE, 2004-12-21, hartera) ssh hgwdev cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChrom set gp = /usr/local/apache/htdocs/goldenPath/danRer2 mkdir -p $gp/vsMm5/axtChrom cp -p *.axt $gp/vsMm5/axtChrom cd $gp/vsMm5/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # copy chains and nets to downloads area # re-make chains and net downloadables (2004-12-21, hartera) rm $gp/vsMm5/mouse*.gz $gp/vsMm5/md5sum.txt cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/mouseMm5.chain.gz gzip -c mouseMm5.net > /cluster/data/danRer2/zip/mouseMm5.net.gz cd $gp/vsMm5 mv /cluster/data/danRer2/zip/mouse*.gz . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # BLASTZ HG17 CLEANUP (DONE, 2004-12-14, hartera) # RE-DONE (DONE, 2004-12-21, hartera) ssh kksilo cd /cluster/data/danRer2/bed/blastz.hg17.swap nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/noClass.net & nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net & nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain & nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet & # BLASTZ MM5 CLEANUP (DONE, 2004-12-14, hartera) # RE-DONE (DONE, 2004-12-21, hartera) ssh kksilo cd /cluster/data/danRer2/bed/blastz.mm5.swap nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/noClass.net & nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net & nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain & nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet & # BLASTZ FOR FUGU (fr1) (DONE, 2004-12-14, hartera) # Fugu is quite distant from zebrafish, more distant than human/chicken # so use the HoxD55.q matrix in future for blastz (2005-06-01, hartera) # Blastz requires lineage-specific repeats but there are none # for these two fish. ssh kk mkdir -p /cluster/data/danRer2/bed/blastz.fr1.2004-12-13 ln -s /cluster/data/danRer2/bed/blastz.fr1.2004-12-13 \ /cluster/data/danRer2/bed/blastz.fr1 cd /cluster/data/danRer2/bed/blastz.fr1 # use same parameters as for danRer1 vs fr1 cat << '_EOF_' > DEF # zebrafish (danRer2) vs. Fugu (fr1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified BLASTZ_ABRIDGE_REPEATS=0 # TARGET - zebrafish (danRer2) SEQ1_DIR=/iscratch/i/danRer2/nib SEQ1_RMSK= # lineage-specific repeats # we don't have that information for these species SEQ1_SMSK= SEQ1_FLAG= SEQ1_IN_CONTIGS=0 # 10 MB chunk for target SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Fugu (fr1) # soft-masked chrom nib SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG= SEQ2_IN_CONTIGS=0 # 10 Mbase for query SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/danRer2/bed/blastz.fr1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.danRer2-fr1.2004-12-13 # setup cluster run # source the DEF file bash . ./DEF /cluster/data/danRer2/jkStuff/BlastZ_run0.sh cd run.0 # check batch looks ok then para try, check, push, check, .... # para time # Completed: 6055 of 6055 jobs # CPU time in finished jobs: 1440485s 24008.08m 400.13h 16.67d 0.046 y # IO & Wait Time: 132074s 2201.24m 36.69h 1.53d 0.004 y # Average job time: 260s 4.33m 0.07h 0.00d # Longest job: 2054s 34.23m 0.57h 0.02d # Submission to last job: 4060s 67.67m 1.13h 0.05d ssh kki cd /cluster/data/danRer2/bed/blastz.fr1 bash # if a csh/tcsh user . ./DEF /cluster/data/danRer2/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Completed: 173 of 173 jobs # CPU time in finished jobs: 249s 4.16m 0.07h 0.00d 0.000 y # IO & Wait Time: 695s 11.58m 0.19h 0.01d 0.000 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest job: 16s 0.27m 0.00h 0.00d # Submission to last job: 296s 4.93m 0.08h 0.00d # Third cluster run to convert lav's to axt's ssh kki cd /cluster/data/danRer2/bed/blastz.fr1 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /iscratch/i/danRer2/nib \ /cluster/bluearc/fugu/fr1/chromNib stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer2/bed/blastz.fr1/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 71s 1.19m 0.02h 0.00d 0.000 y # IO & Wait Time: 765s 12.75m 0.21h 0.01d 0.000 y # Average job time: 30s 0.50m 0.01h 0.00d # Longest job: 66s 1.10m 0.02h 0.00d # Submission to last job: 223s 3.72m 0.06h 0.00d # translate sorted axt files into psl ssh kolossus cd /cluster/data/danRer2/bed/blastz.fr1 mkdir -p pslChrom set tbl = "blastzFr1" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/danRer2/bed/blastz.fr1/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl danRer2 $f echo "$f Done" end # featureBits -chrom=chr1 danRer2 refGene:cds blastzFr1 -enrichment # refGene:cds 0.512%, blastzFr1 10.880%, both 0.439%, cover 85.71%, enrich 7.88x # featureBits -chrom=chr1 danRer1 refGene:cds blastzFr1 -enrichment # refGene:cds 0.529%, blastzFr1 5.542%, both 0.463%, cover 87.50%, enrich 15.79x # CHAIN FUGU (fr1) BLASTZ (DONE, 2004-12-14, hartera) # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS # AND DEGENERATE DNA (DONE, 2004-12-21, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/danRer2/bed/blastz.fr1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/danRer2/bed/blastz.fr1/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /iscratch/i/danRer2/nib \ /cluster/bluearc/fugu/fr1/chromNib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 402s 6.71m 0.11h 0.00d 0.000 y # IO & Wait Time: 276s 4.59m 0.08h 0.00d 0.000 y # Average job time: 24s 0.40m 0.01h 0.00d # Longest job: 56s 0.93m 0.02h 0.00d # Submission to last job: 101s 1.68m 0.03h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/danRer2/bed/blastz.fr1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # filter with minScore=5000 mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chainFilt5k rm -r chain chainSplit chain all.chainFilt5k # remove repeats from chains and reload into database # (2004-12-21, hartera) ssh kksilo cd /cluster/data/danRer2/bed/blastz.fr1/axtChain gunzip all.chainFilt5k.gz chainSplit chainRaw all.chainFilt5k mkdir chain cd chainRaw foreach f (*.chain) set c = $f:r echo $c nice chainAntiRepeat /iscratch/i/danRer2/nib \ /cluster/bluearc/fugu/fr1/chromNib $f \ ../chain/$c.chain end cd .. chainMergeSort ./chain/*.chain > all.chain.antirepeat chainSplit chainAR all.chain.antirepeat # Load chains into database ssh hgwdev cd /cluster/data/danRer2/bed/blastz.fr1/axtChain/chainAR foreach i (*.chain) set c = $i:r hgLoadChain danRer2 ${c}_chainFr1 $i echo done $c end # featureBits -chrom=chr1 danRer2 refGene:cds chainFr1 -enrichment # refGene:cds 0.512%, chainFr1 23.334%, both 0.451%, cover 88.07%, enrich 3.77x # featureBits -chrom=chr1 danRer2 refGene:cds chainFr1Link -enrichment # refGene:cds 0.512%, chainFr1Link 7.794%, both 0.426%, cover 83.16%, enrich 10.67x # featureBits -chrom=chr1 danRer1 refGene:cds chainFr1 -enrichment # refGene:cds 0.529%, chainFr1 19.003%, both 0.479%, cover 90.41%, enrich 4.76x # featureBits -chrom=chr1 danRer1 refGene:cds chainFr1Link -enrichment # refGene:cds 0.529%, chainFr1Link 4.686%, both 0.450%, cover 85.11%, enrich 18.16x # after chainAntiRepeat: # featureBits -chrom=chr1 danRer2 refGene:cds chainFr1Link -enrichment # refGene:cds 0.512%, chainFr1Link 7.726%, both 0.426%, cover 83.16%, # enrich 10.76x # NET FUGU (fr1) BLASTZ (DONE, 2004-12-14, hartera) # REMADE NET FOR FUGU (2004-12-15, hartera) # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera) ssh kksilo cd /cluster/data/danRer2/bed/blastz.fr1/axtChain mkdir preNet cd chainAR foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 108306432, utime 690 s/100, stime 58 # Add classification info using db tables: ssh hgwdev cd /cluster/data/danRer2/bed/blastz.fr1/axtChain # use -noAr option - don't look for ancient repeats # redo net as used danRer1 for netClass instead of danRer2 (2004-12-15) # and load new net into database netClass -noAr noClass.net danRer2 fr1 fuguFr1.net # Load the nets into database ssh hgwdev cd /cluster/data/danRer2/bed/blastz.fr1/axtChain netFilter -minGap=10 fuguFr1.net | hgLoadNet danRer2 netFr1 stdin # featureBits danRer2 refGene:cds netFr1 -enrichment # refGene:cds 0.468%, netFr1 19.389%, both 0.411%, cover 87.95%, enrich 4.54x # featureBits danRer1 refGene:cds netFr1 -enrichment # refGene:cds 0.461%, netFr1 17.530%, both 0.391%, cover 84.78%, enrich 4.84x # after chainAntiRepeat: # featureBits danRer2 refGene:cds netFr1 -enrichment # refGene:cds 0.468%, netFr1 19.372%, both 0.412%, cover 87.97%, enrich 4.54x # MAKE VSFR1 DOWNLOADABLES (DONE, 2004-12-15, hartera) # REPLACE FR1 NET WITH NEW VERSION IN DOWNLOADS (2004-12-15, hartera) # REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat # (DONE, 2004-12-21, hartera) ssh hgwdev cd /cluster/data/danRer2/bed/blastz.fr1/axtChrom set gp = /usr/local/apache/htdocs/goldenPath/danRer2 mkdir -p $gp/vsFr1/axtChrom cp -p *.axt $gp/vsFr1/axtChrom cd $gp/vsFr1/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # copy chains and nets to downloads area # re-make chains and net downloadables (2004-12-21, hartera) rm $gp/vsFr1/fugu*.gz $gp/vsFr1/md5sum.txt cd /cluster/data/danRer2/bed/blastz.fr1/axtChain gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/fuguFr1.chain.gz gzip -c fuguFr1.net > /cluster/data/danRer2/zip/fuguFr1.net.gz cd $gp/vsFr1 mv /cluster/data/danRer2/zip/fugu*.gz . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # BLASTZ FR1 CLEANUP (DONE, 2004-12-15, hartera) # RE-DONE (DONE, 2004-12-21, hartera) ssh kksilo cd /cluster/data/danRer2/bed/blastz.fr1 nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/noClass.net & nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/all.chainFilt5k axtChain/*.net & nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain & nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet & # BLASTZ FOR TETRAODON (tetNig1) (DONE, 2004-12-21, hartera) # Tetraodon is quite distant from zebrafish, more distant than human/chicken # so use the HoxD55.q matrix in future for blastz (2005-06-01, hartera) # blastz requires lineage-specific repeats but there are none # available between these two fish species ssh kk mkdir -p /cluster/data/danRer2/bed/blastz.tetNig1.2004-12-21 ln -s /cluster/data/danRer2/bed/blastz.tetNig1.2004-12-21 \ /cluster/data/danRer2/bed/blastz.tetNig1 cd /cluster/data/danRer2/bed/blastz.tetNig1 # use tetraodon sequence in contigs for dynamic masking - see below # for dynamic masking: M=50 cat << '_EOF_' > DEF # zebrafish (danRer2) vs. tetraodon (tetNig1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin # use BLASTZ_M=50 in blastz-run1 ALIGN=blastz-run1 BLASTZ=blastz BLASTZ_H=2500 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified BLASTZ_ABRIDGE_REPEATS=0 # TARGET - zebrafish (danRer2) SEQ1_DIR=/iscratch/i/danRer2/nib SEQ1_RMSK= # lineage-specific repeats # we don't have that information for these species SEQ1_SMSK= SEQ1_FLAG= SEQ1_IN_CONTIGS=0 # 10 MB chunk for target SEQ1_CHUNK=500000 SEQ1_LAP=500 # QUERY - Tetraodon (tetNig1) # soft-masked chrom nib SEQ2_DIR=/iscratch/i/tetNig1/contigs SEQ2_RMSK= SEQ2_SMSK= SEQ2_FLAG= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK= SEQ2_LAP= BASE=/cluster/data/danRer2/bed/blastz.tetNig1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.danRer2-tetNig1.2004-12-21 # setup cluster run # source the DEF file bash . ./DEF /cluster/data/danRer2/jkStuff/BlastZ_run0.sh cd run.0 # check batch looks ok then para try, check, push, check, .... # para time # Completed: 3193 of 3193 jobs # CPU time in finished jobs: 4460310s 74338.50m 1238.97h 51.62d 0.141 y # IO & Wait Time: 41176s 686.27m 11.44h 0.48d 0.001 y # Average job time: 1410s 23.50m 0.39h 0.02d # Longest job: 2398s 39.97m 0.67h 0.03d # Submission to last job: 12372s 206.20m 3.44h 0.14d ssh kki cd /cluster/data/danRer2/bed/blastz.tetNig1 bash # if a csh/tcsh user . ./DEF /cluster/data/danRer2/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Completed: 3193 of 3193 jobs # CPU time in finished jobs: 327s 5.46m 0.09h 0.00d 0.000 y # IO & Wait Time: 8483s 141.38m 2.36h 0.10d 0.000 y # Average job time: 3s 0.05m 0.00h 0.00d # Longest job: 9s 0.15m 0.00h 0.00d # Submission to last job: 560s 9.33m 0.16h 0.01d # Third cluster run to convert lav's to axt's ssh kki cd /cluster/data/danRer2/bed/blastz.tetNig1 mkdir axtChrom # a new run directory mkdir run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt -fa stdin /iscratch/i/danRer2/nib \ /iscratch/i/tetNig1/contigs/tetNig1Contigs.fa stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 224s 3.74m 0.06h 0.00d 0.000 y # IO & Wait Time: 1240s 20.66m 0.34h 0.01d 0.000 y # Average job time: 52s 0.87m 0.01h 0.00d # Longest job: 148s 2.47m 0.04h 0.00d # Submission to last job: 157s 2.62m 0.04h 0.00d # lift up query sequences #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/danRer2/bed/blastz.tetNig1 foreach d (axtChrom/chr*.axt) echo $d liftUp -axtQ ${d}.out.axt \ /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft warn $d \ > /dev/null end #- Lift pseudo-contigs to chromosome level # check this is working correctly set dr = "/cluster/data/danRer2" foreach c (`cat ${dr}/chrom.lst`) echo lifting $c liftUp -axtQ ./axtChrom/chr${c}.out2.axt \ /cluster/data/tetNig1/jkStuff/liftAll.lft warn \ ./axtChrom/chr${c}.axt.out.axt > /dev/null end cd axtChrom foreach c (`cat ${dr}/chrom.lst`) mv chr${c}.axt chr${c}.axt.old mv chr${c}.out2.axt chr${c}.axt rm chr${c}.axt.out.axt end # translate sorted axt files into psl ssh kolossus cd /cluster/data/danRer2/bed/blastz.tetNig1 mkdir -p pslChrom # need to copy S2.len for whole tetNig1 genome - S2contigs.len cp /cluster/data/tetNig1/bed/blastzSelf/S1.len S2contigs.len set tbl = "blastzTetNig1" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f \ S1.len S2contigs.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/danRer2/bed/blastz.tetNig1/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl danRer2 $f echo "$f Done" end # H=2000 # featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1 -enrichment # refGene:cds 0.512%, blastzTetNig1 22.057%, both 0.435%, cover 84.94%, # enrich 3.85x # H=2000 and L=8000 # featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1L8k -enrichment # refGene:cds 0.512%, blastzTetNig1L8k 16.326%, both 0.401%, cover 78.28%, # enrich 4.80x # use query in contigs in one file and use dynamic masking with M=100 # should mask out more sequence in query, tetraodon has no specific repeats # library so it is not fully repeat masked. H=2500, M=100 # featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1Contigs -enrichment # refGene:cds 0.512%, blastzTetNig1Contigs 13.052%, both 0.427%, cover 83.43%, # enrich 6.39x # contigs used as above but H=2000 #featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1ContigsH2k -enrichment # refGene:cds 0.512%, blastzTetNig1ContigsH2k 17.517%, both 0.433%, # cover 84.51%, enrich 4.82x # contigs used with H=2500, L=6000 # refGene:cds 0.512%, blastzTetNig1ContigsL6k 11.380%, both 0.415%, # cover 80.94%, enrich 7.11x # use M=50 and H=2500 #featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1ContigsM50 -enrichment # refGene:cds 0.512%, blastzTetNig1ContigsM50 12.643%, both 0.427%, cover 83.42%# ,enrich 6.60x # using contigs and dynamic masking improves the enrichment and makes little # difference to coverage. the blastz track % is lower since more sequence # is being masked and since coverage is not much different, it is probably # being masked in non coding regions # at chr1:6,128,109-6,135,734 there are a lot of short alignments in low # complexity regions that are removed using L=6000. # Use contigs with H=2500, keep lower scoring alignments until after chaining # Number of rows: # blastzTetNig1 2360350 # blastzTetNig1L8k 1177874 # blastzTetNig1Contigs 725076 # blastzTetNig1ContigsH2k 825927 # blastzTetNig1ContigsL6k 538149 # blastzTetNig1ContigsM50 479228 # Using M=50 reduces the number of alignments but coverage is the same as # for using M=100 so use the dynamic masking with M=50 and H=2500 # CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-12-21, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/danRer2/bed/blastz.tetNig1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /iscratch/i/danRer2/nib \ /iscratch/i/tetNig1/nib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 446s 7.43m 0.12h 0.01d 0.000 y # IO & Wait Time: 355s 5.92m 0.10h 0.00d 0.000 y # Average job time: 29s 0.48m 0.01h 0.00d # Longest job: 98s 1.63m 0.03h 0.00d # Submission to last job: 98s 1.63m 0.03h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain mkdir chain # remove repeats from chains cd run1/chain foreach f (*.chain) set c = $f:r echo $c nice chainAntiRepeat /iscratch/i/danRer2/nib \ /iscratch/i/tetNig1/nib $f \ ../../chain/$c.chain end cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain chainMergeSort ./chain/*.chain > all.chain.antirepeat chainSplit chainAR all.chain.antirepeat # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # filter with minScore=5000 mv all.chain.antirepeat all.chainAR.unfiltered chainFilter -minScore=5000 all.chainAR.unfiltered > all.chainARFilt5k rm -r chain chainSplit chainAR all.chainARFilt5k # only chr1 has more than 100,000 chains with score of 5000 # or less after filtering # Load chains into database ssh hgwdev cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain/chainAR foreach i (*.chain) set c = $i:r hgLoadChain danRer2 ${c}_chainTetNig1 $i echo done $c end # featureBits -chrom=chr1 danRer2 refGene:cds chainTetNig1Link -enrichment # refGene:cds 0.512%, chainTetNig1Link 11.005%, both 0.416%, cover 81.22%, # enrich 7.38x # NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-12-21, hartera) ssh kksilo cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain mkdir preNet cd chainAR foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2contigs.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len \ ../../S2contigs.len ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 102502400, utime 670 s/100, stime 69 # Add classification info using db tables: ssh hgwdev cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain # use -noAr option - don't look for ancient repeats netClass -noAr noClass.net danRer2 tetNig1 tetraTetNig1.net # Load the nets into database ssh hgwdev cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain netFilter -minGap=10 tetraTetNig1.net | hgLoadNet danRer2 netTetNig1 stdin # featureBits danRer2 refGene:cds netTetNig1 -enrichment # refGene:cds 0.468%, netTetNig1 19.068%, both 0.410%, cover 87.50%, # enrich 4.59x # BACENDS (DONE, 2005-05-20, hartera) # Removed incorrect row from bacCloneXRef table (DONE, 2006-04-19, hartera) # provided by Anthony DiBiase, Yi Zhou and Leonard Zon at the Boston # Children's Hospital. Collaborated with them on this track. # Anthony DiBiase:adibiase@enders.tch.harvard.edu # Yi Zhou:yzhou@enders.tch.harvard.edu # BAC clone end sequences are from Robert Geisler's lab, # Max Planck Institute for Developmental Biology, Tuebingen, Germany # REMAKE AND RELOAD bacCloneAlias AND bacCloneXRef TABLES AFTER FIXING # BUGS IN THE zfishBacClonesandSts PROGRAM TO STORE ALL THE ALIASES FOR EACH # SANGER STS NAME AND TO STORE CORRECT INTERNAL AND EXTERNAL NAMES IN ALL # CASES. DATA FILE OF UniSTS IDS AND SANGER STS NAMES WAS ALSO RE-CREATED # AS THIS WAS INCORRECT. (DONE, 2005-05-24, hartera) # ALSO DO TESTING OF THE DATA IN THESE TABLES TO MAKE SURE IT IS CORRECT - SEE # SECTION ON TESTING BELOW. ssh kksilo mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends cd /cluster/data/danRer2/bed/ZonLab/bacends # copy BAC ends sequence from previous assembly cp /cluster/data/danRer1/bed/ZonLab/bacends/bacends.fa . faSize bacends.fa # 486978153 bases (39070196 N's 447907957 real 447907957 upper 0 lower) # in 594614 sequences in 1 files # Total size: mean 819.0 sd 230.3 min 0 (zKp108-H09.za) max 5403 (zC259G13.zb) median 796 # N count: mean 65.7 sd 154.7 # U count: mean 753.3 sd 302.6 # L count: mean 0.0 sd 0.0 ssh kkr1u00 cd /cluster/data/danRer2/bed/ZonLab/bacends mkdir -p /iscratch/i/danRer2/bacends # if not split already, split up sequence for cluster runs faSplit sequence bacends.fa 20 /iscratch/i/danRer2/bacends/bacends # iSync bacends to kilokluster /cluster/bin/iSync ssh kk cd /cluster/data/danRer2/bed/ZonLab/bacends ls -1S /iscratch/i/danRer1/bacends/*.fa > bacends.lst ls -1S /iscratch/i/danRer2/trfFa/*.fa > genome.lst cat << '_EOF_' > template #LOOP /cluster/bin/i386/blat $(path1) $(path2) -tileSize=10 -ooc=/iscratch/i/danRer2/10.ooc {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy mkdir psl gensub2 genome.lst bacends.lst template jobList para create jobList para try, check, push, check, ... # para time # Completed: 5980 of 5980 jobs # CPU time in finished jobs: 5317533s 88625.55m 1477.09h 61.55d 0.169 y # IO & Wait Time: 34446s 574.10m 9.57h 0.40d 0.001 y # Average job time: 895s 14.92m 0.25h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 9892s 164.87m 2.75h 0.11d # Submission to last job: 12309s 205.15m 3.42h 0.14d # back on kksilo (now kkstore01), filter and lift results: ssh kksilo cd /cluster/data/danRer2/bed/ZonLab/bacends pslSort dirs raw.psl temp psl # took 9 hours # With minCover=0.4, 78% of seqeunces are aligned # No minCover, aligns 90% of sequences but with a large number # of extra alignments - a lot of noise even compared to using minCover of # 0.05. # Total BAC ends sequences: 594614 # minCover minAli #alignments #sequences aligned %total sequences aligned # 0 0.85 13969946 535465 90 # 0.05 0.85 8140775 531299 89 # 0.10 0.75 5099823 519238 87 # 0.10 0.85 5098714 519084 87 # 0.10 0.90 5096205 518616 87 # 0.10 0.92 4984170 515024 87 # 0.10 0.95 4264695 498380 84 # 0.20 0.85 4515869 501415 84 # 0.40 0.85 3806487 464067 78 # Using minCover of 0.10 seems good, there are 55017 extra BAC ends # being aligned with an extra 1292227 alignments compared to 0.10, this # is an average of 23.5 alignments per sequence compared to 8 alignments # per sequence on average for minCover of 0.40. # Using minAli of 0.90 for minCover 0.10 gives 468 less sequences # aligned and 2509 less alignments. Using 0.75 for minCover, increases # the number of alignments by about 1100 compared to using 0.85 and # sequences aligned only increased by about 150. Using minAli of 0.95 # for minCover of 0.10 gives 834,019 less alignments with 20704 less # sequences being aligned than for using minAli of 0.85. # for minAli=0.92 and minCover=0.10, there are 114,544 less alignments # than for minAli=0.85 and 4060 less sequences are aligned. # try minAli=0.85 and minCover=0.40 and also minAli=0.92 and # minCover=0.10 and compare. More alignments are gained - see below # but minCover=0.10 means there are a lot of alignments where < 40% of the # BAC end aligns and this does not seem reliable so stick to minCover=0.4 # Stats for minCover=0.10, minAli=0.92: # Pairs: there are 205360 orphans and 97458 pairs, 1014 long, 19738 mismatch # and 239 slop # Singles: 17829 bacEnds.orphan and 205360 from pair analysis so a total of # 223189 orphans 7828 more than for danRer1 and more than for using # minCover=0.40 and minAli=0.85. # for minCov=0.10, minAli=0.92: # 90450 pairs.txt # 69699 pairs.txt.uniq - about 4500 more than for minCover=0.40 and minAli=0.85 # 202661 singles.txt # 178337 singles.txt.uniq - about 5399 more than for minCover=0.40,minAli=0.85 # Below, used same parameters as for danRer1: minCover=0.40 and minAli= 0.85 # all files below and tables are derived from the alignments using # these parameters for pslReps filtering. # location of store8 has changed to kkstore01 instead of kksilo so now # login there (2005-04-28) ssh kkstore01 cd /cluster/data/danRer2/bed/ZonLab/bacends nice pslReps -nearTop=0.02 -minCover=0.40 -minAli=0.85 -noIntrons raw.psl \ bacEnds.psl /dev/null liftUp bacEnds.lifted.psl /cluster/data/danRer2/jkStuff/liftAll.lft \ warn bacEnds.psl # Got 299 lifts in /cluster/data/danRer2/jkStuff/liftAll.lft wc -l bacEnds.lifted.psl # 3806457 bacEnds.lifted.psl # 4984175 (for minCov=0.10, minAli=0.92) rm -r temp pslCheck bacEnds.lifted.psl >& pslCheck.log # 536 problems with overlapping exons reported by pslCheck awk '{print $10;}' bacEnds.lifted.psl | sort | uniq > bacEnds.names.uniq # edit to remove header lines wc -l bacEnds.names.uniq # 464067 bacEnds.names.uniq grep '>' bacends.fa | wc -l # 594614 bacends.fa # 78% of BAC ends have aligned # Need to add accession information and BAC end sequence pairs mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1 cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1 grep ">" ../bacends.fa > bacEnds # remove '>' at beginning of each line perl -pi.bak -e 's/>//' bacEnds # remove ^M char from end of lines ssh hgwdev cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1 # copy over necessary files from danRer1 set dr1Bacs = "/cluster/data/danRer1/bed/ZonLab/bacends" cp $dr1Bacs/bacends.1/stripEndChar.pl.gz . cp $dr1Bacs/bacends.1/BACEnd_accessions.txt.gz . cp $dr1Bacs/bacends.1/zfish_accs.txt . cp $dr1Bacs/bacends.1/getBacEndInfo.pl.gz . gunzip *.gz ssh kkstore01 cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1 perl stripEndChar.pl < bacEnds > allBacEnds.names rm bacEnds bacEnds.bak # Kerstin Jekosch at the Sanger Centre: kj2@sanger.ac.uk # provided accession data for BAC End clones - zfish_accs.txt # and for BAC End pairs- BACEnd_accessions.txt - put these in bacends.1 dir # write and use perl script to split BAC Ends into pairs and singles # bacEndPairs.txt and bacEndSingles.txt # For pslPairs, the reverse primer (T7 or .z*) should in the first column # and the forward primer (SP6 or .y*) should be in the second column # There are several reads for some sequences and these have similar names. # Reads for the same sequencee should be in a comma separated list. # Sequence read names can be translated to external clone names as found # in NCBI's clone registry using the name without the suffix and the # translation of prefixes as follows after removal of dashes # and extra zeros in the name: # library external prefix internal prefix # CHORI-211 CH211- zC # DanioKey DKEY- zK # DanioKey Pilot DKEYP- zKp # RZPD-71 RP71- bZ # BUSM1 (PAC) BUSM1- dZ # e.g. zC001-A03.za becomes CH211-1A03 # make table of accessions for BAC ends to load into bacEndAlias table # find all suffixes used for sequence namesi cat << '_EOF_' > getSuffixes.pl #!/usr/bin/perl -w use strict; while() { chomp; my $l = $_; if ($l =~ /^[t1_]*[z|Z|b|d][C|K|Z]p?[0-9]+\-?[a-zA-Z]+[0-9]+\.?([A-Z0-9a-z]+)/) { my $suffix = $1; print "$suffix\n"; } } '_EOF_' chmod +x getSuffixes.pl perl getSuffixes.pl < allBacEnds.names > allBacEnds.suffixes sort allBacEnds.suffixes | uniq > allBacEnds.suffixes.uniq wc -l allBacEnds.suffixes.uniq # 22 different suffixes # SP6, SP6A,SP6W,T7,T7A,T7W,Za,p1kSP6,p1kSP6w,q1kT7,q1kT7w,sp6,t7,ya # yb,yc,yd,yt,za,zb,zc,zd # get suffixes used in BAC ends accessions file perl getSuffixes.pl < BACEnd_accessions.txt > BACEnd.names.suffixes sort BACEnd.names.suffixes | uniq # SP6, SP6A, T7 and T7A - all these endings occur in list of BAC ends # in FASTA file but if the ending is something different, need to add "A" # to SP6 and T7 and check this ending also in the BAC end accessions list # edit getBacEndInfo.pl to make sure it includes all these suffixes # also treats zK32A7SP6 and zK32A7SP6A to be different perl getBacEndInfo.pl allBacEnds.names BACEnd_accessions.txt > bacEnds.log wc -l *.txt # 31317 BACEnd_accessions.txt # 180280 bacEndPairs.txt # 16020 bacEndSingles.txt # 594614 direction.txt wc -l bacEndAccs.aliases # 47558 bacEndAccs.aliases # bacEndAccs.aliases contains sequence read names and their # Genbank accessions. # checked that all names from allBacEnds.names appear in either # bacEndPairs.txt or bacEndSingles.txt # First process BAC end alignments ssh kkstore01 mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/pairs cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs set bacDir = /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1 # these bacEnds vary in size from around 2800 bp to 626,000 bp ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \ -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndPairs.txt \ all_bacends bacEnds wc -l bacEnds.* # There are 1411 more orphans and 18147 more pairs than for danRer1 # 980 bacEnds.long # 18083 bacEnds.mismatch # 201646 bacEnds.orphan # 90967 bacEnds.pairs # 0 bacEnds.short # 190 bacEnds.slop # create header required by "rdb" tools ssh hgwdev cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\ > ../header echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header # make pairs bed file cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairs.bed # also need to process bacEndSingles.txt into a database table # for singles in bacEndSingles.txt, create a dummy file where they # are given zJA11B12T7 as dummy sequence pair. If the single is a forward # sequence, put the dummy sequence in the second column, if the single is # a reverse sequence put in first column. use a perl script to do this. ssh hgwdev cd /cluster/data/danRer2/bed/ZonLab/bacends set bacDir = /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1 mkdir singles cd singles cp /cluster/data/danRer1/bed/ZonLab/bacends/singles/formatSingles.pl.gz . gunzip formatSingles.pl.gz perl formatSingles.pl $bacDir/bacEndSingles.txt > \ $bacDir/bacEndSingles.format # then run pslPairs on this formatted sequence ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \ -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \ -mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndSingles.format \ all_bacends bacEnds wc -l bacEnds* # 0 bacEnds.long # 0 bacEnds.mismatch # 15853 bacEnds.orphan # 0 bacEnds.pairs # 0 bacEnds.short # 0 bacEnds.slop # there are 15853 orphans here ( 14126 in danRer1) 201646 from # pair analysis so a total of 217499 orphans (2138 more than danRer1) cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles wc -l bacEnds.singles # 217499 bacEnds.singles # make singles bed file cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndSingles.bed cp bacEndSingles.bed ../pairs cd ../pairs # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad # since orphans are already in bacEndSingles, do not add these cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ bacEnds.orphan | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBad.bed # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \ | row score ge 300 | sorttbl chr start \ | headchg -del > bacEndPairsBadNoOrphans.bed extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed \ bacEndPairsBadNoOrphans.bed bacEndSingles.bed | \ sorttbl tname tstart | headchg -del > bacEnds.load.psl /gbdb/danRer2/bacends/BACends.fa hgLoadSeq danRer2 /gbdb/danRer2/bacends/BACends.fa # There are rows where the aligments were the same but the lfNames are # different. This is due to the presence of multiple reads for the # same BAC end sequence. Sometimes they are slightly different lengths # so the alignments are a little different. It would be good to consolidate # all of these. Firstly, the identical rows were merged into one with a # list of all the lfNames corrsponding to that alignment. # load all alignments into temporary database ssh hgwdev echo "create database bacsDr2_rah;" | hgsql danRer2 cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs hgLoadBed bacsDr2_rah bacEndPairs bacEndPairs.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 84405 elements of size 11 # Loaded 72272 elements of size 11 (danRer1) # create a bacEndSingles table like bacEndPairs if not created already cp $HOME/kent/src/hg/lib/bacEndPairs.sql ../singles/bacEndSingles.sql # edit to give correct table name # or copy over table definition from danRer1 cp /cluster/data/danRer1/bed/ZonLab/bacends/singles/bacEndSingles.sql.gz \ ../singles/ gunzip ../singles/bacEndSingles.sql.gz hgLoadBed bacsDr2_rah bacEndSingles bacEndSingles.bed \ -sqlTable=../singles/bacEndSingles.sql # Loaded 196492 elements of size 11 # Loaded 203683 elements of size 11 (danRer1) # load all_bacends later # NOTE - this track isn't pushed to RR, just used for assembly QA # Use bacEndPairsBadNoOrphans.bed as orphans are in the singles bed file hgLoadBed bacsDr2_rah bacEndPairsBad bacEndPairsBadNoOrphans.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 18544 elements of size 11 # Need to consolidate similar rows for bacEndPairs and bacEndSingles - same # name, different lfNames and same alignments. mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/duplicates cd /cluster/data/danRer2/bed/ZonLab/bacends/duplicates hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \ bacEndPairs order by name, chrom, chromStart;" \ bacsDr2_rah > pairs.txt sort pairs.txt | uniq -c > pairs.txt.uniq wc -l pairs* # 84405 pairs.txt # 65105 pairs.txt.uniq # for danRer1: # 72272 pairs.txt # 53992 pairs.txt.uniq # for replicate rows, find all the unique lfNames and put these # in one row with the relevant lfStarts, lfSizes and correct lfCount cp \ /cluster/data/danRer1/bed/ZonLab/bacends/duplicates/removeReplicates.pl.gz . gunzip removeReplicates.pl.gz # modify script so that the database to use is supplied as an argument nice perl removeReplicates.pl pairs.txt.uniq bacEndPairs bacsDr2_rah \ > pairsNoReps.bed # repeat for singles hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \ bacEndSingles order by name, chrom, chromStart;" bacsDr2_rah \ > singles.txt sort singles.txt | uniq -c > singles.txt.uniq wc -l singles* # 196492 singles.txt # 173025 singles.txt.uniq # 203683 singles.txt (danRer1) # 179170 singles.txt.uniq (danRer1) nice perl removeReplicates.pl singles.txt.uniq bacEndSingles bacsDr2_rah \ > singlesNoReps.bed # Using badPairs with no orphans hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \ bacEndPairsBad order by name, chrom, chromStart;" bacsDr2_rah \ > badPairs.txt sort badPairs.txt | uniq -c > badPairs.txt.uniq wc -l badPairs* # 18544 badPairs.txt # 13803 badPairs.txt.uniq nice perl removeReplicates.pl badPairs.txt.uniq bacEndPairsBad bacsDr2_rah \ > badPairsNoReps.bed # sort by chrom, chromStart - done already # Wrote perl script to choose 2 BAC ends to represent each BAC clone. # since there are often more than one read for each BAC end in this set, # 2 were chosen for each BAC pair or 1 for the singles. This was based on # the ones that had the largest region aligned (using lfSizes). ssh kkstore01 cd /cluster/data/danRer2/bed/ZonLab/bacends/duplicates # run perl script: input bed file, pairs or singles, name of output file perl pickLfNames.pl pairsNoReps.bed pairs pairs2lfNames.bed mv error.log log.pairs # this log.pairs lists the 30 cases where alignments for a BAC clone use # a different pair of sequence reads for the ends. These were checked out # and, in most cases, the alignments are almost identical or overlap for # the most part so it is ok to have these extra alignments missing. # CH211-69C14: zC069-C14.ya hits almost 100,000 bp downstream # of the .yb version. It is slightly longer and it has a good hit by BLAT # also to chrom 15. This is the alignment that is missing of the two for # this clone. # run script for singles: perl pickLfNames.pl singlesNoReps.bed singles singles1lfName.bed mv error.log log.pairs # Singles: there are 80 cases where the alignments for the BAC clone use # a different sequence for the reads. badPairs: there are 13 cases. # These were not included in the new bed file and they were checked out - # the BAC clone names are for these are in the log file. # For Singles: In all but 3 cases, the alignments were almost # identical or mostly overlapped the alignments for the other sequence # read chosen to represent that BAC end in the bed file. # CH211-98J20, CH211-98O21 and CH211-98G4 have alignments for other # sequence reads for the same end represented but the alignments are to # different chromosomes (see singlesNoReps.bed). perl pickLfNames.pl badPairsNoReps.bed pairs badPairs2lfNames.bed mv error.log log.badPairs # for each of these new bed files, checks were made that there are # only 2 BAC ends per alingmnets for pairs and 1 for singles. # For each pair, there should only be 2 ends which can appear either # way round depending on the orientation and there should be 1 end for # the beginning (suffix T7, t7 or z) and one end for the end # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g. # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite # orientation. For singles, there should be a single BAC end for each # alignment and for each BAC clone, a sequence for either or both types # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate # alignments. # Finally overlaps in BAC clone names were checked. All BAC clones # represented in each of the pairs, badPairs and singles bed files are # unique to that file. Between all three bed files, 173188 BAC clones # have alignments. ssh hgwdev # NOTE: using sort and uniq on hgwdev produces tab delimited output # after merging rows with the same BAC name, the scoring is now # wrong in the bed files. # Scores should be 1000 if there is 1 row for that name, else # 1500/number of rows for that sequence name - calculated by pslPairs. # Correct the scores. mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/scores cd /cluster/data/danRer2/bed/ZonLab/bacends/scores # copy over correctScores.pl script from danRer1 cp /cluster/data/danRer1/bed/ZonLab/bacends/scores/correctScores.pl.gz . cp /cluster/data/danRer1/bed/ZonLab/bacends/scores/checkScores.pl.gz . gunzip *.gz # modify correctScores.pl to correctScore2.pl with extra argument to # specify if there is a bin field. awk '{print $4}' ../duplicates/pairs2lfNames.bed \ | sort | uniq -c > pairs.hits perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits noBin \ > bacEndPairsGoodScores.bed # same for singles awk '{print $4}' ../duplicates/singles1lfName.bed \ | sort | uniq -c > singles.hits perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits noBin \ > bacEndSinglesGoodScores.bed # and for badPairs awk '{print $4}' ../duplicates/badPairs2lfNames.bed \ | sort | uniq -c > badPairs.hits perl correctScores2.pl ../duplicates/badPairs2lfNames.bed badPairs.hits \ noBin > bacEndPairsBadGoodScores.bed # check that the scores are correct now: awk 'BEGIN {OFS = "\t"} {print $4, $5}' bacEndPairsGoodScores.bed \ | sort | uniq -c > pairs.count perl checkScores.pl < pairs.count # all the BAC clones should be in good.txt and none in bad.txt # wc -l should give same number of lines in good.txt as in pairs.hits # check all the GoodScores bed files in this manner - all are correct # for singles, 3 end up in bad.txt but it is a rounding error, # there are 7 alignments for each of these: CH211-210N9, CH211-30H16 # and DKEY-31H18. Score is: 214.285714285714 # this will be rounded to 214 when loading the database table # Now load database tables: hgLoadBed danRer2 bacEndPairs bacEndPairsGoodScores.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql # Loaded 65075 elements of size 11 hgLoadBed danRer2 bacEndSingles bacEndSinglesGoodScores.bed \ -sqlTable=../singles/bacEndSingles.sql # Loaded 172945 elements of size 11 # load of bacEndSingles did not go as planned: 172945 record(s), # 0 row(s) skipped, 29 warning(s) loading bed.tab # warnings are unknown but all of bed file loaded and the number # of warnings is small so ignore hgLoadBed danRer2 bacEndPairsBad bacEndPairsBadGoodScores.bed \ -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql # Loaded 13790 elements of size 11 # load BAC end sequences into seq table so alignments may be viewed # symlink to bacends.fa sequences in danRer1 mkdir -p /gbdb/danRer2/bacends ln -s /cluster/data/danRer1/bed/ZonLab/bacends/bacends.fa \ /gbdb/danRer2/bacends/BACends.fa hgLoadSeq danRer2 /gbdb/danRer2/bacends/BACends.fa ssh kkstore01 cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs # for all_bacends table, just load the alignments for those sequences # represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables # bacEnds.load.psl is the file of alignments # get all the names of sequences foreach f (../scores/*GoodScores.bed) echo $f awk '{print $11;}' $f >> allBacEnds.names end wc -l allBacEnds.names # 251810 allBacEnds.names # this is the total number of lines in the *GoodScores.bed files perl -pi.bak -e 's/,/\n/g' allBacEnds.names sort allBacEnds.names | uniq > allBacEnds.names.uniq # write script to go through and pick alignments cat > getAlignments.pl << 'EOF' #!/usr/bin/perl -w use strict; my $bacAligns = $ARGV[0]; # psl alignments of BAC ends my $bacs = $ARGV[1]; # list of BAC ends for which to select alignments open(ALIGNS, $bacAligns) || die "Can not open $bacAligns: $!\n"; open(BACS, $bacs) || die "Can not open $bacs:$!\n"; my %alignsHash; # read alignments and store while () { my $l = $_; my @fields = split(/\t/, $l); # hash key is BAC end name push (@{$alignsHash{$fields[9]}}, $l); } close ALIGNS; while () { chomp; my $end = $_; if (exists($alignsHash{$end})) { my @als = @{$alignsHash{$end}}; foreach my $a (@als) { print $a; } } } close BACS; 'EOF' chmod +x getAlignments.pl # get alignments for just the BAC ends that are in the database tables perl getAlignments.pl bacEnds.load.psl allBacEnds.names.uniq \ > bacEnds.load.inTables.psl # check that alignments are present for all the BAC ends in the list # alignments for all BAC ends in allBacEnds.names.uniq are there ssh hgwdev cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs # load all_bacends table hgLoadPsl danRer2 -table=all_bacends bacEnds.load.inTables.psl # load of all_bacends did not go as planned: 2615155 record(s), 0 row(s) # skipped, 81 warning(s) loading psl.tab ssh kkstore01 cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1 # make bacEndAlias table with Genbank accessions for ends # need to run getBacEndInfo.pl for the BAC end names in the # BAC tables. # in the pairs directory, there is the allBacEnds.names.uniq file # so use this. mkdir bacEndAccs cd bacEndAccs perl ../getBacEndInfo.pl ../../pairs/allBacEnds.names.uniq \ ../BACEnd_accessions.txt > bacEndAccs.aliases.log mv bacEndAccs.aliases bacEndAccs.aliases.intables ssh hgwdev cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1/bacEndAccs echo "drop table bacEndAlias" | hgsql danRer2 hgsql danRer2 < $HOME/kent/src/hg/lib/bacEndAlias.sql echo "load data local infile 'bacEndAccs.aliases.intables' into table \ bacEndAlias" | hgsql danRer2 # clonemarkers.02.12.04.txt, ctgnames.02.12.04.txt and markers.02.12.04.txt # already downloaded from Sanger for danRer1 (see makeDanRer1.doc) ssh kkstore01 mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases set dr1=/cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases set dr2=/cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases ln -s $dr1/clonemarkers.02.12.04.txt $dr2 ln -s $dr1/ctgnames.02.12.04.txt $dr2 ln -s $dr1/markers.02.12.04.txt $dr2 ln -s $dr1/zfish_accs.txt $dr2 ssh hgwdev cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases hgsql -N -e 'select name from bacEndPairs;' danRer2 > bacEnds.names hgsql -N -e 'select name from bacEndSingles;' danRer2 >> bacEnds.names hgsql -N -e 'select name from bacEndPairsBad;' danRer2 >> bacEnds.names sort bacEnds.names | uniq > bacEnds.names.uniq # 173188 bacEnds.names.uniq # 594614 bacEnd sequences to start with # some of the bacEnd sequences are replicates so count lfNames from table hgsql -N -e 'select lfNames from bacEndPairs;' danRer2 > bacEnds.lfNames hgsql -N -e 'select lfNames from bacEndSingles;' danRer2 >> bacEnds.lfNames hgsql -N -e 'select lfNames from bacEndPairsBad;' danRer2 >> bacEnds.lfNames perl -pi.bak -e 's/,/\n/g' bacEnds.lfNames sort bacEnds.lfNames | uniq > bacEnds.lfNames.uniq # 303946 bacEnds.lfNames.uniq # pslCheck has 536 bad alignments # from psl file awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names # edit to remove first few lines with no names sort bacEndsPsl.names | uniq > bacEndsPsl.names.uniq # 464067 bacEndsPsl.names.uniq # about 78% align - after filtering # Add an alias table for BAC clones # bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc # Add a xref table to give external clone registry names, internal names # sanger name, relationship between STS and BAC clone (method of finding # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT, # Genbank accession and STS primer sequences # bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc ssh hgwdev cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases wc -l *.02.12.04.txt 28305 clonemarkers.02.12.04.txt 167441 ctgnames.02.12.04.txt 12008 markers.02.12.04.txt # create list of BAC clones in the tables with chromosome aligned to hgsql -N -e 'select name, chrom from bacEndPairs;' danRer2 \ > bacClones.namesandchrom hgsql -N -e 'select name, chrom from bacEndSingles;' danRer2 \ >> bacClones.namesandchrom sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq # use zfish_accs.txt provided by the Sanger Centre to create a list # of BAC clone names, internal names and Genbank accessions ssh kkstore01 cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases # get list of UniSTS IDs using aliases to search alias file # print Sanger name, alias and UniSTS ID, use print_markers2.pl # modified version of program written by Tony DiBiase (Boston # Children's Hospital) # create find_markers3.pl as the previous version was finding too # many UniSTS IDs for some markers as using grep (2005-05-24, hartera) cat << '_EOF_' > find_markers3.pl # example: # perl find_markers.pl UniSTS.aliases markers.02.12.04.txt use strict; my $verbose = 0; my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest); my $aliasFile = $ARGV[0]; my $markersFile = $ARGV[1]; open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n"; open(MARKERS, $markersFile) || die "Can not open $markersFile\n"; # store aliases from aliasFile my ($id, $al, @alsArray, %aliasHash); while () { chomp; ($id, $al) = split /\t/; @alsArray = split(/;/, $al); foreach my $as (@alsArray) { push (@{$aliasHash{$as} }, $id); } } close ALIAS; while () { my @idArray; ($f, $t, $m, $idArray[0]) = 0; my @ids; chomp; ($a, $b, $aliases, @rest) = split /\|/; if ($verbose > 3) { printf "aliases $aliases \n"; } @alias = split /;/, $aliases; ALIAS: foreach $s (@alias) { if ($s =~ /[\D]+/) { if ($verbose > 5) { printf "this $s \n"; } if (exists($aliasHash{$s})) { @idArray = @{$aliasHash{$s}}; } if ($idArray[0]) { $f = 1; $t = $s; @ids = @idArray; if ($verbose) { printf "this $s found $m \n"; } last ALIAS; } } } if ($f) { my @sNames = split(/;/, $b); foreach my $sn (@sNames) { foreach my $i (@ids) { printf "$sn\t$i\n"; } } } } close MARKERS; '_EOF_' chmod +x find_markers3.pl perl find_markers3.pl /cluster/data/ncbi/UniSTS.2005-04-12/UniSTS.aliases \ markers.02.12.04.txt > sangerandUniSTSId.txt # No need to reformat this for zfishBacClonesandSts # FPC contig information (i.e. FPC contig number) from ctgnames file is # not included in the tables as these are dynamic and constantly # changing with the assembly. # Received a new version of the zfish_accs.txt file: # zfish_accs050605.txt (from Mario Caccamo at Sanger), move this to # /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases awk 'BEGIN {OFS="\t"} {print $1, $2}' zfish_accs.txt | sort \ > zfishOld.accs.sort awk 'BEGIN {OFS="\t"} {print $1, $2}' zfish_accs050605.txt \ | sort > zfishNew.accs.sort comm -13 zfishOld.accs.sort zfishNew.accs.sort > zfishNewAccs.only comm -23 zfishOld.accs.sort zfishNew.accs.sort > zfishOldAccs.only # 1974 zfishNewAccs.only # 303 zfishOldAccs.only # this has 1974 new BAC clone and accessions pairs and 303 of the previous # BAC internal name and accession pairs are missing compared to # zfish_accs.txt awk '{print $1};' zfishOldAccs.only | sort > zfishOld.namesonly.sort awk '{print $1};' zfishNewAccs.only | sort > zfishNew.namesonly.sort # also uniq and check no name has more than one accession - each BAC # name only appears once in the file comm -12 zfishOld.namesonly.sort zfishNew.namesonly.sort \ > accs.OldandNew.common wc -l accs.OldandNew.common # 24 accs.OldandNew.common # therefore 24 BAC names have new accessions in the new file # zfishBacClonesandSts.c only reads the first two fields of each line # in the zfish_accs.txt file so create a new file with just these 2 fields. # Take the 303 BAC clones and accessions from zfish_accs.txt that are # not in zfish_accs050605.txt and add to 9000 accs in zfishNew.accs.sort cat zfishNew.accs.sort zfishOldAccs.only > zfish_accsMerged.txt # 9303 zfish_accsMerged.txt - BAC clone internal names and accessions # use zfishBacClonesandSts to create tab files for loading into # bacCloneAlias and bacCloneXRef tables mkdir -p /cluster/bluearc/danRer2/bacEnds/out # added code so that output directory now must be specified # fixed bug in code so that all aliases are now stored for each Sanger # STS name and the correct BAC clone external name is stored for internal # names in all cases. # remake tab files using the new file mapping Sanger names and UniSTS IDs # and with the corrected program (2005-05-24, hartera) nice $HOME/bin/i386/zfishBacClonesandSts ctgnames.02.12.04.txt \ clonemarkers.02.12.04.txt markers.02.12.04.txt \ bacClones.namesandchrom.uniq zfish_accsMerged.txt \ sangerandUniSTSId.txt \ /cluster/bluearc/danRer2/bacEnds/out > zfishBacs.out # output is in /cluster/bluearc/danRer2/bacends/out so copy over # sort alias tab file by sangerName sort -k2 /cluster/bluearc/danRer2/bacEnds/out/bacAlias.tab \ > bacAlias.sort.tab cp /cluster/bluearc/danRer2/bacEnds/out/bacXRef.tab . wc -l *.tab # 55836 bacAlias.sort.tab # 233418 bacXRef.tab # there are 234056 entries for danRer1 bacXRef.tab - the difference # reflects differences in BAC ends mapped to the assemblies ssh hgwdev cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases # Remove and reload tables after bug fixes in the zfishBacClonesandSts # program and the new UniSTS IDs and Sanger names mapping file which # were used to create these tab files (2005-05-25, hartera) hgsql danRer2 -e 'drop table bacCloneXRef' hgsql danRer2 -e 'drop table bacCloneAlias' hgsql danRer2 < $HOME/kent/src/hg/lib/bacCloneAlias.sql hgsql danRer2 < $HOME/kent/src/hg/lib/bacCloneXRef.sql nice hgsql danRer2 -e \ 'load data local infile "bacAlias.sort.tab" into table bacCloneAlias' nice hgsql danRer2 -e \ 'load data local infile "bacXRef.tab" into table bacCloneXRef' cd $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer2 # checked data in tables to check that everything was correctly loaded # from the files and errors were corrected - see TEST section below # edit trackDb.ra to add bacEnds tracks and searches for the bacEndPairs # and bacEndSingles tracks as for danRer1. copy over html from danRer1 # for bacEndPairs and bacEndSingles tracks. Edit to include description # of filtering alignments so that there is only one pair of sequence reads # for each BAC pair and only one sequence read for each BAC single alignment. # Remove incorrect row from bacCloneXRef table (hartera, 2006-04-19) # There is one row where the name is "CH211-155M11__" and the # intName is "zC155M11__" which is incorrect. # There is a correct row for intName zC155M11 and zC155M11__ and # CH211-155M11__ is not in bacEnd{Singles,Pairs,PairsBad} tables and # zC155M11__ is not an alias in bacEndAlias. hgsql -e 'delete from bacCloneXRef where name = "CH211-155M11__";' danRer2 # corrected termRegex for some bacCloneXRef searches so that they work # correctly (bacPairsSangerSts and bacSinglesSangerSts) # (2006-04-19, hartera) # BACENDS: TESTING FOR bacCloneAlias and bacCloneXRef TABLES # (DONE, 2005-05-25, hartera) # ADDED TEST TO CHECK SANGERNAMES IN ALIAS AND XREF TABLES # (DONE, 2005-05-26, hartera) # The following tests were carried out to check that all the data # in the bacCloneAlias and bacCloneXRef tables is correct. mkdir -p \ /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables # Check that the correct aliases are associated with their Sanger STS names awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \ ../markers.02.12.04.txt > sNameandaliases # write script to get one Sanger name and one alias on each line chmod +x getSangerAndAlias.pl perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format sort sNameandaliases.format | uniq > sNameandaliases.sort # get Sanger names and aliases from database hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer2 \ | sort | uniq > alias.db.sort wc -l alias.db.sort # 55836 alias.db.sort diff sNameandaliases.sort alias.db.sort # No difference between data file and data from database so ok # Check Sanger STS names correspond in bacAlias and bacCloneXRef tables # (2005-05-26, hartera) # get Sanger names from alias table hgsql -N -e 'select sangerName from bacCloneAlias;' danRer2 \ | sort | uniq > sName.alias.sort wc -l sName.alias.sort # 14940 sName.alias.sort # get Sanger names from xRef table hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \ is not null;' danRer2 | sort | uniq > sName.xRef.sort wc -l sName.xRef.sort # 15153 sName.xRef.sort comm -23 sName.alias.sort sName.xRef.sort # nothing in alias file only so all sanger names in the alias table are # also in the xRef table comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias wc -l sNamexRefNotAlias # 213 sNamexRefNotAlias - 213 sangerNames in alias table not in XRef table # get Sanger names from clonemarkers file awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.02.12.04.txt | sort | uniq \ > clonemarkers.sNames.sort # get Sanger names from markers file awk 'BEGIN {FS="|"}{print $2}' ../markers.02.12.04.txt > markers.sNames # remove semi-colons and sort sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort # sanger names unique to markers file comm -13 clonemarkers.sNames.sort markers.sNames.sort # there are none comm -23 clonemarkers.sNames.sort markers.sNames.sort \ > sNames.clonemarkersOnly wc -l sNames.clonemarkersOnly # 213 sNames.clonemarkersOnly diff sNames.clonemarkersOnly sNamexRefNotAlias # no difference so all the extra Sanger Names in the xRef table are # from the clonemarkers file and these have no aliases in the markers file # so they are not in the alias table so this is all ok. # Check that Sanger STS names and primers are associated correctly cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables # get sanger names and primers from markers file awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \ ../markers.02.12.04.txt > sNameandPrimers # write script to reformat and write with one Sanger name per line chmod +x getSangerandPrimers.pl perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format sort sNameandPrimers.format > sNameandPrimers.format.sort wc -l sNameandPrim* # 12008 sNameandPrimers # 14940 sNameandPrimers.format # 14940 sNameandPrimers.format.sort # get Sanger names and primers from database hgsql -N -e \ 'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \ where sangerName is not null and leftPrimer is not null and \ rightPrimer is not null;' danRer2 | sort | uniq \ > sNamesandprimers.fromdb.sort wc -l sNamesandprimers.fromdb.sort # 14940 sNamesandprimers.fromdb.sort diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort # No difference so ok # Check that UniSTS IDs and Sanger STS names are associated correctly # get Sanger names and UniSTS IDs from the database hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \ uniStsId is not null;' danRer2 | sort | uniq > sNameUniSTS.fromdb.sort wc -l sNameUniSTS.fromdb.sort # 5456 sNameUniSTS.fromdb.sort # Need to reformat the sNameUniSTS.fromdb.sort chmod +x formatUniSts.pl perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \ > sNameUniSTS.fromdb.format.sort # get Sanger names from data file and see how many UniSTS IDs there are # for each name awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \ > sangerandUniSTSId.count # the most is 3 UniSTS IDs # 3 etID9056.23 # 3 etID9042.2 # 3 etID8627.2 # 3 etID8281.9 # 3 etID11096.5 # 2 etID9986.14 # 2 etID9968.17 sort ../sangerandUniSTSId.txt > sangerandUniSTSId.txt.sort diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort \ > sangerandUniSTSIdvsdb # No difference between data from original file and that in database so ok # Check that chrom mappings and external BAC clone names are correct # get extNames and chroms they map to from the database hgsql -N -e 'select name, chroms from bacCloneXRef where \ chroms is not null;' danRer2 | sort | uniq \ > nameandchromsfromdb.sort # reformat nameandchromsfromdb.sort perl formatUniSts.pl < nameandchromsfromdb.sort | sort \ > nameandchromsfromdb.format.sort # compare extNames and chroms from db to those in data file diff ../bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort # no difference - all ok # Check Genbank accessions and internal BAC clone names hgsql -N -e 'select intName,genbank from bacCloneXRef where \ genbank is not null;' danRer2 | sort | uniq \ > intNamesandAccs.fromdb.sort # this should be a subset of zfish_accsMerged.txt - not all BAC clones # listed here appear in either our BAC ends tracks or the markers files. sort ../zfish_accsMerged.txt > zfish_accsMerged.sort comm -23 intNamesandAccs.fromdb.sort zfish_accsMerged.sort # there is nothing in the database that is not in zfish_accsMerged.sort comm -13 intNamesandAccs.fromdb.sort zfish_accsMerged.sort > onlyinzfishAccs wc -l onlyinzfishAccs # 87 onlyinzfishAccs hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' danRer2 | sort | uniq > intNamesNoAcc.fromdb.sort awk '{print $1;}' zfish_accsMerged.sort | sort > intNames.withAccs.sort comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \ > indbNoAccsandAccs.out # none of these names are common to both so all accessions from # zfish_accsMerged.txt are in the database for the internal names stored # where there are accessions available. # Test Sanger STS names, internal names and external names are all correct # Test Sanger STS name and internal BAC clone names are associated correctly # get internal names and Sanger names from data file awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.02.12.04.txt \ | sort | uniq > intNameandSanger.sort hgsql -N -e 'select intName, sangerName from bacCloneXRef \ where sangerName is not null;' danRer2 \ | sort | uniq > intNameandSanger.fromdb.sort diff intNameandSanger.sort intNameandSanger.fromdb.sort # No difference between data from file and that from database so ok # Check BAC clone internal name and relationship fields # get internal names and relationships from data file awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.02.12.04.txt \ | sort | uniq > intNameandRelation.sort # get internal names and relationships from database hgsql -N -e 'select intName, relationship from bacCloneXRef \ where relationship != 0;' danRer2 \ | sort | uniq > intNameandrelation.fromdb.sort # differences unique to database file comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \ > intNameRelation.indbonly # differences unique to data file comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \ > intNameRelation.incloneMarkersonly wc -l intNameRelation* # 4345 intNameRelation.incloneMarkersonly # 4345 intNameRelation.indbonly awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names awk '{print $1}' intNameRelation.incloneMarkersonly \ > intNameRelation.incloneMarkersonly.names diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names # there is no difference in the internal names with relationship fields # no difference in names and the only places these should differ is that # the second column should all be 3 in the data from the database only. # this is because all the relationship entries that were blank were # in the clonemarkers file were changed to 3 when entered into the database. awk '{print $2}' intNameRelation.indbonly | sort | uniq # 3 - correct so all ok # all the differences should be that those that are blank in clonemarkers # are 3 in the database. # check that those that have 0 in the database bacCloneXRef relationshipe # field are not in the list from cloneMarkers # select these internal names with 0 relationship from the database hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \ danRer2 | sort | uniq > intNameNoRelation.fromdb.sort # get all the internal names from the data file awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt \ | sort | uniq > intNamefromCloneMarkers.sort comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort # nothing in common between these two files as expected so there are # no internal names in the db with 0 in the relationship field that # appear in the clonemarkers file. # Check all BAC clone internal names and external names from the # ctgnames file are in the database # get intName and extName from ctgnames file awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.02.12.04.txt \ | sort | uniq > intNameandextNamefromCtgNames.sort # get intName and extName from database hgsql -N -e 'select intName,name from bacCloneXRef;' danRer2 \ | sort | uniq > intNameandextName.fromdb.sort wc -l intNameandextName* # 218100 intNameandextName.fromdb.sort # 167441 intNameandextNamefromCtgNames.sort comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \ > intandextindbAndCtgNames wc -l intandextindbAndCtgNames # 167441 intandextindbAndCtgNames # there are 167441 name pairs common between the file and the database # and this is the same number of name pairs as in the data file diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort # no difference between those name pairs from the data file and those that # are common between the data file and the database so all internal and # external names from ctgNames file are in the database # get the list of extra ones from db comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \ > intandextNamesindbNotinCtgNames wc -l intandextNamesindbNotinCtgNames # 50659 intandextNamesindbNotinCtgNames # get list of internal names from the clonemarkers file awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt | sort | uniq \ > clonemarkers.intName.sort wc -l clonemarkers.intName.sort # 12987 clonemarkers.intName.sort # compare these intNames to those from the database not in the ctgnames file comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames # none of these clone markers internal names are in this list so they # must all be in the ctgnames file too. These extra internal names will be # translations of external names found in the list of mappings of BAC clones # to chroms. # Check that all the BAC clone external names from the list of chromosome # mappings and from the ctgnames file are in the database. # get all extNames from baclones.namesandchrom.uniq and from ctgnames awk '{print $1}' ../bacClones.namesandchrom.uniq > \ extNames.ctgnamesandbacClones awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.02.12.04.txt \ >> extNames.ctgnamesandbacClones wc -l extNames.ctgnamesandbacClones # 384421 extNames.ctgnamesandbacClones sort extNames.ctgnamesandbacClones | uniq \ > extNames.ctgnamesandbacClones.sort wc -l extNames.ctgnamesandbacClones.sort # 218100 extNames.ctgnamesandbacClones.sort # get extNames from the database hgsql -N -e 'select name from bacCloneXRef;' danRer2 | sort | uniq \ > extNames.fromdb.sort wc -l extNames.fromdb.sort # 218100 extNames.fromdb.sort comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \ > extNames.fromdbandfiles wc -l extNames.fromdbandfiles # 218100 extNames.fromdbandfiles # find extNames in common from data files and database diff extNames.fromdb.sort extNames.fromdbandfiles # no difference, all extNames from files are in db # Check that all BAC clone internal names from the ctgnames and clonemarkers # files are in the database # get internal names from ctgnames and clonemarkers files awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.02.12.04.txt \ > intNames.ctgnamesandclonemarkers awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.02.12.04.txt \ >> intNames.ctgnamesandclonemarkers wc -l intNames.ctgnamesandclonemarkers # 195746 intNames.ctgnamesandclonemarkers sort intNames.ctgnamesandclonemarkers | uniq \ > intNames.ctgnamesandclonemarkers.sort wc -l intNames.ctgnamesandclonemarkers.sort # 167441 intNames.ctgnamesandclonemarkers.sort # get internal names from database hgsql -N -e 'select intName from bacCloneXRef;' danRer2 | sort | uniq \ > intNames.fromdb.sort wc -l intNames.fromdb.sort # 218100 intNames.fromdb.sort # some of these intNames are derived from the corresponding extNames # all of the intNames from the file should be in the db comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \ > intNames.fromdbandfiles wc -l intNames.fromdbandfiles # 167441 intNames.fromdbandfiles diff intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort # no difference, all intNames from files are in db # Check that all translations are correct between BAC clone # external and internal names. # write script to get the prefixes from internal and external names chmod +x getNamePrefixes.pl hgsql -N -e 'select name, intName from bacCloneXRef;' danRer2 \ | sort | uniq > extandintNames.fromdb.sort perl getNamePrefixes.pl < extandintNames.fromdb.sort \ > extandintNames.prefixes sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq # these all look good # BUSM1 dZ # CH211 zC # CH211 zc # CH73 zH # CT7 bP # DKEY zK # DKEY zk # DKEYP zKp # RP71 bZ # XX bY # zk is a internal name prefix for the external name prefix, DKEY-. There # is only one example where this is used (DKEY-81G7) and this in the # ctgnames file and is in the bacCloneXRef table so that is ok. # All data looks good in these tables now. # Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram) ssh hgwdev cd /tmp wget \ 'http://www.genome.gov/Pages/News/Photos/Science/zebrafish_image.jpg' -O zebrafish_image.jpg # no crop images for this one, make the thumbnail directly: convert -geometry 300x200 -quality 80 -sharpen 0 -normalize \ zebrafish_image.jpg Danio_rerio.jpg cp -p Danio_rerio.jpg /usr/local/apache/htdocs/images # add links to this image in the description.html page, request push # EXTRACT AXTs and MAFs FROM FUGU (fr1) NET (DONE, 2004-12-22, hartera) # GZIP AXTs and MAFs (DONE, 2005-05-16, hartera) ssh kksilo # create axts cd /cluster/data/danRer2/bed/blastz.fr1/axtChain gunzip fuguFr1.net.gz gunzip chainAR/*.chain.gz netSplit fuguFr1.net fuguFr1Net mkdir -p ../axtNet cat > axtNet.csh << 'EOF' foreach f (fuguFr1Net/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt fuguFr1Net/$c.net chainAR/$c.chain \ /cluster/data/danRer2/nib \ /cluster/data/fr1/nib ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end 'EOF' chmod +x axtNet.csh csh axtNet.csh >&! axtNet.log & tail -100f axtNet.log # sort axts before making mafs - must be sorted for multiz cd /cluster/data/danRer2/bed/blastz.fr1 mv axtNet axtNet.unsorted mkdir axtNet foreach f (axtNet.unsorted/*.axt) set c = $f:t:r echo "Sorting $c" axtSort $f axtNet/$c.axt end # create maf ssh kksilo cd /cluster/data/danRer2/bed/blastz.fr1 cd axtNet mkdir ../mafNet cat > makeMaf.csh << 'EOF' foreach f (chr*.axt) set maf = $f:t:r.fr1.maf echo translating $f to $maf axtToMaf $f \ /cluster/data/danRer2/chrom.sizes /cluster/data/fr1/chrom.sizes \ ../mafNet/$maf -tPrefix=danRer2. -qPrefix=fr1. end 'EOF' chmod +x makeMaf.csh csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log cd /cluster/data/danRer2/bed/blastz.fr1/axtChain nice gzip fuguFr1.net chainAR/*.chain & # gzip axts and mafs (hartera, 2005-05-16) ssh kkstore01 cd /cluster/data/danRer2/bed/blastz.fr1 nice gzip axtNet/*.axt mafNet/*.maf & # EXTRACT AXTs and MAFs FROM TETRAODON (tetNig1) NET (DONE, 2004-12-22, hartera) # GZIP AXTs and MAFs AND CHAINS (DONE, 2005-05-16, hartera) ssh kksilo # create axts cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain netSplit tetraTetNig1.net tetraTetNig1Net mkdir -p ../axtNet cat > axtNet.csh << 'EOF' foreach f (tetraTetNig1Net/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt tetraTetNig1Net/$c.net chainAR/$c.chain \ /cluster/data/danRer2/nib \ /cluster/data/tetNig1/nib ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end 'EOF' chmod +x axtNet.csh csh axtNet.csh >&! axtNet.log & tail -100f axtNet.log # sort axts before making mafs - must be sorted for multiz cd /cluster/data/danRer2/bed/blastz.tetNig1 mv axtNet axtNet.unsorted mkdir axtNet foreach f (axtNet.unsorted/*.axt) set c = $f:t:r echo "Sorting $c" axtSort $f axtNet/$c.axt end # create maf ssh kksilo cd /cluster/data/danRer2/bed/blastz.tetNig1 cd axtNet mkdir ../mafNet cat > makeMaf.csh << 'EOF' foreach f (chr*.axt) set maf = $f:t:r.tetNig1.maf echo translating $f to $maf axtToMaf $f \ /cluster/data/danRer2/chrom.sizes /cluster/data/tetNig1/chrom.sizes \ ../mafNet/$maf -tPrefix=danRer2. -qPrefix=tetNig1. end 'EOF' chmod +x makeMaf.csh csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log # gzip axts and mafs (hartera, 2005-05-16) ssh kkstore01 cd /cluster/data/danRer2/bed/blastz.tetNig1 nice gzip axtNet/*.axt mafNet/*.maf & # gzip chains again cd axtChain nice gzip chainAR/*.chain & # TIGR GENE INDEX (DONE, 2004-12-22, hartera) # Data from rsultana@tigr.org (Razvan Sultana at TIGR) ssh kksilo mkdir -p /cluster/data/danRer2/bed/tigr cd /cluster/data/danRer2/bed/tigr wget ftp://ftp.tigr.org/pub/data/tgi/Danio_rerio/TGI_track_danRer2_12-2004.tgz tar xvzf TGI*.tgz foreach f (*g_gallus*) set f1 = `echo $f | sed -e 's/g_gallus/chicken/g'` mv $f $f1 end foreach f (*drosoph*) set f1 = `echo $f | sed -e 's/drosoph/Dmelano/g'` mv $f $f1 end foreach o (Dmelano chicken elegans human mouse rat zfish) echo $o setenv O $o foreach f (chr*_$o*s) tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff end end ssh hgwdev cd /cluster/data/danRer2/bed/tigr hgsql danRer2 -e "drop table tigrGeneIndex" nice ldHgGene -exon=TC danRer2 tigrGeneIndex *.gff # Read 114013 transcripts in 395310 lines in 196 files # 114013 groups 28 seqs 1 sources 1 feature types # 114013 gene predictions hgsql danRer2 -e "update tigrGeneIndex set cdsStart = txStart;" hgsql danRer2 -e "update tigrGeneIndex set cdsEnd = txEnd;" checkTableCoords danRer2 tigrGeneIndex /cluster/bin/scripts/runGeneCheck /cluster/data/danRer2/bed/tigr # 135739 badUtrSplice # 114013 noCDS - fixed these in table as above # 30191 gap gzip *.gff *TCs # make changes in doTigrGeneIndex function in hgc to add original species # name back when constructing URL to TIGR web site. # 3-WAY MULTIZ MULTIPLE ALIGNMENT # (zebrafish danRer2, fugu fr1 and tetraodon tetNig1) # (DONE, 2004-12-23, hartera) # use v.8 of multiz (see makeHg17.doc for 8-WAY alignment) ssh kksilo set multizDir = multiz.2004-12-22 set workingDir = /cluster/bluearc/danRer2/$multizDir ln -s $workingDir /cluster/bluearc/danRer2/multiz3way mkdir -p $workingDir mkdir -p /cluster/data/danRer2/bed/$multizDir ln -s /cluster/data/danRer2/bed/$multizDir \ /cluster/data/danRer2/bed/multiz3way cd /cluster/data/danRer2/bed/multiz3way # wrapper script for multiz # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) # NOTE: next time, modify script so it only needs one arg -- saves the # multiple dirname in a file for use by the next run cat << 'EOF' > doMultiz.csh #!/bin/csh -fe mkdir -p $3:h /cluster/bin/penn/multiz $1 $2 - > $3 'EOF' # << for emacs cat << 'EOF' > gsub #LOOP ../doMultiz.csh {check in line /cluster/bluearc/danRer2/multiz3way/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/danRer2/multiz3way/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/danRer2/multiz3way/$(root1)$(dir1)/$(root2).maf} #ENDLOOP 'EOF' # << for emacs chmod +x doMultiz.csh ssh kksilo set workingDir = /cluster/bluearc/danRer2/multiz3way # copy mafs to bluearc for tetraodon (tetNig1) mkdir $workingDir/tetNig1 cd /cluster/data/danRer2/bed/blastz.tetNig1/mafNet foreach f (*.maf) set c = $f:t:r set g = $c:t:r echo $g cp $f $workingDir/tetNig1/${g}.maf end cd /cluster/data/danRer2/bed/multiz3way ls $workingDir/tetNig1/*.maf > chrom.lst # fugu (fr1) mkdir $workingDir/fr1 cd /cluster/data/danRer2/bed/blastz.fr1/mafNet foreach f (*.maf) set c = $f:t:r set g = $c:t:r echo $g cp $f $workingDir/fr1/${g}.maf end # one multiz - add in fugu to zebrafish/tetraodon ssh kki set workingDir = /cluster/bluearc/danRer2/multiz3way cd /cluster/data/danRer2/bed/multiz3way mkdir run.fr1 cd run.fr1 echo "fr1/tetNig1" > species.lst gensub2 species.lst ../chrom.lst ../gsub jobList para create jobList # 28 jobs in batch para try, check, push, check ... # para time # CPU time in finished jobs: 337s 5.62m 0.09h 0.00d 0.000 y # IO & Wait Time: 153s 2.55m 0.04h 0.00d 0.000 y # Average job time: 18s 0.29m 0.00h 0.00d # Longest job: 66s 1.10m 0.02h 0.00d # Submission to last job: 134s 2.23m 0.04h 0.00d cd .. # copy 3-way mafs to build directory ssh kksilo set workingDir = /cluster/bluearc/danRer2/multiz3way ln -s $workingDir/tetNig1fr1 $workingDir/maf cd /cluster/data/danRer2/bed/multiz3way mkdir maf cp $workingDir/maf/*.maf maf # WZ EST CLUSTERS ALIGNMENTS (DONE, 2004-12-23, hartera) # WZ ESTs are compiled ESTs from WashU. They were provided by # Anthony DiBiase from Leonard Zon's lab at the Children's Hospital, Harvard # Contact: adibiase@enders.tch.harvard.edu # http://zon.tchlab.org ssh kksilo mkdir -p /cluster/data/danRer2/bed/ZonLab/wzESTs cd /cluster/data/danRer2/bed/ZonLab/wzESTs # put WZ ESTs in this directory as wzcontigs.txt - # obtained from the Zon Lab, these are unmasked. # There are 42857 ESTs in this file. # Translated to upper case for danRer1 cp /cluster/data/danRer1/bed/ZonLab/wzESTs/wzcontigs.fa . cd /cluster/data/danRer2/bed/ZonLab/wzESTs mkdir -p /cluster/bluearc/danRer2/wzESTs faSplit sequence wzcontigs.fa 20 /cluster/bluearc/danRer2/wzESTs/wzcontigs ssh kki cd /cluster/data/danRer2/bed/ZonLab/wzESTs mkdir psl ls -1 /cluster/bluearc/danRer2/wzESTs/*.fa > est.lst ls -1S /cluster/bluearc/danRer2/trfFa/*.fa > genome.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -ooc={check in exists /iscratch/i/danRer2/11.ooc} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 genome.lst est.lst gsub spec para create spec para try, check, push, try .... # para time # Completed: 5980 of 5980 jobs # CPU time in finished jobs: 17176s 286.27m 4.77h 0.20d 0.001 y # IO & Wait Time: 22870s 381.16m 6.35h 0.26d 0.001 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest job: 49s 0.82m 0.01h 0.00d # Submission to last job: 2890s 48.17m 0.80h 0.03d # Do sort, best in genome filter, and convert to chromosome coordinates # to create wzEsts.psl ssh kksilo cd /cluster/data/danRer2/bed/ZonLab/wzESTs pslSort dirs raw.psl tmp psl # only use alignments that have at least # 96% identity in aligned region. use parameters used by auto # GenBank update for native ESTs pslReps -minAli=0.96 -nearTop=0.005 raw.psl contig.psl /dev/null liftUp wz_ests.psl /cluster/data/danRer2/jkStuff/liftAll.lft warn contig.psl # check psl pslCheck wz_ests.psl >& pslCheck.log # looks good # Load EST alignments into database. ssh hgwdev cd /cluster/data/danRer2/bed/ZonLab/wzESTs hgLoadPsl danRer2 wz_ests.psl # Add WZ EST sequences # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/danRer2/wzESTs ln -s \ /cluster/data/danRer1/bed/ZonLab/wzESTs/wzcontigs.fa \ /gbdb/danRer2/wzESTs hgLoadSeq danRer2 /gbdb/danRer2/wzESTs/wzcontigs.fa # MAKE Human Proteins track (hg17 DONE 2004-12-15 braney) ssh kksilo mkdir -p /cluster/data/danRer2/blastDb cd /cluster/data/danRer2/blastDb cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA/d" | sed "/Un/d" > chrom.list for i in `cat chrom.list`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list ln -s `cat list` . for i in *.fa do formatdb -i $i -p F done rm *.log *.fa list cd .. for i in `cat blastDb/chrom.list`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft rm blastDb/chrom.list mkdir /cluster/data/danRer2/scaffoldBlastDb cd /cluster/data/danRer2/scaffoldBlastDb cat ../Un/scaffolds/*.fa ../NA/scaffolds/*.fa | faSplit sequence stdin 500 scaf for i in *.fa do formatdb -i $i -p F done rm *.log *.fa ssh kkr1u00 mkdir -p /iscratch/i/danRer2/blastDb cd /cluster/data/danRer2/blastDb for i in nhr nin nsq; do cp *.$i /iscratch/i/danRer2/blastDb ; echo $i; done mkdir -p /iscratch/i/danRer2/scaffoldBlastDb cd /cluster/data/danRer2/scaffoldBlastDb for i in nhr nin nsq; do cp *.$i /iscratch/i/danRer2/scaffoldBlastDb ; echo $i; done cd (iSync) > sync.out mkdir -p /cluster/data/danRer2/bed/tblastn.hg17KG cd /cluster/data/danRer2/bed/tblastn.hg17KG echo /iscratch/i/danRer2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst echo /iscratch/i/danRer2/scaffoldBlastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > scaffold.lst # back to kksilo exit # we want around 250000 jobs cd /cluster/data/danRer2/bed/tblastn.hg17KG calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\) # 42156/(250000/3899) = 657.464976 mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa split -l 657 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2 liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec # I ended up doing the scaffolds separately from the chopped up chrom segments # but next time I wouldn't gensub2 scaffold.lst kg.lst blastGsub scaffoldBlastSpec ssh kk cd /cluster/data/danRer2/bed/tblastn.hg17KG para create blastSpec para push # Completed: 253435 of 253435 jobs # CPU time in finished jobs: 58003988s 966733.13m 16112.22h 671.34d 1.839 y # IO & Wait Time: 13196517s 219941.96m 3665.70h 152.74d 0.418 y # Average job time: 281s 4.68m 0.08h 0.00d # Longest job: 6238s 103.97m 1.73h 0.07d # Submission to last job: 174503s 2908.38m 48.47h 2.02d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/danRer2/bed/tblastn.hg17KG para create chainSpec para push # Completed: 1287 of 1287 jobs # CPU time in finished jobs: 72236s 1203.94m 20.07h 0.84d 0.002 y # IO & Wait Time: 22886s 381.43m 6.36h 0.26d 0.001 y # Average job time: 74s 1.23m 0.02h 0.00d # Longest job: 3374s 56.23m 0.94h 0.04d # Submission to last job: 5982s 99.70m 1.66h 0.07d exit # back to kksilo cd /cluster/data/danRer2/bed/tblastn.hg17KG/blastOut # again some weirdness because I did the NA and Un scaffolds separately for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > cs60.$i.psl sort -rn cs60.$i.psl | pslUniq stdin us.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" cs60.$i.psl > ms60.$i.psl echo $i done for i in kg?? do cat $i/c.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/danRer2/bed/tblastn.hg17KG/blastHg17KG.psl cd .. # lift the chrUn and chrNA scaffolds ssh hgwdev cd /cluster/data/danRer2/bed/tblastn.hg17KG hgLoadPsl danRer2 blastHg17KG.psl # back to kksilo rm -rf blastOut # End tblastn # ACCESSIONS FOR CONTIGS (WORKING 2005-03-17 kate) # Used for Assembly track details, and also ENCODE AGP's cd /cluster/data/danRer2/bed mkdir -p contigAcc cd contigAcc # extract WGS contig to accession mapping from Entrez Nucleotide summaries # To get the summary file, access the Genbank page for the project # by searching: # genus[ORGN] AND WGS[KYWD] # At the end of the page, click on the list of accessions for the contigs. # Select summary format, and send to file. # output to file .acc grep scaffold zfish.acc | wc -l # 21333 # edit hg/encode/regionAgp/contigAccession.pl to recognize zfish format # and install in /cluster/bin/scripts contigAccession zfish.acc > contigAcc.tab wc -l contigAcc.tab # 21333 grep Zv4 /cluster/data/danRer2/?{,?}/*.agp | wc -l # 21333 hgsql danRer2 < $HOME/kent/src/hg/lib/contigAcc.sql echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \ hgsql danRer2 hgsql danRer2 -e "SELECT COUNT(*) FROM contigAcc" # 21333 # KNOWN GENES TRACK (in progress, 2005-02-14, hartera) # Found number of loci for Ensembl and mRNA by clustering cd $HOME/kent/src/hg/near/hgClusterGenes hgClusterGenes -noProt danRer2 ensGene ensCluster ensCanonical # creates 23675 clusters, there are 32062 entries in ensGene # remove extra tables created by this program echo 'drop table ensCluster;' | hgsql danRer2 echo 'drop table ensCanonical;' | hgsql danRer2 cd ./kent/src/hg/geneBounds/clusterRna clusterRna danRer2 rna.bed est.bed -noEst wc -l rna.bed # 10,383 clusters, RefGene has 8397 accessions so that is about right rm *.bed # Therefore Ensembl should be the basis for the Known Genes tracks # since it represents the most loci. # move Ensembl to be the first track on the browser display and # with default visibility of pack # edit zebrafish/danRer2/trackDb.ra # track ensGene # shortLabel Ensembl Genes # longLabel Ensembl Gene Predictions # group genes # priority 32.8 # visibility pack # color 150,0,0 # type genePred ensPep # url http://www.ensembl.org/perl/transview?transcript=$$ # hgGene on cd kent/src/hg/hgGene # edit hgGene.c to add special case of ensGene table for Zebrafish # from kent/src/hg/near/makeNear.doc ssh hgwdev cd kent/src/hg/near/hgNear/hgNearData mkdir Zebrafish cvs add Zebrafish cd Zebrafish cp ../C_elegans/genome.ra . # edit this to be specific for zebrafish and commit to CVS cvs add genome.ra cvs commit genome.ra cd $HOME/kent/src/hg/near/hgClusterGenes # Cluster together various alt splice forms hgClusterGenes -noProt danRer2 ensGene ensIsoforms ensCanonical # Got 23675 clusters, from 32062 genes in 28 chromosomes # need to add genome.ra in hgGeneData cd $HOME/kent/src/hg/hgGene/hgGeneData mkdir Zebrafish cp ../C_elegans/genome.ra . # edit genome.ra and commit to CVS cp ../C_elegans/links.ra . # edit links.ra and commit to CVS # download protein accessions from Ensembl # go to http://www.ensembl.org/Multi/martview # select Ensembl 29 and Danio rerio genes (ZFISH 4) # go to Features and select Ensembl Gene ID from Ensembl Attributes, # then from External References, select UniProt/Swiss-ProtAC, # and UniProt AC and RefSeq DNA accession # copy to this directory and unizip gunzip dr2EnsemblUniProt.gz wc -l dr2EnsemblUniProt # 34115 dr2EnsemblUniProt awk 'BEGIN{FS="\t"} {print $3;}' dr2EnsemblUniProt > ensembl.uniprot sort ensembl.uniprot | uniq > ensembl.uniprot.uniq # 4361 # download uniProt AC, UniProt/SPTREMBL ID, UniProt/Swiss-Prot ID # remove blank lines from ensembl.uniprot sed -e '/^$/d' ensembl.uniprot > ensembl.uniprot.accsonly # there are 6171 UniProt accessions so some Ensembl IDs share accessions ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) # EXTRACT AXTs and MAFs FROM MOUSE (mm6) NET - already done, see # /cluster/data/danRer2/bed/blastz.mm6.swap/mafNet and makeMm6.doc # EXTRACT AXTs and MAFs FROM OPOSSUM (monDom1) NET # (DONE, 2005-05-13, hartera) - see makeMonDom1.doc # EXTRACT AXTs AND MAF's FROM HUMAN NET (DONE, 2005-05-15, hartera) ssh kkstore01 # create axts cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain gunzip humanhg17.net.gz gunzip chainAR/*.chain.gz netSplit humanhg17.net humanHg17Net mkdir -p ../axtNet cat > axtNet.csh << 'EOF' foreach f (humanHg17Net/chr*.net) set c = $f:t:r echo "axtNet on $c" netToAxt humanHg17Net/$c.net chainAR/$c.chain \ /cluster/data/danRer2/nib \ /cluster/data/hg17/nib ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" end 'EOF' chmod +x axtNet.csh csh axtNet.csh >&! axtNet.log & tail -100f axtNet.log # sort axts before making mafs - must be sorted for multiz cd /cluster/data/danRer2/bed/blastz.hg17.swap mv axtNet axtNet.unsorted mkdir axtNet foreach f (axtNet.unsorted/*.axt) set c = $f:t:r echo "Sorting $c" axtSort $f axtNet/$c.axt end # create maf ssh kkstore01 cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtNet mkdir ../mafNet cat > makeMaf.csh << 'EOF' foreach f (chr*.axt) set maf = $f:t:r.hg17.maf echo translating $f to $maf axtToMaf $f \ /cluster/data/danRer2/chrom.sizes /cluster/data/hg17/chrom.sizes \ ../mafNet/$maf -tPrefix=danRer2. -qPrefix=hg17. end 'EOF' chmod +x makeMaf.csh csh makeMaf.csh >&! makeMaf.log & tail -100f makeMaf.log cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain nice gzip humanhg17.net chainAR/*.chain & cd /cluster/data/danRer2/bed/blastz.hg17.swap nice gzip axtNet/*.axt mafNet/*.maf & # PREP FOR LIFTOVER CHAINS FOR THIS ASSEMBLY (DONE, 2005-05-15, hartera) # split into 3k chunks ssh kkr1u00 cd /cluster/data/danRer2 set liftDir = /iscratch/i/danRer2/liftOver/liftSplit mkdir -p $liftDir cd $liftDir mkdir -p split lift cat > split.csh << 'EOF' set liftDir = /iscratch/i/danRer2/liftOver/liftSplit cd /cluster/data/danRer2 foreach n (`ls ?{,?}/*.fa`) set d = $n:h set c = $n:t:r echo $c faSplit -lift=$liftDir/lift/$c.lft size \ /cluster/data/danRer2/$d/$c.fa -oneFile 3000 $liftDir/split/$c end 'EOF' # << for emacs chmod +x split.csh csh split.csh >&! split.log & tail -100f split.log mkdir /iscratch/i/danRer2/liftOver/liftSplit/split/UnandNA mv /iscratch/i/danRer2/liftOver/liftSplit/split/chrUn.fa \ /iscratch/i/danRer2/liftOver/liftSplit/split/chrNA.fa \ /iscratch/i/danRer2/liftOver/liftSplit/split/UnandNA iSync # MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-05-16, hartera) ssh hgwdev cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom set gp = /usr/local/apache/htdocs/goldenPath/danRer2 mkdir -p $gp/vsTetNig1/axtChrom cp -p *.axt $gp/vsTetNig1/axtChrom cd $gp/vsTetNig1/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # copy chains and net to downloads area cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain gzip -c all.chainARFilt5k > /cluster/data/danRer2/zip/tetraTetNig1.chain.gz gzip -c tetraTetNig1.net > /cluster/data/danRer2/zip/tetraTetNig1.net.gz cd $gp/vsTetNig1 mv /cluster/data/danRer2/zip/tetra*.gz . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # 6-WAY VAR_MULTIZ ALIGNMENTS (DONE, 2005-05-20, hartera) # Species: zebrafish(danRer2), human (hg17), mouse(mm6), opossum(monDom1), # fugu(fr1) and tetraodon(tetNig1) # Tetraodon and Fugu are quite distant from zebrafish, more distant than # human/chicken so use the HoxD55.q matrix in future for blastz with these # species. Here the pairwise alignments of zebrafish with Tetraodon and Fugu # were done using the default matrix for blastz. (2005-06-01, hartera) # zebrafish(danRer2), human (hg17), mouse(mm6), opossum(monDom1), # fugu(fr1) and tetraodon(tetNig1) # Prepared tree image for Conservation track details page (2005-06-07, hartera) ssh kkstore01 mkdir -p /cluster/data/danRer2/bed/multiz6way cd /cluster/data/danRer2/bed/multiz6way mkdir mafLinks # set up directories for links to mafs for each pairwise alignment mkdir mafLinks/hg17 mkdir mafLinks/mm6 mkdir mafLinks/monDom1 mkdir mafLinks/fr1 mkdir mafLinks/tetNig1 set dir=/cluster/data/danRer2/bed set dirMonDom=/cluster/data/monDom1/bed # need to make all the links so that they are just chrN.maf.gz # without the species db in the middle e.g. chrN.hg17.maf.gz ln -s $dir/blastz.mm6.swap/mafNet/*.maf.gz ./mafLinks/mm6 echo "$dir/blastz.hg17.swap" > dir.lst echo "$dirMonDom/blastz.monDom1.to.danRer2" >> dir.lst echo "$dir/blastz.fr1" >> dir.lst echo "$dir/blastz.tetNig1" >> dir.lst foreach d (`cat dir.lst`) foreach m ($d/mafNet/*.maf.gz) foreach c (`cat /cluster/data/danRer2/chrom.lst`) foreach sp (hg17 monDom1 fr1 tetNig1) set maf = ${d}/mafNet/chr${c}.${sp}.maf.gz if ($m == $maf) then set s = $sp echo $s endif end ln -s $d/mafNet/chr${c}.${s}.maf.gz ./mafLinks/$s/chr${c}.maf.gz end end end # Copy MAFS to Iservers for kluster run ssh kkr1u00 mkdir /iscratch/i/danRer2/multiz6way cd /iscratch/i/danRer2/multiz6way rsync -a --copy-links --progress \ /cluster/data/danRer2/bed/multiz6way/mafLinks/ . # 321 Mb of data - takes seconds mkdir penn cp -p /cluster/bin/penn/psuCVS/multiz-tba/multiz penn cp -p /cluster/bin/penn/maf_project penn /cluster/bin/iSync # Progressive alignment up the tree w/o stager, # using multiz.v10 (var_multiz) # Method: align internal subtrees (using 0 flag to var_multiz) # Then, align these to human (using 1 flag to var_multiz) # NOTE: must use maf_project after each multiz run, in order # to order output. Single-cov guaranteed by use of net MAF's, # so it is not necessary to run single_cov2. # make output dir and run dir ssh kki cd /cluster/data/danRer2/bed/multiz6way mkdir -p maf mkdir -p run cd run # create scripts to run var_multiz on cluster cat > oneMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 set multi = /scratch/danRer2/multiz6way.$c set pairs = /iscratch/i/danRer2/multiz6way # special mode -- # with 1 arg, cleanup if ($#argv == 1) then rm -fr $multi exit endif # special mode -- # with 3 args, saves an alignment file if ($#argv == 3) then cp $multi/$2/$c.maf $3 exit endif set s1 = $2 set s2 = $3 set flag = $4 # locate input files -- in pairwise dir, or multiple dir set d1 = $multi set d2 = $multi if (-d $pairs/$s1) then set d1 = $pairs set f1 = $d1/$s1/$c.maf.gz set t1 = /tmp/$s1.$c.maf zcat $f1 > $t1 else set f1 = $d1/$s1/$c.maf set t1 = /tmp/$s1.$c.maf cp -p $f1 $t1 endif if (-d $pairs/$s2) then set d2 = $pairs set f2 = $d2/$s2/$c.maf.gz set t2 = /tmp/$s2.$c.maf zcat $f2 > $t2 else set f2 = $d2/$s2/$c.maf set t2 = /tmp/$s2.$c.maf cp -p $f2 $t2 endif # write to output dir set out = $multi/${s1}${s2} mkdir -p $out # check for empty input file if (-s $t1 && -s $t2) then echo "Aligning $f1 $f2 $flag" /iscratch/i/danRer2/multiz6way/penn/multiz $t1 $t2 $flag $out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \ $out/$c.tmp.maf echo "Ordering $c.maf" /iscratch/i/danRer2/multiz6way/penn/maf_project $out/$c.tmp.maf danRer2.$c > $out/$c.maf rm -f $t1 $t2 else if (-s $t1) then cp -p $t1 $out/$c.maf rm -f $t1 else if (-s $t2) then cp -p $t2 $out/$c.maf rm -f $t2 endif 'EOF' # << keep emacs coloring happy chmod +x oneMultiz.csh # Copy this script to iscratch ssh kkr1u00 cd /iscratch/i/danRer2/multiz6way/penn cp -p /cluster/data/danRer2/bed/multiz6way/run/oneMultiz.csh . /cluster/bin/iSync # back to run the job ssh kki cd /cluster/data/danRer2/bed/multiz6way/run # This tree.nh was used in the distant past for early versions # of phastCons. Now, this is merely a convenient reference to the # tree under construction. This is also used to draw a graphic # tree as species.nh, see below. cat << '_EOF_' > tree.nh ((hg17,mm6),monDom1),((tetNig1,fr1),danRer2)) '_EOF_' # << this line keeps emacs coloring happy cat > allMultiz.csh << 'EOF' #!/bin/csh -fe # multiple alignment steps: set c = $1 /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c hg17 mm6 0 /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c monDom1 hg17mm6 0 /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1 fr1 1 /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1fr1 monDom1hg17mm6 1 # get final alignment file /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1fr1monDom1hg17mm6 /cluster/data/danRer2/bed/multiz6way/maf/$c.maf #cleanup /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c 'EOF' # << keep emacs coloring happy chmod +x allMultiz.csh cat << 'EOF' > template #LOOP ./allMultiz.csh $(root1) {check out line+ /cluster/data/danRer2/bed/multiz6way/maf/$(root1).maf} #ENDLOOP 'EOF' cd /cluster/data/danRer2/bed/multiz6way/run awk '{print $1}' ../../../chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para try; para check para push # para time # Completed: 28 of 28 jobs # CPU time in finished jobs: 5903s 98.39m 1.64h 0.07d 0.000 y # IO & Wait Time: 227s 3.78m 0.06h 0.00d 0.000 y # Average job time: 219s 3.65m 0.06h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1328s 22.13m 0.37h 0.02d # Submission to last job: 2098s 34.97m 0.58h 0.02d # do not filter mafs as only removes a small fraction of alignments # better to keep them all. check for single column alignments (these # just have a single base for each species in the alignment), if # present need to do glueing step. ssh hgwdev # create tab file for chr1.maf cd /cluster/data/danRer2/bed/multiz6way hgLoadMaf danRer2 multiz6way -test=./maf/chr1.maf awk 'BEGIN {OFS="\t"} {print $2, $3, $4, $5, $6, $4-$3;}' \ multiz6way.tab > chr1.tab.lengths sort -n -k6 chr1.tab.lengths > chr1.tab.lengths.sort # chr1 - 10 single column alignments # also looked at chrUn - 14 single column alignments # there are also a small number of 2,3,4 column alignments. # This is ok as it is such a small number. # use mafAddIRows to 'chain' the alignments ssh kkstore01 cd /cluster/data/danRer2/bed/multiz6way mkdir mafAddIRows foreach m (./maf/*.maf) set n=$m:t echo "Processing $n ..." mafAddIRows $m /cluster/data/danRer2/danRer2.2bit ./mafAddIRows/$n end # cat maf files into one file for those alignments without iRows, # do not use alignments with iRows in browser at the moment catDir maf > multiz6way.maf # 978 Mb of data # -rw-rw-r-- 1 hartera protein 977796776 May 20 12:06 multiz6way.maf # Load into database ssh hgwdev cd /cluster/data/danRer2/bed/multiz6way mkdir /gbdb/danRer2/multiz6way ln -s /cluster/data/danRer2/bed/multiz6way/multiz6way.maf \ /gbdb/danRer2/multiz6way hgLoadMaf danRer2 multiz6way # Loaded 1230321 mafs in 1 files from /gbdb/danRer2/multiz6way # took about 2 minutes to load hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 danRer2 \ multiz6waySummary multiz6way.maf # Processed 2565228 components in 1230321 mafs from multiz6way.maf # took about 2 minutes to load # Dropped unused indexes (2006-05-09 kate) # NOTE: this is not required in the future, as the loader # has been fixed to not generate these indexes hgsql danRer2 -e "alter table multiz6waySummary drop index chrom_2" hgsql danRer2 -e "alter table multiz6waySummary drop index chrom_3" # create tree image - like tree.nh but with common names, added zebrafish # to tree (2005-06-07, hartera): cat << '_EOF_' > species6.nh (((human,mouse),opossum),((tetraodon,fugu),zebrafish)) '_EOF_' /cluster/bin/phast/draw_tree -b -s species6.nh > species6.ps # used GIMP to reduce the amount of whitespace to make it # smaller, then save as jpg display species6.jpg # use the Transform->Chop and then Direction->horizontal or vertical # to reduce the tree size while keeping label sizes the same cp species6.jpg /usr/local/apache/htdocs/images/phylo/DanRer2_6way.jpg # change permissions for display chmod +r /usr/local/apache/htdocs/images/phylo/DanRer2_6way.jpg # add all.joiner entry for danRer2 multiz6way (2005-05-31, hartera) # add trackDb.ra entry # track multiz6way # shortLabel 6-Way Conservation # longLabel 6-Way Vertebrate Multiz Alignment & Conservation # group compGeno # priority 109 # visibility hide # color 0, 10, 100 # altColor 0,90,10 # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # #wiggle phastCons6way # yLineOnOff Off # autoScale Off # summary multiz6waySummary # speciesGroups mammal vertebrate # sGroup_mammal hg17 mm6 monDom1 # sGroup_vertebrate fr1 tetNig1 # treeImage phylo/DanRer2_6way.jpg # PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 6-WAY ALIGNMENT # (DONE, 2005-06-01, hartera) # Tetraodon and Fugu are quite distant from zebrafish, more distant than # human/chicken so use the HoxD55.q matrix in future for blastz with these # species. Here the pairwise alignments of zebrafish with Tetraodon and Fugu # that were used for the 6-way multiz were done using the default matrix # for blastz. (2005-06-01, hartera) ssh kkstore01 mkdir /cluster/data/danRer2/bed/multiz6way/cons cd /cluster/data/danRer2/bed/multiz6way/cons # create a starting-tree.mod based on chr5 (67Mb - largest chrom) # chr5 is the largest chrom apart from NA and Un /cluster/bin/phast/msa_split ../maf/chr5.maf \ --refseq ../../../5/chr5.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # takes about 2 minutes /cluster/bin/phast/phyloFit -i SS s1.*.ss \ --tree "((tetNig1,fr1),((mm6,hg17),monDom1))" \ --out-root starting-tree # only takes about 5 minutes rm s1.*.ss # Get genome-wide average GC content (for all species together, # not just the reference genome). If you have a globally # estimated tree model, as above, you can get this from the # BACKGROUND line in the .mod file. E.g., # ALPHABET: A C G T # ... # BACKGROUND: 0.309863 0.189473 0.189412 0.311252 # This implies a GC content of 0.1895 + 0.1895 = 0.379 # If you do *not* have a global tree model and you do not know # your GC content, you can get it directly from the MAFs with # a command like: # /cluster/bin/phast/msa_view \ # --aggregate danRer2,tetNig1,fr1,mm6,hg17,monDom1 -i MAF \ # --summary-only /cluster/data/danRer2/bed/multiz6way/maf/chr*.maf \ # > maf_summary.txt # this gives GC content of 0.4026 so similar # break up the genome-wide MAFs into pieces ssh kkstore01 mkdir -p /cluster/bluearc/danRer2/cons/ss cd /cluster/bluearc/danRer2/cons/ss bash for C in `awk '{print $1}' /cluster/data/danRer2/chrom.sizes` do if [ -s /cluster/data/danRer2/bed/multiz6way/maf/${C}.maf ]; then echo msa_split $C chrN=${C/chr/} /cluster/bin/phast/msa_split \ /cluster/data/danRer2/bed/multiz6way/maf/${C}.maf \ --refseq /cluster/data/danRer2/${chrN}/${C}.fa \ --in-format MAF --windows 1000000,0 --between-blocks 5000 \ --out-format SS -I 1000 --out-root ${C} fi done # took 15 minutes to run # Create a random list of 50 1 mb regions (do not use the NA and Un) ls -l | grep -v "NA" > fileList cat fileList | grep -v "Un" > fileList2 mv fileList2 fileList cat fileList | awk '$5 > 4000000 {print $9;}' | \ randomLines stdin 50 ../randomSs rm fileList # Set up parasol directory to calculate trees on these 50 regions ssh kk9 mkdir -p /cluster/bluearc/danRer2/cons/treeRun1 cd /cluster/bluearc/danRer2/cons/treeRun1 mkdir tree log # now set up cluster job to estimate model parameters. Parameters # will be estimated separately for each alignment fragment then # will be combined across fragments # Tuning this loop should come back to here to recalculate # Create little script that calls phastCons with right arguments cat > makeTree << '_EOF_' /cluster/bin/phast/phastCons ../ss/$1.ss \ /cluster/data/danRer2/bed/multiz6way/cons/starting-tree.mod \ --gc 0.403 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 12 --target-coverage 0.17 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # emacs happy chmod a+x makeTree # msa_view as this is from the whole genome. Also, notice the # target coverage of 0.17. Here we are going to aim for 65% coverage # of coding regions by conserved elements. # Create gensub file cat > template << '_EOF_' #LOOP makeTree $(root1) #ENDLOOP '_EOF_' # happy emacs # Make cluster job and run it gensub2 ../randomSs single template jobList para create jobList para try/push/check/etc # para time # Completed: 50 of 50 jobs # CPU time in finished jobs: 4013s 66.89m 1.11h 0.05d 0.000 y # IO & Wait Time: 139s 2.31m 0.04h 0.00d 0.000 y # Average job time: 83s 1.38m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 135s 2.25m 0.04h 0.00d # Submission to last job: 5260s 87.67m 1.46h 0.06d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ls tree/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \ --output-average ../ave.cons.mod > cons_summary.txt ls tree/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/danRer2/bed/multiz6way/cons # measuring entropy # consEntropy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument # target entropy should be L_min*H=9.8 bits, the expected length # that produces this entropy is the one to use for phastCons. /cluster/bin/phast/consEntropy 0.17 12 \ ave.cons.mod ave.noncons.mod # the above steps were repeated with the target coverage and expected lengths # parameters set as below: # -target-coverage=0.17 -expected-lengths 12 #Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068 # Relative entropy: H=0.768727 bits/site # Expected min. length: L_min=13.932145 sites # Expected max. length: L_max=8.869545 sites # Total entropy: L_min*H=10.710017 bits # -target-coverage=0.20 -expected-lengths 10 #Transition parameters:gamma=0.200000,omega=10.000000, mu=0.100000, nu=0.025000 # Relative entropy: H=0.799627 bits/site # Expected min. length: L_min=12.358880 sites # Expected max. length: L_max=7.593192 sites # Total entropy: L_min*H=9.882496 bits # -target-coverage=0.20 -expected-lengths=9 #Transition parameters:gamma=0.200000, omega=9.000000, mu=0.111111, nu=0.027778 # Relative entropy: H=0.805459 bits/site # Expected min. length: L_min=12.022440 sites # Expected max. length: L_max=7.111297 sites # Total entropy: L_min*H=9.683580 bits # -target-coverage=0.25 -expected-lengths=10 #Transition parameters:gamma=0.250000, omega=10.000000, mu=0.100000, nu=0.033333 # Relative entropy: H=0.843206 bits/site # Expected min. length: L_min=10.846871 sites # Expected max. length: L_max=6.973477 sites # Total entropy: L_min*H=9.146148 bits # -target-coverage=0.35 -expected-lengths=12 #Transition parameters:gamma=0.350000, omega=12.000000, mu=0.083333, nu=0.044872 # Relative entropy: H=0.917684 bits/site # Expected min. length: L_min=9.169807 sites # Expected max. length: L_max=6.687135 sites # Total entropy: L_min*H=8.414989 bits # -target-coverage=0.35 -expected-lengths=14 #Transition parameters:gamma=0.350000,omega=14.000000, mu=0.071429, nu=0.038462 # Relative entropy: H=0.903639 bits/site # Expected min. length: L_min=9.778765 sites # Expected max. length: L_max=7.300778 sites # Total entropy: L_min*H=8.836478 bits # -target-coverage=0.35 -expected-lengths=18 #Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915 # Relative entropy: H=0.881986 bits/site # Expected min. length: L_min=10.798324 sites # Expected max. length: L_max=8.320602 sites # Total entropy: L_min*H=9.523968 bits # -target-coverage=0.35 -expected-lengths=19 #Transition parameters:gamma=0.350000,omega=19.000000, mu=0.052632, nu=0.028340 # Relative entropy: H=0.877572 bits/site # Expected min. length: L_min=11.021340 sites # Expected max. length: L_max=8.542773 sites # Total entropy: L_min*H=9.672025 bits # -target-coverage=0.32 -expected-lengths=20 #Transition parameters:gamma=0.320000, omega=20.000000, mu=0.050000, nu=0.023529 # Relative entropy: H=0.853466 bits/site # Expected min. length: L_min=11.824480 sites # Expected max. length: L_max=9.084203 sites # Total entropy: L_min*H=10.091797 bits #### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED #### # -target-coverage=0.32 -expected-lengths=18 #Transition parameters:gamma=0.320000,omega=18.000000, mu=0.055556, nu=0.026144 # Relative entropy: H=0.860401 bits/site # Expected min. length: L_min=11.402969 sites # Expected max. length: L_max=8.648243 sites # Total entropy: L_min*H=9.811131 bits # need to iterate and get the right coverage and parameters # try running phastCons below with parameters used above and check the # coverage of coding regions by the most conserved elements # Create cluster dir to do main phastCons run ssh kk9 mkdir /cluster/bluearc/danRer2/cons/consRun1 cd /cluster/bluearc/danRer2/cons/consRun1 mkdir ppRaw bed # Create script to run phastCons with right parameters # This job is I/O intensive in its output files. To make this # cluster safe, it would be better to do this work somewhere over # in /tmp/... and copy the final result back. kk9 can do this # run, but kk cannot. # Run whole genome, this goes fast in this case. cat > doPhast << '_EOF_' mkdir -p ppRaw/$2 /cluster/bin/phast/phastCons ../ss/$1.ss ../ave.cons.mod,../ave.noncons.mod \ --expected-lengths 12 --target-coverage 0.17 --quiet --seqname $2 \ --idpref $2 --viterbi bed/$1.bed --score --require-informative 0 > \ ppRaw/$2/$1.pp '_EOF_' # emacs happy chmod a+x doPhast # Create gsub file cat > template << '_EOF_' #LOOP doPhast $(file1) $(root1) #ENDLOOP '_EOF_' # happy emacs # Create parasol batch and run it for the whole genome ls -1 ../ss | sed 's/.ss//' > in.lst gensub2 in.lst single template jobList para create jobList para try/check/push/etc. # para time # Completed: 1604 of 1604 jobs # CPU time in finished jobs: 11005s 183.42m 3.06h 0.13d 0.000 y # IO & Wait Time: 30912s 515.19m 8.59h 0.36d 0.001 y # Average job time: 26s 0.44m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 55s 0.92m 0.02h 0.00d # Submission to last job: 608s 10.13m 0.17h 0.01d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /cluster/bluearc/danRer2/cons/consRun catDir bed | awk ' \ {printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > \ /cluster/data/danRer2/bed/multiz6way/mostConserved.bed # Figure out how much is actually covered by the bed files as so: ssh kkstore01 cd /cluster/data/danRer2/bed/multiz6way foreach f ( `cat /cluster/data/danRer2/chrom.lst` ) cat $f/chr${f}.fa >> allChroms.fa end faSize allChroms.fa # 1589273282 bases (101846391 N's 1487426891 real 813899141 upper # 673527750 lower) in 28 sequences in 1 files # Total size: mean 56759760.1 sd 58536773.4 min 16596 (chrM) # max 318297480 (chrNA) median 42763181 awk '{sum+=$3-$2} \ END{printf "%% %.2f = 100.0*%d/1487426891\n",100.0*sum/1487426891,sum}' \ mostConserved.bed -target-coverage 0.17: % 1.99 = 100.0*29631918/1487426891 length 12 -target-coverage 0.20: % 1.96 = 100.0*29183170/1487426891 length 10 -target-coverage 0.25: % 2.11 = 100.0*31391360/1487426891 length 10 -target-coverage 0.35 % 2.69 = 100.0*39940068/1487426891 length 19 -target-coverage 0.32 % 2.60 = 100.0*38730791/1487426891 length 18 # the non-N genome size, from faSize on all chroms: 1487426891 # check with featureBits ssh hgwdev cd /cluster/data/danRer2/bed/multiz6way # get an or of refGene and mgcGenes CDS regions featureBits danRer2 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed # 9745850 bases of 1560497282 (0.625%) in intersection featureBits danRer2 refSeqOrMgcCds.bed mostConserved.bed -enrichment # featureBits danRer2 refSeqOrMgcCds.bed mostConserved.bed -enrichment # -target-coverage=0.17 -expected-lengths=12 # refSeqOrMgcCds.bed 0.625%, mostConserved.bed 1.899%, both 0.350%, # cover 56.07%, enrich 29.53x # -target-coverage=0.20 -expected-lengths=10 # refSeqOrMgcCds.bed 0.625%, mostConserved.bed 1.870%, both 0.347%, # cover 55.54%, enrich 29.70x # -target-coverage=0.25 -expected-length=10 # refSeqOrMgcCds.bed 0.625%, mostConserved4.bed 2.012%, both 0.364%, # cover 58.35%, enrich 29.01x # -target-coverage=0.35 -expected-lengths=19 # refSeqOrMgcCds.bed 0.625%, mostConserved5.bed 2.559%, both 0.431%, # cover 68.98%, enrich 26.95x # -target-coverage=0.32 -expected-lengths=18 # refSeqOrMgcCds.bed 0.625%, mostConserved6.bed 2.482%, both 0.423%, # cover 67.75%, enrich 27.30x # looking at ensGene: # featureBits danRer2 ensGene:cds mostConserved.bed -enrichment # ensGene:cds 2.135%, mostConserved.bed 2.482%, both 1.210%, # cover 56.67%, enrich 22.83x # so use this result with -target-coverage=0.32 -expected-lengths=18 # total entropy is 9.81 which is good, aiming for 9.8 and 65% coverage # of coding regions with most conserved elements, here it is 67.8% # Load most conserved track into database ssh hgwdev cd /cluster/data/danRer2/bed/multiz6way hgLoadBed danRer2 phastConsElements mostConserved.bed # Loaded 393654 elements of size 5 # less than 1 minute load time featureBits danRer2 -enrichment refSeqOrMgcCds.bed phastConsElements # refSeqOrMgcCds.bed 0.625%, phastConsElements 2.482%, both 0.423%, # cover 67.75%, enrich 27.30x featureBits danRer2 mgcGenes:cds phastConsElements -enrichment # mgcGenes:cds 0.486%, phastConsElements 2.482%, both 0.336%, # cover 69.06%, enrich 27.82x featureBits danRer2 refGene:cds phastConsElements -enrichment # refGene:cds 0.597%, phastConsElements 2.482%, both 0.400%, # cover 66.97%, enrich 26.98x # Create merged posterier probability file and wiggle track data files ssh kkstore01 cd /cluster/bluearc/danRer2/cons/consRun # interesting sort here on the chr name and position. # first convert all . and - characters to special strings x/ and x_/ # to get a consistent delimiter of / for all fields to be sorted. # Then do the sort on the chrom name and the start position, after # the sort convert the special stringx x_/ and x/ back to - and . # respectively. This gets everything in order by chrom name and # chrom start. find ./ppRaw -type f | sed -e "s#\.#x/#g; s#-#x_/#g" | \ sort -t"/" -k4,4 -k6,6n | sed -e "s#x_/#-#g; s#x/#.#g" | xargs cat | \ wigEncode stdin phastCons6way.wig phastCons6way.wib # about 3 minutes for above ssh kkstore01 cd /cluster/bluearc/danRer2/cons/consRun cp -p phastCons6way.wi? /cluster/data/danRer2/bed/multiz6way/cons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/danRer2/bed/multiz6way/cons mkdir -p /gbdb/danRer2/wib ln -s `pwd`/phastCons6way.wib /gbdb/danRer2/wib/phastCons6way.wib # use this if need to reload table hgsql -e 'drop table phastCons6way;' danRer2 # load table hgLoadWiggle danRer2 phastCons6way phastCons6way.wig # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/danRer2/bed/multiz6way/cons bash time hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=danRer2 phastCons6way > histogram.data 2>&1 # real 3m1.185s user 2m19.440s sys 0m11.850s # create plot of histogram: cat << '_EOF_' > histo.gp set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Zebrafish danRer2 Histogram phastCons6 track" set xlabel " phastCons6 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # happy emacs gnuplot histo.gp > histo.png display histo.png & # add line: wiggle phastCons6way to trackDb.ra for multiz6way to display the # wiggle for the conservation track. # copy over html for multiz and edit. # PHASTCONS SCORES DOWNLOADABLES (DONE, 2005-05-31, hartera) # prepare compressed copy of ascii data values for downloads ssh kkstore01 cd /cluster/bluearc/danRer2/cons/consRun cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons6wayScores ls ppRaw | while read D do out=${TOP}/phastCons6wayScores/${D}.gz echo -n "$out ... " cd ${TOP}/ppRaw/${D} gzip -c `ls *.pp | sed -e "s#-#.x-x.#g;" | \ sort -t"." -k1,1 -k2,2n | sed -e "s#.x-x.#-#g;"` > ${out} echo "done" done '_EOF_' # happy emacs chmod +x gzipAscii.sh time ./gzipAscii.sh # 104.769u 8.789s 3:00.18 63.0% 0+0k 0+0io 11pf+0w # 182 Mb of data # copy scores for downloads ssh kkstore01 mkdir /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores cd /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores rsync -a --progress \ /cluster/bluearc/danRer2/cons/consRun/phastCons6wayScores/ . ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/danRer2/phastCons6wayScores cd /usr/local/apache/htdocs/goldenPath/danRer2/phastCons6wayScores ln -s /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores/*.gz . md5sum *.gz > md5sum.txt # copy over and edit README.txt. # MULTIZ 6-WAY DOWNLOADABLES (DONE, 2005-05-31, hartera) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/danRer2 mkdir -p multiz6way cd multiz6way foreach f (/cluster/data/danRer2/bed/multiz6way/maf/*.maf) set c = $f:r:t echo $c nice gzip -c $f > $c.maf.gz end # copy README and edit # Create upstream files for download ssh hgwdev cd /cluster/data/danRer2/bed/multiz6way echo danRer2 tetNig1 fr1 mm6 hg17 monDom1 > org.txt # mafFrags is quick for these foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits danRer2 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags danRer2 multiz6way up.bed upstream$i.maf -orgs=org.txt rm up.bed end # took less than 3 minutes ssh kkstore01 cd /cluster/data/danRer2/bed/multiz6way nice gzip upstream{1000,2000,5000}.maf ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/danRer2 mv /cluster/data/danRer2/bed/multiz6way/upstream*.maf.gz multiz6way cd multiz6way md5sum *.gz > md5sum.txt ########################################################################### # LIFTOVER CHAINS TO DANRER3 (DONE, 2006-04-18, hartera) # Split (using makeLoChain-split) of danRer3 is doc'ed in makeDanRer3.doc # Do what makeLoChain-split says to do next (start blat alignment) ssh kk mkdir -p /cluster/data/danRer2/bed/liftOver cd /cluster/data/danRer2/bed/liftOver makeLoChain-align danRer2 /iscratch/i/danRer2/nib danRer3 \ /iscratch/i/danRer3/split10k \ /iscratch/i/danRer3/danRer3_11.ooc >&! align.log & # Took about 4 minutes. # Do what its output says to do next (start cluster job) cd /cluster/data/danRer2/bed/blat.danRer3.2006-04-17/run para try, check, push, check, ... para time >&! run.time # Completed: 784 of 784 jobs # CPU time in finished jobs: 4022758s 67045.97m 1117.43h 46.56d 0.128 y # IO & Wait Time: 71872s 1197.86m 19.96h 0.83d 0.002 y # Average job time: 5223s 87.05m 1.45h 0.06d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 42935s 715.58m 11.93h 0.50d # Submission to last job: 42935s 715.58m 11.93h 0.50d # lift alignments ssh kkr1u00 cd /cluster/data/danRer2/bed/liftOver makeLoChain-lift danRer2 danRer3 >&! lift.log & # Took about 10 minutes to run. # chain alignments ssh kki cd /cluster/data/danRer2/bed/liftOver makeLoChain-chain danRer2 /iscratch/i/danRer2/nib \ danRer3 /iscratch/i/danRer3/nib >&! chain.log & # Do what its output says to do next (start cluster job) cd /cluster/data/danRer2/bed/blat.danRer3.2006-04-17/chainRun para try, check, push ... para time >&! run.time # Completed: 28 of 28 jobs # CPU time in finished jobs: 2697s 44.95m 0.75h 0.03d 0.000 y # IO & Wait Time: 548s 9.13m 0.15h 0.01d 0.000 y # Average job time: 116s 1.93m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 439s 7.32m 0.12h 0.01d # Submission to last job: 439s 7.32m 0.12h 0.01d # net alignment chains ssh kkstore04 cd /cluster/data/danRer2/bed/liftOver makeLoChain-net danRer2 danRer3 >&! net.log & # Took about 20 minutes to run. # load reference to over.chain into database table, # and create symlinks /gbdb and download area ssh hgwdev cd /cluster/data/danRer2/bed/liftOver makeLoChain-load danRer2 danRer3 >&! load.log & # test by converting a region using the "convert" link on # the browser, and comparing to blat of the same region ########################################################################### # SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM DANRER3 (DONE, 2006-04-25, hartera) # followed instructions used in makePanTro2.doc ssh kkr1u00 cd /cluster/data/danRer2/bed mkdir -p liftOver cd liftOver makeLoChain-split danRer2 /cluster/data/danRer2/nib >&! split.log & # Took about 30 minutes to run. ######################################################################### ## Reorder Fish organisms (DONE - 2006-12-22 - Hiram) hgsql -h genome-testdb hgcentraltest \ -e "update dbDb set orderKey = 452 where name = 'danRer2';" ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer2 ########################################################################## # BACENDS CLEANUP (DONE, 2007-03-27, hartera) ssh kkstore04 cd /cluster/data/danRer2/bed/ du -sh ZonLab # 150G ZonLab/ cd ZonLab # Remove old BAC ENDS directories nice rm -r bacends2 bacends3 cd bacends rm bacEndsCov* bacEndsMinCov* bl2seq.out blast.out diff bacEnds \ bacEnds.names.uniq jobList nice rm -r psl err minCov10MinAli92 # bacEnds.lifted.psl is lifted version of bacEnds.psl so remove latter rm bacEnds.psl batch batch.bak *.lst para.results names.test template *.log cd bacends.1 rm accs* bacEndAccs.aliases *.log allBacEnds* zfishaccs* diffaccs bacEnds \ BACEnd* rm -r test test2 nice gzip *.psl cd ../pairs # bacEndSingles.bed is already in singles directory rm bacEnds.* bed.tab bacEndSingles.bed psl.tab bacEndSingles.sql *.bak rm -r old test cd ../singles rm bacEnds.* rm -r old cd ../scores rm *.counts *.count *.hits good.txt error.log bed.tab bad.txt rm -r test cd ../duplicates rm -r old2 test test1 rm error* sing *.counts singles.log.may10 log* out* bed.tab err rm list list.aligns singles* *.uniq *.tmp *.tmp2 100names* pairs.* \ pairsTest* rm -r badPrsTest pairsTest cd ../cloneandStsAliases rm *.bak dr1* dr2* xref.names.uniq zfishNew* zfishOld* *.tab accs.* mv testTables/*.pl . rm -r test testTables cd /cluster/data/danRer2/bed/ du -sh ZonLab # 46G ZonLab/ # removed 104 GB ##########################################################################