# for emacs: -*- mode: sh; -*- # Danio Rerio (zebrafish) from Sanger, version Zv7 (released 07/13/07) # Project website: # http://www.sanger.ac.uk/Projects/D_rerio/ # Assembly notes: # http://www.sanger.ac.uk/Projects/D_rerio/ # ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_assembl_information.shmtl # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # ########################################################################### # DOWNLOAD SEQUENCE (DONE, 2007-07-16, hartera) # MOVE FILES TO SEPARATE DIRECTORY (DONE, 2007-07-20, hartera) ssh kkstore06 mkdir /cluster/store4/danRer5 ln -s /cluster/store4/danRer5 /cluster/data cd /cluster/data/danRer5 wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/README wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_chr.agp wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_contigs.fa wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_scaffold.agp wget --timestamp \ ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_scaffolds.fa # move assembly files to a separate directory mkdir Zv7release mv *.agp *.fa README ./Zv7release/ ########################################################################### # CREATE AGP FILES FOR RANDOMS (chrNA and chrUn) # (2007-08-16 - 2007-08-18, hartera) ssh kkstore06 mkdir /cluster/data/danRer5/Zv7release/randoms cd /cluster/data/danRer5/Zv7release/randoms # first find the contigs assigned to chromosomes: awk '{if ($5 !~ /N/) print $6}' ../Zv7_chr.agp | sort | uniq \ > chromContigs.txt # get list of all contigs: awk '{if ($5 !~ /N/) print $6}' ../Zv7_scaffold.agp | sort | uniq \ > allContigs.txt # find contigs not assigned to a chromosome comm -23 allContigs.txt chromContigs.txt > contigsRandomsAndAccs.txt # get all those that are in the Zv7_scaffold.agp and get a list of # scaffold names. foreach c (`cat contigsRandomsAndAccs.txt`) grep -w $c ../Zv7_scaffold.agp >> contigsRandomsScafs.agp end # check that all the contigs/clones names in contigsRandomsAndAccs.txt # are also in contigsRandomsScafs.agp awk '{if ($6 != 100) print $6}' contigsRandomsScafs.agp \ | sort | uniq > names.sort wc -l names.sort contigsRandomsAndAccs.txt # 19845 names.sort # 19845 contigsRandomsAndAccs.txt comm -12 contigsRandomsAndAccs.txt names.sort | wc -l # 19845 # get list of scaffolds names awk '{print $1}' contigsRandomsScafs.agp | sort -k 1.7,1n | uniq \ > scaffoldsRandoms.txt # make an AGP of the randoms scaffolds only grep -w -f scaffoldsRandoms.txt ../Zv7_scaffold.agp > randomsScaffold.agp # check that we have all the scaffolds to be selected awk '{print $1}' randomsScaffold.agp | sort | uniq > names2.sort sort scaffoldsRandoms.txt > scafsTmp.sort wc -l scafsTmp.sort names2.sort # 5010 scafsTmp.sort # 5010 names2.sort comm -12 scafsTmp.sort names2.sort | wc -l # 5010 # all scaffolds are in the new AGP file that occur in scaffoldsRandoms.txt # get the list of contigs from the agp awk '{if ($5 !~ /N/) print $6}' randomsScaffold.agp | sort | uniq \ > randomContigs.list # extract the FASTA sequences for just these contigs faSomeRecords ../Zv7_contigs.fa randomContigs.list Zv7contigs_random.fa # remove excess part of the names for the contigs with accessions perl -pi.bak -e 's/^(>[A-Z]{2}[0-9]+\.[0-9]+)\s.+$/$1/' Zv7contigs_random.fa # check that all the contigs from the list are in the FASTA file grep '>' Zv7contigs_random.fa | sed -e 's/^>//' | sort | uniq \ > Zv7contigs.list wc -l *.list # 19736 Zv7contigs.list # 19845 randomContigs.list comm -13 Zv7contigs.list randomContigs.list > notFound wc -l notFound # 109 notFound # (2007-08-06, hartera) # RE-DONE (2007-08-18, hartera) # These are present in the original, but there are several sequeunces for # each accession since there are Ns in the sequence. Need to fix the names # in the AGP to match each part of the sequence in the FASTA file. # First, get the list of headers from the FASTA for all contigs grep '>' ../Zv7_contigs.fa > contigs.headers # remove ">" perl -pi.bak -e 's/>//' contigs.headers foreach a (`cat notFound`) grep $a contigs.headers >> contigsHeaders.notFound end awk '{print $1}' contigsHeaders.notFound > contigsHeaders.notFound.accs perl -pi.bak -e 's/_.+$//' contigsHeaders.notFound.accs sort contigsHeaders.notFound.accs | uniq \ > contigsHeaders.notFound.accs.sort sort notFound | uniq > notFound.sort wc -l notFound.sort # 109 notFound.sort wc -l contigsHeaders.notFound.accs.sort # 109 contigsHeaders.notFound.accs.sort comm -12 notFound.sort contigsHeaders.notFound.accs.sort | wc -l # 109 # So all the not Found accessions are in the list of contig headers # accessions: contigsHeaders.notFound.accs.sort # Then extract the names for the accession parts # e.g. BX649254.13_01364 zK228P6.01364 BX649254 1 32480 # and add them to the AGP file for the correct lines for these # components. The last 2 fields above are the start and end coordinates # relative to the accession (BX649254.13 in this case). Also, add # 50,000 Ns between scaffolds. # Wrote program to do this: agpAddCtgNamesAndGaps.c /cluster/home/hartera/bin/x86_64/agpAddCtgNamesAndGaps \ contigsHeaders.notFound randomsScaffold.agp \ randomsScafFixed.agp >& agpScafs.log awk '{print $1}' randomsScaffold.agp | sort | uniq > scafs.lst.uniq wc -l scafs.lst.uniq # 5010 scafs.lst.uniq wc -l *.agp # 46688 contigsRandomsScafs.agp # 40641 randomsScafFixed.agp # 35633 randomsScaffold.agp # 35633 + 5010 = 40643 but there are 2 less gap rows since there are none # at the ends of the random chromosomes. So the number of lines in the # AGP file is correct: 40641. # get the list of contigs again for the randoms from the FASTA headers # and check all of these are present in the AGP file and vice versa. cp contigs.headers contigs.names # remove excess part of the names for the contigs with accessions perl -pi.bak -e \ 's/^([A-Z]{2}[0-9]+\.[0-9]+_?[0-9]*\.?[0-9]*)\s.+$/$1/' \ contigs.names sort contigs.names | uniq > contigs.names.sort awk '{print $6}' randomsScafFixed.agp | sort | uniq > contigsFromAgp.sort wc -l contigs*.sort # 60092 contigs.names.sort # 20351 contigsFromAgp.sort comm -13 contigs.names.sort contigsFromAgp.sort # CR293502.4 # CR356227.33 # CR388165.16 # CR854948.10 # CR931788.11 # CR954226.7 # CT573234.3 # CT573263.4 # CT737182.2 # CT997808.4 # These accessions are not matched from the headers to the AGP file. # e.g. CR293502.4 # CR293502.4_00001 zK31A1.00001 CR293502 1 27131 # CR293502.4_02478 zK31A1.02478 CR293502 27232 29631 # CR293502.4_01210.0 zK31A1.01210.0 CR293502 119649 233137 # The last 2 fields above are coordinates relative to the accession and should # match to fields 7 and 8 in the relevant lines of the AGP file but, in this # case, they do not. # in the AGP file: # Zv7_scaffold2558 1944730 1960624 49 D CR293502.4 11237 # 27131 + # Zv7_scaffold2558 1960625 1960724 50 N 100 fragment no # Zv7_scaffold2558 1960725 1963124 51 D CR293502.4 27232 # 29631 + # Zv7_scaffold2558 1963125 1963224 52 N 100 fragment no # Zv7_scaffold2558 1963225 1995007 53 D CR293502.4 201355 # 233137 - # Co-ordinates are relative to CR293502.4 in fields 7 and 8 grep CR293502.4 randomsScaffold.agp > CR293502.4.agp # E-mailed Tina Eyre (te3@sanger.ac.uk) and Ian Sealy (is1@sanger.ac.uk) at # Sanger to ask them about these discrepancies and how to fix it (2007-08-09). # Received the Zv7_all_scaffolds.agp file on 2007-08-14 from Ian Sealy # (is1@sanger.ac.uk). This contains the unfinished clones and should not be # shared with the public. The contig/clone names match up to those for clone # name fragments in the Zv7_contigs.fa file. # Received Zv7_all_scaffolds.agp: grep CR293502.4 ../Zv7_all_scaffolds.agp > CR293502.4.allScafs.agp # Zv7_scaffold2558 1944730 1960624 49 U CR293502.4_00001 # 11237 27131 + zK31A1.00001 15895 27131 # Zv7_scaffold2558 1960725 1963124 51 U CR293502.4_02478 # 12400 + zK31A1.02478 2400 2400 # Zv7_scaffold2558 1963225 1995007 53 U CR293502.4_01210.0 # 81707 113489 - zK31A1.01210.0 31783 113489 # Coordinates in fields 7 and 8 of this file are relative to the clone # fragment names in field 6. foreach f (CR293502*.agp) awk \ '{if ($5 !~ /N/) print "faFrag ", $6".fa", $7-1, $8, $6$7"-"$8".fa";}' \ $f >> faFrag${f} end chmod +x faFrag* awk '{print $6}' CR293502.4.allScafs.agp > ctgList.txt foreach f (`cat ctgList.txt`) echo $f > list faSomeRecords ../Zv7_contigs.fa list ${f}.fa end faFragCR293502.4.agp # Wrote 15895 bases to CR293502.411237-27131.fa # Wrote 2400 bases to CR293502.427232-29631.fa # Wrote 31783 bases to CR293502.4201355-233137.fa faFragCR293502.4.allScafs.agp # Wrote 15895 bases to CR293502.4_0000111237-27131.fa # Wrote 2400 bases to CR293502.4_024781-2400.fa # Wrote 31783 bases to CR293502.4_01210.081707-113489.fa # When diff on each pair of files of the same size, the sequence is # identical, only the headers are different. # Decided to base assembly on scaffolds not contigs (2007-08-22) ########################################################################## # ALL CHROMS AGP (2007-08-18, hartera) ssh kkstore06 cd /cluster/data/danRer5/Zv7release awk '{if ($5 !~ /N/) print $6;}' Zv7_all_scaffolds.agp | sort | uniq \ > contigNamesFromAllScafs.sort # compare to contig names from FASTA file comm -13 contigNamesFromAllScafs.sort ./randoms/contigs.names.sort # no difference: all contig names from AGP file are in the FASTA file comm -23 contigNamesFromAllScafs.sort ./randoms/contigs.names.sort \ > notInAllScafs wc -l notInAllScafs # 3924 notInAllScafs grep "Zv7_NA" notInAllScafs | wc -l # 3924 # So the only ones not in FASTA file are the 3924 Zv7_NA contigs # get clone names without underscore and extension # remove excess part of the names for the contigs with accessions cp contigNamesFromAllScafs.sort contigNamesFromAllScafs2.sort perl -pi.bak -e \ 's/^([A-Z]{2}[0-9]+\.[0-9]+)_?[0-9]*\.?[0-9]*$/$1/' \ contigNamesFromAllScafs2.sort grep -v "Zv7_NA" contigNamesFromAllScafs2.sort \ | sort | uniq > contigNamesFromAllScafs2NoNAs.sort # get list of contigs and clones in randoms only awk '{if ($5 !~ /N/) print $6;}' ./randoms/randomsScaffold.agp \ | sort | uniq > randomsContigsNames.sort # remove randoms scaffolds from the list of contigs/clones from # Zv7_all_scaffolds.agp comm -13 randomsContigsNames.sort contigNamesFromAllScafs2NoNAs.sort \ > contigNamesFromAllScafs2NoRandoms.txt sort contigNamesFromAllScafs2NoRandoms.txt | uniq \ > contigNamesFromAllScafs2NoRandoms.sort # then get the compare this list to a list of clones/contigs # from Zv7_chr.agp awk '{if ($5 !~ /N/) print $6;}' Zv7_chr.agp | sort | uniq \ > chromsAgpContigs.sort comm -23 contigNamesFromAllScafs2NoRandoms.sort chromsAgpContigs.sort \ | wc -l # 0 # So there are no new contigs in the Zv7_all_scaffolds.agp file that # are not randoms or Zv7_NA. # Try agpAddCtgNamesAndGaps.c on Zv7_chr.agp and see if all # clone fragments can be found in the FASTA file. cp ./randoms/contigs.headers . # get the names from the headers awk '{print $1}' contigs.headers | sort | uniq > contigsNames.headers.sort # get the contig/clone names from the Zv7_chr.agp file: sorted file is # chromContigs.txt comm -13 contigsNames.headers.sort ./randoms/chromContigs.txt \ > contigsInChromAgpOnly.txt wc -l contigsInChromAgpOnly.txt # 575 contigsInChromAgpOnly.txt # Get FASTA file headers for just this set of contigs. These are ones # with fragment that are named XXNNNN_NN e.g. BX511136.3_00285.0 grep -f contigsInChromAgpOnly.txt contigs.headers \ > contigsInChromAgpOnly.headers /cluster/home/hartera/bin/x86_64/agpAddCtgNamesAndGaps \ contigsInChromAgpOnly.headers Zv7_chr.agp \ chrFixed.agp >& agpChroms.log # check if all the contig/clone names in the AGP file have now been # found in the FASTA file. sort contigs.names | uniq > contigs.names.sort # get contig/clone names from fixed AGP file awk '{if ($5 !~ /N/) print $6}' chrFixed.agp | sort | uniq \ > contigsFromChrFixedAgp.sort # get list of names in the FASTA contig headers cp ./randoms/contigs.names.sort . wc -l contigs*.sort # 60092 contigs.names.sort # 39659 contigsFromChrFixedAgp.sort comm -13 contigs.names.sort contigsFromChrFixedAgp.sort \ > notFoundInChrFixedAgp.txt wc -l notFoundInChrFixedAgp.txt # 334 notFoundInChrFixedAgp.txt echo BX005112.16 > list nice faSomeRecords Zv7_contigs.fa list BX005112.16_02613.fa faSize BX005112.16_02613.fa # 171673 bases (0 N's 171673 real 171673 upper 0 lower) in 1 sequences in # 1 files grep BX005112.16 Zv7_chr.agp # chr21 31909678 32080531 1296 D BX005112.16 820 # 171673 + # chr21 32080632 32084318 1298 D BX005112.16 171774 # 175460 + grep BX005112.16 chrFixed.agp # chr21 1104388161 1104559014 1296 D BX005112.16 820 # 171673 + # chr21 1104559115 1104562801 1298 D BX005112.16_02934 # 171774 175460 + grep BX005112.16 Zv7_all_scaffolds.agp # Zv7_scaffold2077 678529 849382 54 U BX005112.16_02613 # 820 171673 + zK85G15.02613 170854 171673 # Zv7_scaffold2077 849483 853169 56 U BX005112.16_02934 # 1 3687 + zK85G15.02934 3687 3687 grep BX005112.16 contigs.headers # BX005112.16_02613 zK85G15.02613 BX005112 1 171673 # BX005112.16_02934 zK85G15.02934 BX005112 171774 175460 echo BX005112.16 > list2 nice faSomeRecords Zv7_scaffolds.fa list2 BX005112.fa # not found, these accedssions are not in the FASTA file. # In order to create the chroms, need to extract the relevant coords from # the accessions to create the FASTA file. # Now basing assembly on scaffolds instead of contigs so create # AGP files for scaffolds. ########################################################################## # CREATE A SCAFFOLDS AGP WITH SCAFFOLDS ONLY FOR RANDOMS # (2007-08-22 and 2007-08-24, hartera) # Make AGP using just the unmapped scaffolds and not making virtual # chromosomes for unmapped scaffolds so that this is the same as the # way Ensembl handles these. Therefore there will be 25 chromosomes, chrM, # plus 5010 unmapped scaffolds. ssh kkstore06 # make agps and fasta directories mkdir -p /cluster/data/danRer5/Zv7release/agps mkdir -p /cluster/data/danRer5/Zv7release/fasta # move assemmbly FASTA files to this directory cd /cluster/data/danRer5/Zv7release mv Zv7_*.fa ./fasta/ cd /cluster/data/danRer5/Zv7release/randoms # get list of scaffolds for randoms - one for Un_random and one for # NA_random scaffolds. awk '{if ($1 ~ /Zv7_scaffold/) print $1;}' randomsScaffold.agp \ | sort -k 1.7,1n | uniq > Un_randomScafs.list awk '{if ($1 ~ /Zv7_NA/) print $1;}' randomsScaffold.agp \ | sort -k 1.7,1n | uniq > NA_randomScafs.list wc -l *randomScafs.list # 4844 NA_randomScafs.list # 166 Un_randomScafs.list # 5010 total # get sequences for just these scaffolds foreach f (NA Un) faSomeRecords ../fasta/Zv7_scaffolds.fa ${f}_randomScafs.list \ Zv7${f}_randomScafs.fa end # check that they are all there foreach f (NA Un) grep '>' Zv7${f}_randomScafs.fa > ${f}_Random.headers end wc -l *Random.headers # 4844 NA_Random.headers # 166 Un_Random.headers # 5010 total perl -pi.bak -e 's/>//' *Random.headers foreach f (NA Un) sort -k 1.7,1n ${f}_Random.headers | uniq > ${f}_Random.headers.sort end comm -12 NA_Random.headers.sort NA_randomScafs.list | wc -l # 4844 comm -12 Un_Random.headers.sort Un_randomScafs.list | wc -l # 166 # Total is 4844 + 166 = 5010 # so all the sequences in the scaffolds lists are in the FASTA files. # Make an AGP from these scaffolds FASTA sequences with 50000 Ns # inserted between scaffolds. foreach c (NA_random Un_random) scaffoldFaToAgp -scaffoldGapSize=0 Zv7${c}Scafs.fa end # scaffold gap size is 0, total scaffolds: 4844 # chrom size is 117689868 # writing Zv7NA_randomScafs.agp # writing Zv7NA_randomScafs.gap # writing Zv7NA_randomScafs.lft # scaffold gap size is 0, total scaffolds: 166 # chrom size is 45800611 # writing Zv7Un_randomScafs.agp # writing Zv7Un_randomScafs.gap # writing Zv7Un_randomScafs.lft # Create AGP with just the scaffolds # sort NA by scaffold number: # first remove gap lines: grep -w -v "N" Zv7NA_randomScafs.agp > Zv7NA_randomScafsNoGaps.agp grep -w -v "N" Zv7Un_randomScafs.agp > Zv7Un_randomScafsNoGaps.agp sort -k 6.7n -k 6.8n -k 6.9,6.10n -k 6.10,6.11n \ Zv7NA_randomScafsNoGaps.agp > Zv7NA_randomScafsSorted.agp foreach f (Zv7Un_randomScafsNoGaps.agp Zv7NA_randomScafsSorted.agp) set g = $f:r echo $g awk 'BEGIN {OFS="\t"} {print $6, $7, $8, "1", "W", $6, $7, $8, "+"}' \ $f > ${g}2.agp end wc -l Zv7*_randomScafs*agp # 9687 Zv7NA_randomScafs.agp # 4844 Zv7NA_randomScafsNoGaps.agp # 4844 Zv7NA_randomScafsSorted.agp # 4844 Zv7NA_randomScafsSorted2.agp # 331 Zv7Un_randomScafs.agp # 166 Zv7Un_randomScafsNoGaps.agp # 166 Zv7Un_randomScafsNoGaps2.agp # 4844 + 166 = 5010 -> total unmapped scaffolds cat Zv7NA_randomScafsSorted2.agp Zv7Un_randomScafsNoGaps2.agp \ > Zv7AllRandomScafs.agp wc -l Zv7AllRandomScafs.agp # 5010 Zv7AllRandomScafs.agp # move scaffolds AGP to agps directory: mv Zv7AllRandomScafs.agp /cluster/data/danRer5/Zv7release/agps/ # move sequences to FASTA directory mv Zv7*_randomScafs.fa \ /cluster/data/danRer5/Zv7release/fasta/ ####################################################################### # PROCESS CHROMOSME AGP AND SCAFFOLDS AGP TO CREATE A SCAFFOLDS TO # CHROMOSOMES AGP FILE (DONE, 2007-08-24, hartera) # The Zv7_chr.agp file contains a mapping of contigs to chromsomes # and the Zv7_scaffold.agp file contains a mapping of contigs to scaffolds. # To build the Browser and assemble chromosomes, we need an AGP file # that maps scaffolds to chromsomes. # A program, createScaffoldsAgp.c was written to create this AGP file. # in kent/src/hg/oneShot/createScaffoldsAgp/. ssh kkstore06 cd /cluster/data/danRer5/Zv7release/agps createScaffoldsAgp ../Zv7_scaffold.agp ../Zv7_chr.agp \ Zv7ScafToChrom.agp >& createAgp.log wc -l *.agp # 5010 Zv7AllRandomScafs.agp # 4943 Zv7ScafToChrom.agp # 9953 total ########################################################################### # CHECK AGP FILES AND FASTA CONTENTS AND SIZE CONSISTENCY # (DONE, 2007-08-22, hartera) # # Check that all the scaffolds in AGPs are in the FASTA file and vice versa # get list of scaffolds in AGP files: Zv7ScafToChrom.agp, # chrNA_random.scaffolds.agp and chrUn_random.scaffolds.agp ssh kkstor06 cd /cluster/data/danRer5/Zv7release/assembly/agps foreach f (*.agp) awk '{if ($5 !~ /N/) print $6}' $f >> allScafsFromAgps.txt end sort allScafsFromAgps.txt | uniq > allScafsFromAgps.sort grep '>' ../fasta/Zv7_scaffolds.fa | sed -e 's/>//' | sort | uniq \ > scafsHeaders.sort wc -l *.sort # 7494 allScafsFromAgps.sort # 7494 scafsHeaders.sort comm -12 allScafsFromAgps.sort scafsHeaders.sort | wc -l # 7494 # 7494 are common to both files so they the AGP files contain all the # scaffolds in the FASTA file and vice versa. # create one AGP file for the assembly cat Zv7ScafToChrom.agp Zv7AllRandomScafs.agp > Zv7ScafToChromAndRandom.agp cd /cluster/data/danRer5/Zv7release/fasta # make a scaffolds directory to work in mkdir scaffolds cd scaffolds faSize -detailed ../Zv7_scaffolds.fa > Zv7.scaffolds.sizes # Check that these sizes correspond to the sizes in the scaffolds agp file # use script compareSizes2.pl cat << '_EOF_' > compareSizes2.pl #!/usr/bin/perl -w use strict; my ($file, $agp); $file = $ARGV[0]; $agp = $ARGV[1]; open(FILE, $file) || die "Can not open $file: $!\n"; open(AGP, $agp) || die "Can not open $agp: $!\n"; open(OUT, ">log.txt") || die "Can not create log.txt: $!\n"; my ($l, @f, $name, $size, %scafsHash); while () { $l = $_; @f = split(/\t/, $l); $name = $f[0]; $size = $f[1]; $scafsHash{$name} = $size; } close FILE; while () { my ($line, @fi, $scaf, $end); $line = $_; if ($line =~ /Zv/) { @fi = split(/\t/, $line); $scaf = $fi[5]; $end = $fi[7]; if (exists($scafsHash{$scaf})) { if ($scafsHash{$scaf} == $end) { print OUT "$scaf - ok\n"; } else { print OUT "$scaf - different size to sequence\n"; } } else { print OUT "$scaf - does not exist in list of sizes\n"; } } } close AGP; close OUT; '_EOF_' # << happy emacs chmod +x compareSizes2.pl perl compareSizes2.pl Zv7.scaffolds.sizes \ ../../agps/Zv7ScafToChromAndRandom.agp grep different log.txt grep not log.txt # these are all consistent with the sequence sizes # check that the co-ordinates in the agp files are consistent: # field 2 is the start position, field 3 is the end and field 8 is the size # so check that this is consistent. cd /cluster/data/danRer5 awk '{if ($5 !~ /N/ && (($3-$2+1) != $8)) print $6;}' \ ../../agps/Zv7AllRandomScafs.agp > Zv7.scaffolds.coordCheck # this file is empty so they are ok. do the same for the contigs to # chromsomes .agp file awk '{if ($5 !~ /N/ && (($3-$2+1) != ($8-$7 +1))) print $6;}' \ ../../Zv7_chr.agp > Zv7.contigsToChroms.coordCheck # this file is empty so ok # in order to create the scaffolds to chroms AGP file with # createScaffoldsAgp, the coordinates were checked between the # Zv7_scaffold.agp and Zv7_chr.agp files. cd .. rm -r scaffolds ####################################################################### # MAKE GENOME DB FROM CHROMOSOMES AND UNMAPPED SCAFFOLDS # (DONE, 2007-08-24, hartera) # CHANGE dbDb ENTRY MADE BY SCRIPT TO DISPLAY SAME DEFAULT POSITION # AS FOR DANRER4, GENE vdac2 (DONE, 2007-09-10, hartera) # CHANGE dbDb ENTRY SO THAT THE FULL NAME OF THE SANGER INSTITUTE IS GIVEN # IN THE SOURCENAME FIELD (DONE, 2007-09-22, hartera) # CHANGED FRAG IN GOLD TABLE FROM GI NUMBER TO ACCESSION FOR chrM # (DONE, 2007-10-01, hartera) # Since there will be a mixture of chroms and unmapped scaffolds, the # chroms must be built first being inputting the assembly sequence to # makeGenomeDb.pl ssh kkstore06 cd /cluster/data/danRer5/Zv7release/ cd fasta agpToFa -simpleMultiMixed ../agps/Zv7ScafToChrom.agp all \ Zv7_chromosomes.fa ./Zv7_scaffolds.fa # check the chroms are all there grep '>' Zv7_chromosomes.fa | sed -e 's/>//' > headers # all chroms 1-25 are there rm headers # check agp and FASTA checkAgpAndFa ../agps/Zv7ScafToChrom.agp Zv7_chromosomes.fa # All AGP and FASTA entries agree - both files are valid # cat together chromosomes FASTA and scaffolds FASTA: cat Zv7_chromosomes.fa Zv7NA_randomScafs.fa Zv7Un_randomScafs.fa \ > Zv7ChromsAndRandomScafs.fa # To find the gi number for chrM, go to http://www.ncbi.nih.gov/ and search # Nucleotide for "Danio mitochondrion genome". # That shows gi number: 15079186, accession: NC_002333.2, 16596 bp # also GI:8576324 and accession: AC024175 cd /cluster/data/danRer5/ cat << 'EOF' > danRer5.config.ra db danRer5 clade vertebrate scientificName Danio rerio commonName Zebrafish assemblyDate Jul. 2007 assemblyLabel Sanger Centre, Danio rerio Sequencing Project Zv7 orderKey 449 dbDbSpeciesDir zebrafish # NC_002333.2 mitoAcc 15079186 agpFiles /cluster/data/danRer5/Zv7release/agps/Zv7ScafToChromAndRandom.agp fastaFiles /cluster/data/danRer5/Zv7release/fasta/Zv7ChromsAndRandomScafs.fa 'EOF' # << keep emacs coloring happy # corrected wget for mitochondron genome so: # http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=15079186&retmode=text # remove directories created rm -r TemporaryTrackDbCheckout bed html jkStuff rm dbDbInsert.sql ssh hgwdev cd /cluster/data/danRer5/ hgsql -e 'delete from dbDb where name="danRer5";' hgcentraltest ssh kkstore06 cd /cluster/data/danRer5 # Run with -debug to see what it would do / check for issues: makeGenomeDb.pl danRer5.config.ra -debug -verbose=3 makeGenomeDb.pl danRer5.config.ra >& makeGenomeDb.log & # Took about 10 minutes. tail -f makeGenomeDb.log # Follow the directions at the end of the log after #NOTES -- STUFF THAT YOU WILL HAVE TO DO -- # Search for '***' notes in each file in and make corrections (sometimes the # files used for a previous assembly might make a better template): # description.html /cluster/data/danRer5/html/{trackDb.ra,gap.html,gold.html} # Then cd ../.. (to trackDb/) and # - edit makefile to add danRer5 to DBS. # - (if necessary) cvs add zebrafish # - cvs add zebrafish/danRer5 # - cvs add zebrafish/danRer5/*.{ra,html} # - cvs ci -m "Added danRer5 to DBS." makefile # - cvs ci -m "Initial descriptions for danRer5." zebrafish/danRer5 # - (if necessary) cvs ci zebrafish # - Run make update DBS=danRer5 and make alpha when done. # - (optional) Clean up /cluster/data/danRer5/TemporaryTrackDbCheckout # - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there. # makeGenomeDb.pl creates the db for the genome, makes the hgcentraltest # entries and loads the gold (scaffolds) and gap tables. Creates the GC # percent, short match and restriction enzyme tracks. It also creates # a danRer5 directory in the trackDb/zebrafish directory with a trackDb.ra # file and a template for the assembly descripton, and for the # gold and gap track tracks # Then need to load in fragment gaps manually for gap table. # (2007-09-10, hartera), Change the default Browser position in dbDb # so that it displays vdac2 - chr13:14,786,820-14,801,993 - same as for # danRer4. ssh hgwdev hgsql -h genome-testdb -e \ 'update dbDb set defaultPos = "chr13:14,786,820-14,801,993" \ where name = "danRer5";' hgcentraltest # (2007-09-22, hartera), Change the source name for danRer5 to give # Sanger's full name and use similar format to other genome assembly # entries in dbDb. hgsql -h genome-testdb -e \ 'update dbDb set sourceName = "Wellcome Trust Sanger Institute, Zv7 assembly" where name = "danRer5";' \ hgcentraltest # (2007-10-01, hartera), Added the accession instead of the gi number # for the frag field for chrM in the gold table. gi number is gi|15079186 # and accession is NC_002333.2. This is the same as AC024175.3 which was # used previously. ssh hgwdev hgsql -e \ 'update gold set frag = "NC_002333.2" where chrom = "chrM";' danRer5 ########################################################################## # REBUILD GAP TABLE WITH FRAGMENT GAPS AS WELL AS CONTIG GAPS # (DONE, 2007-08-24, hartera) # It is confusing not to display the fragment gaps in the gap track - # these are gaps within contigs. ssh kkstore06 cd /cluster/data/danRer5/Zv7release/ # need to use the contigs to chroms AGP # also copy over the contigs to scaffolds AGP for the unmapped scaffolds cp /cluster/data/danRer5/Zv7release/randoms/randomsScaffold.agp \ /cluster/data/danRer5/Zv7release/agps/ mkdir /cluster/data/danRer5/gap awk '{if ($5 ~ /N/) print;}' Zv7_chr.agp \ > /cluster/data/danRer5/gap/chroms.gap # 34449 gaps awk '{if ($5 ~ /N/) print;}' ./agps/randomsScaffold.agp \ > /cluster/data/danRer5/gap/randomScafs.gap # 15278 gaps cat chroms.gap randomScafs.gap > danRer5.gap wc -l /cluster/data/danRer5/gap/*.gap # 34449 /cluster/data/danRer5/gap/chroms.gap # 15278 /cluster/data/danRer5/gap/randomScafs.gap # 49727 /cluster/data/danRer5/gap/danRer5.gap rm chroms.gap randomScafs.gap # 2459 is current count in table. ssh hgwdev cd /cluster/data/danRer5/ hgLoadGap -unsplit danRer5 /cluster/data/danRer5/gap/danRer5.gap hgsql -e 'select count(*) from gap;' danRer5 # 49727 # So all the gaps were loaded. ########################################################################### # REPEATMASKER RUN (DONE, 2007-08-25 - 2007-08-28, hartera) # RE-RUN REPEATMASKER (DONE, 2007-08-30 - 2007-09-05, hartera) # The sequence was under RepeatMasked, because the new zebrafish repeats # library with the unclassified repeats must be distributed to all the # nodes for pk. # NESTED REPEATS TRACK RE-CREATED AFTER FIX TO extractNestedRepeats.pl # SCRIPT (DONE, 2007-09-19, angie) # RepeatMasker,v 1.19 2007/05/23 (May 17 2007 (open-3-1-8) version of # RepeatMasker) with RepeatMasker library RELEASE 20061006. # Added zebunc.ref (Zebrafish Unclassified) repeats from RepBase12.07 # from the Genetic Information Research Institute (GIRI, # http://www.girinst.org/server/RepBase/index.php). Some repeats were # removed from zebunc.ref because they mask out real genes (advised by # Zebrafish bioinformatics group at Sanger) - more details below. # Download the zebunc.ref unclassified repeats file for zebrafish # from RepBase: ssh kkstore06 cd /cluster/data/danRer5/ gunzip repeatmaskerlibraries-20061006.tar.gz # no unclassified zebrafish repeats here # Download zebunc.ref from # http://www.girinst.org/server/RepBase/RepBase12.07.fasta/zebunc.ref # last updated 21-Aug-2007 16:21 451K # copy the one used for danRer4 and see if it is any different cp /cluster/bluearc/RepeatMasker060320/Libraries/zebunc.ref.txt \ zebuncref2006.txt diff zebunc.ref zebuncref2006.txt # no difference so check format is still correct for the current # version of RepeatMasker on pk /scratch/data/RepeatMasker/Libraries/ # format still looks the same. ssh pk mkdir /tmp/danRer5 cd /tmp/danRer5 faOneRecord /cluster/data/danRer5/Zv7release/fasta/Zv7_contigs.fa \ Zv7_scaffold1.1 > Zv7scaffold1.1.fa # Run RepeatMasker on this to get it to create the danio library /scratch/data/RepeatMasker/RepeatMasker -ali -s -species danio \ Zv7scaffold1.1.fa cp /cluster/bluearc/RepeatMasker060320/Libraries/zebunc.ref.format \ /scratch/data/RepeatMasker/Libraries/ # Add this to the specieslib created for danio cd /scratch/data/RepeatMasker/Libraries/ cat zebunc.ref.format \ >> /scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib grep '>' specieslib | wc -l # 1184 # then the RepeatMasker script # Need to re-do this and add the extra zebrafish repeats to the danio # libraries on /cluster/bluearc/scratch/data/RepeatMasker/ # and get it rsynced to all the parasol hubs, nodes, workhorse machines # and fileservers. # E-mail from Kerstin Howe about zebrafish repeats on Aug 31, 2007 # states that some of these unknown repeats were removed because they # align to real genes. Save this e-mail as README.Repeats # (2007-09-03, hartera) ssh kkstore06 cd /cluster/data/danRer5/ # get list of zebunc.ref sequence names grep '>' zebunc.ref | sed -e 's/>//' > zebunc.ref.names # make a list of the repeats that Sanger removed: unknownRepsRemoved.txt # Sanger removed some of these repeats because they masked out real genes # get list list and remove them from list of repeat names. grep -w -v -f unknownRepsRemoved.txt zebunc.ref.names \ > zebuncFixed.ref.names wc -l *.names # 958 zebunc.ref.names # 943 zebuncFixed.ref.names # select just those remaining names from the zebunc.ref FASTA file faSomeRecords zebunc.ref zebuncFixed.ref.names zebuncFixed.ref.fa grep '>' zebuncFixed.ref.fa | wc -l # 943 # then add these repeats to the library for zebrafish on bluearc ssh hgwdev # do a dummy run first to create the specieslib for zebrafish /cluster/bluearc/scratch/data/RepeatMasker/RepeatMasker \ -species 'danio' /dev/null # then cat the new library to the specieslib for danio cat /cluster/data/danRer5/zebuncFixed.ref.fa >> \ /cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib cd /cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio grep '>' specieslib | grep "Dr | wc -l # 943 # (2007-09-04, hartera) # also copy to /cluster/bluearc/RepeatMasker # do a dummy run first to create the specieslib for zebrafish /cluster/bluearc/RepeatMasker/RepeatMasker -species 'danio' /dev/null cp -p \ /cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib\ /cluster/bluearc/RepeatMasker/Libraries/20061006/danio/ # ask cluster-admins for an rsync from # /cluster/bluearc/scratch/data/RepeatMasker/ # to /scratch/data/RepeatMasker/ on all machines (parasol hubs, nodes, # fileservers, workhorse machines) ssh hgwdev # cleanup rm /gbdb/danRer5/danRer5.2bit rm -r /san/sanvol1/scratch/danRer5/RMPart rm /san/sanvol1/scratch/danRer5/danRer5.2bit rm /san/sanvol1/scratch/danRer5/*.ooc hgsql -e 'drop table rmsk;' danRer5 ssh kkstore06 # clean up cd /cluster/data/danRer5 rm danRer5.rmsk.2bit danRer5.rmskTrf.2bit danRer5.2bit mv RepeatMasker.2007-08-25/ OldRepeatMask ## use screen for this time nice doRepeatMasker.pl -bigClusterHub=pk -species danio \ danRer5 >& repeatmask.log & tail -f repeatmask.log # 0.338u 0.267s 20:33:39.53 0.0% 0+0k 0+0io 5pf+0w # Checked part of danRer5.rmsk.2bit with twoBitToFa and it looks masked. ## From previous run of RepeatMasker, this is fixed now: # Output is in # /scratch/tmp/doRepeatMasker.cluster.s15477/chr3:16000000-16500000.fa.out # Checked in danRer4 RM run files and some of them have "*" at the end of # a line e.g. /cluster/data/danRer4/1/chr1_11/chr1_11_10.fa.out # Also, the IDs in the current run do not run consecutively # e.g. 1,2,3,4,5,6,2,7,8 etc. # E-mailed Angie to ask for advice # (2007-08-27, hartera) # liftUp is fixed so it will skip this repeat in the liftUp step when # it finds a "*" but no ID. # After the repeat DNA11TA1_DR with ID 821, there is a line with a # repeat but no ID, just a "*" at the end which confuses liftUp. # Angie sent this problem as an example to Robert Hubley to fix the bug. # Here is the offending line: # 294 9.8 0.0 0.0 chr3:16000000-16500000 372065 372105 (127895) + # DNA11TA1_DR DNA/Tc1 164 204 (0) * # Angie fixed liftUp to just warn when it finds a "*" instead of an ID. # NOTE: the Nested Repeats track is created automatically by the # doRepeatMasker.pl script. When the script was run to create the # RepeatMasker track it was found that the extractNestedRepeats.pl # needed fixing which was done by angie. # The Nested Repeats track was then re-created (angie, 2007-09-19) cd /cluster/data/danRer5/bed/RepeatMasker.2007-09-04 extractNestedRepeats.pl danRer5.fa.out > danRer5.nestedRepeats.bed hgLoadBed danRer5 nestedRepeats danRer5.nestedRepeats.bed \ -sqlTable=\$HOME/kent/src/hg/lib/nestedRepeats.sql ########################################################################### # SIMPLE REPEAT (TRF) TRACK (DONE, 2007-08-25 - 2007-08-27, hartera) # TRF can be run in parallel with RepeatMasker on the file server # since it doesn't require masked input sequence. ssh kolossus ## use screen for this mkdir /cluster/data/danRer5/bed/simpleRepeat cd /cluster/data/danRer5/bed/simpleRepeat time nice twoBitToFa ../../danRer5.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp \ >& trf.log & # Took about 8 hours to run tail -f trf.log # Crashed on kolossus so re-run as before on kki: # Broken pipe twoBitToFa ../../danRer5.unmasked.2bit # stdout | # 12.260u 4.233s 8:05:02.78 0.0% 0+0k 0+0io 0pf+0w # Segmentation fault trfBig -trf=/cluster/bin/i386/trf stdin # /dev/null -bedAt=simpleRepeat.bed ... # This time split the sequence up and run on kki ssh kkstore06 cd /cluster/data/danRer5/ mkdir splitNoMask cd splitNoMask twoBitToFa ../danRer5.unmasked.2bit danRer5.unmasked.fa faSplit byname danRer5.unmasked.fa . rm danRer5.unmasked.fa # Copy the split sequences to iscratch dir on kkr1u00 ssh kkr1u00 rm -r /iscratch/i/danRer5/splitNoMask mkdir -p /iscratch/i/danRer5/splitNoMask foreach s (/cluster/data/danRer5/splitNoMask/*.fa) echo "Copying $s ..." cp $s /iscratch/i/danRer5/splitNoMask/ end # Count files transferred foreach f (/iscratch/i/danRer5/splitNoMask/*.fa) ls $f >> seq.lst end wc -l seq.lst # 5036 seq.lst # correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds. rm seq.lst # rsync to cluster machines foreach R (2 3 4 5 6 7 8) rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/ end ## use screen ssh kki mkdir -p /cluster/data/danRer5/bed/simpleRepeat cd /cluster/data/danRer5/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << keep emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << keep emacs coloring happy foreach f (/iscratch/i/danRer5/splitNoMask/*.fa) ls -1S $f >> genome.lst end gensub2 genome.lst single gsub jobList /parasol/bin/para create jobList # 5036 jobs written to batch /parasol/bin/para try, check, push, check etc... /parasol/bin/para time # still crashing on one sequence: # sh: line 1: 23369 Segmentation fault /cluster/bin/i386/trf # /tmp/Zv7_scaffold2487.tf 2 7 7 80 10 50 2000 -m -d # can't open /tmp/Zv7_scaffold2487.tf.2.7.7.80.10.50.2000.mask # Completed: 5035 of 5036 jobs # Crashed: 1 jobs # CPU time in finished jobs: 31429s 523.82m 8.73h 0.36d 0.001 y # IO & Wait Time: 12949s 215.81m 3.60h 0.15d 0.000 y # Average job time: 9s 0.15m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3818s 63.63m 1.06h 0.04d # Submission to last job: 5013s 83.55m 1.39h 0.06d # test this sequence ssh kkstore06 cd /cluster/data/danRer5/bed/simpleRepeat mkdir noMaskSplit mkdir run2 cd noMaskSplit faSplit -minGapSize=100 -lift=scaf2487.lft gap \ /cluster/data/danRer5/splitNoMask/Zv7_scaffold2487.fa \ 10000 scaf2487_ ssh kkr1u00 mkdir /iscratch/i/danRer5/splitScaf2487/ cp /cluster/data/danRer5/bed/simpleRepeat/noMaskSplit/*.fa \ /iscratch/i/danRer5/splitScaf2487/ ls /iscratch/i/danRer5/splitScaf2487/*.fa | wc -l # 250 # rsync to all iServers foreach R (2 3 4 5 6 7 8) rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/ end ssh kki cd /cluster/data/danRer5/bed/simpleRepeat/run2 cp ../runTrf . cp ../gsub . mkdir trf ls -1S /iscratch/i/danRer5/splitScaf2487/*.fa > genome.lst gensub2 genome.lst single gsub jobList /parasol/bin/para create jobList # 250 jobs written to batch /parasol/bin/para try, check, push, check etc... /parasol/bin/para time # Completed: 249 of 250 jobs # Crashed: 1 jobs #CPU time in finished jobs: 73s 1.22m 0.02h 0.00d 0.000 y #IO & Wait Time: 622s 10.36m 0.17h 0.01d 0.000 y #Average job time: 3s 0.05m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 23s 0.38m 0.01h 0.00d #Submission to last job: 63s 1.05m 0.02h 0.00d /parasol/bin/para problems >& problems # trf/scaf2487_082.bed does not exist # stderr: # sh: line 1: 7469 Segmentation fault /cluster/bin/i386/trf # /tmp/scaf2487_082.tf 2 7 7 80 10 50 2000 -m -d # can't open /tmp/scaf2487_082.tf.2.7.7.80.10.50.2000.mask # Failed sequence looks fine - just A,C,G,T or N. # Downloaded the latest version of trf (v4.0, previous was v3.21) # for both 64 bit and i386. Tried 64 bit version on the scaf2487_082.fa # and it works fine. So now run on kolossu. mv /cluster/data/danRer5/bed/simpleRepeat \ /cluster/data/danRer5/bed/OldsimpleRepeat # run on the pk cluster, too slow on kolossus using the 2bit sequence file. ssh pk mkdir -p /san/sanvol1/scratch/danRer5/splitNoMask # copy the split sequence files to the san - 1 for each chorm and 1 # for each unmapped scaffold foreach f (/cluster/data/danRer5/splitNoMask/*.fa) cp $f /san/sanvol1/scratch/danRer5/splitNoMask end mkdir /cluster/data/danRer5/bed/simpleRepeat cd /cluster/data/danRer5/bed/simpleRepeat mkdir trf cat << '_EOF_' > runTrf #!/bin/csh -fe # set path1 = $1 set inputFN = $1:t set outpath = $2 set outputFN = $2:t mkdir -p /tmp/$outputFN cp $path1 /tmp/$outputFN pushd . cd /tmp/$outputFN /cluster/bin/i386/trfBig -trf=/cluster/bin/x86_64/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp popd rm -f $outpath cp -p /tmp/$outputFN/$outputFN $outpath rm -fr /tmp/$outputFN/* rmdir --ignore-fail-on-non-empty /tmp/$outputFN '_EOF_' # << keep emacs coloring happy chmod +x runTrf cat << '_EOF_' > gsub #LOOP ./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed} #ENDLOOP '_EOF_' # << keep emacs coloring happy foreach f (/san/sanvol1/scratch/danRer5/splitNoMask/*.fa) ls -1S $f >> genome.lst end /parasol/bin/gensub2 genome.lst single gsub jobList /parasol/bin/para create jobList # 5036 jobs written to batch /parasol/bin/para try, check, push, check etc... /parasol/bin/para time # Completed: 5036 of 5036 jobs # CPU time in finished jobs: 45132s 752.20m 12.54h 0.52d 0.001 y # IO & Wait Time: 16660s 277.67m 4.63h 0.19d 0.001 y # Average job time: 12s 0.20m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 5804s 96.73m 1.61h 0.07d # Submission to last job: 6042s 100.70m 1.68h 0.07d # (2007-08-27, hartera) # the input to trf was the chromosomes and scaffolds so no liftUp needed. # cat all the files together cat ./trf/*.bed > simpleRepeat.bed # load table ssh hgwdev cd /cluster/data/danRer5/bed/simpleRepeat hgLoadBed danRer5 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 966523 elements of size 16 ########################################################################### # PROCESS SIMPLE REPEATS INTO A MASK (DONE, 2007-08-28, hartera) # RE-DONE AFTER RE-RUNNING REPEATMASKER (DONE, 2007-09-05, hartera) # The danRer5.rmsk.2bit is already masked with RepeatMasker output. # After the simpleRepeats track has been built, make a filtered # version of the trf output for masking: keep trfs with period <= 12. ssh kkstore06 cd /cluster/data/danRer5/bed/simpleRepeat rm -r trfMask rm trfMask.bed mkdir trfMask foreach f (trf/*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end cat ./trfMask/*.bed > trfMask.bed # don't need to lift up. these are already in chrom and unmapped # scaffold coordinates. ## Add trfMask to repeat masked sequence ssh kkstore06 cd /cluster/data/danRer5 # cleanup old sequence: rm danRer5.rmskTrf.2bit danRer5.2bit cat << '_EOF_' > addTrf.csh #!/bin/csh -efx #This script will fail if any of its commands fail. set DB = danRer5 set WORK_DIR = /cluster/data/${DB} cd ${WORK_DIR} set inputTwoBit = ${WORK_DIR}/${DB}.rmsk.2bit set outputTwoBit = ${WORK_DIR}/${DB}.rmskTrf.2bit cat /cluster/data/${DB}/bed/simpleRepeat/trfMask.bed \ | twoBitMask -add -type=.bed ${inputTwoBit} stdin ${outputTwoBit} twoBitToFa ${outputTwoBit} stdout | faSize stdin > faSize.${DB}.rmskTrf.txt '_EOF_' # << happy emacs chmod +x ./addTrf.csh time ./addTrf.csh # Warning: BED file stdin has >=13 fields which means it might contain # block coordinates, but this program uses only the first three fields # (the entire span -- no support for blocks). # This is ok, there are no blocks, only the first 3 fields are # standard BED format. # 44.550u 6.030s 0:50.66 99.8% 0+0k 0+0io 0pf+0w cat faSize.danRer5.rmskTrf.txt # 1440582308 bases (4986636 N's 1435595672 real 724596899 upper 710998773 # lower) in 5036 sequences in 1 files # Total size: mean 286056.9 sd 3643515.2 min 2000 (Zv7_NA5415) max 70371393 # (chr5) median 7336 # N count: mean 990.2 sd 9834.4 # U count: mean 143883.4 sd 1862761.1 # L count: mean 141183.2 sd 1772056.1 # %49.35 masked total, %49.53 masked real # this is similar to previous masking of zebrafish assembly sequence # for danRer4: %44.10 masked total, %48.94 masked real ln -s danRer5.rmskTrf.2bit danRer5.2bit # add symlink to /gbdb/danRer5 to symlink the danRer5.2bit file ssh hgwdev rm /gbdb/danRer5/danRer5.2bit ln -s /cluster/data/danRer5/danRer5.rmskTrf.2bit \ /gbdb/danRer5/danRer5.2bit # copy over to the san rm /san/sanvol1/scratch/danRer5/danRer5.2bit cp -p /cluster/data/danRer5/danRer5.rmskTrf.2bit \ /san/sanvol1/scratch/danRer5/danRer5.2bit # also copy to iscratch ssh kkr1u00 rm /iscratch/i/danRer5/danRer5.2bit cp -p /cluster/data/danRer5/danRer5.rmskTrf.2bit \ /iscratch/i/danRer5/danRer5.2bit # rsync to iServers foreach R (2 3 4 5 6 7 8) rsync -a --progress /iscratch/i/danRer5/ \ kkr${R}u00:/iscratch/i/danRer5/ end ########################################################################### # MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE, 2007-08-28, hartera) # RE-RUN AFTER RE-RUNNING REPEATMASKER AND RE-MASKING SEQUENCE # (DONE, 2007-09-05, hartera) # Use -repMatch=512 (based on size -- for human we use 1024, and # the zebrafish genome is ~50% of the size of the human genome # zebrafish is about 1.44 Gb and human is 3.1 Gb. ssh kolossus rm -r /cluster/data/danRer5/bed/ooc mkdir /cluster/data/danRer5/bed/ooc cd /cluster/data/danRer5/bed/ooc time blat /cluster/data/danRer5/danRer5.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=danRer5_11.ooc -repMatch=512 # 75.647u 3.159s 1:21.04 97.2% 0+0k 0+0io 2pf+0w # Wrote 40297 overused 11-mers to danRer5_11.ooc # then make 10.ooc file for overused 10mers, repMatch = 4096 # for humans so use 2048 for danRer5 time blat /cluster/data/danRer5/danRer5.2bit \ /dev/null /dev/null -tileSize=10 -makeOoc=danRer5_10.ooc -repMatch=2048 # 72.078u 2.587s 1:14.83 99.7% 0+0k 0+0io 0pf+0w # Wrote 9589 overused 10-mers to danRer5_10.ooc # copy the *.ooc files over to the san for cluster runs and Genbank run rm /san/sanvol1/scratch/danRer5/*.ooc cp -p *.ooc /san/sanvol1/scratch/danRer5/ # also copy to iscratch ssh kkr1u00 rm /iscratch/i/danRer5/*.ooc cp -p /cluster/data/danRer5/bed/ooc/*.ooc /iscratch/i/danRer5/ # rsync to iServers foreach R (2 3 4 5 6 7 8) rsync -a --progress /iscratch/i/danRer5/*.ooc \ kkr${R}u00:/iscratch/i/danRer5/ end ########################################################################### # CREATE LIFT FILE SEPARATED BY NON-BRIDGED GAPS # (DONE, 2007-08-28, hartera) # This is needed for the GenBank run. Use Hiram's gapToLift program ssh hgwdev cd /cluster/data/danRer5/jkStuff gapToLift danRer5 nonBridgedGap.lft # copy over to the san cp -p nonBridgedGap.lft /san/sanvol1/scratch/danRer5 # there are 14973 rows in this file, only 370 for mouse # that's ok. ########################################################################### # AUTO UPDATE GENBANK MRNA AND EST AND ZGC GENES RUN # (DONE, 2007-08-28, hartera) # RE-RUN AFTER RE-DOING REPEATMASKING OF GENOME SEQUENCE # (DONE, 2007-09-05, hartera) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank cvs update -dP # edit etc/genbank.conf to add danRer5 and commit to CVS # done already for first run so no need to update. # danRer5 (zebrafish) danRer5.serverGenome = /cluster/data/danRer5/danRer5.2bit danRer5.clusterGenome = /iscratch/i/danRer5/danRer5.2bit danRer5.ooc = /iscratch/i/danRer5/danRer5_11.ooc danRer5.lift = /cluster/data/danRer5/jkStuff/nonBridgedGap.lft danRer5.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} danRer5.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} danRer5.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} danRer5.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} danRer5.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} danRer5.downloadDir = danRer5 danRer5.mgcTables.default = full danRer5.mgcTables.mgc = all danRer5.perChromTables = no # end of section added to etc/genbank.conf cvs commit -m "Added danRer5." etc/genbank.conf # also added the perChromTables line afterwards. This means that there # will not be one table per chromosome. Would be too many tables as # there are 5010 scaffolds. # update /cluster/data/genbank/ make etc-update # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains # danRer genome information ssh kkstore06 cd /cluster/data/genbank # for re-run, remove everything under these directories: rm -r /cluster/data/genbank/data/aligned/genbank.161.0/danRer5/* rm -r /cluster/data/genbank/data/aligned/refseq.24/danRer5/* time nice bin/gbAlignStep -initial danRer5 & # 3691.965u 996.052s 4:08:37.03 31.4% 0+0k 0+0io 32pf+0w # logFile: var/build/logs/2007.09.05-10:04:19.danRer5.initalign.log # check log file - looks ok, it has finished (last line in log file) # kkstore06.cse.ucsc.edu 2007.09.05-14:12:56 danRer5.initalign: finish # then load database sh hgwdev cd /cluster/data/genbank # -drop parameter will ensure that old tables are dropped before loading time nice ./bin/gbDbLoadStep -drop -initialLoad danRer5 & # logFile: var/dbload/hgwdev/logs/2007.09.05-14:36:03.dbload.log # 1108.286u 310.389s 41:23.51 57.1% 0+0k 0+0io 16pf+0w # This time ran smoothly and the only problem was an NFS issue with # removing directories at the end - the load went ok though. # On the first time loading the GenBank alignments, it failed: # failed because there were too many open files, trying to load a table # per chrom and per scaffold so add the danRer5.perChromTables = n # line to genbank.conf file (see above) so that it will all be loaded into # one table. Too many table otherwise, as there are 25 chroms, chrM and # 5010 scaffolds. # Solution: remove lock file: # rm /cluster/data/genbank/var/dbload/hgwdev/run/dbload.lock # then run again ########################################################################### # ENSEMBL GENES TRACK FOR PROTEIN-CODING GENES FROM # ENSEMBL VERSION 46 (AUG 2007) (DONE, 2007-09-05, markd) # Compare peptides translated from genome to those downloaded from # BioMart for these transcripts and LOAD ensPep TABLE WITH PEPTIDE # SEQUENCES FOR THE PROTEIN-CODING ENSEMBL GENES # (DONE, 2007-09-24 - 2007-09-25, hartera) # ADD AUTOSQL CODE FOR ensInfo TABLE AND ADD CODE TO HGC TO PRINT # THIS ADDITIONAL INFORMATION ON THE DETAILS PAGE FOR EACH GENE # (DONE, 2007-09-28 and 2007-10-03, hartera) # Used the programs created by Robert Baertsch to download Ensembl # genes from Ensembl's ftp site, load it into the danRer5 database. ssh hgwdev mkdir /cluster/data/danRer5/bed/ensembl46 cd /cluster/data/danRer5/bed/ensembl46 # Download Ensembl genes from ftp site and create tracks in browser: # current version is danio_rerio core_46_7 hgLoadEnsembl danio_rerio core_46_7 danRer5 # these problems were detected, add to problem.ids file: genePredCheck -db=danRer5 ensGene.gp Error: ensGene.gp:2720: ENSDART00000019661 exonFrame on non-CDS exon 14 Error: ensGene.gp:3874: ENSDART00000027605 exonFrame on non-CDS exon 0 Error: ensGene.gp:9416: ENSDART00000062681 exonFrame on non-CDS exon 12 checked: 31743 failed: 3 hgLoadEnsembl -f problem.ids danio_rerio core_46_7 danRer5 # hgLoadEnsembl creates the ensGene, ensGeneXref, ensGtp and ensInfo # tables but no ensPep peptide sequence table. # Compare the peptides as translated from the genomic sequence of the # CDS for each transcript and the peptides for the transcripts downloaded # from BioMart (2007-09-23 - 2007-09-24, hartera) ssh kkstore06 mkdir /cluster/data/danRer5/bed/ensembl46/peptides ssh hgwdev cd /cluster/data/danRer5/bed/ensembl46/peptides hgsql -N -e 'select distinct(name) from ensGene;' danRer5 | sort \ > ensGene.txIds.sort # First get peptides from the genome translation of the genes defined in # ensGene. This does not work for the mitochondrial genome. The ability # to deal with vertebrate mitochondrial genetic code for translation # of genes on chrM was added (markd). getRnaPred -peptides -genePredExt danRer5 ensGene all ensGeneTablePep.fa # Took about 10 minutes # Then download the peptide sequences from BioMart and compare # differences may include selenocysteines and translationnal frameshifts # Get the ensembl peptide sequences from # http://www.ensembl.org/biomart/martview # Follow this sequence: # 1) Choose the Ensembl Genes 46 as the database and then # Danio Rerio genes (ZFISH7) as the dataset. # 2) Click on the Attributes link in the left panel. Select sequences. # 3) Expand the SEQUENCES section and choose Peptide as type of sequence # to export and then expand the Header Information section and select # Ensembl Gene ID from Gene Attributes and Ensembl Transcript ID # and Ensembl Peptide ID from Transcript Attributes # 4) Click on the Filters link in the left panel and expand the GENE # section. Select the Gene type box and then select protein_coding as # these are the only genes with an associated protein sequence. # 5) Click on the Results link in the top black menu bar and # choose FASTA for the output and export all results to # Compressed file (.gz). Select unique results only. # save the file as ensembl46Pep.fasta.gz and move to # /cluster/data/danRer5/bed/ensembl46 ssh kkstore06 cd /cluster/data/danRer5/bed/ensembl46/peptides # unzip the Enseml peptides file downloaded from BioMart gunzip ensembl46Pep.fasta.gz grep '>' ensembl46Pep.fasta | wc -l # 31743 grep '>' ensembl46Pep.fasta > headers awk 'BEGIN {FS="|"} {print $1;}' headers | sort | uniq \ > pepBioMart.txIds.sort perl -pi.bak -e 's/>//' pepBioMart.txIds.sort # compare list of transcript IDs from the ensGene table to those for the # downloaded peptides from BioMart. comm -13 ensGene.txIds.sort pepBioMart.txIds.sort # from the BioMart peptide download there were 3 extra: # ENSDART00000019661 # ENSDART00000027605 # ENSDART00000062681 # These were the problem IDs that were removed from the set - see above. comm -23 ensGene.txIds.sort pepBioMart.txIds.sort # no difference # change headers so that they only have the Ensembl transcript ID. awk 'BEGIN{FS="|"} {if ($1 ~ /ENSDART/) print $1; else print;}' \ ensembl46Pep.fasta > ensembl46PepTxIdsOnly.fasta # use faCmp (~/kent/src/utils/faCmp/faCmp.c) to compare the two files. # it requires the same number of sequences in both files. # Use faFilter to remove the ones not in ensGeneTablePep.fa faFilter -v -name=ENSDART00000019661 ensembl46PepTxIdsOnly.fasta tmp1.fa faFilter -v -name=ENSDART00000027605 tmp1.fa tmp2.fa faFilter -v -name=ENSDART00000062681 \ tmp2.fa ensembl46PepTxIdsOnlyFilt.fasta grep '>' ensembl46PepTxIdsOnlyFilt.fasta | wc -l # 31740 rm tmp* # faCmp assumed that DNA sequences were being compared. An option was # added so that protein sequences could be compared (markd added this). faCmp -sortName -peptide \ ensGeneTablePep.fa ensembl46PepTxIdsOnlyFilt.fasta \ >& ensTablevsBioMartFasta.diff # ENSDART00000038434 in ensGeneTablePep.fa differs from ENSDART00000038434 at # ensembl46PepTxIdsOnlyFilt.fasta at base 114 (X != V) # ENSDART00000046903 in ensGeneTablePep.fa differs from ENSDART00000046903 at # ensembl46PepTxIdsOnlyFilt.fasta at base 71 (X != T) # ENSDART00000047515 in ensGeneTablePep.fa differs from ENSDART00000047515 at # ensembl46PepTxIdsOnlyFilt.fasta at base 149 (X != L) # ENSDART00000049170 in ensGeneTablePep.fa differs from ENSDART00000049170 at # ensembl46PepTxIdsOnlyFilt.fasta at base 172 (X != S) # ENSDART00000080260 in ensGeneTablePep.fa differs from ENSDART00000080260 at # ensembl46PepTxIdsOnlyFilt.fasta at base 353 (X != G) # ENSDART00000081978 in ensGeneTablePep.fa differs from ENSDART00000081978 at # ensembl46PepTxIdsOnlyFilt.fasta at base 21 (X != R) # ENSDART00000082954 in ensGeneTablePep.fa differs from ENSDART00000082954 at # ensembl46PepTxIdsOnlyFilt.fasta at base 747 (X != A) # ENSDART00000083022 in ensGeneTablePep.fa differs from ENSDART00000083022 at # ensembl46PepTxIdsOnlyFilt.fasta at base 344 (X != L) # ENSDART00000105255 in ensGeneTablePep.fa differs from ENSDART00000105255 at # ensembl46PepTxIdsOnlyFilt.fasta at base 427 (X != A) # There are differences in just 9 of the protein sequences where we have an # "X" because there is an "N" in the genome sequence in the codon for the # amino acid whereas Ensembl used transcript evidence to predict the # missing base. Since there are so few differences, then the predicted # translation from the genome is good enough without loading an ensPep table. # Load anyway for completeness since there are some differences. Use # "generic" as the type for loading using hgPepPred, not "ensembl" since # all the IDs were removed from the header line but the transcript ID. # If type "ensembl" is defined, it expects at least 3 IDs separated by "|". ssh hgwdev cd /cluster/data/danRer5/bed/ensembl46/peptides # load the BioMart downloaded Ensembl 46 peptide sequences into database: hgPepPred danRer5 generic ensPep ensembl46PepTxIdsOnlyFilt.fasta hgsql -e 'select count(*) from ensPep;' danRer5 # 31740 # All the sequences were loaded. # Added a zebrafish/danRer5/trackDb.ra entry for ensGene with the # archive setting, aug2007, so that URLs for Ensembl Genes point to the # correct archive. Added the Esnembl version number to the longlabel and # as a dataVersion setting (Ensembl Relese 46). # clean up ssh kkstore06 cd /cluster/data/danRer5/bed/ensembl46/peptides rm tmp*.fa headers ensPep.tab ensembl46PepTxIdsOnly.fasta *.bak rm pepBioMart.txIds.sort ensGene.txIds.sort # (2007-09-28, hartera) # Add code to use ensInfo table, use autoSql. There is already a # ensInfo.sql table specified in ~/kent/src/hg/lib ssh hgwdev cat << 'EOF' > $HOME/kent/src/hg/lib/ensInfo.as table ensInfo "ensembl Genes track additional information" ( string name; "Ensembl transcript ID" string otherId; "other ID" string geneId; "Ensembl gene ID" string class; "Ensembl gene type" string geneDesc; "Ensembl gene description" string status; "Ensembl gene confidence" ) 'EOF' cd $HOME/kent/src/hg/lib/ensInfo.as # move the current ensInfo.sql out of the way mv ensInfo.sql tmp.sql autoSql ensInfo.as ensInfo # renmae autogenerated file and reinstate original file as ensInfo.sql # Difference is that this file has a longblob for description whereas # the description field in the autoSql generated code just has a # varchar[255] which may not be big enough in all cases. mv ensInfo.sql ensInfoTemp.sql mv tmp.sql ensInfo.sql mv ensInfo.h ../inc/ # commit ../inc/ensInfo.h, ensInfo.c and ensInfo.as to CVS # add ensInfo.o to makefile and commmit to CVS cd $HOME/kent/src/hg/ # add code to hgc.c to doEnsemblGene function to write information # from the ensInfo table on the details page for each gene. # (2007-10-03, hartera). Check whether the name2 field in ensGene is # always the same as otherId in the ensInfo table. ssh hgwdev cd /cluster/data/danRer5/bed/ensembl46 hgsql -N -e 'select name, name2 from ensGene;' danRer5 \ | sort | uniq > names.ensGene.sort hgsql -N -e 'select name, otherId from ensInfo;' danRer5 \ | sort | uniq > names.ensInfo.sort wc -l names*.sort # 31740 names.ensGene.sort # 31991 names.ensInfo.sort comm -12 names.ensGene.sort names.ensInfo.sort | wc -l # 31740 # So the name, name2 pair in ensGene is the same as the name, otherId # pair in ensInfo. ensInfo has more entries than ensGene because # it includes pseudogene and non-coding RNAs. hgc already prints the # Alternate Name using name2 from ensGene so do not need to print out # the otherId field from ensInfo. ########################################################################### # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer5 (DONE, 2006-09-11, hartera) ssh hgwdev # DNA port is "0", trans prot port is "1" echo 'insert into blatServers values("danRer5", "blat8", "17784", "1", "0"); insert into blatServers values("danRer5", "blat8", "17785", "0", "1");' \ | hgsql hgcentraltest # this enables blat and isPcr, isPcr is enabled by loading blat server # with tilesize=5 (ask for this when request blat servers from # cluster admin). # if you need to delete those entries echo 'delete from blatServers where db="danRer5";' | hgsql hgcentraltest ########################################################################### # VEGA GENES (DONE, 2007-09-05 - 2007-09-12, hartera) # Data provided by Tina Eyre from Sanger: te3@sanger.ac.uk # GTF file sent on 2007-08-30, Vegav27 ssh kkstore06 mkdir -p /cluster/data/danRer5/bed/vega27/data/ cd /cluster/data/danRer5/bed/vega27/data # 2 files: vegav27.gtf.zip vegav27_information.txt.zip unzip vegav27.gtf.zip mv ./lustre/work1/ensembl/te3/otter_vega_builds/vega/070523/vegav27.gtf . rm -r lustre unzip vegav27_information.txt.zip mv \ ./lustre/work1/ensembl/te3/otter_vega_builds/vega/070523/vegav27_information.txt \ . rm -r lustre # asked for the vegav27_information.txt file to be in the format # required to load the table directory. # Check the format then prepare data files to load database tables. # Can grep for "otter" to get only gene description lines from the GTF # file - the rest is output and header from program that generated it. # 2007-09-06, hartera ssh kkstore06 cd /cluster/data/danRer5/bed/vega27 # get just lines describing genes and remove "chr" prefix from scaffolds # NOTE: there are no NA scaffolds. grep "otter" ./data/vegav27.gtf | sed -e 's/chrZv7_/Zv7_/' > vegav27Fixed.gtf # vegaInfo is transcriptId, otterId, geneId, method and geneDesc # Get otter transcript ID and otter gene ID: awk 'BEGIN{FS=";"} {OFS="\t"} \ {if (($5 ~ /otter_gene_id/) && ($6 ~ /otter_transcript_id/)) \ print $5, $6;}' vegav27Fixed.gtf | sort | uniq > vega27OtterIds.txt # remove the otter_gene_id and otter_transcript_id and # extra spaces and quotation marks. perl -pi.bak -e 's/\sotter_gene_id\s\"//;' vega27OtterIds.txt perl -pi.bak -e 's/"\t\sotter_transcript_id\s"(OTTDART[0-9]+)"/\t$1/' \ vega27OtterIds.txt wc -l vega27OtterIds.txt # 12819 vega27OtterIds.txt # get list of the otter gene IDs only awk '{print $1}' vega27OtterIds.txt | sort | uniq > otterGeneIds.only # then get list of otter gene IDs from information file awk '{if ($1 ~ /OTTDARG/) print $1}' data/vegav27_information.txt \ | sort | uniq > infoOtterGeneIds.only comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l # 8649 wc -l *.only # 10374 infoOtterGeneIds.only # 8649 otterGeneIds.only # all the gene IDs from the GTF file but the information file contains # 1725 are in the information file but not the GTF file comm -13 otterGeneIds.only infoOtterGeneIds.only > geneIds.InfoOnly.txt cat << 'EOF' > mergeTransIdAndInfo.pl #!/usr/bin/perl -w use strict; my ($idsFile, $infoFile, %idsHash); $idsFile = $ARGV[0]; $infoFile = $ARGV[1]; open (IDS, $idsFile) || die "Can not open $idsFile: $!\n"; open (INFO, $infoFile) || die "Can not open $idsFile: $!\n"; # Read in the otter gene IDs and transcript IDs while () { my (@f, $line); chomp; $line = $_; # split line on tab @f = split(/\t/, $line); # hash keyed by gene ID, stores transcript ID # more than one ID so store as an array if ($f[0] =~ /OTTDARG/) { # add to array in hash if it is an otter gene ID push @{$idsHash{$f[0]}}, $f[1]; } } close IDS; while () { my ($line, @info, $geneId, @transIds); # read in information file and get the otter gene ID $line = $_; # split line on tab @info = split(/\t/, $line); if ($info[0] =~ /OTTDARG/) { $geneId = $info[0]; # look up transcript ID in hash # if gene ID exists in the hash then print out the information # file line with the correct transcript ID if (exists($idsHash{$geneId})) { @transIds = @{$idsHash{$geneId}}; # rewrite line with transcript ID foreach my $t (@transIds) { print "$t\t$line"; } } } } close INFO; 'EOF' # << happy emacs chmod +x mergeTransIdAndInfo.pl mergeTransIdAndInfo.pl vega27OtterIds.txt ./data/vegav27_information.txt \ > vegav27InfoWithTransIds.txt wc -l vega27OtterIds.txt # 12819 vega27OtterIds.txt wc -l vegav27InfoWithTransIds.txt # 12820 vegav27InfoWithTransIds.txt awk 'BEGIN {OFS="\t"} {print $2, $1}' vegav27InfoWithTransIds.txt \ | sort | uniq > infoWithTrans.Ids sort vega27OtterIds.txt | uniq > otterIds.sort comm -13 otterIds.sort infoWithTrans.Ids comm -23 otterIds.sort infoWithTrans.Ids # No difference, there must be an extra line duplicated wc -l infoWithTrans.Ids # 12819 infoWithTrans.Ids awk 'BEGIN {OFS="\t"} {print $2, $1}' vegav27InfoWithTransIds.txt \ | sort | uniq -c | sort -nr > count # 2 OTTDARG00000014371 OTTDART00000027056 # this appears twice in info file grep OTTDARG00000014371 ./data/vegav27_information.txt # OTTDARG00000014371 DKEY-26M21.1 ZDB-GENE-041210-211 # si:ch211-162i19.2 protein_coding novel protein NOVEL # OTTDARG00000014371 DKEY-26M21.1 ZDB-GENE-041210-211 # si:ch211-162i19.2 protein_coding novel protein NOVEL # remove one copy from the vegav27_information.txt file. # so just sort and uniq the resulting information file with transcript IDs sort vegav27InfoWithTransIds.txt | uniq > vegav27InfoWithTransIdsUniq.txt wc -l vegav27InfoWithTransIdsUniq.txt # 12819 vegav27InfoWithTransIdsUniq.txt # There are extra IDs in the information file as it includes all Vega # genes (including clones not in Ensembl). Those actually in the Vega # v27 GTF are those that are found on clones that are in Ensembl. # Vega includes only clone sequence and Zv7_NA scaffolds are WGS so # these are not annotated in Vega. This information is from # an e-mail from Tina Eyre (te3@sanger.ac.uk) on 2007-09-07. # then use the info file to grab those genes that are pseudogenes, get the # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes # to a separate file. # Next step is to find which transcripts are pseudogenes. grep pseudogene vegav27InfoWithTransIdsUniq.txt | sort | uniq | wc -l # found 58 pseudogenes so need to create the pseudogene track # Get transcript IDs for pseudogenes. grep pseudogene vegav27InfoWithTransIdsUniq.txt | awk '{print $1}' \ > pseudogenes.ids grep -w -f pseudogenes.ids vegav27Fixed.gtf > vegav27Pseudogene.gtf awk '{print $20}' vegav27Pseudogene.gtf | sort | uniq | wc -l # 58 # Need to make the GTF file vegGene table by subtracting pseudogenes: grep -vw -f pseudogenes.ids vegav27Fixed.gtf > vegaGenev27.gtf wc -l vega*gtf # 168381 vegaGenev27.gtf # 168602 vegav27Fixed.gtf # 221 vegav27Pseudogene.gtf # Need to relabel IDs to get the name to be the otter transcript ID # and name 2 to be the transcript_id (needs to be labeled as gene_id) # Also, relabel the otter_transcript_id to be transcript_id as ldHgGene # groups the rows by this ID. sed -e 's/gene_id/tmp_id/' vegaGenev27.gtf \ | sed -e 's/transcript_id/gene_id/' \ | sed -e 's/otter_transcript_id/transcript_id/' > vegaGenev27Format.gtf # remove an extra line grep -v "rachels_data" vegaGenev27Format.gtf > tmp mv tmp vegaGenev27Format.gtf # Do the same for the pseudogene GTF file: sed -e 's/gene_id/tmp_id/' vegav27Pseudogene.gtf \ | sed -e 's/transcript_id/gene_id/' \ | sed -e 's/otter_transcript_id/transcript_id/' \ > vegaPseudogenev27Format.gtf gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \ vegaGenev27Format.gtf vegaGenev27.gp genePredCheck vegaGenev27.gp # genepred ok checked: 12761 failed: 0 gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \ vegaPseudogenev27Format.gtf vegaPseudoGenev27.gp genePredCheck vegaPseudoGenev27.gp # genepred ok checked: 58 failed: 0 # load GTF files for Vega genes and pseudogenes: ssh hgwdev cd /cluster/data/danRer5/bed/vega27 # load vegaGene table hgLoadGenePred -genePredExt danRer5 vegaGene vegaGenev27.gp # load vegaPseudoGene table hgLoadGenePred -genePredExt danRer5 vegaPseudoGene vegaPseudoGenev27.gp hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer5 # There are 36, chr1-25 and 11 scaffolds are annotated: # Zv7_scaffold2491 # Zv7_scaffold2498 # Zv7_scaffold2509 # Zv7_scaffold2516 # Zv7_scaffold2523 # Zv7_scaffold2533 # Zv7_scaffold2537 # Zv7_scaffold2551 # Zv7_scaffold2560 # Zv7_scaffold2633 # Zv7_scaffold2650 hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer5 # 12 chroms and 1 scaffolds have pseudogenes annotated # chr2, chr3, chr4, chr5, chr9, chr12, chr13, chr18, chr19, chr20, chr24, # Zv7_scaffold2509 # Only finished sequence is annotated by Vega # Vega information tables: # mySQL table definition and autosql-generated files created previously # for zebrafish-specific information (vegaInfoZfish). # Add clone_id to a separate table instead of this one. A tab-separated # file of transcript ID and clone ID was provided by Tina Eyre # (te3@sanger.ac.uk) at Sanger. # Need to grep for the transcript IDs # created a second table for the cloneId accessions since there # are multiple ids for some VEGA genes. Otherwise, there would be # a comma separated list in this field or many rows repeated but just # different in the cloneId field. Associate transcript ID to clone IDs. # Table definitions are in danRer4.txt. # first check that the clone IDs file is complete. # this is from Tina Eyre sent on 2007-09-07 and the file is # vegav27_transcript_to_clone.txt.zip ssh kkstore06 cd /cluster/data/danRer5/bed/vega27 unzip vegav27_transcript_to_clone.txt.zip awk '{print $1}' vegav27_transcript_to_clone.txt | sort | uniq \ > transWithCloneId.txt wc -l transWithCloneId.txt # 12819 transWithCloneId.txt awk '{print $1}' vegav27InfoWithTransIdsUniq.txt | sort | uniq \ > vegaInfoTransIds.txt wc -l vegaInfoTransIds.txt # 12819 vegaInfoTransIds.txt # compare to vega27OtterIds.txt comm -12 transWithCloneId.txt vegaInfoTransIds.txt | wc -l # 12819 # so all the transcript IDs are in this list. # sort this table: sort vegav27_transcript_to_clone.txt | uniq > vegav27TransToClone.txt # load these tables: ssh hgwdev cd /cluster/data/danRer5/bed/vega27 hgLoadSqlTab danRer5 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \ vegav27InfoWithTransIdsUniq.txt hgLoadSqlTab danRer5 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \ vegav27TransToClone.txt # Add trackDb.ra entry. Keep Vega Genes and Vega Pseudogenes separate as for # human and add version number to description. ########################################################################### # BLASTZ FOR MEDAKA (oryLat1) (DONE, 2007-09-10, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS # No lineage-specific repeats for this species pair. # Medaka also has no species-specific repeats in the RepeatMasker # library so run this using dynamic masking. ssh kkstore06 mkdir /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10 cd /cluster/data/danRer5/bed ln -s blastz.oryLat1.2007-09-10 blastz.oryLat1 cd /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10 # Use Blastz parameters as for tetNig1 in danRer4.txt. Using scaffolds makes this run # slower so it is best to have the scaffolds in the query. Use HoxD55.q # matrix as medaka is quite distant from zebrafish. Blastz uses # lineage-specfic repeats but there are none for these two species. # Use soft-masked scaffolds for oryLat1 (and dynamic masking) and also use # windowMasker plus SDust repeats since there are no species-specific # RepeatMasker repeats for medaka. The random scaffolds were used in Blastz # separately (oryLat1UnScaffolds.2bit) rather than as an artificial unordered # chromosome to avoid getting Blastz alignments across non-contiguous # scaffolds. cat << '_EOF_' > DEF # zebrafish (danRer5) vs. Medaka (oryLat1) BLASTZ=blastz.v7.x86_64 BLASTZ_H=2500 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - zebrafish (danRer5) SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit SEQ1_LEN=/cluster/data/danRer5/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Medaka (oryLat1) # (40M chunks covers the largest chroms in one # gulp) SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ2_CHUNK=40000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF ## use screen time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust \ -blastzOutRoot /san/sanvol1/scratch/danRer5OryLat1 \ `pwd`/DEF >& doBlastz.log & # 0.154u 0.096s 6:13:59.67 0.0% 0+0k 0+0io 4pf+0w # TEST PARAMETERS: # RUN 1 - parameters above used for fugu queried against zebrafish before. # BLASTZ_H=2500 # BLASTZ_M=50 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # # TRY WITH PARAMETERS USED FOR DANRER4 - Run 2: # use parameters for oryLat1 in danRer4.txt. Using scaffolds makes this run # slower so it is best to have the scaffolds in the query. Use HoxD55.q # matrix as medaka is quite distant from zebrafish. Blastz uses # lineage-specfic repeats but there are none for these two species. # Use soft-masked scaffolds and dynamic masking. # zebrafish (danRer5) vs. Medaka (oryLat1) # BLASTZ=blastz.v7.x86_64 # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=6000 # BLASTZ_K=2200 # BLASTZ_M=50 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # Run 1: featureBits danRer5 refGene:cds chainOryLat1Run1Link -enrichment # refGene:cds 1.019%, chainOryLat1Run1Link 10.478%, both 0.895%, cover 87.79%, # enrich 8.38x # Run 2: featureBits danRer5 refGene:cds chainOryLat1Link -enrichment # refGene:cds 1.019%, chainOryLat1Link 11.164%, both 0.894%, cover 87.77%, # enrich 7.86x # Chains danRer4, parameters like Run 2: featureBits danRer4 refGene:cds chainOryLat1Link -enrichment # refGene:cds 0.952%, chainOryLat1Link 12.366%, both 0.813%, cover 85.43%, # enrich 6.91x # Run 1: hgsql -e 'select count(*) from chainOryLat1Run1Link;' danRer5 +----------+ | count(*) | +----------+ | 32786056 | +----------+ # Run 2: hgsql -e 'select count(*) from chainOryLat1Link;' danRer5 +----------+ | count(*) | +----------+ | 46341892 | +----------+ # Run 1: hgsql -e 'select count(*) from chainOryLat1Run1;' danRer5 +----------+ | count(*) | +----------+ | 2197571 | +----------+ # Run 2: hgsql -e 'select count(*) from chainOryLat1;' danRer5 +----------+ | count(*) | +----------+ | 2961333 | +----------+ # Nets Run 1: featureBits danRer5 refGene:cds netOryLat1Run1 -enrichment # refGene:cds 1.019%, netOryLat1Run1 65.343%, both 0.977%, cover 95.85%, # enrich 1.47x # Nets Run 2: featureBits danRer5 refGene:cds netOryLat1 -enrichment # refGene:cds 1.019%, netOryLat1 66.238%, both 0.974%, cover 95.63%, enrich # 1.44x # Nets danRer4, parameters like Run 2: featureBits danRer4 refGene:cds netOryLat1 -enrichment # refGene:cds 0.952%, netOryLat1 64.174%, both 0.888%, cover 93.29%, enrich # 1.45x # [1] Exit 25 # /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk # 0.188u 0.132s 7:33:05.44 0.0% 0+0k 0+0io 3pf+0w # Command failed: # ssh -x hgwdev nice # /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10/axtChain/installDownloads.csh ssh hgwdev # remove downloads and run install downloads again cd /usr/local/apache/htdocs/goldenPath/danRer5/ rm -r vsOryLat1 rm -r liftOver ssh kkstore06 cd /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10 # Run script again starting at download step time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -continue download \ -blastzOutRoot /san/sanvol1/scratch/danRer5OryLat1 \ `pwd`/DEF >& doBlastz2.log & # 0.095u 0.047s 17:35.63 0.0% 0+0k 0+0io 0pf+0w ## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND # ALIGNMENT DOWNLOADS FOR DANRER5 ON ORYLAT1 - documented in oryLat1.txt ########################################################################### # SPLIT MASKED SEQUENCE INTO CHROM AND SCAFFOLD FASTA FILES # (DONE, 2007-09-10, hartera) ssh kkstore06 cd /cluster/data/danRer5/ mkdir splitSoftMask cd splitSoftMask twoBitToFa ../danRer5.2bit danRer5.softMasked.fa faSplit byname danRer5.softMasked.fa . rm danRer5.softMasked.fa # Copy the chrom and scaffold sequences to /san/sanvol1/scratch/danRer5 mkdir -p /san/sanvol1/scratch/danRer5/splitSoftMask foreach f (/cluster/data/danRer5/splitSoftMask/*.fa) cp -p $f /san/sanvol1/scratch/danRer5/splitSoftMask/ end foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa) ls $f >> seq.lst end wc -l seq.lst # 5036 seq.lst # correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds. rm seq.lst # Copy the split sequences to iscratch dir on kkr1u00 ssh kkr1u00 rm -r /iscratch/i/danRer5/splitSoftMask mkdir -p /iscratch/i/danRer5/splitSoftMask foreach s (/cluster/data/danRer5/splitSoftMask/*.fa) echo "Copying $s ..." cp $s /iscratch/i/danRer5/splitSoftMask/ end # Count files transferred foreach f (/iscratch/i/danRer5/splitSoftMask/*.fa) ls $f >> seq.lst end wc -l seq.lst # 5036 seq.lst # correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds. rm seq.lst # rsync to cluster machines foreach R (2 3 4 5 6 7 8) rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/ end ########################################################################### # AFFY ZEBRAFISH GENOME ARRAY CHIP TRACK (DONE, 2007-09-10 - 2007-09-12) # With the first version of this track on danRer4, QA found a number of short # alignments of <= 30 bp and there are a number in the <= 50bp range. # These do not seem to be meaningful so filtering was changed to try to # remove these alignments while retaining meaningful alignments. # pslCDnaFilter was used with the same settings as used for the # Genbank EST alignments for zebrafish. # Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the # higher minIdentity is causing alignments to be dropped that should not be. # Blat's minIdentity seems to be more severe than that for pslReps or # pslCDnaFilter as it takes insertions and deletions into account. # These are Jim's recommendations. # Load both consensus and target sequences to form a composite track. Users # have asked for Affy target sequences in the past. # Array chip consensus sequences already downloaded for danRer1 ssh hgwdev cd /projects/compbio/data/microarray/affyZebrafish mkdir -p /san/sanvol1/scratch/affy # Affy Zebrafish consensus sequences already in this directory: cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /san/sanvol1/scratch/affy/ # Set up cluster job to align Zebrafish consensus sequences to danRer5 mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2007-09-10 ln -s /cluster/data/danRer5/bed/affyZebrafish.2007-09-10 \ /cluster/data/danRer5/bed/affyZebrafish ssh pk cd /cluster/data/danRer5/bed/affyZebrafish ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa) ls -1 $f >> genome.lst end wc -l genome.lst # 5036 genome.lst # for output: mkdir -p /san/sanvol1/scratch/danRer5/affy/psl # use -repeats option to report matches to repeat bases separately # to other matches in the PSL output. echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/danRer5/danRer5_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/scratch/danRer5/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub /parasol/bin/gensub2 genome.lst affy.lst template.sub para.spec /parasol/bin/para create para.spec # 5036 jobs written to batch /parasol/bin/para try, check, push ... etc. /parasol/bin/para time # Completed: 5036 of 5036 jobs #CPU time in finished jobs: 22117s 368.61m 6.14h 0.26d 0.001 y #IO & Wait Time: 14290s 238.17m 3.97h 0.17d 0.000 y #Average job time: 7s 0.12m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 840s 14.00m 0.23h 0.01d #Submission to last job: 988s 16.47m 0.27h 0.01d # need to do pslSort ssh pk cd /san/sanvol1/scratch/danRer5/affy # Do sort and then best in genome filter. # only use alignments that have at least # 95% identity in aligned region. # Previously did not use minCover since a lot of sequence is in # Un and NA so genes may be split up so good to see all alignments. # However, found a number of short alignments of <= 50 bp. These are # not meaningful so maybe need to use minCover. If increased too much, # then hits on poor parts of the assembly will be missed. # use pslCDnaFilter with the same parameters as used for zebrafish # Genbank EST alignments. pslSort dirs raw.psl tmp psl pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl \ affyZebrafishConsensus.psl # seqs aligns # total: 15295 459469 # drop minNonRepSize: 2791 391655 # drop minIdent: 2590 30197 # drop minCover: 1712 7112 # weird over: 237 1038 # kept weird: 176 228 # drop localBest: 1393 13532 # kept: 14971 16973 # 96.6% of sequences aligned. There are 15502 Affy consensus sequences. # FOR DANRER4: # seqs aligns # total: 15272 828202 #drop minNonRepSize: 2763 741674 # drop minIdent: 2656 39188 # drop minCover: 2550 10784 # weird over: 359 1439 # kept weird: 277 347 # drop localBest: 2830 17737 # kept: 14952 18819 # Kept 97.9% of alignments. There are 15502 Affy sequences originally # aligned so there are now 96.5% remaining. # rsync these psl files rsync -a --progress /san/sanvol1/scratch/danRer5/affy/*.psl \ /cluster/data/danRer5/bed/affyZebrafish/ ssh kkstore06 cd /cluster/data/danRer5/bed/affyZebrafish # shorten names in psl file sed -e 's/Zebrafish://' affyZebrafishConsensus.psl > tmp.psl mv tmp.psl affyZebrafishConsensus.psl pslCheck affyZebrafishConsensus.psl # psl is good # load track into database ssh hgwdev cd /cluster/data/danRer5/bed/affyZebrafish hgLoadPsl danRer5 affyZebrafishConsensus.psl # Add consensus sequences for Zebrafish chip # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/hgFixed/affyProbes ln -s \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /gbdb/hgFixed/affyProbes # these sequences were loaded previously so no need to reload. hgLoadSeq -abbr=Zebrafish: danRer5 \ /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa # Clean up rm batch.bak raw.psl # check number of short alignments: hgsql -e \ 'select count(*) from affyZebrafishConsensus where (qEnd - qStart) <= 50;'\ danRer5 # 6 # There were 7 for danRer4 - this is ok. ### TARGET SEQUENCES SUBTRACK: (2007-09-11 - 2007-09-12) ### Create subtrack of target sequences, this is just the region of the # consensus sequence that is used for designing the probesets. # Array chip target sequences downloaded from # http://www.affymetrix.com/support/technical/byproduct.affx?product=zebrafish ssh hgwdev # copy sequences to /projects/compbio/data/microarray/affyZebrafish cd /projects/compbio/data/microarray/affyZebrafish unzip Zebrafish_target.zip grep '>' Zebrafish_target > target.headers wc -l target.headers # 15617 target.headers # includes the control sequences # remove these grep -v "AFFX" target.headers > targetNoControls.headers wc -l targetNoControls.headers # 15502 targetNoControls.headers mv Zebrafish_target Zebrafish_target.fa grep '>' Zebrafish_target.fa | wc -l # 15617 sequences awk 'BEGIN {FS=";"} {print $1;}' targetNoControls.headers \ | sed -e 's/>//' > targetNoControls.ids wc -l *.ids # 15502 targetNoControls.ids # remove semi-colon after name in the target FASTA file sed -e 's/;//' Zebrafish_target > Zebrafish_target.fa faSomeRecords Zebrafish_target.fa targetNoControls.ids \ Zebrafish_targetNoControls.fa grep '>' Zebrafish_targetNoControls.fa | wc -l # 15502 # Remove "Zebrafish:" from name, leave in "tg:" to differentiate # sequences from the consensus ones. perl -pi.bak -e 's/>target:Zebrafish:/>tg:/' Zebrafish_targetNoControls.fa # make affy directory on the san if it does not exist already: mkdir -p /san/sanvol1/scratch/affy cp -p \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_targetNoControls.fa \ /san/sanvol1/scratch/affy/ # Set up cluster job to align Zebrafish consensus sequences to danRer5 # Directories below were set up above # mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2006-09-10 # ln -s /cluster/data/danRer5/bed/affyZebrafish.2006-09-10 \ # /cluster/data/danRer5/bed/affyZebrafish mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2006-09-10/target ssh pk cd /cluster/data/danRer5/bed/affyZebrafish/target ls -1 /san/sanvol1/scratch/affy/Zebrafish_targetNoControls.fa \ > affyTarget.lst foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa) ls -1 $f >> genome.lst end wc -l genome.lst # 5036 genome.lst # for output: mkdir -p /san/sanvol1/scratch/danRer5/affy/pslTarget # use -repeats option to report matches to repeat bases separately # to other matches in the PSL output. echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/danRer5/danRer5_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/scratch/danRer5/affy/pslTarget/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub /parasol/bin/gensub2 genome.lst affyTarget.lst template.sub para.spec /parasol/bin/para create para.spec # 5036 jobs written to batch /parasol/bin/para try, check, push ... etc. # Keeps crashing - can't write to directory - permissions look ok. # Tried again (2007-09-12, hartera) and it worked fine. /parasol/bin/para time # Completed: 5036 of 5036 jobs #CPU time in finished jobs: 17604s 293.40m 4.89h 0.20d 0.001y #IO & Wait Time: 14325s 238.75m 3.98h 0.17d 0.000 y #Average job time: 6s 0.11m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 767s 12.78m 0.21h 0.01d #Submission to last job: 842s 14.03m 0.23h 0.01d # need to do pslSort ssh pk cd /san/sanvol1/scratch/danRer5/affy # Do sort and then best in genome filter. pslSort dirs rawTarget.psl tmp pslTarget # only use alignments that have at least 95% identity in aligned region. pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \ -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 rawTarget.psl \ affyZebrafishTarget.psl # seqs aligns # total: 15216 308151 # drop invalid: 1 1 # drop minNonRepSize: 2051 256676 # drop minIdent: 1917 20458 # drop minCover: 741 2951 # weird over: 143 607 # kept weird: 92 116 # drop localBest: 1306 11562 # kept: 14820 16503 # 95.6% of sequences aligned. There are 15502 Affy consensus sequences. # rsync these psl files rsync -a --progress /san/sanvol1/scratch/danRer5/affy/*Target.psl \ /cluster/data/danRer5/bed/affyZebrafish/target/ ssh kkstore06 cd /cluster/data/danRer5/bed/affyZebrafish/target # shorten names in psl file #sed -e 's/Zebrafish://' affyZebrafishTarget.psl > tmp.psl #mv tmp.psl affyZebrafishConsensus.psl pslCheck affyZebrafishTarget.psl # psl is good # load track into database ssh hgwdev cd /cluster/data/danRer5/bed/affyZebrafish/target hgLoadPsl danRer5 affyZebrafishTarget.psl # Add consensus target for Zebrafish chip # Copy sequences to gbdb if they are not there already mkdir -p /gbdb/hgFixed/affyProbes ln -s \ /projects/compbio/data/microarray/affyZebrafish/Zebrafish_targetNoControls.fa \ /gbdb/hgFixed/affyProbes # these sequences were loaded previously so no need to reload. hgLoadSeq danRer5 \ /gbdb/hgFixed/affyProbes/Zebrafish_targetNoControls.fa # Clean up rm batch.bak rawTarget.psl # check number of short alignments: hgsql -e \ 'select count(*) from affyZebrafishTarget where (qEnd - qStart) <= 50;'\ danRer5 # 33 # This is ok, these are shorter regions anyway. # Added trackDb.ra entry to zebrafish/danRer5/trackDb.ra. This is # a composite track consisting of 2 subtracks for the consensus # sequence Blat alignments and the target sequence Blat alignments. # The composite track is called affyZebrafish. Add searches for both # tracks. ########################################################################### # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE, 2007-09-11, hartera) # ADDED A CHROMOSOMES DOWNLOAD DIRECOTRY (DONE, 2007-09-14, hartera) ssh hgwdev cd /cluster/data/danRer5 ln -s /cluster/data/danRer5/bed/RepeatMasker.2007-09-04/danRer5.fa.out \ /cluster/data/danRer5 /cluster/bin/scripts/makeDownloads.pl danRer5 -verbose=2 \ >& jkStuff/downloads.log & tail -f jkStuff/downloads.log # Edit README.txt files when done: # *** All done! # *** Please take a look at the downloads for danRer5 using a web browser. # *** Edit each README.txt to resolve any notes marked with "***": # /cluster/data/danRer5/goldenPath/database/README.txt # /cluster/data/danRer5/goldenPath/bigZips/README.txt # (The htdocs/goldenPath/danRer5/*/README.txt "files" are just links to # those.) # ADD CHROMOSOMES DOWNLOADS DIRECTORY # Add chromosome and scaffolds downloads, these are already in # /cluster/data/danRer5/splitSoftMask (2007-09-14, hartera) # compress files: ssh kkstore06 mkdir /cluster/data/danRer5/goldenPath/chromosomes cd /cluster/data/danRer5/splitSoftMask foreach c (chr*.fa) gzip $c end # now scaffolds, compress together into one file foreach s (Zv7_*.fa) gzip -c $s >> unmappedScaffolds.fa.gz end mv *.gz /cluster/data/danRer5/goldenPath/chromosomes/ ssh hgwdev # make symbolic links to the goldenPath downloads directory for danRer5 mkdir /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes ln -s /cluster/data/danRer5/goldenPath/chromosomes/*.gz \ /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes/ cd /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes md5sum *.gz > md5sum.txt cp /usr/local/apache/htdocs/goldenPath/danRer4/chromosomes/README.txt \ /cluster/data/danRer5/goldenPath/chromosomes/ # edit README.txt to make it specific for the danRer5 (Sanger Zv7) # assembly and then link it to the downloads directory. ln -s /cluster/data/danRer5/goldenPath/chromosomes/README.txt \ /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes/ ######################################################################## # BLASTZ Swap Mouse mm9 (DONE - 2007-09-12 - Hiram) ssh kkstore06 # the original cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-11 cat fb.mm9.chainDanRer5Link.txt # 48497464 bases of 2620346127 (1.851%) in intersection # the swap mkdir /cluster/data/danRer5/bed/blastz.mm9.swap cd /cluster/data/danRer5/bed/blastz.mm9.swap time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -chainMinScore=5000 \ /cluster/data/mm9/bed/blastzDanRer5.2007-09-11/DEF \ -swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \ > swap.log 2>&1 & # real 9m47.163s cat fb.danRer5.chainMm9Link.txt # 34017483 bases of 1435609608 (2.370%) in intersection ######################################################################## # ADD CONTIGS TRACK (DONE, 2007-09-14, hartera) # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from # chunks (contigs) agp files. ssh kkstore06 mkdir -p /cluster/data/danRer5/bed/ctgPos2 # get chrM information from danRer4: ssh hgwdev cd /cluster/data/danRer5/bed/ctgPos2 hgsql -N -e 'select * from ctgPos2 where chrom = "chrM";' danRer4 \ > chrM.ctgPos2.txt ssh kkstore06 cd /cluster/data/danRer5/bed/ctgPos2 # update chrM accession to NC_002333.2 perl -pi.bak -e 's/AC024175\.3/NC_002333\.2/' chrM.ctgPos2.txt # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc # get the contigs for the chroms, chr1-25: awk 'BEGIN {OFS="\t"} \ {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \ /cluster/data/danRer5/Zv7release/Zv7_chr.agp >> ctgPos2.tab # Add chrM to ctgPos2 file: cat chrM.ctgPos2.txt >> ctgPos2.tab # then do the same for the unmapped scaffolds: awk 'BEGIN {OFS="\t"} \ {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \ /cluster/data/danRer5/Zv7release/randoms/randomsScaffold.agp \ >> ctgPos2.tab # load the ctgPos2 table ssh hgwdev cd /cluster/data/danRer5/bed/ctgPos2 # use hgLoadSqlTab as it gives more error messages than using # "load data local infile ...". hgLoadSqlTab danRer5 ctgPos2 \ ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab # check that all chroms and scaffolds are there hgsql -N -e 'select distinct(chrom) from ctgPos2;' danRer5 \ | sort > chromsAndScafsCtgPos2.txt hgsql -N -e 'select distinct(chrom) from chromInfo;' danRer5 \ | sort > chromsAndScafsChromInfo.txt wc -l chroms* # 5036 chromsAndScafsChromInfo.txt # 5036 chromsAndScafsCtgPos2.txt comm -12 chromsAndScafsCtgPos2.txt chromsAndScafsChromInfo.txt | wc -l # 5036 # So all chroms and unmapped scaffolds are represented in the table. # cleanup ssh kkstore06 cd /cluster/data/danRer5/bed/ctgPos2 rm *.bak chromsAndScafs* # create trackDb.ra entry and html page for ctgPos2 track. # add search for the track and make sure the termRegex will handle # contigs named "Zv7_scaffoldN.N" where N is an integer and all the # accessions in the *.agp files. ########################################################################### # CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO # AFFY ZEBRAFISH ALIGNMENTS (DONE, 2007-09-16, hartera) # Array data is for whole embryos of five wild type zebrafish strains. # Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's # Hospital Boston. Contact: adibiase@enders.tch.harvard.edu # see danRer4.txt and hgFixed.txt for the history of this track and the # expression data. ssh hgwdev mkdir -p /cluster/data/danRer5/bed/ZonLab/wtArray cd /cluster/data/danRer5/bed/ZonLab/wtArray # Map the data to the consensus sequence alignments first. # use AllRatio table for mapping. There are not many arrays in this # dataset so using AllRatio will allow the selection of All Arrays # from the track controls on the track description page. Also set up the # Zebrafish microarrayGroups.ra so that the Medians of replicates or # Means of replicates can also be selected for display. # Create mapped data in zebrafishZonWT.bed. hgMapMicroarray zebrafishZonWTAffyConsensus.bed \ hgFixed.zebrafishZonWTAllRatio \ /cluster/data/danRer5/bed/affyZebrafish/affyZebrafishConsensus.psl # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTAllRatio # Mapped 14971, multiply-mapped 2002, missed 0, unmapped 646 hgLoadBed danRer5 affyZonWildType zebrafishZonWTAffyConsensus.bed # Loaded 16973 elements of size 15 # then for the Affy Zebrafish chip target sequences, map these to the # Wild Type expression data hgMapMicroarray -pslPrefix="tg:" zebrafishZonWTAffyTarget.bed \ hgFixed.zebrafishZonWTAllRatio \ /cluster/data/danRer5/bed/affyZebrafish/target/affyZebrafishTarget.psl # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTAllRatio # Mapped 14820, multiply-mapped 1683, missed 0, unmapped 797 hgLoadBed danRer5 affyTargetZonWildType zebrafishZonWTAffyTarget.bed # Loaded 16503 elements of size 15 # add trackDb.ra entry at trackDb/zebrafish/danRer5 level to create # a composite affyZonWildType with alignments and array data for both # Affy Zebrafish chip consensus sequences and target sequences - see # danRer4.txt for more details of history of trackDb.ra and # microarrayGroups.ra entries for this data. ########################################################################### # BLASTZ FOR TETRAODON (tetNig1) (DONE, 2007-09-16, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS # No lineage-specific repeats for this species pair. # Tetraodon also has no species-specific repeats in the RepeatMasker # library so run this using dynamic masking. # The tetraodon 2bit file of scaffolds was used # (tetNig1.randomContigs.sdTrf.2bit) - this contains sequences for # scaffolds from randoms. These were used for Blastz rather than the # random chroms so that alignments do not occur across non-contiguous # scaffolds. ## use screen to run ssh kkstore06 mkdir /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16 cd /cluster/data/danRer5/bed ln -s blastz.tetNig1.2007-09-16 blastz.tetNig1 cd /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16 # Use Blastz parameters for tetNig1 in danRer4.txt. Using scaffolds makes this run # slower so it is best to have the scaffolds in the query. Use HoxD55.q # matrix as tetraodon is quite distant from zebrafish. Blastz uses # lineage-specfic repeats but there are none for these two species. # Use soft-masked scaffolds for tetNig1 (and dynamic masking) and also use # windowMasker plus SDust repeats since there are no species-specific # RepeatMasker repeats for tetraodon. cat << '_EOF_' > DEF # zebrafish (danRer5) vs. tetraodon (tetNig1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2500 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - zebrafish (danRer5) SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit SEQ1_LEN=/cluster/data/danRer5/chrom.sizes SEQ1_LIMIT=30 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Tetraodon (tetNig1) # soft-masked chroms and random scaffolds in 2bit format SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.lift SEQ2_CHUNK=1000000000 SEQ2_LAP=0 BASE=/cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF mkdir /san/sanvol1/scratch/danRer5TetNig1 time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust \ -blastzOutRoot /san/sanvol1/scratch/danRer5TetNig1 \ `pwd`/DEF >& doBlastz.log & # Took about 5 hours 7 minutes to run. featureBits danRer5 refGene:cds chainTetNig1Link -enrichment # refGene:cds 1.019%, chainTetNig1Link 6.359%, both 0.866%, cover 84.94%, # enrich 13.36x featureBits danRer4 refGene:cds chainTetNig1Link -enrichment # refGene:cds 0.953%, chainTetNig1Link 7.345%, both 0.836%, cover 87.72%, # enrich 11.94x featureBits danRer5 refGene:cds netTetNig1 -enrichment # refGene:cds 1.019%, netTetNig1 63.685%, both 0.961%, cover 94.30%, enrich # 1.48x featureBits danRer4 refGene:cds netTetNig1 -enrichment # refGene:cds 0.953%, netTetNig1 63.389%, both 0.909%, cover 95.41%, enrich # 1.51x ## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND # ALIGNMENT DOWNLOAD FOR DANRER5 ON TETNIG1 - documented in tetNig1.txt ########################################################################### # BLASTZ FOR FUGU (fr2) (DONE, 2007-09-18 - 2007-09-19, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS # No lineage-specific repeats for this species pair. # Fugu also has no species-specific repeats in the RepeatMasker # library so run this using dynamic masking. # Usee the Fugu 2bit file of scaffolds (fr2.scaffolds.2bit) - this # contains sequences for all the scaffolds of chrUn. The whole assembly # is unmapped, unordered scaffolds. The individual scaffolds were # used for Blastz rather than the random chroms so that alignments do not # occur across non-contiguous scaffolds. ## use screen to run ssh kkstore06 mkdir /cluster/data/danRer5/bed/blastz.fr2.2007-09-18 cd /cluster/data/danRer5/bed ln -s blastz.fr2.2007-09-18 blastz.fr2 cd /cluster/data/danRer5/bed/blastz.fr2.2007-09-18 # Use Blastz parameters for fr1 in danRer4.txt. Using scaffolds makes this run # slower so it is best to have the scaffolds in the query. Use HoxD55.q # matrix as Fugu is quite distant from zebrafish. Blastz uses # lineage-specfic repeats but there are none for these two species. # Use soft-masked scaffolds for fr2 (and dynamic masking) and also use # windowMasker plus SDust repeats since there are no species-specific # RepeatMasker repeats for Fugu. Fugu is about 400.5 Mb of scaffolds plus # chrM which is about 16.4 kb. cat << '_EOF_' > DEF # zebrafish (danRer5) vs. Fugu (fr2) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin BLASTZ=blastz.v7.x86_64 BLASTZ_H=2500 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - zebrafish (danRer5) SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit SEQ1_LEN=/cluster/data/danRer5/chrom.sizes SEQ1_LIMIT=30 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Fugu (fr2) # soft-masked chroms and random scaffolds in 2bit format SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/danRer5/bed/blastz.fr2.2007-09-18 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy chmod +x DEF mkdir /san/sanvol1/scratch/danRer5Fr2 time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust \ -blastzOutRoot /san/sanvol1/scratch/danRer5Fr2 \ `pwd`/DEF >& doBlastz.log & # Took about 6 hours and 50 minutes to run. cat fb.danRer5.chainFr2Link.txt # 104476108 bases of 1435609608 (7.277%) in intersection # Compare coverage to refGene CDS: featureBits danRer5 refGene:cds chainFr2Link -enrichment # refGene:cds 1.019%, chainFr2Link 7.277%, both 0.886%, cover 86.93%, # enrich 11.95x featureBits danRer4 refGene:cds chainFr2Link -enrichment # refGene:cds 0.955%, chainFr2Link 8.543%, both 0.810%, cover 84.73%, # enrich 9.92x featureBits danRer5 refGene:cds netFr2 -enrichment # refGene:cds 1.019%, netFr2 64.210%, both 0.968%, cover 94.95%, enrich 1.48x featureBits danRer4 refGene:cds netFr2 -enrichment # refGene:cds 0.955%, netFr2 62.980%, both 0.885%, cover 92.68%, enrich 1.47x ## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND # ALIGNMENT DOWNLOAD FOR DANRER5 ON FR2 -document in fr2.txt ########################################################################### # CREATE LIFTOVER CHAINS FROM danRer5 TO danRer4 # (DONE 2007-09-19 AND 2007-09-21, hartera) ssh kkstore06 mkdir /cluster/data/danRer5/bed/blat.danRer4 cd /cluster/data/danRer5/bed/blat.danRer4 time nice doSameSpeciesLiftOver.pl danRer5 danRer4 \ -bigClusterHub pk \ -ooc /san/sanvol1/scratch/danRer5/danRer5_11.ooc \ -buildDir /cluster/data/danRer5/bed/blat.danRer4 >& do.log & # 0.401u 0.253s 8:09:03.62 0.0% 0+0k 0+0io 11pf+0w # Remove symbolic link to liftOver chains and copy over the file rm ../liftOver/danRer5ToDanRer4.over.chain.gz cp -p danRer5ToDanRer4.over.chain.gz ../liftOver # a link in /usr/local/apache/htdocs/goldenPath/danRer5/liftOver has # already been made to this file and md5sum.txt needs to be updated ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/danRer5/liftOver md5sum *.gz > md5sum.txt ########################################################################### # VEGA GENES UPDATE TO v28 (DONE, 2007-09-28 - 2007-10-01, hartera) # Data provided by Tina Eyre from Sanger: te3@sanger.ac.uk # UPDATE SENT ON 2007-09-27, Vega v28. RE-LOAD TABLES TO UPDATE THE ssh kkstore06 mkdir -p /cluster/data/danRer5/bed/vega28/data/ cd /cluster/data/danRer5/bed/vega28/data # 3 files: vegav28.gtf.zip vegav28_information.txt.zip # vegav28_transcript_to_clone.txt.zip unzip vegav28.gtf.zip unzip vegav28_information.txt.zip unzip vegav28_transcript_to_clone.txt.zip # asked for the vegav28_information.txt file to be in the format # required to load the table directory. # Check the format then prepare data files to load database tables. cd /cluster/data/danRer5/bed/vega28 # this time all the lines are describing genes and there is no extra # header lines and output from the program that generated the file. # NOTE: there are no NA scaffolds. # Remove "chr" prefix from scaffolds sed -e 's/chrZv7_/Zv7_/' ./data/vegav28.gtf > vegav28Fixed.gtf # vegaInfo is transcriptId, otterId, geneId, method and geneDesc # Get otter transcript ID and otter gene ID: awk 'BEGIN{FS=";"} {OFS="\t"} \ {if (($5 ~ /otter_gene_id/) && ($6 ~ /otter_transcript_id/)) \ print $5, $6;}' vegav28Fixed.gtf | sort | uniq > vega28OtterIds.txt # remove the otter_gene_id and otter_transcript_id and # extra spaces and quotation marks. perl -pi.bak -e 's/\sotter_gene_id\s\"//;' vega28OtterIds.txt perl -pi.bak -e 's/"\t\sotter_transcript_id\s"(OTTDART[0-9]+)"/\t$1/' \ vega28OtterIds.txt wc -l vega28OtterIds.txt # 14001 vega28OtterIds.txt # there were 12819 in Vega v27 # get list of the otter gene IDs only awk '{print $1}' vega28OtterIds.txt | sort | uniq > otterGeneIds.only # then get list of otter gene IDs from information file awk '{if ($1 ~ /OTTDARG/) print $1}' data/vegav28_information.txt \ | sort | uniq > infoOtterGeneIds.only comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l # 8489 wc -l *.only # 10374 infoOtterGeneIds.only # 9318 otterGeneIds.only comm -23 otterGeneIds.only infoOtterGeneIds.only > geneIdsGtfNotInfo.txt # also check the transcripts in transcript to clone are all there. awk 'BEGIN {FS="\t"} {print $2;}' vega28OtterIds.txt \ | sort | uniq > otterTxIds.only wc -l otterTxIds.only # 14001 otterTxIds.only awk 'BEGIN {FS="\t"} {print $1;}' ./data/vegav28_transcript_to_clone.txt \ | sort | uniq > vegaCloneTxIds.sort wc -l vegaCloneTxIds.sort # 14001 vegaCloneTxIds.sort comm -12 otterTxIds.only vegaCloneTxIds.sort | wc -l # 14001 # All otter transcript IDs in the GTF vegav28 gene file are in the # vegav28_transcript_to_clone.txt file as expected. wc -l geneIdsGtfNotInfo.txt # 829 geneIdsGtfNotInfo.txt # E-mailed Tina Eyre at Sanger - on 2007-09-28 - to ask about # these otter gene Ids that are in the GTF file but not in the # vegav28_information.txt file. # Received correct file: 2007-10-01 # Check again ssh kkstore06 cd /cluster/data/danRer5/bed/vega28/data unzip vegav28_information.txt.zip cd .. # then get list of otter gene IDs from information file awk '{if ($1 ~ /OTTDARG/) print $1}' ./data/vegav28_information.txt \ | sort | uniq > infoOtterGeneIds.only comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l # 9318 wc -l *.only # 11115 infoOtterGeneIds.only # 9318 otterGeneIds.only # All IDs from the GTF file are contained in the information file. # all the gene IDs from the GTF file but the information file contains # 1797 are in the information file but not the GTF file # Extra genes are included that are found on clones that do not map # to Ensembl. comm -13 otterGeneIds.only infoOtterGeneIds.only > geneIds.InfoOnly.txt cat << 'EOF' > mergeTransIdAndInfo.pl #!/usr/bin/perl -w use strict; my ($idsFile, $infoFile, %idsHash); $idsFile = $ARGV[0]; $infoFile = $ARGV[1]; open (IDS, $idsFile) || die "Can not open $idsFile: $!\n"; open (INFO, $infoFile) || die "Can not open $idsFile: $!\n"; # Read in the otter gene IDs and transcript IDs while () { my (@f, $line); chomp; $line = $_; # split line on tab @f = split(/\t/, $line); # hash keyed by gene ID, stores transcript ID # more than one ID so store as an array if ($f[0] =~ /OTTDARG/) { # add to array in hash if it is an otter gene ID push @{$idsHash{$f[0]}}, $f[1]; } } close IDS; while () { my ($line, @info, $geneId, @transIds); # read in information file and get the otter gene ID $line = $_; # split line on tab @info = split(/\t/, $line); if ($info[0] =~ /OTTDARG/) { $geneId = $info[0]; # look up transcript ID in hash # if gene ID exists in the hash then print out the information # file line with the correct transcript ID if (exists($idsHash{$geneId})) { @transIds = @{$idsHash{$geneId}}; # rewrite line with transcript ID foreach my $t (@transIds) { print "$t\t$line"; } } } } close INFO; 'EOF' # << happy emacs chmod +x mergeTransIdAndInfo.pl mergeTransIdAndInfo.pl vega28OtterIds.txt ./data/vegav28_information.txt \ > vegav28InfoWithTransIds.txt wc -l vega28OtterIds.txt # 14001 vega28OtterIds.txt wc -l vegav28InfoWithTransIds.txt # 14001 vegav28InfoWithTransIds.txt # so just sort and uniq the resulting information file with transcript IDs sort vegav28InfoWithTransIds.txt | uniq > vegav28InfoWithTransIdsUniq.txt wc -l vegav28InfoWithTransIdsUniq.txt # 14001 vegav28InfoWithTransIdsUniq.txt # So no duplicated lines. # There are extra IDs in the information file as it includes all Vega # genes (including clones not in Ensembl). Those actually in the Vega # v27 GTF are those that are found on clones that are in Ensembl. # Vega includes only clone sequence and Zv7_NA scaffolds are WGS so # these are not annotated in Vega. This information is from # an e-mail from Tina Eyre (te3@sanger.ac.uk) on 2007-09-07. # then use the info file to grab those genes that are pseudogenes, get the # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes # to a separate file. # Next step is to find which transcripts are pseudogenes. grep pseudogene vegav28InfoWithTransIdsUniq.txt | sort | uniq | wc -l # found 68 pseudogenes so need to create the pseudogene track # Get transcript IDs for pseudogenes. grep pseudogene vegav28InfoWithTransIdsUniq.txt | awk '{print $1}' \ > pseudogenes.ids grep -w -f pseudogenes.ids vegav28Fixed.gtf > vegav28Pseudogene.gtf awk '{print $20}' vegav28Pseudogene.gtf | sort | uniq | wc -l # 68 # Need to make the GTF file vegGene table by subtracting pseudogenes: grep -vw -f pseudogenes.ids vegav28Fixed.gtf > vegaGenev28.gtf wc -l vega*gtf # 178987 vegaGenev28.gtf # 179249 vegav28Fixed.gtf # 262 vegav28Pseudogene.gtf # Need to relabel IDs to get the name to be the otter transcript ID # and name 2 to be the transcript_id (needs to be labeled as gene_id) # Also, relabel the otter_transcript_id to be transcript_id as ldHgGene # groups the rows by this ID. sed -e 's/gene_id/tmp_id/' vegaGenev28.gtf \ | sed -e 's/transcript_id/gene_id/' \ | sed -e 's/otter_transcript_id/transcript_id/' > vegaGenev28Format.gtf # Do the same for the pseudogene GTF file: sed -e 's/gene_id/tmp_id/' vegav28Pseudogene.gtf \ | sed -e 's/transcript_id/gene_id/' \ | sed -e 's/otter_transcript_id/transcript_id/' \ > vegaPseudogenev28Format.gtf gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \ vegaGenev28Format.gtf vegaGenev28.gp genePredCheck vegaGenev28.gp # checked: 13933 failed: 0 gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \ vegaPseudogenev28Format.gtf vegaPseudogenev28.gp genePredCheck vegaPseudogenev28.gp # checked: 68 failed: 0 # load GTF files for Vega genes and pseudogenes: ssh hgwdev cd /cluster/data/danRer5/bed/vega28 hgsql -e 'drop table vegaGene;' danRer5 # load vegaGene table hgLoadGenePred -genePredExt danRer5 vegaGene vegaGenev28.gp hgsql -e 'drop table vegaPseudoGene;' danRer5 # load vegaPseudoGene table hgLoadGenePred -genePredExt danRer5 vegaPseudoGene vegaPseudogenev28.gp hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer5 # For vegav28 there are 29, chr1-25 plue 4 scaffolds: # Zv7_scaffold2498 # Zv7_scaffold2509 # Zv7_scaffold2516 # Zv7_scaffold2572 # For Vega v27, there were more scaffolds annotated so e-mailed Tina Eyre # to ask why this is the case. # There are 36, chr1-25 and 11 scaffolds are annotated: # Zv7_scaffold2491 # Zv7_scaffold2498 # Zv7_scaffold2509 # Zv7_scaffold2516 # Zv7_scaffold2523 # Zv7_scaffold2533 # Zv7_scaffold2537 # Zv7_scaffold2551 # Zv7_scaffold2560 # Zv7_scaffold2633 # Zv7_scaffold2650 hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer5 # 13 chroms and 1 scaffolds have pseudogenes annotated # chr2, chr3, chr4, chr5, chr9, chr12, chr13, chr18, chr19, chr20, chr22, chr24, # Zv7_scaffold2509 # Only finished sequence is annotated by Vega # Vega information tables: # mySQL table definition and autosql-generated files created previously # for zebrafish-specific information (vegaInfoZfish). # Add clone_id to a separate table instead of this one. A tab-separated # file of transcript ID and clone ID was provided by Tina Eyre # (te3@sanger.ac.uk) at Sanger. # Need to grep for the transcript IDs # created a second table for the cloneId accessions since there # are multiple ids for some VEGA genes. Otherwise, there would be # a comma separated list in this field or many rows repeated but just # different in the cloneId field. Associate transcript ID to clone IDs. # Table definitions are in danRer4.txt. # Load the vega to clone ID information: ssh kkstore06 cd /cluster/data/danRer5/bed/vega28 sort ./data/vegav28_transcript_to_clone.txt | uniq \ > vegav28TransToClone.txt # load the vegaInfo and vegaToCloneId tables: ssh hgwdev cd /cluster/data/danRer5/bed/vega28 hgsql -e 'drop table vegaInfoZfish;' danRer5 hgLoadSqlTab danRer5 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \ vegav28InfoWithTransIdsUniq.txt hgsql -e 'drop table vegaToCloneId;' danRer5 hgLoadSqlTab danRer5 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \ vegav28TransToClone.txt # A peptide file was requested this time (vegav28_protein.fa) so load # it into vegaPep: ssh kkstore06 cd /cluster/data/danRer5/bed/vega28 # There are more peptides in the FASTA file than there are transcript IDs # in the GTF file so get those peptides sequences for transcripts in GTF. # otterTxIds.only is the list of transcript IDs (14,001 IDs). faSomeRecords ./data/vegav28_protein.fa otterTxIds.only vega28Pep.fa grep '>' vega28Pep.fa | wc -l # 10110 grep '>' vega28Pep.fa | sed -e 's/>//' | sort | uniq > vega28PepTxIds.txt comm -12 vega28PepTxIds.txt otterTxIds.only | wc -l # 10110 comm -13 vega28PepTxIds.txt otterTxIds.only > otterTxNotInPeptides.txt wc -l otterTxNotInPeptides.txt # 3891 otterTxNotInPeptides.txt # Checked a few and these appear to be novel processed transcripts or novel # protein-coding so probably the novel ones do not have an associated # protein - check with Tina Eyre. ssh hgwdev cd /cluster/data/danRer5/bed/vega28 hgPepPred danRer5 generic vegaPep vega28Pep.fa # hgsql -e 'select count(*) from vegaPep;' danRer5 # 10110 # Check which type of transcripts do not have a peptide translation. hgsql -N -e 'select distinct(name) from vegaPep;' danRer5 | \ sort > vegaPep.names.sort hgsql -N -e 'select distinct(transcriptId) from vegaInfoZfish;' danRer5 \ | sort > vegaInfo.txId.sort comm -12 vegaPep.names.sort vegaInfo.txId.sort | wc -l # 10110 wc -l *.sort # 14001 vegaInfo.txId.sort # 10110 vegaPep.names.sort comm -13 vegaPep.names.sort vegaInfo.txId.sort > vegaTx.inInfoNotPep.txt wc -l vegaTx.inInfoNotPep.txt # 3891 vegaTx.inInfoNotPep.txt foreach t (`cat vegaTx.inInfoNotPep.txt`) hgsql -N -e "select transcriptId, method, confidence from vegaInfoZfish where transcriptId = '${t}';" danRer5 >> vegaTx.inInfoNotPep.type end awk 'BEGIN {OFS="\t"} {print $2, $3}' vegaTx.inInfoNotPep.type \ | sort | uniq -c | sort -nr > vegaTx.inInfoNotPep.counts # Add trackDb.ra entry for trackDb/zebrafish/danRer5/trackDb.ra if it is not # there already. Keep Vega Genes and Vega Pseudogenes separate as for # human and add version number to description. ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-10-18) ssh kkstore06 # bash if not using bash shell already mkdir /cluster/data/danRer5/blastDb cd /cluster/data/danRer5 awk '{if ($2 > 1000000) print $1}' chrom.sizes > great1M.lst twoBitToFa danRer5.2bit stdout | faSomeRecords stdin great1M.lst temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa twoBitToFa danRer5.2bit stdout | faSomeRecords -exclude stdin great1M.lst temp.fa faSplit sequence temp.fa 156 blastDb/y cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /cluster/bluearc//danRer5/blastDb cd /cluster/data/danRer5/blastDb for i in nhr nin nsq; do echo $i cp *.$i /cluster/bluearc//danRer5/blastDb done mkdir -p /cluster/data/danRer5/bed/tblastn.hg18KG cd /cluster/data/danRer5/bed/tblastn.hg18KG echo /cluster/bluearc//danRer5/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 1456 query.lst # we want around 200000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(200000/1456) = 267.372560 mkdir -p /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa split -l 267 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/danRer5/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer5/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh kk cd /cluster/data/danRer5/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 75786 of 75786 jobs # CPU time in finished jobs: 13289094s 221484.91m 3691.42h 153.81d 0.421 y # IO & Wait Time: 6534562s 108909.36m 1815.16h 75.63d 0.207 y # Average job time: 262s 4.36m 0.07h 0.00d # Longest finished job: 1158s 19.30m 0.32h 0.01d # Submission to last job: 37516s 625.27m 10.42h 0.43d ssh kkstore06 cd /cluster/data/danRer5/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh pk cd /cluster/data/danRer5/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 138 of 138 jobs # CPU time in finished jobs: 83388s 1389.80m 23.16h 0.97d 0.003 y # IO & Wait Time: 43355s 722.58m 12.04h 0.50d 0.001 y # Average job time: 918s 15.31m 0.26h 0.01d # Longest finished job: 21693s 361.55m 6.03h 0.25d # Submission to last job: 21713s 361.88m 6.03h 0.25d ssh kkstore06 cd /cluster/data/danRer5/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/danRer5/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/danRer5/bed/tblastn.hg18KG hgLoadPsl danRer5 blastHg18KG.psl # check coverage featureBits danRer5 blastHg18KG # 21056644 bases of 1435609608 (1.467%) in intersection featureBits danRer4 blastHg18KG # 21159392 bases of 1626093931 (1.301%) in intersection featureBits danRer5 ensGene:cds blastHg18KG -enrichment # ensGene:cds 2.114%, blastHg18KG 1.467%, both 1.077%, cover 50.92%, enrich 34.72x featureBits danRer4 ensGene:cds blastHg18KG -enrichment # ensGene:cds 2.230%, blastHg18KG 1.301%, both 1.058%, cover 47.46%, enrich 36.47x ssh kkstore06 rm -rf /cluster/data/danRer5/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-28 - Hiram) cd /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27 cat fb.oryLat2.chainDanRer5Link.txt # 138070427 bases of 700386597 (19.713%) in intersection # and for the swap mkdir /cluster/data/danRer5/bed/blastz.oryLat2.swap cd /cluster/data/danRer5/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \ -swap -tRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 & # real 58m15.303s # had to finish the load nets manually, mistakenly included # -qRepeats=windowmaskerSdust when that is not valid for danRer5 cat fb.danRer5.chainOryLat2Link.txt # 158709399 bases of 1435609608 (11.055%) in intersection # Then, continuing: doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \ -continue=download -swap -tRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > download.log 2>&1 & ######################################################################### ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: danRer5.upstreamGeneTbl = refGene ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ # SWAP BLASTZ FOR DOG (canFam2) (DONE, 2009-08-08, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS # See canFam2.txt for creation of zebrafish chain and net tracks and downloads # on the dog genome assembly. mkdir /hive/data/genomes/danRer5/bed/blastz.canFam2.swap cd /hive/data/genomes/danRer5/bed/blastz.canFam2.swap time nice doBlastzChainNet.pl -chainMinScore=5000 -chainLinearGap=loose \ -swap /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07/DEF \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \ -chainMinScore=5000 -chainLinearGap=loose \ >& doBlastz.swap.log & cat fb.danRer5.chainCanFam2Link.txt # 32053647 bases of 1435609608 (2.233%) in intersection ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # CREATE LIFTOVER CHAINS FROM danRer5 TO danRer6 (DONE 10/27/09 angie) doSameSpeciesLiftOver.pl -debug danRer5 danRer6 cd /hive/data/genomes/danRer5/bed/blat.danRer6.2009-10-27 doSameSpeciesLiftOver.pl danRer5 danRer6 >& do.log & tail -f do.log ############################################################################ # BUILD U MASS ChiP-Seq TRACKS (DONE 3/28/11, Fan) ssh hgwdev mkdir -p /hive/data/genomes/danRer5/bed/H3K4me cd /hive/data/genomes/danRer5/bed/H3K4me # Download the following files from http://lawsonlab.umassmed.edu/ChIP-SeqSupp.html: Supplementary File 1 REV 24h_input_signal.wig Supplementary File 2 REV 24h_ME1_signal.wig Supplementary File 3 REV 24h_ME3_signal.wig Supplementary File 4 REV 24h_ME1_hotspots.bed Supplementary File 5 REV 24h_ME1_peaks.WIG Supplementary File 6 REV 24h_ME3_hotspots.bed Supplementary File 7 REV 24h_ME3_peaks.WIG # build UMassInput track cat 'Supplementary File 1 REV 24h_input_signal.wig'|sed -e 's/chrMT/chrM/g' >UMassInput.wig.input wigEncode UMassInput.wig.input UMassInput.wig UMassInput.wib # Converted UMassInput.wig.input, upper limit 500.00, lower limit 10.00 ln -s /hive/data/genomes/danRer5/bed/H3K4me/UMassInput.wib /gbdb/danRer5/wib/UMassInput.wib hgLoadWiggle danRer5 UMassInput UMassInput.wig #WARNING: Exceeded Zv7_NA1024 size 99420 > 99329. dropping 10 data point(s) #WARNING: Exceeded Zv7_NA134 size 97140 > 97109. dropping 4 data point(s) #WARNING: Exceeded Zv7_NA2952 size 14610 > 14546. dropping 7 data point(s) #WARNING: Exceeded Zv7_NA3149 size 24520 > 24469. dropping 6 data point(s) #WARNING: Exceeded Zv7_NA3766 size 23600 > 23589. dropping 2 data point(s) #WARNING: Exceeded Zv7_NA7305 size 3820 > 3819. dropping 1 data point(s) #WARNING: Exceeded chrM size 16690 > 16596. dropping 10 data point(s) #WARNING: Exceeded chrM size 16720 > 16596. dropping 13 data point(s) # Build UMassME1 track cat 'Supplementary File 2 REV 24h_ME1_signal.wig'|sed -e 's/chrMT/chrM/g' >UMassME1.wig.input wigEncode UMassME1.wig.input UMassME1.wig UMassME1.wib # Converted UMassME1.wig.input, upper limit 404.00, lower limit 10.00 ln -s /hive/data/genomes/danRer5/bed/H3K4me/UMassME1.wib /gbdb/danRer5/wib/UMassME1.wib hgLoadWiggle danRer5 UMassME1 UMassME1.wig #WARNING: Exceeded Zv7_NA1266 size 52350 > 52297. dropping 6 data point(s) #WARNING: Exceeded Zv7_NA1308 size 18360 > 18317. dropping 5 data point(s) #WARNING: Exceeded Zv7_NA1320 size 40410 > 40361. dropping 5 data point(s) #WARNING: Exceeded Zv7_NA1723 size 44750 > 44727. dropping 3 data point(s) #WARNING: Exceeded Zv7_NA1741 size 44070 > 44004. dropping 7 data point(s) #WARNING: Exceeded Zv7_NA1868 size 34760 > 34733. dropping 3 data point(s) #WARNING: Exceeded Zv7_NA2098 size 13950 > 13871. dropping 8 data point(s) #WARNING: Exceeded Zv7_NA2952 size 14640 > 14546. dropping 10 data point(s) #WARNING: Exceeded Zv7_NA3149 size 24520 > 24469. dropping 6 data point(s) #WARNING: Exceeded Zv7_NA3282 size 15540 > 15514. dropping 3 data point(s) #WARNING: Exceeded Zv7_NA3685 size 5030 > 5011. dropping 2 data point(s) #WARNING: Exceeded Zv7_NA3793 size 3860 > 3857. dropping 1 data point(s) #WARNING: Exceeded Zv7_NA4436 size 7570 > 7547. dropping 3 data point(s) #WARNING: Exceeded Zv7_NA4449 size 2260 > 2210. dropping 6 data point(s) #WARNING: Exceeded Zv7_NA5160 size 4560 > 4535. dropping 3 data point(s) #WARNING: Exceeded Zv7_NA6710 size 11900 > 11826. dropping 8 data point(s) #WARNING: Exceeded Zv7_NA6758 size 4590 > 4582. dropping 1 data point(s) #WARNING: Exceeded Zv7_NA7355 size 2680 > 2667. dropping 2 data point(s) #WARNING: Exceeded Zv7_NA7403 size 4870 > 4815. dropping 6 data point(s) #WARNING: Exceeded Zv7_NA7408 size 4180 > 4175. dropping 1 data point(s) # Build UMassME1Peak track cat 'Supplementary File 5 REV 24h_ME1_peaks.WIG'|sed -e 's/chrMT/chrM/g' >UMassME1Peak.wig.input wigEncode UMassME1Peak.wig.input UMassME1Peak.wig UMassME1Peak.wib # Converted UMassME1Peak.wig.input, upper limit 2510.00, lower limit 20.00 ln -s /hive/data/genomes/danRer5/bed/H3K4me/UMassME1Peak.wib /gbdb/danRer5/wib/UMassME1Peak.wib hgLoadWiggle danRer5 UMassME1Peak UMassME1Peak.wig # Build UMassME3 track cat 'Supplementary File 3 REV 24h_ME3_signal.wig'|sed -e 's/chrMT/chrM/g' >UMassME3.wig.input wigEncode UMassME3.wig.input UMassME3.wig UMassME3.wib # Converted UMassME3.wig.input, upper limit 477.00, lower limit 10.00 ln -s /hive/data/genomes/danRer5/bed/H3K4me/UMassME3.wib /gbdb/danRer5/wib/UMassME3.wib hgLoadWiggle danRer5 UMassME3 UMassME3.wig #WARNING: Exceeded Zv7_NA1024 size 99410 > 99329. dropping 9 data point(s) #WARNING: Exceeded Zv7_NA1266 size 52370 > 52297. dropping 8 data point(s) #WARNING: Exceeded Zv7_NA1308 size 18380 > 18317. dropping 7 data point(s) #WARNING: Exceeded Zv7_NA1417 size 8250 > 8219. dropping 4 data point(s) #WARNING: Exceeded Zv7_NA153 size 43460 > 43438. dropping 3 data point(s) #WARNING: Exceeded Zv7_NA1741 size 44020 > 44004. dropping 2 data point(s) #WARNING: Exceeded Zv7_NA1913 size 14430 > 14424. dropping 1 data point(s) #WARNING: Exceeded Zv7_NA2378 size 10240 > 10147. dropping 10 data point(s) #WARNING: Exceeded Zv7_NA2552 size 2110 > 2041. dropping 7 data point(s) #WARNING: Exceeded Zv7_NA2952 size 14640 > 14546. dropping 10 data point(s) #WARNING: Exceeded Zv7_NA3149 size 24580 > 24469. dropping 12 data point(s) #WARNING: Exceeded Zv7_NA3282 size 15630 > 15514. dropping 12 data point(s) #WARNING: Exceeded Zv7_NA4966 size 4270 > 4220. dropping 6 data point(s) #WARNING: Exceeded Zv7_NA5160 size 4560 > 4535. dropping 3 data point(s) #WARNING: Exceeded Zv7_NA6710 size 11900 > 11826. dropping 8 data point(s) #WARNING: Exceeded Zv7_NA7305 size 3850 > 3819. dropping 4 data point(s) #WARNING: Exceeded Zv7_NA7355 size 2760 > 2667. dropping 10 data point(s) #WARNING: Exceeded Zv7_NA7403 size 4870 > 4815. dropping 6 data point(s) #WARNING: Exceeded Zv7_NA7450 size 5410 > 5333. dropping 8 data point(s) #WARNING: Exceeded Zv7_NA7545 size 12030 > 11945. dropping 9 data point(s) #WARNING: Exceeded Zv7_NA78 size 146010 > 145973. dropping 4 data point(s) # Build UMassME3Peak track cat 'Supplementary File 7 REV 24h_ME3_peaks.WIG'|sed -e 's/chrMT/chrM/g' >UMassME3Peak.wig.input wigEncode UMassME3Peak.wig.input UMassME3Peak.wig UMassME3Peak.wib # Converted UMassME3Peak.wig.input, upper limit 6835.00, lower limit 16.00 ln -s /hive/data/genomes/danRer5/bed/H3K4me/UMassME3Peak.wib /gbdb/danRer5/wib/UMassME3Peak.wib hgLoadWiggle danRer5 UMassME3Peak UMassME3Peak.wig # Build UMassME1Hotspot track cat 'Supplementary File 4 REV 24h_ME1_hotspots.bed' |grep -v 'hotspot' > UMassME1Hotspot.bed hgLoadBed danRer5 UMassME1Hotspot UMassME1Hotspot.bed # Loaded 41478 elements of size 5 # Build UMassME3Hotspot track cat 'Supplementary File 6 REV 24h_ME3_hotspots.bed' |grep -v 'hotspot' > UMassME3Hotspot.bed hgLoadBed danRer5 UMassME3Hotspot UMassME3Hotspot.bed # Loaded 29033 elements of size 5