# for emacs: -*- mode: sh; -*- # Bos taurus # Btau_2.0 from Baylor # March 2005 # contigs and chromosomes # WGS, Atlas assembler, 6x coverage # ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20050310-freeze/ # anonymous ftp, no login/password needed # http://www.hgsc.bcm.tmc.edu/projects/bovine/ # bovine chrom map: http://locus.jouy.inra.fr/fpc/cattle/WebAGCoL/WebChrom/index.html # agp format: http://www.ncbi.nlm.nih.gov/Genbank/WGS.agpformat.html # also old page here: http://bioinfo.hku.hk/ucsc/goldenPath/datorg.html # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | sgpGene | genePred sgpPep | # | geneid | genePred geneidPep | # | genscan | genePred genscanPep | # +-------------+---------------------------------+ # Create storage area on store11 (DONE Jul. 18, 2005 galt) ssh kkstore02 mkdir /cluster/store11/bosTau2 cd /cluster/data ln -s /cluster/store11/bosTau2 bosTau2 # DOWNLOAD SEQUENCE (DONE Jul. 18, 2005 Galt) cd /cluster/data/bosTau2 mkdir downloads cd downloads # download all for completeness wget -r "ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20050310-freeze/" mv ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20050310-freeze/* . rm -fr ftp.hgsc.bcm.tmc.edu # GUNZIP gunzip */*.gz # note: this turned up an error which I sent to Kim Worley # Chr10.fa.qual.gz, Chr11.fa.qual.gz, and ChrUn.fa.qual.gz: # "gunzip: linearScaffolds/Chr10.fa.qual.gz: invalid compressed data--format violated" # The next day she posted new ones, which did download, decompress correctly # CHECK FILES # Check that IDs in .fa match IDs in .qual ssh kkstore02 cd /cluster/data/bosTau2/downloads # foreach Bin0 contigs linearScaffolds repeat-reads grep -h ">" *.fa > fa.id grep -h ">" *.qual > qual.id diff fa.id qual.id # note: also re-did for the 3 re-posted qual.gz files mentioned above # and the ids did match the .fa ok # I'm thinking really it would be better to keep # the chromosomes with real numbers as chromosomes, # and to keep chrUn as scaffolds. These days we # handle scaffolds pretty well. This approach # has the advantage of avoiding "false joins" when # we align RNA to chrUn. It also has the advantage # of making us more interoperable with ensembl. # -Jim # For the bin0 stuff, how many are there? # They're probably small enough and enough of them # that it'd make the performance not so great. # It'd be good then to put them in one pseudo-chromosome, # perhaps one with not much space between them so # as to keep the whole thing less than 4 gig. # # The system handles chromosomes up to 512 MB. # The whole genome needs to come in under 4 gig for BLAT. # Each chromosome needs to be 512 meg or less for the # binning scheme in the browser. # -Jim # No chrom/scaffold is individually over 372M, # so it's less than 512M bin scheme max size, # and collectively, cow2 genome is 3.4 GB, # which is less then the 4GB BLAT limit. # -Galt # Let's not split any table for the cow. # -Jim # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 7/18/05 galt) cd /cluster/data/bosTau2/downloads # go to http://www.ncbi.nih.gov/ and search Nucleotide for # "bos taurus mitochondrion genome". That shows the gi number: # 12800 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=12800&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Bos taurus mitochondrion complete genome, and then replace the # header line with just ">chrM". echo "chrM 1 16338 1 W chrM 1 16338 +" > chrM.agp # RE-DID THIS WHOLE SECTION (DONE 7/23/05 galt) # Fix the chrom ids in contigs: "Chr" to "chr" and chr30 to chrX sed -e 's/^Chr/chr/' contigs/Btau20050310.agp | sed 's/^chr30/chrX/' > temp.agp # Create chrBin0.agp from bin0/ cd /cluster/data/bosTau2/downloads/bin0 scaffoldFaToAgp bin0-reads.fa cd .. # fix bin0-reads.agp which has D instead of W and contig instead of clone # and called chrUn instead of chrBin0 # and ids gnl|baylor|9999999 are too long, shorten to gnl9999999 sed -e 's/[\t]D[\t]/\tW\t/' bin0/bin0-reads.agp \ | sed 's/[\t]contig[\t]/\tclone\t/' \ | sed 's/chrUn[\t]/\tchrBin0\t/' \ | sed 's/|baylor|//' \ > tempBin0.agp # gaps of size 1000 lead to chrUn with positions > 512MB, # So use my agpTweak program to reduce the size of clone gaps agpTweak tempBin0.agp tempBin0-500.agp -agpSeq=chrBin0 -cloneGapSize=500 cat temp.agp tempBin0-500.agp chrM.agp > tempAll.agp # Split chrUn into Scaffolds agpTweak tempAll.agp UCSC_Cow2.0.agp -agpSeq=chrUn -splitIntoScaffolds # Create chrom and scaffold lists awk '{print $1;}' UCSC_Cow2.0.agp | uniq | grep -v scaffold > ../chrom.lst awk '{print $1;}' UCSC_Cow2.0.agp | uniq | grep scaffold > ../scaffold.lst # make chrBin0.fa # also fix gnl ids here sed -e 's/|baylor|//' bin0/bin0-reads.fa > tempBin0.fa cat contigs/Btau20050310.contigs.fa tempBin0.fa chrM.fa > tempAll.fa # found out that agpAllToFaFile does not preserve order # so I extended agpAllToFaFile with -sort option to preserve agp chrom order agpAllToFaFile -sort UCSC_Cow2.0.agp tempAll.fa UCSC_Cow2.0.fa # checkAgpAndFa for scaffolds (only works if agp and fa are sorted the same order) checkAgpAndFa UCSC_Cow2.0.agp UCSC_Cow2.0.fa | tail -1 #All AGP and FASTA entries agree - both files are valid faSize UCSC_Cow2.0.fa #3525488607 bases (713506215 N's 2811982392 real 2811982392 upper 0 lower) in 98090 sequences in 1 files #Total size: mean 35941.4 sd 1613747.0 min 101 (scaffold97899) max 381246137 (chrBin0) median 1186 #N count: mean 7274.0 sd 634636.2 #U count: mean 28667.4 sd 1055740.1 #L count: mean 0.0 sd 0.0 # SELECTED DOWNLOAD FILES (DONE 07/23/05, galt) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath mkdir bosTau2 cd bosTau2 mkdir bigZips cd bigZips cp /cluster/data/bosTau2/downloads/UCSC_Cow2.0.agp . nice gzip UCSC_Cow2.0.agp cp /cluster/data/bosTau2/downloads/UCSC_Cow2.0.fa . nice gzip UCSC_Cow2.0.fa # takes 45 min. # CREATING DATABASE (DONE Jul.20, 2005 galt) hgsql mysql -e 'create database bosTau2' # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql df -h /var/lib/mysql #Filesystem Size Used Avail Use% Mounted on #/dev/sdc1 1.8T 955G 706G 58% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE Jul.20, 2005 galt) hgsql bosTau2 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp' # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) 2BIT (DONE 07/21/05 galt) # Make .2bit, unmasked until RepeatMasker and TRF steps are done. # Do this now so we can load up RepeatMasker and run featureBits; # can also load up other tables that don't depend on masking. ssh kkstore02 cd /cluster/data/bosTau2 #OLDWAY nice faToTwoBit ?{,?}/chr*.fa Bin0/chrBin0.fa scaffolds/scaffolds.fa bosTau2.2bit nice faToTwoBit downloads/UCSC_Cow2.0.fa bosTau2.2bit # Make symbolic links from /gbdb/bosTau2/ to the real .2bit. ssh hgwdev mkdir /gbdb/bosTau2 ln -s /cluster/data/bosTau2/bosTau2.2bit /gbdb/bosTau2/ mkdir -p bed/chromInfo # Load /gbdb/bosTau2/bosTau2.2bit paths into database and save size info. twoBitInfo bosTau2.2bit stdout \ | awk '{print $1 "\t" $2 "\t/gbdb/bosTau2/bosTau2.2bit";}' \ > bed/chromInfo/chromInfo.tab hgsql bosTau2 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql bosTau2 -e 'load data local infile \ "/cluster/data/bosTau2/bed/chromInfo/chromInfo.tab" \ into table chromInfo;' echo "select chrom,size from chromInfo" | hgsql -N bosTau2 > chrom.sizes # take a look at chrom.sizes, should be 41 lines wc chrom.sizes # 98090 196180 1833156 chrom.sizes # large because scaffolds as well as chroms # GOLD AND GAP TRACKS (DONE 07/22/05 galt) ssh hgwdev cd /cluster/data/bosTau2 # create chrN_gold and chrN_gap tables hgGoldGapGl -noGl bosTau2 downloads/UCSC_Cow2.0.agp # unexpected coords (10239932, 10239932) for frag Contig53769 in chrom chr21 # chr21 10239932 10239932 1248 W Contig53769 2146 2146 - # unexpected coords (30727, 30727) for frag Contig108449 in chrom scaffold4717 # unexpected coords (1, 1) for frag Reptig1055060 in chrom scaffold19165 # This warning is not an error because there are over 10000 records in the original agp # where fragstart is not 1. This contig may not have helped much, but it did fill in # one base next to the succeeding adjoining contig. # Refresh index stats with "analyze table". hgsql bosTau2 -e 'analyze table gold; analyze table gap;' # check cardinalities: hgsql bosTau2 -e 'show index from gap' hgsql bosTau2 -e 'show index from gold' # cardinalities ok # The best index is chrom+bin because chrom helps if you have many scaffolds # and bin helps when you are zoomed in to a particular position. # But having chrom+start and chrom+end are only confusing mysql which uses # the wrong index. So I am removing these moot indexes, and now it goes really fast # when zoomed in on a chrom or scaffold: hgsql bosTau2 -e 'drop index chrom_2 on gap' hgsql bosTau2 -e 'drop index chrom_3 on gap' hgsql bosTau2 -e 'drop index chrom_2 on gold' hgsql bosTau2 -e 'drop index chrom_3 on gold' cd /cluster/home/heather/jdbc source javaenv java AssemblyGapChain bosTau2 # No anomalies detected except "Contiguous rows detected at gold pos ..." # There seem to be lots of these in the Baylor .agp. This is ok. # Note that these seem to be the case where a reptig is right next to a contig. # # Heather's comment: # "It's fine about the contiguous gaps. Those are from overlapping # contigs; they have to truncate one of them." # # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR BOSTAU2 (DONE 07/22/04 galt) # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd $HOME/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add bosTau2 in all the right places and do make update cvs commit -m "added bosTau2" makefile mkdir -p cow/bosTau2 cvs add cow/bosTau2 cvs ci -m "trackDb dir for cow genome" cow/bosTau2 make alpha # I copied trackDb.ra from bosTau1, and removed all but gold and gap # to start it off. # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ values("bosTau2", "Mar. 2005", \ "/gbdb/bosTau2/nib", "Cow", "chr1:10666612-10673232", 1, \ 19, "Cow", "Bos taurus", \ "/gbdb/bosTau2/html/description.html", 0, 0, \ "Baylor Btau_2.0");' \ | hgsql -h genome-testdb hgcentraltest # add rows to hgcentraltest on hgwbeta in dbDb # set orderKey past cow1 temporarily # update orderKey to swap cow2, cow1 so cow2 comes first in dropdownlist echo 'update dbDb set orderKey=20 where name="bosTau1"' \ | hgsql -h genome-testdb hgcentraltest # update defaultDb to make cow2 the default # If there's another cow assembly, this will be an update not insert: echo 'update defaultDb set name="bosTau2" where name="bosTau1"' \ | hgsql -h genome-testdb hgcentraltest # clone the description.html page from bosTau1 cd /cluster/data/bosTau2 cp -r /cluster/data/bosTau1/html/ . ln -s /cluster/data/bosTau2/html /gbdb/bosTau2/ # make basic edits necessary vi html/description.html cd ${HOME}/kent/src/hg/makeDb/trackDb/cow/bosTau2/ cp /cluster/data/bosTau2/html/description.html . cvs add description.html cvs ci -m "added initial description page" description.html # CREATE CONTIGORIENT TRACK (DONE 08/03/05 galt) # This was done to provide the user with info in the agp comment field # that would otherwise be lost. # Some of the contigs say "#unorientedScaffold" in field 10. # This info is extracted as a bed 6 + and loaded as follows: ssh hgwdev cd /cluster/data/bosTau2/bed mkdir contigOrient cd contigOrient gawk '{if (($5 == "W") && ($1 != "chrUn")) {if ($10=="") score="1000"; else score="500";print $1 "\t" ($2 -1) "\t" $3 "\t" $6 "\t" score "\t" $9}}' ../../downloads/temp.agp > oriented.bed hgLoadBed bosTau2 contigOrient oriented.bed #Reading oriented.bed #Loaded 118581 elements of size 6 #Sorted #Creating table definition for contigOrient #Saving bed.tab #Loading bosTau2 cd ~/kent/src/hg/makeDb/trackDb/ vi cow/bosTau2/contigOrient.html #------------ #track contigOrient #shortLabel Contig Orient #longLabel Contig Orientation #priority 15 #group map #visibility hide #type bed 6 + #useScore 1 #chromosomes chr1,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr2,chr20,chr21,chr22,chr23,chr24,chr25,chr26,chr27,chr28,chr29,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chrX make update DBS=bosTau2 ZOO_DBS= # SPLIT SCAFFOLDS (DONE 07/23/05 galt) ssh kkstore02 cd /cluster/data/bosTau2 mkdir scaffolds cd scaffolds /cluster/bin/i386/faSplit byname ../downloads/UCSC_Cow2.0.fa scaffolds -outDirDepth=3 # find big files over 500k to split du -ka | grep "[.]fa" | gawk '{if ($1 > 500) print $2}' > bigfiles #splitBig.csh #/bin/tcsh -ef foreach b (`cat bigfiles`) echo "splitting $b:h $b:t" pushd $b:h set contig = $b:t:r echo $contig faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft -minGapSize=100 popd end chmod +x splitBig.csh splitBig.csh # REPEATMASKER (DONE 07/23/05 galt) # using the split scaffold fa files generated earlier # note the version of RepeatMasker in use; will want this for download README cat /cluster/bluearc/RepeatMasker/Libraries/version # RepBase Update 9.11, RM database version 20050112 # do a trial run ssh hgwdev cd /cluster/data/bosTau2/scaffolds/0/0/0 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec bos scaffold1000.fa #took about 40 min, no errors # configure ssh kkstore02 cd /cluster/data/bosTau2 mkdir jkStuff cp /cluster/data/anoGam1/jkStuff/RMAnopheles jkStuff/RMBosTaurus # change references anoGam1 --> bosTau2 including -spec bos mkdir RMRun #makeJoblist-RM.csh # /bin/tcsh -ef cd scaffolds foreach i (0 1 2 3 4 5 6 7 8 9) cd $i foreach j (0 1 2 3 4 5 6 7 8 9) cd $j foreach k (0 1 2 3 4 5 6 7 8 9) cd $k foreach f (*.fa) # big files already split have a .lft so skip the big ones if (! -e $f:r.lft) then echo /cluster/data/bosTau2/jkStuff/RMBosTaurus \ /cluster/data/bosTau2/scaffolds/$i/$j/$k $f \ '{'check out line+ /cluster/data/bosTau2/scaffolds/$i/$j/$k/$f.out'}' \ >> /cluster/data/bosTau2/RMRun/RMJobs endif end cd .. end cd .. end cd .. end chmod +x makeJoblist-RM.csh makeJoblist-RM.csh # do the run ssh kk cd /cluster/data/bosTau2/RMRun para create RMJobs # Checking input files # 102932 jobs written to batch para try para push para check para time etc. # CHECK REPEATMASKER FINISHED, LIFT BIGFILES AND CONCATENATE OUTPUT (DONE 07/24/05 galt) #[kk:/cluster/data/bosTau2/RMRun> para time #102932 jobs in batch #3 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 102932 of 102932 jobs #CPU time in finished jobs: 33107057s 551784.28m 9196.40h 383.18d 1.050 y #IO & Wait Time: 6194171s 103236.19m 1720.60h 71.69d 0.196 y #Average job time: 382s 6.36m 0.11h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 7956s 132.60m 2.21h 0.09d #Submission to last job: 77640s 1294.00m 21.57h 0.90d #- Lift up the 500KB chunk .out's to scaffold level ssh kkstore02 cd /cluster/data/bosTau2/scaffolds #liftBig.csh #/bin/tcsh -ef foreach b (`cat bigfiles`) echo "lifting $b:h $b:t" pushd $b:h set contig = $b:t:r echo $contig liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null popd end chmod +x liftBig.csh liftBig.csh # concatenate into one output file; took about 90 minutes ssh kkstore02 cd /cluster/data/bosTau2 mkdir repeats #create script concatRM.csh { #!/bin/tcsh -ef if (-e /cluster/data/bosTau2/repeats/repeats.all) then rm /cluster/data/bosTau2/repeats/repeats.all endif cd scaffolds foreach i (0 1 2 3 4 5 6 7 8 9) cd $i foreach j (0 1 2 3 4 5 6 7 8 9) cd $j foreach k (0 1 2 3 4 5 6 7 8 9) cd $k foreach f (*.fa.out) # make sure we only get big lifted files, not small split files set pf = `echo $f | sed -e 's/[_][0-9][0-9]*[.]fa[.]out$/.fa.out/'` if ("$pf" == "$f") then echo $f cat $f >> /cluster/data/bosTau2/repeats/repeats.all endif end cd .. end cd .. end cd .. end cd .. chmod +x concatRM.csh concatRM.csh > concatRM.list wc concatRM.list # 98090 98090 2048519 concatRM.list cd repeats wc repeats.all # 6044161 89020033 723977658 repeats.all # use grep -v to get rid of all the headers # then, add back in an initial header # this is so hgLoadOut is happy head -3 repeats.all > repeats.header egrep "chr|scaffold" repeats.all > repeats.clean cat repeats.header repeats.clean > repeats.final grep "There were no repetitive sequences detected" repeats.final > repeats.notdetected wc repeats.notdetected # 5211 41688 333503 repeats.notdetected grep -v "There were no repetitive sequences detected" repeats.final > repeats.out wc repeats.out # 5760316 86470639 704384016 repeats.out ssh hgwdev hgLoadOut bosTau2 /cluster/data/bosTau2/repeats/repeats.out #Processing /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -5.2 line 726783 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -0.3 line 1844558 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -24.1 line 2227209 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -396.9 line 2244938 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -763.4 line 2244938 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -10363.0 line 2434452 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -3147.2 line 2434452 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -1217.1 line 2434452 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -0.9 line 2613009 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -4.0 line 2745397 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -0.1 line 3534731 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -0.1 line 3726610 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -6355.5 line 4849142 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -1155.4 line 4849142 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -174.3 line 4849142 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -0.3 line 5117371 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -0.2 line 5117371 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -0.9 line 5365557 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -1.1 line 5365557 of /cluster/data/bosTau2/repeats/repeats.out #Strange perc. field -11.0 line 5561481 of /cluster/data/bosTau2/repeats/repeats.out #Loading up table repeats_rmsk #note: 116 records dropped due to repStart > repEnd # # run with -verbose=2 for details hgsql bosTau2 -e 'alter table repeats_rmsk rename rmsk' hgsql bosTau2 -e 'select count(*) from rmsk' # 5760197 hgsql bosTau2 -e 'select count(*) from rmsk where repName like "Bov%"' # 2193098 nice featureBits bosTau2 rmsk # 1291561904 bases of 2812203870 (45.927%) in intersection # indexes slow and not working, because they were optimized for # having a small number of chroms, with a fairly big max position 200+, ~30chrom # but we have many small scaffolds, so need to index on genoName. # but bin can also be useful for when zoomed-in. # I experimented and found that having just one compound index # with the key of genoName+bin solves this problem well. drop index bin on rmsk; drop index genoStart on rmsk; drop index genoEnd on rmsk; create index genoNameBin on rmsk(genoName,bin); # I tested this and it is very fast when zoomed in, or when on full scaffolds. # Even on the largest full-view for chr1 and chrBin0, the response time is ok. # SIMPLE REPEATS (DONE 07/25/05 galt) # put the results throughout the scaffold 0/0/0 directories, # same as RepeatMasker, to avoid too many files in the same directory ssh kkstore02 cd /cluster/data/bosTau2 mkdir -p bed/simpleRepeat cd bed/simpleRepeat # create script makeJoblist-trf.csh #!/bin/tcsh -ef if (-e /cluster/data/bosTau2/bed/simpleRepeat/trf-run.csh) then rm /cluster/data/bosTau2/bed/simpleRepeat/trf-run.csh endif pushd /cluster/data/bosTau2/scaffolds foreach i (0 1 2 3 4 5 6 7 8 9) cd $i foreach j (0 1 2 3 4 5 6 7 8 9) cd $j foreach k (0 1 2 3 4 5 6 7 8 9) cd $k foreach f (*.fa) # make sure we only get big files, not small split files set pf = `echo $f | sed -e 's/[_][0-9][0-9]*[.]fa$/.fa/'` if ("$pf" == "$f") then set fout = $f:t:r.bed echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf /cluster/data/bosTau2/scaffolds/$i/$j/$k/$f /dev/null -bedAt=/cluster/data/bosTau2/scaffolds/$i/$j/$k/$fout -tempDir=/tmp" \ >> /cluster/data/bosTau2/bed/simpleRepeat/trf-run.csh endif end cd .. end cd .. end cd .. end popd chmod +x makeJoblist-trf.csh makeJoblist-trf.csh wc trf-run.csh # 98090 588540 19399078 trf-run.csh # This is correct: 98058 scaffolds + chr1-29 + chrM,X,Bin0 = 98090 chmod +x trf-run.csh # do the run trf-run.csh > & ! trf.log & #did re-nice on it #took about 4 hours grep "Done.faFile" trf.log | wc # 98090 1569440 11944238 # count good. # concatenate into one output file; took about an hour # create concatTRF.csh #!/bin/tcsh -ef if (-e /cluster/data/bosTau2/bed/simpleRepeat/trf.bed) then rm -i /cluster/data/bosTau2/bed/simpleRepeat/trf.bed endif cd ../../scaffolds foreach i (0 1 2 3 4 5 6 7 8 9) cd $i foreach j (0 1 2 3 4 5 6 7 8 9) cd $j foreach k (0 1 2 3 4 5 6 7 8 9) cd $k foreach f (*.bed) echo $f cat $f >> /cluster/data/bosTau2/bed/simpleRepeat/trf.bed end cd .. end cd .. end cd .. end chmod +x concatTRF.csh concatTRF.csh > concatTRF.log wc concatTRF.log # 98090 98090 1754249 concatTRF.log # good. # load ssh hgwdev cd /cluster/data/bosTau2/bed/simpleRepeat hgLoadBed bosTau2 simpleRepeat trf.bed -sqlTable=${HOME}/kent/src/hg/lib/simpleRepeat.sql # Reading trf.bed # Loaded 417896 elements of size 16 # Sorted # Saving bed.tab # Loading bosTau2 hgsql bosTau2 -e 'select count(distinct chrom) from simpleRepeat' # +-----------------------+ # | count(distinct chrom) | # +-----------------------+ # | 37407 | # +-----------------------+ # Many small scaffolds didn't have tandem repeats. hgsql bosTau2 -e 'select count(distinct chrom) from simpleRepeat where chrom like "chr%"' # +-----------------------+ # | count(distinct chrom) | # +-----------------------+ # | 31 | # +-----------------------+ # All the larger chroms had tandem repeats as expected. # Get rid of these indexes that just slow down mysql and confuse it # as far as I can tell we don't need them. hgsql bosTau2 -e 'drop index chrom_2 on simpleRepeat' hgsql bosTau2 -e 'drop index chrom_3 on simpleRepeat' # CREATE MASKED FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 07/26/05 galt) ssh kkstore02 cd /cluster/data/bosTau2 maskOutFa -soft downloads/UCSC_Cow2.0.fa repeats/repeats.out bosTau2.softmask.fa # 117 warnings about negative rEnd # matches this query: hgsql bosTau2 -e 'select count(*) from rmsk where repEnd < 0' maskOutFa -softAdd bosTau2.softmask.fa bed/simpleRepeat/trf.bed bosTau2.softmask2.fa # hard masking (Ns instead of lower case) for download files # split for use by genscan maskOutFa bosTau2.softmask2.fa hard bosTau2.hardmask.fa mkdir hardmask-split cd hardmask-split faSplit about ../bosTau2.hardmask.fa 2000000 hardmask-split # made 687 files, useful with genescan later # DOWNLOAD FILES (DONE 07/26/05 galt) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/bosTau2/bigZips cp /cluster/data/bosTau2/bosTau2.softmask.fa . cp /cluster/data/bosTau2/bosTau2.softmask2.fa . cp /cluster/data/bosTau2/bosTau2.hardmask.fa . nice gzip bosTau2.softmask.fa nice gzip bosTau2.softmask2.fa nice gzip bosTau2.hardmask.fa # REGENERATE 2BIT NIB (DONE 07/26/05 galt) ssh kkstore02 cd /cluster/data/bosTau2 mv bosTau2.2bit bosTau2.2bit.unmasked faToTwoBit bosTau2.softmask2.fa bosTau2.2bit twoBitToFa bosTau2.2bit check.fa diff -q bosTau2.softmask2.fa check.fa # note: size match # note: you can also use faCmp # Now check in the browser # RepeatMasker (rmsk) and trf (simpleRepeat) # should show lower case when masking option used # and be sure to use Extended DNA Case/Color Options # for simpleRepeat because it doesn't have strand info # and otherwise lower-casing does not occur with standard getDna # Result: bosTau2 looks good for both rmsk and simpleRepeat # GC5BASE - (DONE 07/26/05 galt) ssh kolossus mkdir /cluster/data/bosTau2/bed/gc5Base cd /cluster/data/bosTau2/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 bosTau2 \ /cluster/data/bosTau2 | wigEncode stdin gc5Base.wig gc5Base.wib # Calculating gcPercent with window size 5 # Using twoBit: /cluster/data/bosTau2/bosTau2.2bit # File stdout created # Converted stdin, upper limit 100.00, lower limit 0.00 # about 10 minutes # Now load those results into the database ssh hgwdev cd /cluster/data/bosTau2/bed/gc5Base mkdir /gbdb/bosTau2/wib ln -s `pwd`/gc5Base.wib /gbdb/bosTau2/wib hgLoadWiggle -pathPrefix=/gbdb/bosTau2/wib bosTau2 gc5Base gc5Base.wig # The index is no good for this organism due to the scaffold # naming scheme, drop the index and create one that works: hgsql bosTau2 -e 'drop index chrom on gc5Base' hgsql bosTau2 -e 'create index chromBin on gc5Base (chrom,bin)' # The speeds up an operation on the hgTables browser by orders of # magnitude - whole genome statistics display # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 07/26/05 galt) ssh kkr1u00 mkdir -p /iscratch/i/bosTau2 cp -p /cluster/data/bosTau2/bosTau2.2bit /iscratch/i/bosTau2/ iSync #if you get error "Permission denied" it means you have to add ~/.rhosts file # and be sure to chmod 600 ~/.rhosts mkdir -p /cluster/bluearc/bosTau2 cp -p /cluster/data/bosTau2/bosTau2.2bit /cluster/bluearc/bosTau2/ # MAKE 11.OOC FILE FOR BLAT (DONE 07/26/05 galt) # Use -repMatch=1005 (based on size -- for human we use 1024, and # dog size is 2812203870 / 2866216770 = ~98.1% of human judging by # gapless genome size from featureBits) ssh kkr1u00 mkdir -p /cluster/data/bosTau2/bed/ooc cd /cluster/data/bosTau2/bed/ooc ls -1 /cluster/data/bosTau2/bosTau2.2bit > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/bosTau2/11.ooc -repMatch=1005 # Wrote 31532 overused 11-mers to /cluster/bluearc/bosTau2/11.ooc cp -p /cluster/bluearc/bosTau2/11.ooc /iscratch/i/bosTau2/ cp -p /cluster/bluearc/bosTau2/11.ooc . iSync #> My question is, can I still use just a .2bit and no lift? # #With the largest chr being 97M, no need for a lift, so don't #bother. You actually get better results without a lift. # #You probably want use a smaller value for bosTau2.mondoTwoBitParts, #I would try 1000. #-mark # #oops chrBin0 is 372M. # GENBANK (NOTE: All of this was redone by MarkD to test his new process, # see next section below) ssh hgwdev cd ${HOME}/kent/src/hg/makeDb/genbank # check for missing commits diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # (there were none) # edit etc/genbank.conf and add these lines, starting with the comment: vi etc/genbank.conf # bosTau2 (B. taurus) 30 chromosomes plus 98058 scaffolds # This uses the mondo 2bit file functionality, where multiple # sequences from a two bit are alignmed from a single jobs. # Genbank B. taurus: (these are the old numbers from bosTau1) # 3720 mRNAs # 493370 ESTs # partation 2bit into 1000 parts for initial build bosTau2.genome = /iscratch/i/bosTau2/bosTau2.2bit bosTau2.lift = no bosTau2.refseq.mrna.native.load = no bosTau2.genbank.mrna.xeno.load = no bosTau2.genbank.est.xeno.load = no bosTau2.downloadDir = bosTau2 bosTau2.perChromTables = no bosTau2.mondoTwoBitParts = 1000 # note: MarkD later decided he thought 2000 is better for mondo, # so we will use that for ESTs below cvs commit -m "added bosTau2" etc/genbank.conf #revision 1.116,1.117 # edit src/lib/gbGenome.c vi src/lib/gbGenome.c #(no changes were necessary) # edit src/align/gbBlat vi src/align/gbBlat make cvs commit -m "added bosTau2" src/align/gbBlat # revision 1.61 # actually I just change the .ooc path from bosTau1 to bosTau2 make install-server # I tried doing this from kkstore02 but it didn't work, so went back to eieio ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial -verbose=1 bosTau2 & # logFile: var/build/logs/2005.08.02-12:00:54.bosTau2.initalign.log # /panasas/store/genbank/work/initial.bosTau2/ # and now it is re-running again on eieio. Started around 12:15pm on Tues.Aug2nd. # This died by 3pm, with an error # I did a para push again, removed a few dead machines, it finally finished # para check says all ran ok. # instructions: repeat with -continue=finish which must go before the database argument. # note: screen is handy for long-running operations screen nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial -verbose=1 -continue=finish bosTau2 # logFile: var/build/logs/2005.08.03-00:07:50.bosTau2.initalign.log #ctrl-a ctrl-d to release terminal. #screen -r -d to recapture terminal. #exit to close screen when done. # load ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad bosTau2 # logFile: var/dbload/eieio/logs/2005.08.03-11:07:08.dbload.log # finished no errs # checked out the new tables list which seems right # added all_mrna clause to bosTau2/trackDb.ra, copied from from cow1 # did make update on bosTau2 # looked at the new Cow mRNAs track # one problem is some are split up with parts on different scaffolds # just because the assembly is an early draft and we still have scaffolds. # another problem is some are split on opp strands of say chr1, # probably because some of the contigs in the chroms are unoriented. # So, now I will have to create a track to show which regions are # oriented and which are not. # MarkD thinks maybe the mondoTwoBitParts should be increased to 2000. ssh hgwdev cd ${HOME}/kent/src/hg/makeDb/genbank diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf vi etc/genbank.conf #change to bosTau2.mondoTwoBitParts = 2000 cvs commit -m "MarkD advises changing to bosTau2.mondoTwoBitParts = 2000" etc/genbank.conf #revision 1.119 make make install-server # ESTs with a huge number of scaffolds. ssh eieio cd /cluster/data/genbank screen bin/gbAlignStep -initial -srcDb=genbank -type=est bosTau2 #logFile: var/build/logs/2005.08.05-02:12:43.bosTau2.initalign.log #ctrl-a,d to detach tail -f var/build/logs/2005.08.05-02:12:43.bosTau2.initalign.log # to check progres: [kk:/cluster/data/genbank/work/initial.bosTau2/align, para check #Markd: #Yea, chrBin0.psl is too big and pslIntronsOnly (intronEst builder) #A short-term fix is to finish it on the kki minicluster. #So please do the following: # on kk, kill the para make that is running on kk, if you are not running cd /cluster/store3/genbank/work/initial.bosTau2/align/ para stop #use parasol list batches to determine if they have really stopped. #then do a para recover align.jobs align2.jobs #So now create a directory under the align directory to save the parasol bits mkdir firstRun mv batch firstRun/ mv batch.bak firstRun/ mv err firstRun/ mv para.results firstRun/ #Check the align2.jobs file (it should have just the ~21 jobs that #were still running and any that had failed 4 times). ssh kki cd /cluster/store3/genbank/work/initial.bosTau2/align/ para create align2.jobs para push #etc #once this has finished, rerun the gbAlignStep command, # only adding the -continue=finish option (before the database). ssh eieio cd /cluster/data/genbank screen nice bin/gbAlignStep -srcDb=genbank -type=est -initial -verbose=1 -continue=finish bosTau2 # # Markd wanted a lift file for the big artificial chrBin0 ssh hgwdev cd /cluster/data/bosTau2/downloads # need the size of chrBin0: hgsql bosTau2 -e 'select size from chromInfo where chrom="chrBin0"' # 381246137 # now use gawk to make the lift file gawk '{ \ if ($5=="W") { \ print ($2 - 1) "\t" $6 "\t" ($3 - $2 + 1) "\t" "chrBin0" "\t" "381246137" \ } else { \ print ($2 - 1) "\t" "gap" "\t" ($3 - $2 + 1) "\t" "chrBin0" "\t" "381246137" \ } \ }' tempBin0-500.agp > chrBin0.lft # load the ESTs, reloading mRNAs as well ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -drop -initialLoad bosTau2 # add bosTau2 to list of databases to align in # /cluster/data/genbank/etc/align-genbank # GENBANK: check Non-Cow Refseqs # Jim says he doesn't care about these? # add to kent/src/hg/makeDb/genbank/etc/genbank.conf and commit: # bosTau1.refseq.mrna.xeno.load = yes cd /cluster/data/genbank/etc cvs update # new tables: xenoRefGene, xenoRefFlat, xenoRefSeqAli, refLink, refSeqStatus # refSeqSummary # MARKD re-does it again to test his new genbank process: # AUTO UPDATE GENBANK MRNA AND EST AND MGC GENES RUN (DONE, 2005-09-05, markd) # redo genbank revised alignment procedure once again to # pickup local near best pslCDnaFilter # align with revised genbank process cd ~kent/src/makeDb/genbank cvs update -d etc # edit etc/genbank.conf to add bosTau2 # bosTau2 (B. taurus) 30 chromosomes plus 98058 scaffolds # Lift file partitions unplaced sequence pseudo-chroms bosTau2.serverGenome = /cluster/data/bosTau2/bosTau2.2bit bosTau2.clusterGenome = /iscratch/i/bosTau2/bosTau2.2bit bosTau2.ooc = /iscratch/i/bosTau2/11.ooc bosTau2.align.unplacedChroms = chrBin0 bosTau2.lift = /cluster/data/bosTau2/downloads/chrBin0.lft bosTau2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} bosTau2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} bosTau2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} bosTau2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} bosTau2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} bosTau2.genbank.est.xeno.pslCDnaFilter = ${lowCover.genbank.est.xeno.pslCDnaFilter} bosTau2.refseq.mrna.native.load = no bosTau2.refseq.mrna.xeno.load = yes bosTau2.genbank.mrna.xeno.load = no bosTau2.genbank.est.xeno.load = no bosTau2.downloadDir = bosTau2 bosTau2.perChromTables = no # N.B. above was changed later to include refseqs don't just copy this or a spell will be # cast on your descendents. # update /cluster/data/genbank/ make etc-update ssh kkstore02 cd /cluster/data/genbank nice bin/gbAlignStep -initial bosTau2 & # load database when finished ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -drop -initialLoad bosTau2& # enable daily alignment and update of hgwdev cd ~kent/src/makeDb/genbank cvs update -d etc # add bosTau2 to: etc/align.dbs etc/hgwdev.dbs cvs commit make etc-update # enabled native refSeq: 2006-05-11 markd # set this in genbank.conf: bosTau2.refseq.mrna.native.load = yes # kick off an alignment to verify ssh kkstore02 cd /cluster/data/genbank (nice ./bin/gbAlignStep bosTau2)|&mail markd& # GENBANK DOWNLOAD FILES #Nope, no need. It will now done automatically on hgdownload. #-MarkD # output goes to /genbank/download/bosTau1/bigZips # these are automatically rsynced to hgnfs1 (which hgdownload reads) # DROP INDEXES FROM GENBANK TABLES (galt) #(these would just slow down most operations) ssh hgwdev hgsql bosTau2 -e 'drop index tName_2 on all_est' hgsql bosTau2 -e 'drop index tname_3 on all_est' hgsql bosTau2 -e 'drop index tName_2 on intronEst' hgsql bosTau2 -e 'drop index tname_3 on intronEst' hgsql bosTau2 -e 'drop index tName_2 on all_mrna' hgsql bosTau2 -e 'drop index tname_3 on all_mrna' # REQUESTED BLAT SERVER 2005-08-07 # blat server working. # GENSCAN (DONE 2005-08-12 galt) # uses hard-masked sequence files ssh hgwdev mkdir /cluster/data/bosTau2/bed/genscan cd /cluster/data/bosTau2/bed/genscan cvs co hg3rdParty/genscanlinux mkdir gtf pep subopt # Run on small cluster (more memory than big cluster). ssh kki cd /cluster/data/bosTau2/bed/genscan # generate list of hard-masked scaffolds that are not all-N's # (genscan crashes if all-N's) (this is slow started 6:45pm) foreach f ( `ls -1S /cluster/data/bosTau2/hardmask-split/*` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # 779 should be 689, has some duplicates which sorting shows # I found dupes for some reason, so I will remove those mv genome.list temp.list sort -u temp.list > genome.list rm temp.list wc -l genome.list # 689 # Create gsub (I changed the method to make it more readable) # move this block to the left by putting your cursor on the # next line and pressing <%, then paste into window, then press >% to restore { echo "#LOOP" > gsub echo '/cluster/bin/x86_64/gsBig \ {check in line+ $(path1)} \ {check out line gtf/$(root1).gtf} \ -trans={check out line pep/$(root1).pep} \ -subopt={check out line subopt/$(root1).bed} \ -exe=hg3rdParty/genscanlinux/genscan \ -par=hg3rdParty/genscanlinux/HumanIso.smat \ -tmp=/tmp \ -window=2400000' \ | tr -d '\n' >> gsub echo "\n#ENDLOOP">>gsub } /cluster/bin/i386/gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... ls -1 gtf | wc -l # 689 endsInLf gtf/* # no messages, so ok # Concatenate ssh kkstore02 cd /cluster/data/bosTau2/bed/genscan cat gtf/*.gtf > genscan.gtf cat pep/*.pep > genscan.pep cat subopt/*.bed > genscanSubopt.bed # Load ssh hgwdev cd /cluster/data/bosTau2/bed/genscan ldHgGene -gtf bosTau2 genscan genscan.gtf #Reading genscan.gtf #Read 61380 transcripts in 358757 lines in 1 files # 61380 groups 18721 seqs 1 sources 1 feature types # 61380 gene predictions hgPepPred bosTau2 generic genscanPep genscan.pep hgLoadBed bosTau2 genscanSubopt genscanSubopt.bed #Reading genscanSubopt.bed #Loaded 589698 elements of size 6 #Sorted #Creating table definition for genscanSubopt #Saving bed.tab #Loading bosTau2 featureBits bosTau2 genscan # 58980161 bases of 2812203870 (2.097%) in intersection # QUALITY TRACK (DONE 2005-08017 galt) # The challenge was a sorting problem. # The quality file from Baylor was ordered by contig. # We need quality data by scaffold, so used the gold table # It won't have any data for chrM or for chrBin0 which is fine. ssh hgwdev cd /cluster/data/bosTau2/bed mkdir quality cd quality # I wrote a c program /cluster/store11/bosTau2/bed/quality/quality.c to process the database.gold # and also the contigs.fa.qual data from Baylor. # Because the qual file is so big, I do not load # it all at once, instead I read it once and # create an index into where each section lies e.g. '>Contig999' # and then using the gold resultset, I fetch each # section from the qual file as needed. # this creates the simple output format needed by wigEncode # I later broke the program into 3 stages for convenience and speed # run on hgwdev for sql connection, just dumps a simple query of db.gold # this is not super fast because I removed some indexes, # but it's still less than a minute loadQuality getGold bosTau2 gold.tab ssh kkstore02 cd /cluster/data/bosTau2/bed/quality # run this on kkstore02 where it is fast and we won't ever have to run index step again # takes about 3 minutes to read and index a 7GB quality file loadQuality indexQuality ../../downloads/contigs/Btau20050310.contigs.fa.qual qual.index # run this on kkstore02 where it is fast (less than an hour!) loadQuality build gold.tab qual.index ../../downloads/contigs/Btau20050310.contigs.fa.qual quality.out > quality.log # inspect quality.log. only expected things like chrM and chrBin0 # should have a message saying they are missing. # wigEncode wigEncode quality.out quality.wig quality.wib # Converted quality.out, upper limit 90.00, lower limit 0.00 # load ssh hgwdev cd /cluster/data/bosTau2/bed/quality ln -s /cluster/data/bosTau2/bed/quality/quality.wib /gbdb/bosTau2/wib/ hgLoadWiggle -pathPrefix=/gbdb/bosTau2/wib bosTau2 quality quality.wig # loaded with no error messages # Hiram and I spot-checked several contigs for the qual data, # both full contig forward and reverse strand, and one reptig that # had fragStart=781 on - strand and they all were perfect matches # for the track data using hgTables. Looks good! # MAKE Human Proteins track (hg17 DONE 2005-09-08 galt) ssh kkstore02 mkdir /cluster/data/bosTau2/blastDb cd /cluster/data/bosTau2/blastDb #old: faSplit sequence ../downloads/linearScaffolds/Btau20040927-freeze-assembly.fa 500 x faSplit sequence ../downloads/UCSC_Cow2.0.fa 500 x # find big files over 10M to split du -ka | grep '[.]fa$' | gawk '{if ($1 > 10000) print $2}' > bigfiles #splitBig.csh { #/bin/tcsh -ef foreach b (`cat bigfiles`) echo "splitting $b" set contig = $b:t:r echo $contig faSplit gap $contig.fa 10000000 ${contig}_ -lift=$contig.lft -minGapSize=100 end } chmod +x splitBig.csh splitBig.csh #I will use the scaffolds/ which has this stuff already split up for me. #use formatdb to prepare for blastall #prepForBlast.csh { #!/bin/tcsh -ef foreach f (*.fa) # big files already split have a .lft so skip the big ones if (! -e $f:r.lft) then /projects/compbio/bin/i686/formatdb -i $f -p F endif end } chmod +x prepForBlast.csh prepForBlast.csh #takes about 5 min #[kkstore02:/cluster/data/bosTau2/blastDb> ls | wc -l #2627 #[kkstore02:/cluster/data/bosTau2/blastDb> ls | grep '[.]fa[.]nsq$' | wc -l #640 #[kkstore02:/cluster/data/bosTau2/blastDb> ls | grep '[.]fa[.]nhr$' | wc -l #640 #[kkstore02:/cluster/data/bosTau2/blastDb> ls | grep '[.]fa[.]nin$' | wc -l #640 rm *.log rm *.fa mkdir /cluster/data/bosTau2/bed/tblastn.hg17KG ssh kkr1u00 cp -r /cluster/data/bosTau2/blastDb/ /iscratch/i/bosTau2/ ls /iscratch/i/bosTau2/blastDb/*.nsq | sed "s/\.nsq//" > /cluster/data/bosTau2/bed/tblastn.hg17KG/query.lst (iSync) > ~/bosTau2sync.out # took a few hours # Hiram now says not to bother with iSync, it's too slow. # Better to just run rsync on the directory you need, 8x. # See one of his more recent makedocs for an example. # back to kkstore02 exit # we want around 200,000 jobs cd /cluster/data/bosTau2/bed/tblastn.hg17KG calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk '{print $1}'`/\(200000/`wc query.lst | awk '{print $1}'`\) # 37365/(200000/640) = 119.6 mkdir -p /cluster/bluearc/bosTau2/bed/tblastn.hg17KG/kgfa split -l 120 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/bosTau2/bed/tblastn.hg17KG/kgfa/kg # 120 created 312 psl files ln -s /cluster/bluearc/bosTau2/bed/tblastn.hg17KG/kgfa kgfa pushd kgfa # go compile kent/src/hg/pslxToFa on kolossus for 64-bit util # this runs fast foreach i (*) pslxToFa $i $i.fa rm $i end popd ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/bosTau2/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/bosTau2/bed/tblastn.hg17KG/blastOut #old for i in (`cat kg.lst`) do;mkdir blastOut/`basename $i`;end foreach i (`cat kg.lst`) mkdir blastOut/$i:t:r end cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome /cluster/bin/i386/gensub2 query.lst kg.lst blastGsub blastSpec wc -l blastSpec $199680 blastSpec ssh kk cd /cluster/data/bosTau2/bed/tblastn.hg17KG para create blastSpec # from previous failed attempts, counts are: # galt 9 0 238 10 10 -1 /cluster/store11/bosTau2/bed/tblastn.hg17KG/ para try para push # There were major power and cluster problems that delayed this batch [kk:/cluster/data/bosTau2/bed/tblastn.hg17KG> para check #199680 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #ranOk: 199680 #total jobs in batch: 199680 [kk:/cluster/data/bosTau2/bed/tblastn.hg17KG> para time #199680 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 199680 of 199680 jobs #CPU time in finished jobs: 95239579s 1587326.31m 26455.44h 1102.31d 3.020 y #IO & Wait Time: 4986306s 83105.10m 1385.09h 57.71d 0.158 y #Average job time: 502s 8.37m 0.14h 0.01d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 4991s 83.18m 1.39h 0.06d #Submission to last job: 560248s 9337.47m 155.62h 6.48d # I have to do the lift now! # I also accidentally moved files with the same name together into one directory # while trying to do liftup, which wiped out one-third of my files, # which were redone with para recover blastSpec blastSpec2, etc. #- Lift up the 10MB chunk psl's to chrom/scaffold level ssh kkstore02 cd /cluster/data/bosTau2/bed/tblastn.hg17KG/blastOut #create liftBig.csh { #/bin/tcsh -ef set nonomatch = 1 foreach d ( */ ) pushd $d set flist = ( *_*.psl ) if ( "$flist" != "*_*.psl" ) then foreach f ( $flist ) mv $f $f:r.qsl end endif foreach l ( /cluster/data/bosTau2/blastDb/*.lft ) set contig = $l:t:r echo "lifting $d $contig" liftUp -nohead q.${contig}.psl $l warn q.${contig}_*.qsl >> lift.log end popd end } chmod +x liftBig.csh liftBig.csh >& liftBig.log #started at 5:30 pm #ended ? cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/bosTau2/bed/tblastn.hg17KG para create chainSpec para push # Completed: 312 of 312 jobs # CPU time in finished jobs: 198486s 3308.09m 55.13h 2.30d 0.006 y # IO & Wait Time: 20318s 338.64m 5.64h 0.24d 0.001 y # Average job time: 701s 11.69m 0.19h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 6778s 112.97m 1.88h 0.08d # Submission to last job: 99549s 1659.15m 27.65h 1.15d # actually, I had run into errors where there weren't 21 words per line in the psl files, #foreach d ( */ ) #cd $d #ls -1 *.[pq]sl | xargs -iX awk '{if (NF != 21) print ENVIRON["PWD"] " " FILENAME " " NR " " NF}' X >> ../badPslList2 #cd .. #end # # and also problems with strange things that look like unlifted but are just junk # due to cluster power outage crashes I guess - I found those, removed them # grep -c 'chr[0-9][0-9]_' *.psl > problems # # I deleted the problem files and then I did para recover, and re-ran the missing bits. # Finally I got it to complete with no errors. exit # back to kkstore02 pushd /cluster/data/bosTau2/bed/tblastn.hg17KG/blastOut { foreach i ( kg?? ) awk '($13 - $12)/$11 > 0.60 {print}' c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk '$1 / $11 > 0.60 {print}' c60.$i.psl > m60.$i.psl echo $i end } cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq \ > /cluster/data/bosTau2/bed/tblastn.hg17KG/blastHg17KG.psl popd ssh hgwdev cd /cluster/data/bosTau2/bed/tblastn.hg17KG hgLoadPsl bosTau2 blastHg17KG.psl # back to kksilo rm -rf blastOut # TODO this not done yet, waiting to see if needed for debugging # search not working for all expected terms, sent question to braney #On Fri, 9 Sep 2005, Brian Raney wrote: # #> Hey Galt, #> #> I checked in some search rules for blastHg17KG in #> trackDb/cow/bosTau2/trackDb.ra . #> #> brian #the main difference seems to be: #xrefTable hg17.blastKGRef01 (in main trackDb.ra) #and #xrefTable hg17.blastKGRef02 (in bosTau2 trackDb.ra that you added) # #I see however that searches are now working much better. #-Galt # End tblastn # MAKE SPLIT MASK2 FA (done 2005-09-08 galt) (for CpG Islands) ssh kkstore02 cd /cluster/data/bosTau2 mkdir softmask-split cd softmask-split faSplit sequence ../bosTau2.softmask2.fa 100 x # LOAD CPGISSLANDS (DONE, 2005-09-08, galt) ssh hgwdev mkdir -p /cluster/data/bosTau2/bed/cpgIsland cd /cluster/data/bosTau2/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/bosTau2/bed/cpgIsland/ ssh kkstore02 cd /cluster/data/bosTau2/bed/cpgIsland foreach f (../../softmask-split/*.fa) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpglh.exe $f > $fout end # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk x*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd /cluster/data/bosTau2/bed/cpgIsland hgLoadBed bosTau2 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed #Loaded 108010 elements of size 10 wc -l cpgIsland.bed # 108010 cpgIsland.bed featureBits bosTau2 cpgIslandExt # 69207600 bases of 2812203870 (2.461%) in intersection #CHROMSOMES DOWNLOAD DIRECTORY (done 2005-09-26 galt) ssh kkstore02 cd /cluster/data/bosTau2 mkdir chromosomes cd chromosomes faFilter -name=scaffold* ../bosTau2.softmask2.fa scaffolds.fa /cluster/home/galt/bin/i386/faFilter -v -name=scaffold* ../bosTau2.softmask2.fa nonscaffolds.fa faSplit byname nonscaffolds.fa ./ rm nonscaffolds.fa gzip *.fa md5sum *.gz > md5sum.txt hgwdev cd /usr/local/apache/htdocs/goldenPath/bosTau2/ mv /cluster/data/bosTau2/chromosomes/ . # LOAD GENEID PREDICTIONS (DONE galt 2005-10-19) ssh kksilo mkdir /cluster/data/bosTau2/bed/geneid cd /cluster/data/bosTau2/bed/geneid { rm -f bosTau2.gtf rm -f bosTau2.prot set base = 'http://genome.imim.es/genepredictions/B.taurus/golden_path_200503/geneidv1.2/' foreach f ( `htmlCheck getLinks $base | grep '[.]gtf$'` ) echo $f set url = "$base$f" echo "$url" wget $url set url = "$base$f:r.prot" echo "$url" wget $url end cat {c,s}*.gtf > bosTau2.gtf cat {c,s}*.prot > bosTau2.prot } # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf perl -wpe 's/^(>scaffold\S+|>chr\S+)/$1.1/' bosTau2.prot > geneid.fa ssh hgwdev cd /cluster/data/bosTau2/bed/geneid ldHgGene -gtf -genePredExt bosTau2 geneid bosTau2.gtf #Reading bosTau2.gtf #Read 98183 transcripts in 399106 lines in 1 files # 98183 groups 65717 seqs 1 sources 3 feature types #98183 gene predictions hgPepPred bosTau2 generic geneidPep geneid.fa #Processing geneid.fa # LOAD SGPGENE PREDICTIONS (DONE galt 2005-12-14) ssh kksilo mkdir /cluster/data/bosTau2/bed/sgpGene cd /cluster/data/bosTau2/bed/sgpGene { rm -f bosTau2.gtf rm -f bosTau2.prot set base = 'http://genome.imim.es/genepredictions/B.taurus/golden_path_200503/SGP/mm6/' foreach f ( `htmlCheck getLinks $base | grep '[.]gtf$'` ) echo $f set url = "$base$f" echo "$url" wget $url set url = "$base$f:r.prot" echo "$url" wget $url end cat {c,s}*.gtf > bosTau2.gtf cat {c,s}*.prot > bosTau2.prot } # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf perl -wpe 's/^(>scaffold\S+|>chr\S+)/$1.1/' bosTau2.prot > sgpGene.fa ssh hgwdev cd /cluster/data/bosTau2/bed/sgpGene ldHgGene -gtf -genePredExt bosTau2 sgpGene bosTau2.gtf #Reading bosTau2.gtf #Read 44919 transcripts in 284636 lines in 1 files # 44919 groups 14813 seqs 1 sources 3 feature types #44919 gene predictions hgPepPred bosTau2 generic sgpGenePep sgpGene.fa #Processing sgpGene.fa # SGP GENES (UPDATE 1/18/2006) sgpPep table dropped, replaced by hgc generated protein seq in browser ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram) ssh pk cd /cluster/data/mm8/bed/blastzBosTau.2006-02-18 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits bosTau2 chainMm8Link # 683178156 bases of 2812203870 (24.293%) in intersection ############################################################################ # SWAP CHAINS/NET RN4 (DONE 4/2/06 angie) ssh kkstore02 mkdir /cluster/data/bosTau2/bed/blastz.rn4.swap cd /cluster/data/bosTau2/bed/blastz.rn4.swap doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.bosTau2/DEF \ -workhorse kkr8u00 >& do.log & tail -f do.log ln -s blastz.rn4.swap /cluster/data/bosTau2/bed/blastz.rn4 ############################################################################ ## BLASTZ swap from hg17 alignments (DONE - 2006-03-30 - 2004-04-04 - Hiram) # LIFTOVER BOSTAU2HG17 # swapping to get the lift over file in the other direction ssh pk mkdir /cluster/data/bosTau2/bed/blastz.hg17.swap cd /cluster/data/bosTau2/bed ln -s blastz.hg17.swap blastz.hg17 cd blastz.hg17.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/hg17/bed/blastz.bosTau2.2005-12-19/DEF > swap.out 2>&1 & # manually cleaned this up since the run faild during the MySQL # load due to out of space problems. These tables do not need to # be loaded anyway. sh kkstore02 cd /cluster/data/bosTau2/bed/blastz.hg17.swap rm -fr psl/ rm -fr axtChain/run/chain/ rm -f axtChain/noClass.net rm -fr axtChain/net/ rm -fr axtChain/chain/ ############################################################################ ## BLASTZ swap from canFam2 alignments (DONE 2006-05-04 markd) ssh pk # not sure why this was done mkdir /cluster/data/bosTau2/bed/blastz.canFam2.swap cd /cluster/data/bosTau2/bed ln -s blastz.canFam2.swap blastz.canFam2 cd blastz.canFam2.swap time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/canFam2/bed/blastz.bosTau2/DEF >& swap.out # should have specified -stop=net, so kill hgLoadChain # create the net files ssh kolossus ./netChains.csh ssh hgwdev cd /cluster/data/bosTau2/bed/blastz.canFam2.swap/axtChain nice netClass -verbose=0 -noAr noClass.net bosTau2 canFam2 bosTau2.canFam2.net nice gzip bosTau2.canFam2.net ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna bosTau2 ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2006-06-16) ssh kkstore02 bash # if not using bash shell already # we can use blastDb directory set up for hg17 proteins cd /cluster/data/bosTau2/blastDb mkdir -p /san/sanvol1/scratch/bosTau2/blastDb cd /cluster/data/bosTau2/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/bosTau2/blastDb done mkdir -p /cluster/data/bosTau2/bed/tblastn.hg18KG cd /cluster/data/bosTau2/bed/tblastn.hg18KG echo /san/sanvol1/scratch/bosTau2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 640 query.lst # we want around 350000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(350000/640) = 67.157943 mkdir -p /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/kgfa split -l 67 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/bosTau2/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/bosTau2/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit # back to bash ssh pk cd /cluster/data/bosTau2/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 290218 of 351360 jobs # Crashed: 61142 jobs # CPU time in finished jobs: 25182187s 419703.12m 6995.05h 291.46d 0.799 y # IO & Wait Time: 2666154s 44435.89m 740.60h 30.86d 0.085 y # Average job time: 96s 1.60m 0.03h 0.00d # Longest finished job: 1106s 18.43m 0.31h 0.01d # Submission to last job: 296737s 4945.62m 82.43h 3.43d # Completed: 61142 of 61142 jobs # CPU time in finished jobs: 4075252s 67920.86m 1132.01h 47.17d 0.129 y # IO & Wait Time: 383494s 6391.57m 106.53h 4.44d 0.012 y # Average job time: 73s 1.22m 0.02h 0.00d # Longest finished job: 389s 6.48m 0.11h 0.00d # Submission to last job: 27257s 454.28m 7.57h 0.32d ssh kkstore02 cd /cluster/data/bosTau2/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' exit chmod +x chainOne ls -1dS /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/bosTau2/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 549 of 549 jobs # CPU time in finished jobs: 923373s 15389.55m 256.49h 10.69d 0.029 y # IO & Wait Time: 35341s 589.01m 9.82h 0.41d 0.001 y # Average job time: 1746s 29.10m 0.49h 0.02d # Longest finished job: 49684s 828.07m 13.80h 0.58d # Submission to last job: 57041s 950.68m 15.84h 0.66d ssh kkstore02 cd /cluster/data/bosTau2/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/bosTau2/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # this is ok. # load table ssh hgwdev cd /cluster/data/bosTau2/bed/tblastn.hg18KG hgLoadPsl bosTau2 blastHg18KG.psl # check coverage featureBits bosTau2 refGene:cds blastHg18KG -enrichment # refGene:cds 0.274%, blastHg18KG 0.966%, both 0.230%, cover 83.83%, enrich 86.80x ssh kkstore02 rm -rf /cluster/data/bosTau2/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## # LIFTOVER TO BOSTAU3 (DONE 8/17/07 angie) ssh kolossus # machine on which script is run doesn't really matter mkdir /cluster/data/bosTau2/bed/blat.bosTau3.2007-08-16 cd /cluster/data/bosTau2/bed/blat.bosTau3.2007-08-16 doSameSpeciesLiftOver.pl bosTau2 bosTau3 >& do.log & tail -f do.log ##########################################################################