# for emacs: -*- mode: sh; -*- # Bos Taurus -- Baylor Release 3.1 (August 2006) ######################################################################### # DOWNLOAD SEQUENCE (DONE Sept. 21, 2006 heather) ssh kkstore05 mkdir /cluster/store12/bosTau3 ln -s /cluster/store12/bosTau3 /cluster/data/bosTau3 mkdir /cluster/data/bosTau3/baylor cd /cluster/data/bosTau3/baylor wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20060815-freeze/ReadMeBovine.3.1.txt wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20060815-freeze/contigs/* wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20060815-freeze/markers/* mkdir chroms cd chroms wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20060815-freeze/linearScaffolds/* NOTE: chrM is available from NCBI nucleotides NC_006853. I didn't check for it soon enough to include it here, but it should be included in bosTau4. ######################################################################### # MAKE GENOME DB (September 2006 Heather) # I got this right after a few attempts. # There were 3 things I needed to change in the files from Baylor: # I put the edits into /cluster/data/bosTau3/fixup. # # 1) Their AGP file used capitalized "Chr". This is not recommended for us. ssh kkstore05 cd /cluster/data/bosTau3 sed -e 's/Chr/chr/' baylor/Btau20060815.agp > fixup/UCSC.agp # 2) Their quality scores were reported by scaffold, not by chrom. # Convert to qac (UCSC compressed qual score format) and lift from AGP. zcat baylor/Btau20060815.contigs.fa.qual.gz | qaToQac stdin fixup/scaffolds.qac qacAgpLift fixup/UCSC.agp fixup/scaffolds.qac fixup/chrom.qac # 3) They used a ".1" suffix in the contig and quality files. # An example line from the contig.fa file: # >gi|112123023|gb|AAFC03055291.1| Bos taurus Ctg58.CH240-395G16, whole genome shotgun sequence # We need just # >AAFC03055291 # Fix this with variations on a simple C program: #include "common.h" #include "linefile.h" void doTrimHeader(char *inputFileName) { FILE *outputFileHandle = mustOpen("trimHeader.out", "w"); struct lineFile *lf = lineFileOpen(inputFileName, TRUE); char *line; int lineSize; char *row[5], *contigId[2]; while (lineFileNext(lf, &line, &lineSize)) { if (line[0] != '>') { fprintf(outputFileHandle, "%s\n", line); continue; } if (fileType == "contig") { chopString(line, "|", row, ArraySize(row)); chopString(row[3], ".", contigId, ArraySize(contigId)); fprintf(outputFileHandle, ">%s\n", contigId[0]); } else // qual scores { chopString(line, ".", contigId, ArraySize(row)); fprintf(outputFileHandle, "%s\n", contigId[0]); } } carefulClose(&outputFileHandle); lineFileClose(&lf); } # At this point, attempt to run makeGenomeDb.pl: cat > bosTau3.config.ra <& makeGenomeDb.log & tail -f makeGenomeDb.log # This fails as described in the log file: # Error: IDs in fastaFiles # do not perfectly match IDs in either the first or sixth columns of agpFiles # Please examine and then remove these temporary files: # /tmp/makeGenomeDb.fastaIds.W10474 -- fasta IDs # /tmp/makeGenomeDb.agpIds.E10485 -- AGP first column IDs ('big' sequences) # /tmp/makeGenomeDb.agpIds.T10489 -- AGP sixth column IDs ('little' sequences) # Command failed: # ssh -x kkr6u00 nice /cluster/data/bosTau3/jkStuff/makeUnmasked2bit.csh # Move tmp files to kkr6u00:/scratch/heather/bosTau3 # Create diff.clean which has 108 IDs # Write to Kim Worley (kworley@bcm.tmc.edu) # Response from Kim on Sept. 22, 2006: # Those are sequences that were intentionally omitted from the AGP, because they # were deemed to be contaminants after they were accessioned. # Please omit them from your assembly. # We can use faSomeRecords to get rid of the bad IDs. # First generate a sorted list of just the IDs in fixup/3.agp cd /cluster/data/bosTau3/fixup zcat trimHeader.contigs.fa.gz | faSomeRecords stdin 3.agp UCSC.bosTau3.fa gzip fixup/UCSC.bosTau3.fa zcat trimHeader.qual.gz | faSomeRecords stdin 3.agp UCSC.qual gzip fixup/UCSC.qual # Edit bosTau3.config.ra # fastaFiles /cluster/data/bosTau3/fixup/UCSC.bosTau3.fa.gz # qualFiles /cluster/data/bosTau3/fixup/UCSC.qual.gz # Drop database ssh hgwdev hgsql -e 'drop database bosTau3' mysql # Backup files and rerun ssh kkstore02 cd /cluster/data/bosTau3 mv chrom.sizes chrom.sizes.1 mv bosTau3.agp bosTau3.agp.1 mv makeGenomeDb.log makeGenomeDb.log.1 /cluster/home/angie/kent/src/utils/makeGenomeDb.pl bosTau3.config.ra > & makeGenomeDb.log & ######################################################################### # REPEATMASKER (DONE Oct 5, REDONE Oct 16, 2006 heather) ssh kkstore05 cd /cluster/data/bosTau3/bed # Run -debug to create the dir structure and preview the scripts: ~/kent/src/utils/doRepeatMasker.pl bosTau3 -verbose 3 -debug # Run it for real and tail the log: ~/kent/src/utils/doRepeatMasker.pl bosTau3 -verbose 3 \ >& RepeatMasker.2006-10-05/do.log & tail -f RepeatMasker.2006-10-05/do.log # RepeatMasker and lib version from do.log: # March 20 2006 (open-3-1-5) version of RepeatMasker #CC RELEASE 20060315; * # Compare coverage to previous assembly: featureBits bosTau3 rmsk # 1200525422 bases of 2731807384 (43.946%) in intersection featureBits bosTau2 rmsk # 1291561904 bases of 2812203870 (45.927%) in intersection # Oct 16 # October 6 2006 (open-3-1-6) version of RepeatMasker #CC RELEASE 20061006; * featureBits bosTau3 rmsk # 1270629763 bases of 2731807384 (46.512%) in intersection ######################################################################### # SIMPLE REPEATS (TRF) (Oct 2006/Feb 2007 heather) # look for an idle small cluster machine ssh -x kki parasol list machines | grep idle ssh kkr1u00 nice tcsh mkdir /cluster/data/bosTau3/bed/simpleRepeat cd /cluster/data/bosTau3/bed/simpleRepeat twoBitToFa ../../bosTau3.unmasked.2bit stdout \ | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \ -bedAt=simpleRepeat.bed -tempDir=/tmp \ >& trf.log & tail -f trf.log # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed splitFileByColumn trfMask.bed trfMaskChrom # Load unfiltered repeats into the database: ssh hgwdev hgLoadBed bosTau3 simpleRepeat \ /cluster/data/bosTau3/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # 878 warnings # Compare coverage to previous assembly: featureBits bosTau3 simpleRepeat # 22475721 bases of 2731807384 (0.823%) in intersection featureBits bosTau2 simpleRepeat # 43628019 bases of 2812203870 (1.551%) in intersection # try again, spliting the fa files first this time # 13k files in one directory, seems okay... ssh kkstore05 cd /cluster/data/bosTau3 mkdir split cd split twoBitToFa bosTau3.unmasked.2bit bosTau3.unmasked.fa faSplit byname bosTau3.unmasked.fa . rm bosTau3.unmasked.fa cd /cluster/data/bosTau3/bed mkdir simpleRepeat4 cd simpleRepeat4 ./makeJobList-trf.csh # split into regular chroms and chrUn # regular chroms took about 90 minutes ./trf-run.csh > & ! trf.log & # chrUn took about 2 hours ./trf-run-chrUn.csh > & ! trfUn.log & ./concat.csh ./concatUn.csh # non chrUn 235926 # chrUn 54713 cp trf.bed all.bed cat trfUn.bed >> all.bed hgLoadBed bosTau3 simpleRepeat all.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits bosTau3 simpleRepeat # 22475721 bases of 2731807384 (0.823%) in intersection # Same result as first run! Must be right # Not sure why it is less than bosTau2, maybe just the assembly is better # Make a filtered version for sequence masking: awk '{if ($5 <= 12) print;}' all.bed > trfMask.bed splitFileByColumn trfMask.bed trfMaskChrom # cleanup cd /cluster/data/bosTau3/bed mv simpleRepeat simpleRepeatOct13 mv simpleRepeat2 simpleRepeatOct14 mv simpleRepeat3 simpleRepeatFeb23 mv simpleRepeat4 simpleRepeat ######################################################################### # WINDOWMASKER (TESTING Nov 16, 2006 Andy) ssh hgwdev screen time ~/kent/src/hg/utils/automation/doWindowMasker.pl \ -workhorse kolossus bosTau3 >& bosTau3.wmsk.log # Note: windowmasker uses the internet so -workhorse is basically a # required option. # Cntrl+A+D then tail -f on the bosTau3.wmsk.log and keep an eye on it. # Log into kolossus and run top. Again, keep an eye on it. ######################################################################### # ooc file # use same repMatch value as bosTau2 ssh kkr3u00 cd /cluster/data/bosTau3 mkdir bed/ooc cd bed/ooc ls -1 /cluster/data/bosTau3/bosTau3.2bit > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 -makeOoc=/cluster/bluearc/bosTau3/11.ooc -repMatch=1005 ssh kkr1u00 cd /iscratch/i/bosTau3 cp -p /cluster/bluearc/bosTau3/11.ooc . # alternate to iSync suggested by Hiram /bin/bash for R in 2 3 4 5 6 7 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/bosTau3 done ######################################################################### # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (Heather, Feb. 2007) ssh kolossus cd /cluster/data/bosTau3 time twoBitMask bosTau3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau3.2bit # Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which # means it might contain block coordinates, but this program uses only the # first three fields (the entire span -- no support for blocks). # 2.870u 6.542s 0:21.16 44.4% 0+0k 0+0io 5pf+0w # Link to it from /gbdb: ln -s /cluster/data/bosTau3/bosTau3.2bit /gbdb/bosTau3/bosTau3.2bit # Ask cluster-admin to bring up BLAT server echo 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) VALUES ("bosTau3", "blat11", 17782, 1, 0); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) VALUES ("bosTau3", "blat11", 17783, 0, 1);' \ | hgsql -h genome-testdb hgcentraltest ######################################################################### # MAKE DOWNLOADABLE / GOLDENPATH FILES (Heather March 2007) cd /cluster/data/bosTau3 ln -s /cluster/data/bosTau3/bed/RepeatMasker.2006-10-16/bosTau3.fa.out /cluster/data/bosTau3 makeDownloads.pl bosTau3 -verbose=2 >& jkStuff/downloads.log & ######################################################################### # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (Heather, Feb. 2007) # pitakluster: ssh pk mkdir /san/sanvol1/scratch/bosTau3 cp -p /cluster/data/bosTau3/bosTau3.2bit /san/sanvol1/scratch/bosTau3 cp -p /cluster/data/bosTau3/chrom.sizes /san/sanvol1/scratch/bosTau3 mkdir /san/sanvol1/scratch/bosTau3/rmsk cp -p /cluster/data/bosTau3/bosTau3.fa.out /san/sanvol1/scratch/bosTau3/rmsk # small cluster: ssh kkr1u00 mkdir -p /iscratch/i/bosTau3 cd /iscratch/i/bosTau3 cp -p /cluster/data/bosTau3/bosTau3.2bit . cp -p /cluster/data/bosTau3/chrom.sizes . cp -p /cluster/data/bosTau3/11.ooc . # alternate to iSync suggested by Hiram /bin/bash for R in 2 3 4 5 6 7 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/bosTau3 done ######################################################################### # BLASTZ/CHAIN/NET HG18 (Done Feb 2007 heather) # Do target = hg18 first because bosTau3 is 13k scaffolds (documented in hg18.txt) # Then do swap mkdir /cluster/data/bosTau3/bed/blastz.hg18.2007-02-25 ln -s /cluster/data/bosTau3/bed/blastz.hg18.2007-02-25 /cluster/data/bosTau3/bed/blastz.hg18 cd /cluster/data/bosTau3/bed/blastz.hg18 doBlastzChainNet.pl /cluster/data/hg18/bed/blastz.bosTau3/DEF -swap >& do.log & featureBits bosTau3 chainHg18Link # 1378293354 bases of 2731807384 (50.454%) in intersection ######################################################################### # BLASTZ/CHAIN/NET MM8 (Done March 2007 heather) # Do target = mm8 first because bosTau3 is 13k scaffolds (documented in mm8.txt) # Then do swap mkdir /cluster/data/bosTau3/bed/blastz.mm8.2007-03-23 ln -s /cluster/data/bosTau3/bed/blastz.mm8.2007-03-23 /cluster/data/bosTau3/bed/blastz.mm8 cd /cluster/data/bosTau3/bed/blastz.mm8 doBlastzChainNet.pl /cluster/data/mm8/bed/blastz.bosTau3/DEF -swap >& do.log & featureBits bosTau3 chainMm8Link # 707594985 bases of 2731807384 (25.902%) in intersection ######################################################################### # BLASTZ/CHAIN/NET RN4 (Done March 2007 heather) # Do target = rn4 first because bosTau3 is 13k scaffolds (documented in rn4.txt) # Then do swap mkdir /cluster/data/bosTau3/bed/blastz.rn4.2007-03-26 ln -s /cluster/data/bosTau3/bed/blastz.rn4.2007-03-26 /cluster/data/bosTau3/bed/blastz.rn4 cd /cluster/data/bosTau3/bed/blastz.rn4 doBlastzChainNet.pl /cluster/data/rn4/bed/blastz.bosTau3/DEF -swap >& do.log & featureBits bosTau3 chainRn4Link # 666169091 bases of 2731807384 (24.386%) in intersection ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-04-24) ssh kkstore05 # bash if not using bash shell already mkdir /cluster/data/bosTau3/blastDb cd /cluster/data/bosTau3 find split -name "c*" | xargs cat > temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done echo *.fa | xargs rm mkdir -p /san/sanvol1/scratch/bosTau3/blastDb cd /cluster/data/bosTau3/blastDb for i in nhr nin nsq; do echo $i for j in *.$i do cp $j /san/sanvol1/scratch/bosTau3/blastDb done done mkdir -p /cluster/data/bosTau3/bed/tblastn.hg18KG cd /cluster/data/bosTau3/bed/tblastn.hg18KG echo /san/sanvol1/scratch/bosTau3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 16114 query.lst # we want around 350000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(350000/16114) = 1690.911080 # 36727/(250000/16114) = 2367.275512 mkdir -p /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/kgfa split -l 1690 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/bosTau3/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/bosTau3/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/bosTau3/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. # Completed: 354508 of 354508 jobs # CPU time in finished jobs: 43134111s 718901.85m 11981.70h 499.24d 1.368 y # IO & Wait Time: 23636059s 393934.32m 6565.57h 273.57d 0.749 y # Average job time: 188s 3.14m 0.05h 0.00d # Longest finished job: 1330s 22.17m 0.37h 0.02d # Submission to last job: 247330s 4122.17m 68.70h 2.86d ssh kkstore05 cd /cluster/data/bosTau3/bed/tblastn.hg18KG for i in blastOut/* do echo "cd $i; echo *.psl | xargs cat | pslSortAcc nohead chrom /tmp/ stdin ; cd ../.." done > sort.jobs sh -x sort.jobs ssh kkstore05 cd /cluster/data/bosTau3/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainOne simpleChain -prot -outPsl -maxGap=150000 $1 `dirname $1`/c.`basename $1`.psl '_EOF_' cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' chmod +x chainOne ls ../blastOut/*/chrom/*.psl > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/bosTau3/bed/tblastn.hg18KG/chainRun para create chainSpec para maxNode 30 para try, check, push, check etc. # Completed: 660 of 682 jobs # Crashed: 22 jobs # CPU time in finished jobs: 249233s 4153.89m 69.23h 2.88d 0.008 y # IO & Wait Time: 4468s 74.46m 1.24h 0.05d 0.000 y # Average job time: 384s 6.41m 0.11h 0.00d # Longest finished job: 2190s 36.50m 0.61h 0.03d # Submission to last job: 9537s 158.95m 2.65h 0.11d # all the chrUn jobs crashed on kk, re-do on kolossus ssh kkstore05 cd /cluster/data/bosTau3/bed/tblastn.hg18KG/blastOut for i in kg?? do cat $i/chrom/c.*.psl|awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/bosTau3/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # load table ssh hgwdev cd /cluster/data/bosTau3/bed/tblastn.hg18KG hgLoadPsl bosTau3 blastHg18KG.psl # check coverage featureBits bosTau3 blastHg18KG # 35001855 bases of 2731807384 (1.281%) in intersection featureBits bosTau3 refGene:cds blastHg18KG -enrichment # refGene:cds 0.292%, blastHg18KG 1.281%, both 0.258%, cover 88.42%, enrich 69.01x ssh kkstore05 rm -rf /cluster/data/bosTau3/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/bosTau3/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## # CpG Islands (Done, June 27 2007, Heather) ssh kkstore05 cd /cluster/data/bosTau3/bed mkdir cpgIsland cd cpgIsland zcat ../../goldenPath/bigZips/bosTau3.fa.masked.gz | faSplit about stdin 2000000 split ssh hgwdev cd /cluster/data/bosTau3/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe # cpg_lh.c: In function `main': # cpg_lh.c:74: warning: conflicting types for built-in function 'malloc' mv cpglh.exe /cluster/data/bosTau3/bed/cpgIsland/ ssh kkstore05 # could also have used kolossus cd /cluster/data/bosTau3/bed/cpgIsland cat << '_EOF_' > run.csh #!/bin/csh -ef foreach f (split*) set fout=$f:t:r:r.cpg echo $fout ./cpglh.exe $f > $fout end '_EOF_' # << this line makes emacs coloring happy run.csh # Transform cpglh output to bed + cat << '_EOF_' > filter.awk { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk *.cpg > cpgIsland.bed # load into database ssh hgwdev cd /cluster/data/bosTau3/bed/cpgIsland hgLoadBed bosTau3 cpgIslandExt -tab \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed wc -l cpgIsland.bed # 38189 cpgIsland.bed featureBits bosTau3 cpgIslandExt # 24374280 bases of 2731807384 (0.892%) in intersection ########################################################################## # GENSCAN (June 2007 Heather) # Assume none of the 200K scaffolds are all Ns # (These cause Genscan to run forever) ssh hgwdev mkdir /cluster/data/bosTau3/bed/genscan cd /cluster/data/bosTau3/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt cvs co hg3rdParty/genscanlinux # generate hard-masked sequence ssh kkstore05 cd /cluster/data/bosTau3/bed/genscan zcat ../../goldenPath/bigZips/bosTau3.fa.gz | maskOutFa stdin hard bosTau3.hardmask.fa mkdir split cd split faSplit about ../bosTau3.hardmask.fa 2000000 split # Generate file list and check that no files are completed masked cd .. cat << '_EOF_' > makeList.csh foreach f ( /cluster/data/bosTau3/bed/genscan/split/*.fa ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end '_EOF_' wc -l genome.list # 252 genome.list # Log into kki (not kk!). kki is the driver node for the small # cluster (kkr1u00-kkr8u00). Genscan has a problem running on the # big cluster, due to limited memory and swap space on each # processing node. ssh kki cd /cluster/data/bosTau3/bed/genscan # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy /parasol/bin/gensub2 genome.list single gsub jobList para create jobList para try para check para push # chr9 (split08.fa) failed due to insufficient memory # Tried again on kki, still failed # Tried on kolossus, still failed # Tried on kolossus with -window=1000000, that worked # Concatenate ssh kkstore05 cd /cluster/data/bosTau3/bed/genscan cat gtf/*.gtf > genscan.gtf cat pep/*.pep > genscan.pep cat subopt/*.bed > genscanSubopt.bed # Load into the database ssh hgwdev cd /cluster/data/bosTau3/bed/genscan ldHgGene -gtf bosTau3 genscan genscan.gtf hgPepPred bosTau3 generic genscanPep genscan.pep hgLoadBed bosTau3 genscanSubopt genscanSubopt.bed featureBits bosTau3 genscan # 59251085 bases of 2731807384 (2.169%) in intersection featureBits bosTau3 genscanSubopt # 58996872 bases of 2731807384 (2.160%) in intersection # Should be zero intersection with rmsk featureBits bosTau3 genscan rmsk ########################################################################## # MAKE SCAFFOLD LIFT FILE (DONE 11/14/07 angie) # Bovine consortium is doing analysis in scaffold coords. Make a liftUp # for viewing in GB. cd /cluster/data/bosTau3/baylor wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20060815-freeze/individualScaffolds/Btau3.1.scaffolds.chr # Exclude chrUn from the lift file for two bosTau3-specific reasons: # chrUn scaffolds were kept separate, and they used a different prefix # (bosTau3 prefix is "chrUn.003" while Baylor scaffold prefix is just # ChrUn) sed -e '/ChrUn/ d; s/Chr30/ChrX/' Btau3.1.scaffolds.chr \ | ~/kent/src/utils/baylorScaffoldChrToAgp.pl \ ../chrom.sizes \ | agpToLift -revStrand \ > ../jkStuff/scaffolds.lft # Actually, we can add on some name-translating .lft lines for chrUn.*: perl -wpe '@w=split; if ($w[0] !~ /ChrUn/) { s/^.*\n$//; next; } \ $w[0] =~ s/^.*Chr/Chr/; \ $chr = $w[0]; $chr =~ s/ChrUn/chrUn.003/; \ $size = $w[2] - $w[1] + 1;\ s/^.*/0\t$w[0]\t$size\t$chr\t$size\t$w[3]/; \ ' Btau3.1.scaffolds.chr \ > /tmp/chrUn.lft # Make sure sizes agree: grep ^chrUn ../chrom.sizes > /tmp/1 awk '{print $4 "\t" $5;}' /tmp/chrUn.lft > /tmp/2 cmp /tmp/1 /tmp/2 # Add chrUn lines to main .lft: cat /tmp/chrUn.lft >> ../jkStuff/scaffolds.lft ########################################################################## # MAKE SCAFFOLDS TRACK (DONE 11/14/07 angie) ssh hgwdev mkdir /cluster/data/bosTau3/bed/scaffolds cd /cluster/data/bosTau3/bed/scaffolds # bosTau3's chrUn.* scaffolds are separate, so make this track only # for the assembled chroms. perl -wpe 'if (/ChrUn/) {s/^.*\n$//; next;} \ m/(Chr\w+)\.(\d+)\t(\d+)\t(\d+)\t([+N-])/ || die; \ ($chr, $scafNum, $start, $end, $strand) = ($1, $2, $3, $4, $5); \ $scafName = "$chr.$scafNum"; \ $chr =~ s/^C/c/; $chr =~ s/chr30/chrX/; \ $start--; $strand =~ s/N/+/; \ s/^.*$/$chr\t$start\t$end\t$scafName\t0\t$strand/;' \ ../../baylor/Btau3.1.scaffolds.chr \ > scaffolds.bed hgLoadBed bosTau3 scaffold scaffolds.bed ########################################################################## # MAKE PER-CHROM FASTA DOWNLOADS (DONE 12/7/07 angie) ssh kkstore05 mkdir /cluster/data/bosTau3/goldenPath/chromosomes cd /cluster/data/bosTau3/goldenPath/chromosomes awk '$1 ~ /^chrUn/ {print $1;}' ../../chrom.sizes > scafs.lst awk '$1 !~ /^chrUn/ {print $1;}' ../../chrom.sizes > chroms.lst foreach chr (`cat chroms.lst`) twoBitToFa ../../bosTau3.2bit stdout -seq=$chr \ | gzip -c > $chr.fa.gz end twoBitToFa ../../bosTau3.2bit stdout -seqList=scafs.lst \ | gzip -c > chrUn.scaffolds.fa.gz rm chroms.lst scafs.lst md5sum *.gz > md5sum.txt # Make README.txt ssh hgwdev ln -s /cluster/data/bosTau3/goldenPath/chromosomes \ /usr/local/apache/htdocs/goldenPath/bosTau3/chromosomes ############################################################################ # SGP GENES (DONE - 2007-12-20 - Hiram) ssh kkstore05 mkdir /cluster/data/bosTau3/bed/sgp cd /cluster/data/bosTau3/bed/sgp mkdir gtf # chrUn is done in a different manner for C in `grep -v chrUn /cluster/data/bosTau3/chrom.sizes | awk '{print $1}'` do wget --timestamping \ "http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/SGP/mm9/${C}.gtf" \ -O "gtf/${C}.gtf" done # a single chrUn.gtf file has all the scaffolds wget --timestamping \ 'http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/SGP/mm9/chrUn.gtf' \ -O "gtf/chrUn.gtf" ssh hgwdev cd /cluster/data/bosTau3/bed/sgp ldHgGene -gtf -genePredExt bosTau3 sgpGene gtf/chr*.gtf # Read 41496 transcripts in 308500 lines in 31 files # 41496 groups 4767 seqs 1 sources 3 feature types # 41496 gene predictions featureBits bosTau3 sgpGene # 40658543 bases of 2731807384 (1.488%) in intersection featureBits bosTau3 -enrichment refGene:CDS sgpGene # refGene:CDS 0.426%, sgpGene 1.488%, both 0.384%, cover 90.22%, # enrich 60.62x ##################################################################### # LOAD GENEID GENES (DONE - 2007-12-20, 2008-03-27 - Hiram) ssh kkstore05 mkdir -p /cluster/data/bosTau3/bed/geneid cd /cluster/data/bosTau3/bed/geneid mkdir gtf prot # The data load in December 2007 was broken at their end. # download new data 27 March 2008 and reload this track for C in `grep -v chrUn /cluster/data/bosTau3/chrom.sizes | awk '{print $1}'` do echo $C wget --timestamping \ "http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/geneidv1.2/${C}.gtf" \ -O "gtf/${C}.gtf" wget --timestamping \ "http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/geneidv1.2/${C}.prot" \ -O "prot/${C}.prot" done wget --timestamping \ "http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/geneidv1.2/chrUn.gtf" \ -O "gtf/chrUn.gtf" wget --timestamping \ "http://genome.imim.es/genepredictions/B.taurus/hgsc_Btau20060815/geneidv1.2/chrUn.prot" \ -O "prot/chrUn.prot" # Add missing .1 to protein id's cd prot foreach f (*.prot) perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot end ssh hgwdev cd /cluster/data/bosTau3/bed/geneid ldHgGene -genePredExt -gtf bosTau3 geneid gtf/*.gtf # Read 50657 transcripts in 325618 lines in 31 files # 50657 groups 12601 seqs 1 sources 3 feature types # 50657 gene predictions hgPepPred bosTau3 generic geneidPep prot/chr*-fixed.prot featureBits bosTau3 geneid # 44623578 bases of 2731807384 (1.633%) in intersection featureBits bosTau3 -enrichment refGene:CDS geneid # refGene:CDS 0.429%, geneid 1.633%, both 0.364%, cover 84.64%, enrich 51.82x ############################################################################# # LIFTOVER TO bosTau4 (WORKING - 2008-04-02 - Hiram) ssh kkstore05 screen -r -d # use screen to control this job # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug bosTau3 bosTau4 \ -ooc /cluster/bluearc/bosTau3/11.ooc # Real run: cd /cluster/data/bosTau3/bed/blat.bosTau4.2008-04-02 time nice -n +19 doSameSpeciesLiftOver.pl bosTau3 bosTau4 \ -ooc /cluster/bluearc/bosTau3/11.ooc > do.log 2>&1 & # real 587m45.013s # this broke down on kolossus due to out of disk space on # /scratch/tmp/ - so, go to one of the memk nodes, and finish # the doNet.csh script ssh mkr0u0 cd /cluster/data/bosTau3/bed/blat.bosTau4.2008-04-02/run.chain time ./doNet.csh # real 81m17.919s # then, continuing: ssh kkstore05 cd /cluster/data/bosTau3/bed/blat.bosTau4.2008-04-02 time nice -n +19 doSameSpeciesLiftOver.pl -continue=load bosTau3 bosTau4 \ -buildDir=/cluster/data/bosTau3/bed/blat.bosTau4.2008-04-02 \ -ooc /cluster/bluearc/bosTau3/11.ooc > load.log 2>&1 & # real 1m17.778s ############################################################################ # bosTau3 - Cow - Ensembl Genes (DONE - 2008-04-22 - hiram) # This one had to be done manually. There was a chrUn lift file that # needed to be made to turn the Ensembl chrUn coordinates into # the UCSC chrUn.003.N contig coordinates # It was run with a geneScaffolds yes to fetch the MySQL tables, # Then a script was run: ssh hgwdev cd /cluster/data/bosTau3 cat << '_EOF_' > jkStuff/chrUnLiftAcross.pl #!/usr/bin/env perl use strict; use warnings; open (FH,") { chomp $line; my ($region_id, $name, $type, $size) = split('\s+', $line); $end = $start + $size; printf "chrUn\t%d\t%d\t%s\t%d\t%d\t+\n", $start, $end, $name, 0, $size; $start += $size + $gap; } close (FH); '_EOF_' # << happy emacs chmod +x jkStuff/chrUnLiftAcross.pl cd bed/ensGene.49/download ../../../jkStuff/chrUnLiftAcross.pl > ../../../jkStuff/chrUn.liftAcross.txt # then use that chrUn.liftAcross.txt in the process script procedure # after that, comment out the gene Scaffolds and -continue=load ssh kkstore05 cd /cluster/data/bosTau3 cat << '_EOF_' > bosTau3.ensGene.ra # required db variable db bosTau3 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9UX][0-9n]*\)/chr\1/; /^MT/d" # cause SQL tables to be fetched to see if chrUn can be fixed up # geneScaffolds yes '_EOF_' # << happy emacs $HOME/kent/src/hg/utils/automation/doEnsGeneUpdate.pl \ -ensVersion=49 -stop=process bosTau3.ensGene.ra # do the manual fixups as described above, then continue doEnsGeneUpdate.pl -continue=load -ensVersion=49 bosTau3.ensGene.ra ssh hgwdev cd /cluster/data/bosTau3/bed/ensGene.49 featureBits bosTau3 ensGene # 39278215 bases of 2731807384 (1.438%) in intersection ############################################################################ # SELF BLASTZ (DONE - 2008-06-30 - Hiram) # do not want the noise from the contigs ssh kkstore05 screen # use a screen to manage this multi-day job mkdir /cluster/data/bosTau3/noContigs cd /cluster/data/bosTau3/noContigs cut -f1 ../chrom.sizes | grep -v chrUn | while read C do echo "twoBitToFa -seq=${C} ../bosTau3.2bit ${C}.fa" twoBitToFa -seq=${C} ../bosTau3.2bit ${C}.fa done faToTwoBit chr*.fa bosTau3.noContigs.2bit twoBitToFa bosTau3.noContigs.2bit stdout | faSize stdin # 2434234369 bases (141856351 N's 2292378018 real # 1246068866 upper 1046309152 lower) in 30 sequences in 1 files # %42.98 masked total, %45.64 masked real grep -v chrUn ../chrom.sizes | ave -col=2 stdin # count 30 # total 2434234369.000000 twoBitInfo bosTau3.noContigs.2bit stdout | sort -k2nr \ > bosTau3.noContigs.chrom.sizes cp -p bosTau3.noContigs.2bit /cluster/bluearc/scratch/data/bosTau3 cp -p bosTau3.noContigs.chrom.sizes /cluster/bluearc/scratch/data/bosTau3 mkdir /cluster/data/bosTau3/bed/blastzSelf.2008-08-30 cd /cluster/data/bosTau3/bed ln -s blastzSelf.2008-08-30 blastzSelf cd blastzSelf.2008-08-30 cat << '_EOF_' > DEF BLASTZ_M=400 # TARGET: Cow bosTau3 SEQ1_DIR=/cluster/bluearc/scratch/data/bosTau3/bosTau3.noContigs.2bit SEQ1_LEN=/cluster/bluearc/scratch/data/bosTau3/bosTau3.noContigs.chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau3 SEQ2_DIR=/cluster/bluearc/scratch/data/bosTau3/bosTau3.noContigs.2bit SEQ2_LEN=/cluster/bluearc/scratch/data/bosTau3/bosTau3.noContigs.chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/bosTau3/bed/blastzSelf.2008-08-30 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy cd /cluster/data/bosTau3/bed/blastzSelf.2008-08-30 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ `pwd`/DEF > blastz.out 2>&1 & # failed during loading XXXX - running 2008-06-30 11:49 # real 3537m49.719s ############################################################################