# for emacs: -*- mode: sh; -*- # Bos taurus # Btau_1.0 from Baylor # September 2004 # contigs and scaffolds, no chromosomes # WGS, Atlas assembler, 3x coverage # ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze # anonymous ftp, no login/password needed # DOWNLOAD SEQUENCE (DONE Sept. 30, 2004 Heather) ssh kksilo mkdir /cluster/store8/bosTau1 cd /cluster/data ln -s /cluster/store8/bosTau1 bosTau1 cd /cluster/data/bosTau1 mkdir downloads cd downloads # we won't use these, but download them for completeness mkdir Bin0 cd Bin0 wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/Bin0/Btau20040927-freeze-bin0.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/Bin0/Btau20040927-freeze-bin0.fa.qual.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/Bin0/Btau_lib_insert_size_20040927.tbl.gz cd .. mkdir contigs cd contigs wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/contigs/Btau20040927-freeze-contigs.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/contigs/Btau20040927-freeze-contigs.fa.qual.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/contigs/Btau_v1.0_20040927.agp.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/contigs/AAFC01_accs.gz cd .. mkdir linearScaffolds cd linearScaffolds wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/linearScaffolds/Btau20040927-freeze-assembly.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/linearScaffolds/Btau20040927-freeze-assembly.fa.qual.gz cd .. mkdir repeat-reads cd repeat-reads wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/repeat-reads/Btau20040927-freeze-repeat-reads.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/repeat-reads/Btau20040927-freeze-repeat-reads.fa.qual.gz cd .. # GUNZIP gunzip */*.gz # CHECK FILES # Check that IDs in .fa match IDs in .qual ssh kksilo cd /cluster/data/bosTau1/downloads # foreach Bin0 contigs linearScaffolds repeat-reads grep ">" *.fa > id.fa grep ">" *.qual > id.qual diff id.fa id.qual # this takes about 5-10 minutes /cluster/bin/i386/checkAgpAndFa contigs/Btau_v1.0_20040927.agp linearScaffolds/Btau20040927-freeze >& check.out /cluster/bin/i386/checkAgpAndFa contigs/Btau_v1.0_20040927.agp linearScaffolds/Btau20040927-freeze > check.err # SELECTED DOWNLOAD FILES (DONE, Feb. 9, 2005, Heather) ssh hgwdev cd /cluster/data/bosTau1/downloads cd contigs cp *agp /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips/scaffoldAgp cd /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips nice gzip scaffoldAgp # takes about 30 minutes for gzip cd /cluster/data/bosTau1/downloads cd scaffolds cp Btau20040927-freeze-assembly.fa /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips/scaffoldFa cd /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips nice gzip scaffoldFa # SPLIT SCAFFOLDS (DONE Sep. 30, 2004, Heather) ssh eieio cd /cluster/data/bosTau1 mkdir scaffolds /cluster/bin/i386/faSplit byname downloads/linearScaffolds/Btau20040927-freeze-assembly.fa scaffolds -outDirDepth=3 # not sure why these didn't get put under scaffolds directory mv 1 scaffolds mv 2 scaffolds mv 3 scaffolds mv 4 scaffolds etc. # generate list find scaffolds -print > scaffolds.list.0 grep fa scaffolds.list.0 > scaffolds.list # MAKE 2BIT NIB FILE (DONE Oct.1, 2004, Heather) ssh kksilo cd /cluster/data/bosTau1/linearScaffolds /cluster/bin/i386/faToTwoBit Btau20040927-freeze-assembly.fa bosTau1.2bit /cluster/bin/i386/twoBitToFa bosTau1.2bit check.fa mv bosTau1.2bit .. diff -q Btau20040927-freeze-assembly.fa check.fa # could also use cmp # note: size match ssh hgwdev mkdir /gbdb/bosTau1 mkdir /gbdb/bosTau1/nib # could get rid of the nib in this path, 2bit isn't a nib (4bit) ln -s /cluster/data/bosTau1/bosTau1.2bit /gbdb/bosTau1/nib # CREATE DATABASE ssh hgwdev hgsql hg17 create database bosTau1; use bosTau1; create table grp (PRIMARY KEY(NAME)) select * from hg17.grp; # add rows to hgcentraltest on hgwbeta in dbDb and defaultDb # set orderKey past dog, before mouse # CREATE GAP TABLE ssh hgwdev cd kent/src/hg/lib # remove bin column for now hgsql bosTau1 < gap2.sql cd /cluster/data/bosTau1/downloads/contigs grep fragment Btau_v1.0_20040927.agp > fragment.txt hgsql bosTau1 load data local infile 'fragment.txt' into table gap # create trackDb/cow/bosTau1/gap.html # CREATE CONTIG TABLE (DONE, Oct. 5, 2004, Heather) ssh hgwdev cd kent/src/hg/lib # this doesn't include strand or contig start/end hgsql bosTau1 < ctgPos2.sql cd /cluster/data/bosTau1/downloads/contigs grep Contig Btau_v1.0_20040927.agp > Contig.out ssh kksilo cd /cluster/data/bosTau1/downloads/contigs getContig.pl < Contig.out > ctgPos2 ssh hgwdev load data local infile 'ctgPos2' into table ctgPos2 # 9 skipped # Contig24811 # Contig368174 # Contig44331 # Contig55201 # Contig59195 # Contig64664 # Contig67471 # Contig88377 # Contig89200 # 9 with size 0 # CREATE CHROMINFO TABLE (DONE Oct. 5, 2004, Heather) # an improvement here would be to have getMaxCoord output the 2bit file name ssh hgwdev cd /cluster/data/bosTau1 # get coordinates from agp and put in database table getcoords.pl < downloads/contigs/Btau_v1.0_20040927.agp > scaffolds.coords hgsql bosTau1 < coords.sql load data local infile 'scaffolds.coords' into table scaffoldCoords # calculate maximum coords; check that start coord is always 1 /cluster/home/heather/bin/i386/GetMaxCoord > getMaxCoord.bosTau1 # edit chromInfo.sql; allow fileName to be null cp ~heather/kent/src/hg/lib/chromInfo.sql . hgsql bosTau1 < chromInfo.sql load data local infile 'getMaxCoord.bosTau1' into table chromInfo update chromInfo set fileName = "/gbdb/bosTau1/nib/bosTau1.2bit" # REPEATMASKER (DONE, Oct. 5, 2004, Heather) # using the split scaffold fa files generated earlier # note the version of RepeatMasker in use; will want this for download README cat /cluster/bluearc/RepeatMasker/Libraries/version # RepBase Update 9.04, RM database version 20040702 # do a trial run ssh hgwdev cd /cluster/data/bosTau1/scaffolds/0/0/0 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec bos SCAFFOLD1000.fa # configure cd /cluster/data/bosTau1 mkdir jkStuff cp /cluster/data/anoGam1/jkStuff/RMAnopheles jkStuff/RMBosTaurus # change references anoGam1 --> bosTau1 mkdir RMRun # /bin/csh makeJoblist-RM.csh cd scaffolds foreach i (0 1 2 3 4 5 6 7 8 9) cd $i foreach j (0 1 2 3 4 5 6 7 8 9) cd $j foreach k (0 1 2 3 4 5 6 7 8 9) cd $k foreach f (*.fa) echo /cluster/data/bosTau1/jkStuff/RMBosTaurus \ /cluster/data/bosTau1/scaffolds/$i/$j/$k $f \ '{'check out line+ /cluster/data/bosTau1/$i/$j/$k/$f.out'}' \ >> /cluster/data/bosTau1/RMRun/RMJobs end cd .. end cd .. end cd .. end # do the run ssh kk cd /cluster/data/bosTau1/RMRun para create RMJobs para try para push para check para time etc. # concatenate into one output file; took about 90 minutes ssh kksilo cd /cluster/data/bosTau1 mkdir repeats /bin/tcsh concatRM.tcsh cd repeats # use grep -v to get rid of all the headers # then, add back in an initial header # this is so hgLoadOut is happy grep SCAFFOLD repeats.all > repeats.clean cat repeats.header repeats.clean > repeats.final grep "There were no repetitive sequences found" repeats.final > repeats.notfound grep -v "There were no repetitive sequences found" repeats.final > repeats.out ssh hgwdev hgLoadOut bosTau1 repeats.out # Strange perc. fields: # -2.3 line 925441 # -18.2 line 1644720 # -18.2 line 1644722 hgsql bosTau1 rename table repeats_rmsk to rmsk # select count(*) from rmsk; # 4,244,045 # select count(*) from rmsk where repName like "Bov%"; # 1,445,972 # SIMPLE REPEATS (DONE, Oct. 6, 2004, Heather) # put the results throughout the scaffold 0/0/0 directories, # same as RepeatMasker, to avoid too many files in the same directory ssh kksilo cd /cluster/data/bosTau1 mkdir bed cd bed mkdir simpleRepeat cd simpleRepeat /bin/csh makeJoblist-trf.csh # do the run; took about 12 hours tcsh trf-run.csh > & ! trf.log & # concatenate into one output file; took about an hour /bin/tcsh concatTRF.csh # load /cluster/bin/i386/hgLoadBed bosTau1 simpleRepeat trf.bed \ -sqlTable = /cluster/home/heather/kent/src/hg/lib/simpleRepeat.sql # Reading trf.all # Loaded 248313 elements of size 16 # Sorted # Saving bed.tab # Loading bosTau1 create index chrom_2 on simpleRepeat (chrom(16), chromStart); create index chrom_3 on simpleRepeat (chrom(16), chromEnd); # CREATE MASKED FA USING REPEATMASKER AND FILTERED TRF FILES (DONE Oct. 10, 2004, Heather) ssh kksilo cd /cluster/data/bosTau1 /cluster/bin/i386/maskOutFa -soft downloads/linearScaffolds/Btau20040927-freeze-assembly.fa repeats/repeat.out bosTau1.softmask.fa # 71 warnings about negative rEnd # matches select count(*) from rmsk where repEnd < 0; /cluster/bin/i386/maskOutFa -softAdd bosTau1.softmask.fa bed/simpleRepeat/trf.bed bosTau1.softmask2.fa # hard masking (Ns instead of lower case) for download files # split for use by genscan /cluster/bin/i386/maskOutFa bosTau1.softmask2.fa hard bosTau1.hardmask.fa mkdir hardmask-split /cluster/bin/i386/faSplit about bosTau1.hardmask.fa 2000000 hardmask-split # DOWNLOAD FILES (DONE, Feb. 9, 2005, Heather) ssh hgwdev cd /cluster/data/bosTau1 cp bosTau1.softmask.fa /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips cp bosTau1.softmask2.fa /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips cp bosTau1.hardmask.fa /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips cd /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips nice gzip bosTau1.softmask.fa nice gzip bosTau1.softmask2.fa nice gzip bosTau1.hardmask.fa # REGENERATE 2BIT NIB (DONE, Oct. 10, 2004, Heather) ssh kksilo cd /cluster/data/bosTau1 /cluster/bin/i386/faToTwoBit bosTau1.softmask2.fa bosTau1.softmask.2bit /cluster/bin/i386/twoBitToFa bosTau1.softmask.2bit check.softmask.fa diff -q bosTau1.softmask2.fa check.softmask.fa mv bosTau1.2bit bosTau1.2bit.unmasked mv bosTau1.softmask.2bit bosTau1.2bit # note: size match # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE, Oct. 10, 2004, Heather) # could get rid of the nib in this path, 2bit isn't a nib (4bit) ssh kkr1u00 mkdir -p /iscratch/i/bosTau1/nib cp -p /cluster/data/bosTau1/bosTau1.2bit /iscratch/i/bosTau1/nib iSync ssh kksilo mkdir -p /cluster/bluearc/bosTau1/nib cp -p /cluster/data/bosTau1/nib/bosTau1.2bit /cluster/bluearc/bosTau1/nib # GENERATE split masked fas for blastz (DONE, Oct. 19, 2004, Heather) ssh kkr1u00 mkdir -p /iscratch/i/bosTau1/splitFas cd /cluster/data/bosTau1 /cluster/bin/i386/faSplit sequence /cluster/data/bosTau1/bosTau1.softmask2.fa 350 \ /iscratch/i/bosTau1/splitFas # slight problem with directory name cd /iscratch/i/bosTau1 mkdir splitDir mv splitFas* splitDir rmdir splitFas # check that I got all scaffolds grep SCAFFOLD *.fa > SCAFFOLD.list # sync /cluster/bin/scripts/iSync # MAKE 11.OOC FILE FOR BLAT (DONE, Oct. 10, 2004, Heather) ssh kkr1u00 mkdir /cluster/data/bosTau1/bed/ooc cd /cluster/data/bosTau1/bed/ooc ls -1 /cluster/data/bosTau1/bosTau1.2bit > nib.lst /cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/bosTau1/11.ooc -repMatch=100 # Wrote 1327650 overused 11-mers to /cluster/bluearc/bosTau1/11.ooc cp -p /cluster/bluearc/bosTau1/11.ooc /iscratch/i/bosTau1/ iSync # GENBANK ssh hgwdev cd /cluster/home/heather/kent/src/hg/makeDb/genbank # check for missing commits diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # edit etc/genbank.conf and add these lines, starting with the comment: # bosTau1 (B. taurus) # could get rid of the nib in this path, 2bit isn't a nib (4bit) bosTau1.genome = /iscratch/i/bosTau1/nib/bosTau1.2bit bosTau1.lift = no bosTau1.refseq.mrna.native.load = no # bosTau1.genbank.mrna.xeno.load = yes bosTau1.genbank.mrna.xeno.load = no bosTau1.genbank.est.xeno.load = no bosTau1.downloadDir = bosTau1 bosTau1.perChromTables = no cvs commit -m "added cow" etc/genbank.conf #revision 1.56 # edit src/lib/gbGenome.c make cvs commit -m "added cow" src/lib/gbGenome.c # revision 1.19 # edit src/align/gbBlat make cvs commit -m "added cow" src/align/gbBlat # revision 1.29 make install-server ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial -verbose=1 bosTau1 & # logged to # /cluster/data/genbank/var/build/logs/YYMMDDHHMMSS.bosTau1.initalign.log # load ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad bosTau1 # logged to /cluster/data/genbank/var/dbload/hgwdev/logs/YYMMDDHHMMSS.dbload.log # ESTs required modification to the build process to deal with the huge # number of scaffolds. Once completed, the following line was # added to genbank.conf to enable it's uses: bosTau1.mondoTwoBitParts = 5000 ssh eieio cd /cluster/data/genbank bin/gbAlignStep -initial -srcDb=genbank -type=est bosTau1 # load the ESTs, reloading mRNAs as well ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -drop -initialLoad bosTau1 # add bosTau1 to list of databases to align in # /cluster/data/genbank/etc/align-genbank # GENBANK: add Non-Cow Refseqs (DONE May 9, 2005, Heather) # add to kent/src/hg/makeDb/genbank/etc/genbank.conf and commit: # bosTau1.refseq.mrna.xeno.load = yes cd /cluster/data/genbank/etc cvs update # new tables: xenoRefGene, xenoRefFlat, xenoRefSeqAli, refLink, refSeqStatus # refSeqSummary # GC PERCENT (DONE Oct. 17, 2004, Heather) # use old-style (non-wiggle) # run on kolossus (serialized) # use default window size (20K); only applies to 25k scaffolds # ran from Friday night 4pm to Sunday morning 7am # dominated one CPU ssh kolossus mkdir /cluster/data/bosTau1/bed/gcPercent cd /cluster/data/bosTau1/bed/gcPercent # gcPercent modified to recognize 2bit file /cluster/bin/x86_64/hgGcPercent -noLoad bosTau1 /gbdb/bosTau1/nib ssh hgwdev cd /cluster/home/heather/kent/src/hg/lib # change index from 12 to 14 characters hgsql bosTau1 < gcPercent.sql load data local infile 'gcPercent.bed' into table gcPercent # 54007 rows # GC5BASE - (DONE 2004-10-27 - Hiram) ssh kksilo mkdir /cluster/data/bosTau1/bed/gc5Base cd /cluster/data/bosTau1/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 bosTau1 \ /cluster/data/bosTau1 | wigBedToBinary stdin gc5Base.wig gc5Base.wib # Calculating gcPercent with window size 5 # Using twoBit: /cluster/data/bosTau1/bosTau1.2bit # File stdout created # Converted stdin, upper limit 100.00, lower limit 0.00 # real 254m41.199s # user 139m32.740s # sys 85m1.450s # Now load those results into the database ssh hgwdev cd /cluster/data/bosTau1/bed/gc5Base mkdir /gbdb/bosTau1/wib ln -s `pwd`/gc5Base.wib /gbdb/bosTau1/wib hgLoadWiggle -pathPrefix=/gbdb/bosTau1/wib bosTau1 gc5Base gc5Base.wig # The index is no good for this organism due to the scaffold # naming scheme, drop the index and create one that works: hgsql -e "drop index chrom on gc5Base;" bosTau1 hgsql -e "create index chrom on gc5Base (chrom(16));" bosTau1 # The speeds up an operation on the hgTables browser by orders of # magnitude - whole genome statistics display # LOAD GAP & GOLD TABLES FROM AGP (DONE 11/10/04 angie) ssh hgwdev # There is one line of AGP containing a 0-length contig segment, and # this causes a problem for findBin in hgGoldGapGl. awk that out: awk '$3 != 0 {print;}' \ /cluster/data/bosTau1/downloads/contigs/Btau_v1.0_20040927.agp \ | hgGoldGapGl -noGl bosTau1 stdin # For some reason, the indices did not get built correctly -- # "show index from gap/gold" shows NULL cardinalities for chrom. # Rebuild indices with "analyze table". hgsql bosTau1 -e 'analyze table gold; analyze table gap;' # FIX AGP FILE # Ali detected some truncated scaffolds # I added agpFragValidate to kent/src/hg/lib/agpFrag.c (revision 1.9) # and added a call to it from kent/src/hg/makeDb/hgGoldGapGl/hgGoldGapGl.c (revision 1.20) ssh hgwdev cd /cluster/data/bosTau1/downloads/contigs cp Btau_v1.0_20040927.agp copy.agp # edit copy.agp: # delete 0 length contig that Angie found (Contig92264 in SCAFFOLD230837) # delete -1 length contigs found by hgGoldGapGl: # SCAFFOLD36273 9126 9125 3 W Contig55199 1 0 + # SCAFFOLD120295 9153 9152 4 W Contig64664 1 0 + # SCAFFOLD130877 2412 2411 3 W Contig89199 1656 1655 - # SCAFFOLD165124 13935 13934 11 W Contig67469 2691 2690 - # SCAFFOLD195025 37500 37499 17 W Contig59193 1196 1195 - # SCAFFOLD215637 2564 2563 3 W Contig24808 1 0 + # SCAFFOLD225287 10250 10249 7 W Contig88376 998 997 - # SCAFFOLD300287 13363 13362 7 W Contig44330 1417 1416 - # drop gold and gap tables; rerun /cluster/home/heather/bin/i386/hgGoldGapGl -noGl bosTau1 copy.agp # check cd /cluster/home/heather/jdbc source javaenv ScaffoldGapChain bosTau1 # No anomalies detected # Took about 3 hours to run # QUALITY TRACK (DONE April 2005 Heather) # The challenge was a sorting problem. # The quality file from Baylor was ordered by contig. # We need quality data by scaffold. # Solution was to load quality data into database, # then extract it by scaffold. ssh kolossus cd /cluster/data/bosTau1/bed mkdir quality cd quality # load quality data into database table hgsql mysql -e 'create database bosTau1;' hgsql bosTau1 < contigQuality.sql source javaenv # took about 2 hours to load /usr/java/jdk1.5.0_01/bin/java LoadQuality ../../downloads/contigs/Btau20040927-freeze-contigs.fa.qual bosTau1 \ > LoadQuality.log.mar9 # create test table of quality data hgsql bosTau1 < contigQualityTest.sql source javaenv /usr/java/jdk1.5.0_01/bin/java LoadQualityTest quality.test bosTau1 # transfer gold table from hgwdev to kolossus ssh hgwdev cd /cluster/data/bosTau1/bed/quality hgsqldump bosTau1 gold > gold.dump ssh kolossus cd /cluster/data/bosTau1/bed/quality hgsql bosTau1 < gold.dump # read # run a test case hgsql bosTaul < goldClean.dump source javaenv /usr/java/jdk1.5.0_01/bin/java ScaffoldQualityTest bosTau1 goldClean > goldClean.run # run for real; took about 7 hours (3pm - 10pm) # should have written to local scratch on kolossus /usr/java/jdk1.5.0_01/bin/java ScaffoldQuality bosTau1 gold > gold.run # check grep fixedStep gold.run > fixedStep.out # wigEncode (actually copied data to eieio; no logins to kksilo; copied back to kksilo) # took about 20 minutes wigEncode gold.run quality.wig quality.wib # load # took about 10 minutes ssh hgwdev ln -s /cluster/data/bosTau1/bed/quality/quality.wib /gbdb/bosTau1/wib hgLoadWiggle -pathPrefix=/gbdb/bosTau1/wib bosTau1 quality quality.wig # Connected to database bosTau1 for track quality # Creating table definition with 13 columns in bosTau1.quality # Saving wiggle.tab # WARNING: Exceeded SCAFFOLD195025 size 51200 > 50221. dropping 980 data point(s) # WARNING: Exceeded SCAFFOLD195025 size 51416 > 50221. dropping 1196 data point(s) # WARNING: Exceeded SCAFFOLD215637 size 11264 > 10768. dropping 497 data point(s) # WARNING: Exceeded SCAFFOLD215637 size 12288 > 10768. dropping 1521 data point(s) # WARNING: Exceeded SCAFFOLD215637 size 13159 > 10768. dropping 2392 data point(s) # Loading bosTau1 # Rerun Quality with new gold table # transfer gold table from hgwdev to kolossus ssh hgwdev cd /cluster/data/bosTau1/bed/quality hgsqldump bosTau1 gold > gold.dump.2 ssh kolossus cd /cluster/data/bosTau1/bed/quality # mysql: rename table gold to goldBackup hgsql bosTau1 < gold.dump.2 source javaenv # should have written to local scratch on kolossus /usr/java/jdk1.5.0_01/bin/java ScaffoldQuality bosTau1 gold > gold.run.2 # check ssh kkstore01 cd /cluster/data/bosTau1/bed/quality grep fixedStep gold.run.2 > fixedStep.out.2 # fixedStep.out.2 matches fixedStep.out # wigEncode ssh kkstore01 cd /cluster/data/bosTau1/bed/quality wigEncode gold.run.2 quality.wig.2 quality.wib.2 # load ssh hgwdev ln -s /cluster/data/bosTau1/bed/quality/quality.wib.2 /gbdb/bosTau1/wib cd /cluster/data/bosTau1/bed/quality hgLoadWiggle -pathPrefix=/gbdb/bosTau1/wib bosTau1 quality quality.wig.2 # Rerun without using zeroes for gaps (new version of ScaffoldQuality) # (Heather, May 24, 2005) ssh kkstore01 cd /cluster/data/bosTau1/bed/quality source javaenv # should have written to local scratch on kolossus /usr/java/jdk1.5.0_01/bin/java ScaffoldQuality bosTau1 gold > gold.run.3 wigEncode gold.run.3 quality.wig.3 quality.wib.3 # upper limit 90.0, lower limit 0.0 ssh hgwdev ln -s /cluster/data/bosTau1/bed/quality/quality.wib.3 /gbdb/bosTau1/wib cd /cluster/data/bosTau1/bed/quality hgLoadWiggle -pathPrefix=/gbdb/bosTau1/wib bosTau1 quality quality.wig.3 # GENSCAN (DONE Nov. 17, 2004 Heather) # uses hard-masked sequence files ssh hgwdev mkdir /cluster/data/bosTau1/bed/genscan cd /cluster/data/xenTro1/bed/genscan cvs co hg3rdParty/genscanlinux mkdir gtf pep subopt # Run on small cluster (more memory than big cluster). ssh kki # generate list of hard-masked scaffolds that are not all-N's # (genscan crashes if all-N's) foreach f ( `ls -1S /cluster/data/bosTau1/hardmask-split/*` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # Create gsub cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy /cluster/bin/i386/gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... ls -1 gtf | wc -l endsInLf gtf/* # Concatenate ssh kksilo cd /cluster/data/bosTau1/bed/genscan cat gtf/*.gtf > genscan.gtf cat pep/*.pep > genscan.pep cat subopt/*.bed > genscanSubopt.bed # Load ssh hgwdev cd /cluster/data/bosTau1/bed/genscan ldHgGene -gtf bosTau1 genscan genscan.gtf hgPepPred bosTau1 generic genscanPep genscan.pep hgLoadBed bosTau1 genscanSubopt genscanSubopt.bed featureBits bosTau1 genscan # 49346362 bases of 2261116798 (2.182%) in intersection # QA NOTE [ASZ: 9-11-2006]: used mytouch on genscanPep. # sudo mytouch bosTau1 genscanPep 200504081400.00 # RE-INDEX TABLES TO ACCOMODATE LONG SCAFFOLD NAMES ssh hgwdev # Note: next time around rmsk index redo won't be necessary, # because of modifications to hgLoadOut. Note we're just # doing an extremely simplified (chrom-only) index in most # of these cases because the scaffolds are so small. hgsql bosTau1 -e 'drop index bin on rmsk' hgsql bosTau1 -e 'create index genoName on rmsk(genoName(16))' hgsql bosTau1 -e 'drop index genoStart on rmsk' hgsql bosTau1 -e 'drop index genoEnd on rmsk' # Next time around simpleRepeat index redo also not necessary, # because of changes to simpleRepeat.sql hgsql bosTau1 -e 'drop index chrom on simpleRepeat' hgsql bosTau1 -e 'drop index chrom_2 on simpleRepeat' hgsql bosTau1 -e 'drop index chrom_3 on simpleRepeat' hgsql bosTau1 -e 'create index chrom on simpleRepeat(chrom(16),bin)' # I'm not sure if the all_est/intronEst/all_mrna will need # redoing next time or not. Please check with show index from table. hgsql bosTau1 -e 'drop index tName on all_est' hgsql bosTau1 -e 'drop index tName_2 on all_est' hgsql bosTau1 -e 'drop index tname_3 on all_est' hgsql bosTau1 -e 'create index tName on all_est(tName(16))' hgsql bosTau1 -e 'drop index tName on intronEst' hgsql bosTau1 -e 'drop index tName_2 on intronEst' hgsql bosTau1 -e 'drop index tname_3 on intronEst' hgsql bosTau1 -e 'create index tName on intronEst(tName(16))' hgsql bosTau1 -e 'drop index tName on all_mrna' hgsql bosTau1 -e 'drop index tName_2 on all_mrna' hgsql bosTau1 -e 'drop index tname_3 on all_mrna' hgsql bosTau1 -e 'create index tName on all_mrna(tName(16))' # MAKE Human Proteins track (hg17 DONE 2004-12-20 braney) ssh kksilo mkdir -p /cluster/data/bosTau1/blastDb cd /cluster/data/bosTau1/blastDb faSplit sequence ../downloads/linearScaffolds/Btau20040927-freeze-assembly.fa 500 x for i in x* do formatdb -i $i -p F done rm *.log *.fa ssh kkr1u00 mkdir -p /iscratch/i/bosTau1/blastDb cp /cluster/data/bosTau1/blastDb/* /iscratch/i/bosTau1/blastDb (iSync) > sync.out mkdir -p /cluster/data/bosTau1/bed/tblastn.hg17KG cd /cluster/data/bosTau1/bed/tblastn.hg17KG ls -1S /iscratch/i/bosTau1/blastDb/*.nsq | sed "s/\.nsq//" > query.lst # back to kksilo exit # we want around 200000 jobs cd /cluster/data/bosTau1/bed/tblastn.hg17KG calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\) # 42156/(200000/494) = 104.125320 mkdir -p /cluster/bluearc/bosTau1/bed/tblastn.hg17KG/kgfa split -l 104 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/bosTau1/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/bosTau1/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/bosTau1/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/bosTau1/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/bosTau1/bed/tblastn.hg17KG para create blastSpec para push # Completed: 200564 of 200564 jobs # CPU time in finished jobs: 44956749s 749279.15m 12487.99h 520.33d 1.426 y # IO & Wait Time: 2159792s 35996.53m 599.94h 25.00d 0.068 y # Average job time: 235s 3.92m 0.07h 0.00d # Longest job: 843s 14.05m 0.23h 0.01d # Submission to last job: 102195s 1703.25m 28.39h 1.18d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/bosTau1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 406 of 406 jobs # CPU time in finished jobs: 1354s 22.57m 0.38h 0.02d 0.000 y # IO & Wait Time: 18021s 300.35m 5.01h 0.21d 0.001 y # Average job time: 48s 0.80m 0.01h 0.00d # Longest job: 116s 1.93m 0.03h 0.00d exit # back to kksilo cd /cluster/data/bosTau1/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/bosTau1/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/bosTau1/bed/tblastn.hg17KG hgLoadPsl bosTau1 blastHg17KG.psl # back to kksilo rm -rf blastOut # End tblastn # Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram) ssh hgwdev cd /tmp wget \ 'http://www.genome.gov/Pages/News/Photos/Science/bovine_image.jpg' -O bovine_image.jpg # view that image in 'display' to determine crop edges, then: convert -crop 1280x800+280+240 -quality 80 -sharpen 0 \ -normalize bovine_image.jpg bbt.jpg convert -geometry 300x200 -quality 80 bbt.jpg Bos_taurus.jpg rm -f bbt.jpg bovine_image.jpg cp -p Bos_taurus.jpg /usr/local/apache/htdocs/images # add links to this image in the description.html page, request push # MAKE 11.OOC FILE FOR BLAT (REDONE, Jan. 18, 2004, Heather) ssh kkr1u00 mkdir /cluster/data/bosTau1/bed/ooc cd /cluster/data/bosTau1/bed/ooc ls -1 /cluster/data/bosTau1/bosTau1.2bit > nib.lst /cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/bosTau1/11.ooc -repMatch=800 # Wrote 1327650 overused 11-mers to /cluster/bluearc/bosTau1/11.ooc cp -p /cluster/bluearc/bosTau1/11.ooc /iscratch/i/bosTau1/ iSync # GENBANK # gbAlign for mrna ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial -verbose=1 bosTau1 & # logged to # /cluster/data/genbank/var/build/logs/YYMMDDHHMMSS.bosTau1.initalign.log # load mrna ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad bosTau1 # logged to /cluster/data/genbank/var/dbload/hgwdev/logs/YYMMDDHHMMSS.dbload.log # gbAlign for est ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -initial -srcDb=genbank -type=est bosTau1 # load the ESTs, reloading mRNAs as well ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -drop -initialLoad bosTau1 # add bosTau1 to list of databases to align in # /cluster/data/genbank/etc/align-genbank # GENBANK DOWNLOAD FILES (DONE, Jan. 26 Heather) ssh hgwbeta cd /genbank ./bin/gbDbLoadStep -getDownloadSeqs=1 bosTau1 # output goes to /genbank/download/bosTau1/bigZips # these are automatically rsynced to hgnfs1 (which hgdownload reads) # SWAP human(target)/cow(query) alignments to make bosTau1.chainHg17 ssh kksilo cd /cluster/data/bosTau1/bed mkdir blastz mkdir blastz/axtChain /cluster/bin/i386/chainSwap zb.hg17/axtChain/all.chain.jan5.filtered blastz/axtChain/all.chain cp all.chain all.chain.backup # out of memory loading in one piece # can't split by scaffold, too many files # solution: split into 2 pieces cd blastz/axtChain mkdir split-2 /cluster/bin/i386/chainSplit -lump=2 split-2 all.chain ssh hgwdev hgsql bosTau1 drop table chainHg17 drop table chainHg17Link cd split-2 # each of these takes about 4 hours /cluster/bin/i386/hgLoadChain -tIndex bosTau1 chainHg17 000.chain /cluster/bin/i386/hgLoadChain -tIndex -oldTable bosTau1 chainHg17 001.chain # cleanup ssh kksilo cd /cluster/data/bosTau1/bed/blastz/axtChain rm all.chain.backup # bosTau1.netHg17 ssh kolossus cd /cluster/data/bosTau1/bed/blastz/axtChain # PRE /cluster/bin/x86_64/chainPreNet all.chain ../../zb.hg17/S2.len ../../zb.hg17/S1.len chainPreNet.out # chainNet /cluster/bin/x86_64/chainNet chainPreNet.out -minSpace=1 ../../zb.hg17/S2.len ../../zb.hg17/S1.len hg17.net.raw /dev/null # syntenic (using revision 1.7) /cluster/home/heather/bin/x86_64/netSyntenic hg17.net.raw hg17.net.syn # memory usage 2733064192, utime 5681 s/100, stime 617 # gzip ssh kksilo cd /cluster/data/bosTau1/bed/blastz/axtChain gzip chainPreNet.out gzip bosTau1.net.raw # netClass # lots of feedback as each scaffold is processed; should have redirected output # takes about an hour ssh hgwdev cd /cluster/data/bosTau1/bed/blastz/axtChain /cluster/bin/i386/netClass -noAr hg17.net.syn bosTau1 hg17 hg17.net # load # again, lots of feedback as each scaffold is processed ssh hgwdev cd /cluster/data/bosTau1/bed/blastz/axtChain /cluster/bin/i386/netFilter -minGap=10 hg17.net | /cluster/bin/i386/hgLoadNet bosTau1 netHg17 stdin # generate axts # don't need to split? # kksilo or kolossus? # yet again, lots of feedback as each scaffold is processed # WORKING HERE ssh kksilo mkdir ../axtNet /cluster/bin/i386/netToAxt hg17.net.syn all.chain /cluster/data/bosTau1/bosTau1.2bit /cluster/data/hg17/nib ../axtNet/all.axt # sort? # cleanup, compress, etc. ssh kksilo cd /cluster/data/bosTau1/bed/blastz/axtChain gzip all.chain gzip hg17.net.syn rm chain.tab link.tab cd /cluster/data/bosTau1/bed/blastz/axtNet cp all.axt all.axt.backup # MAKE VSHG17 DOWNLOADABLES (DONE Feb. 15, 2005 Heather) ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/bosTau1 mkdir vsHg17 cd /cluster/data/bosTau1/bed/blastz/axtNet cp -p all.axt.gz /usr/local/apache/htdocs/goldenPath/bosTau1/vsHg17/axtNet.gz cd ../axtChain cp -p all.chain.gz /usr/local/apache/htdocs/goldenPath/bosTau1/vsHg17/human.chain.gz cp -p hg17.net.gz /usr/local/apache/htdocs/goldenPath/bosTau1/vsHg17/human.net.gz cd /usr/local/apache/htdocs/goldenPath/bosTau1/vsHg17 # Make a README.txt which explains the files & formats. md5sum *.gz > md5sum.txt # SELF BLASTZ / BLASTZ-RUN-UCSC EXPERIMENT (DONE 1/27/05 angie) # DELETED 9/20/05 angie (never used) # Put chrom.sizes in /iscratch/ since blastz-run-ucsc reads it when # lifting, and bosTau1 chrom.sizes is large. ssh kkr1u00 cp /cluster/data/bosTau1/chrom.sizes /iscratch/i/bosTau1/ iSync ssh kk mkdir /cluster/data/bosTau1/bed/blastz.bosTau1.2005-01-26 cd /cluster/data/bosTau1/bed/blastz.bosTau1.2005-01-26 ln -s blastz.bosTau1.2005-01-26 /cluster/data/bosTau1/bed/blastz.bosTau1 # 10M chunks led to hour-long average job time, so try 5M chunks: cat << '_EOF_' > DEF # Cow vs. self export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET - Cow SEQ1_DIR=/iscratch/i/bosTau1/nib/bosTau1.2bit SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=5000000 SEQ1_LAP=10000 # QUERY - Cow SEQ2_DIR=/iscratch/i/bosTau1/nib/bosTau1.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=5000000 SEQ2_LAP=10000 BASE=/cluster/data/bosTau1/bed/blastz.bosTau1.2005-01-26 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=/iscratch/i/bosTau1/chrom.sizes SEQ2_LEN=/iscratch/i/bosTau1/chrom.sizes '_EOF_' # << this line keeps emacs coloring happy mkdir run cd run mkdir lstFiles partitionSequence.pl 5000000 10000 /iscratch/i/bosTau1/nib/bosTau1.2bit \ /iscratch/i/bosTau1/chrom.sizes \ -xdir xdir.sh -rawDir ../axtParts -lstDir lstFiles \ > bosTau1.lst csh -ef xdir.sh cat << '_EOF_' > gsub #LOOP /cluster/home/angie/kent/src/utils/blastz-run-ucsc -outFormat axt -dropSelf $(path1) $(path2) ../DEF {check out exists ../axtParts/$(file1)/$(file1)_$(file2).axt} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 bosTau1.lst bosTau1.lst gsub jobList para create jobList para try, check, push, check, ... # Here's the time for the first 10 jobs, to .axt: #Completed: 10 of 220900 jobs #CPU time in finished jobs: 9177s 152.95m 2.55h 0.11d 0.000 y #IO & Wait Time: 85s 1.41m 0.02h 0.00d 0.000 y #Average job time: 926s 15.44m 0.26h 0.01d #Longest job: 1047s 17.45m 0.29h 0.01d #Submission to last job: 1047s 17.45m 0.29h 0.01d # Here's the time for the first 10 jobs, to .psl.gz (should we believe the I/O?): #Completed: 10 of 220900 jobs #CPU time in finished jobs: 10709s 178.48m 2.97h 0.12d 0.000 y #IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y #Average job time: 1008s 16.79m 0.28h 0.01d #Longest job: 1238s 20.63m 0.34h 0.01d #Submission to last job: 1238s 20.63m 0.34h 0.01d # And from a previous run of the first 10 jobs, to lav: #Completed: 10 of 220900 jobs #CPU time in finished jobs: 9273s 154.55m 2.58h 0.11d 0.000 y #IO & Wait Time: 84s 1.40m 0.02h 0.00d 0.000 y #Average job time: 936s 15.60m 0.26h 0.01d #Longest job: 1060s 17.67m 0.29h 0.01d #Submission to last job: 1062s 17.70m 0.29h 0.01d # Jim said we don't really want this track, so no need to carry through # with the whole thing. But I did go through the steps below with # partial results to make sure that they work. # Combine the many small .psl.gz files into per-target-partition files # for chaining. #NOT DONE ssh kksilo cd /cluster/data/bosTau1/bed/blastz.bosTau1 mkdir psl foreach t (pslParts/*) zcat $t/*.psl.gz \ | sort -k 14,14 -k 16n,17n \ | gzip -c \ > psl/$t:t:r.psl.gz end # chain psl/*.psl.gz # LOAD CPGISSLANDS (DONE, Nov. 27, 2004, Heather) ssh hgwdev mkdir -p /cluster/data/bosTau1/bed/cpgIsland cd /cluster/data/bosTau1/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/bosTau1/bed/cpgIsland/ ssh kksilo cd /cluster/data/bosTau1/bed/cpgIsland foreach f (../../softMasked2/*.fa) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpglh.exe $f > $fout end # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd /cluster/data/bosTau1/bed/cpgIsland hgLoadBed bosTau1 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed wc -l cpgIsland.bed # 44091 cpgIsland.bed featureBits bosTau1 cpgIslandExt # LOAD GENEID PREDICTIONS (DONE 2/25/05) ssh kksilo mkdir /cluster/data/bosTau1/bed/geneid cd /cluster/data/bosTau1/bed/geneid wget http://genome.imim.es/genepredictions/B.taurus/golden_path_200409/geneidv1.2/bosTau1.gtf wget http://genome.imim.es/genepredictions/B.taurus/golden_path_200409/geneidv1.2/bosTau1.prot # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf perl -wpe 's/^(>SCAFFOLD\S+)/$1.1/' bosTau1.prot > geneid.fa ssh hgwdev cd /cluster/data/bosTau1/bed/geneid ldHgGene -gtf -genePredExt bosTau1 geneid bosTau1.gtf hgPepPred bosTau1 generic geneidPep geneid.fa # QA NOTE [ASZ: 9-11-2006]: mytouch for geneidPep # sudo mytouch bosTau1 geneidPep 200504081400.00 # SWAP CHAINS FROM MM5, BUILD NETS ETC. (DONE 2/26/05 angie) ssh kksilo mkdir /cluster/data/bosTau1/bed/blastz.mm5.swap cd /cluster/data/bosTau1/bed/blastz.mm5.swap # Small cluster is quiet, workhorse needs to see cluster /scratch (and # kolossus has its own local /scratch/), so use kkr7u00: doBlastzChainNet.pl -swap /cluster/data/mm5/bed/blastz.bosTau1/DEF \ -workhorse=kkr7u00 >& do.log & tail -f do.log # Add {chain,net}Mm5 to trackDb.ra if necessary. # Add /usr/local/apache/htdocs/goldenPath/bosTau1/vsMm5/README.txt # MAKE COW TO HUMAN LIFTOVER (DONE Mar. 4, 2005, Heather) ssh kkr1u00 cd /cluster/data/bosTau1/bed/blastz/axtChain /cluster/bin/x86_64/netChainSubset hg17.net.gz all.chain.gz stdout | gzip -c > bosTau1.hg17.over.chain.gz cp bosTau1.hg17.over.chain.gz /cluster/data/bosTau1/bed/bedOver ssh hgwdev cd /gbdb/bosTau1 mkdir liftOver cd liftOver cp /cluster/data/bosTau1/bed/bedOver/bosTau1.hg17.over.chain.gz . cd /usr/local/apache/htdocs/goldenPath/bosTau1/liftOver cp /cluster/data/bosTau1/bed/bedOver/bosTau1.hg17.over.chain.gz . # hgsql hgcentraltest # insert into liftOverChain # (fromDb, toDb, path, minMatch, minSizeT, minSizeQ, multiple, minBlocks, fudgeThick) # values ("bosTau1", "hg17", "/gbdb/bosTau1/liftOver/bosTau1.hg17.over.chain", 0.95, 0, 0, "Y", 1, "N"); # ACCESSIONS FOR CONTIGS (DONE 2005-03-22 kate) # Used for Assembly track details, and also ENCODE AGP's cd /cluster/data/bosTau1/bed mkdir -p contigAcc cd contigAcc # extract WGS contig to accession mapping from Entrez Nucleotide summaries # To get the summary file, access the Genbank page for the project # by searching: # genus[ORGN] AND WGS[KYWD] # At the end of the page, click on the list of accessions for the contigs. # Select summary format, and send to file. # output to file .acc # The cow WGS project accesion is: AAFC01000000 grep Bos cow.acc | wc -l # 795203 # edit hg/encode/regionAgp/contigAccession.pl to recognize cow format # and install in /cluster/bin/scripts contigAccession cow.acc > contigAcc.tab wc -l contigAcc.tab # 795203 hgsql bosTau1 < $HOME/kent/src/hg/lib/contigAcc.sql echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \ hgsql bosTau1 hgsql bosTau1 -e "SELECT COUNT(*) FROM contigAcc" # ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) TODO: # LIFTOVER CHAIN TO BOSTAU2 (DONE 2005-09-08 galt) ssh kkstore02 cd /cluster/data/bosTau2 mkdir liftSplits/ mkdir liftSplits/lift mkdir liftSplits/split cat << _EOF_ > split.csh #!/bin/tcsh set liftDir = /cluster/data/bosTau2/liftSplits cd /cluster/data/bosTau2/softmask-split foreach n (*.fa) echo \$n faSplit -lift=\$liftDir/lift/\$n:r.lft size \$n -oneFile 3000 \$liftDir/split/\$n:r end _EOF_ chmod +x split.csh ./split.csh mkdir /san/sanvol1/scratch/bosTau2 cp -r liftSplits/ /san/sanvol1/scratch/bosTau2 cd /cluster/data/bosTau1 mkdir softTwoBit cd softTwoBit faSplit sequence ../bosTau1.softmask2.fa 33 x # 33 files avg size 75 MB foreach f ( *.fa ) faToTwoBit $f $f:r.2bit rm $f end # 33 files avg size 20MB cd .. mkdir /san/sanvol1/scratch/bosTau1 cp -r softTwoBit/ /san/sanvol1/scratch/bosTau1 cd /cluster/data/bosTau1 cp `which makeLoChain-align` . vi makeLoChain-align cp /cluster/bluearc/bosTau1/11.ooc /san/sanvol1/scratch/bosTau1/ # change so that it accepts 2bit files, not just nibs # Jim says using ooc file on target bosTau1 is important ssh pk ./makeLoChain-align bosTau1 /san/sanvol1/scratch/bosTau1/softTwoBit \ bosTau2 /san/sanvol1/scratch/bosTau2/liftSplits/split \ /san/sanvol1/scratch/bosTau1/11.ooc #Setting up blat in /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20 #Checking input files #3003 jobs written to batch # #First two lines of para spec: #blat /san/sanvol1/scratch/bosTau1/softTwoBit/x31.2bit /san/sanvol1/scratch/bosTau2/liftSplits/split/x089.fa {check out line+ ../raw/x31_x089.psl} -tileSize=11 -ooc=/san/sanvol1/scratch/bosTau1/11.ooc -minScore=100 -minIdentity=98 -fastMap #blat /san/sanvol1/scratch/bosTau1/softTwoBit/x09.2bit /san/sanvol1/scratch/bosTau2/liftSplits/split/x089.fa {check out line+ ../raw/x09_x089.psl} -tileSize=11 -ooc=/san/sanvol1/scratch/bosTau1/11.ooc -minScore=100 -minIdentity=98 -fastMap # #DO THIS NEXT: # cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/run # para try, check, push, check, ... # ssh kkstore02 # ./makeLoChain-lift.csh bosTau1 bosTau2 cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/run /parasol/bin/para push #3003 jobs in batch #1525 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 3003 of 3003 jobs #CPU time in finished jobs: 150921s 2515.34m 41.92h 1.75d 0.005 y #IO & Wait Time: 12002s 200.04m 3.33h 0.14d 0.000 y #Average job time: 54s 0.90m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 651s 10.85m 0.18h 0.01d #Submission to last job: 3472s 57.87m 0.96h 0.04d # Lifting ssh kkstore02 cd /cluster/data/bosTau1/ cp `which makeLoChain-lift` . vi makeLoChain-lift # changed so uses softmask-split/ instead of nib/ # and uses x??_$c.psl instead of chr*_$c.psl # use kolossus because we lost our mapping to /san/sanvol1/ again ssh kolossus cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/ ../../makeLoChain-lift bosTau1 bosTau2 /san/sanvol1/scratch/bosTau2/liftSplits/lift > lift.log & tail -f lift.log ssh kki cd /cluster/data/bosTau1 cp `which makeLoChain-chain` . vi makeLoChain-chain # fix to tolerate 2bit instead of only nib cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/ ../../makeLoChain-chain bosTau1 /cluster/data/bosTau1/bosTau1.2bit \ bosTau2 /cluster/data/bosTau2/bosTau2.2bit cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/chainRun para push para check para time #[kki:/cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/chainRun> para time #91 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 91 of 91 jobs #CPU time in finished jobs: 560372s 9339.53m 155.66h 6.49d 0.018 y #IO & Wait Time: 14128s 235.47m 3.92h 0.16d 0.000 y #Average job time: 6313s 105.22m 1.75h 0.07d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 220728s 3678.80m 61.31h 2.55d #Submission to last job: 220728s 3678.80m 61.31h 2.55d #------ Optional Speedup ------------------------------- # Because I am having trouble with the bin0 x089.psl 1.4G 12million lines, # I am going to split it carefully into a multiple cluster job, since this # hippo has been running for days. I checked out the structure of the # psl and chain files and the order that everything is sorted in. # I believe this will work correctly without side-effects. ssh kkstore02 cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/psl/ ~/bin/i386/pslSplitOnTarget -maxTargetCount=100 -lump x089.psl bin0/ # made 100 psl files 000.psl to 099.psl in bin0/ from 7MB to 32MB in size ls -1S *.psl > pslList wc -l pslList # Create gsub { cat << '_EOF_' > gsub #LOOP axtChain -psl {check in line+ $(path1)} /cluster/data/bosTau1/bosTau1.2bit /cluster/data/bosTau2/bosTau2.2bit {check out line $(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy } /cluster/bin/i386/gensub2 pslList single gsub jobList ssh kk /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/psl/bin0 para create jobList para try, check, push, check, ... #re-merge using chainMergeSort to x089.chain: chainMergeSort *.chain > ../../chainRaw/x089.chain #This approach seems to work well for this situation. #Feel free to use it if you have # one big artificial chrom query against many small scaffolds target #------------------------------------- ssh kkstore02 cd /cluster/data/bosTau1/ cp `which makeLoChain-net` . vi makeLoChain-net # added option -lump=33 to chainSplit command # (because otherwise it says "too many files open" since cow1 has many SCAFFOLDS) # chainSplit splits by target by default (cow1) # script expects directory with current date cd bed ln -s blat.bosTau2.2005-09-20 blat.bosTau2.2005-09-23 cd .. # do it ./makeLoChain-net bosTau1 bosTau2 >& mklc-net.log ssh hgwdev cd /cluster/data/bosTau1/ cp /cluster/scripts/makeLoChain-load . # Make sure your .hg.conf central and backupcentral # are pointing to the right thing for hgAddLiftOverChain makeLoChain-load bosTau1 bosTau2 >& mklc-load.log #Connected to central database #Inserting record in liftOverChain #Finished loading bosTau1ToBosTau2.over.chain #Now, add download link to hgLiftOver for #/usr/local/apache/htdocs/goldenPath/bosTau1/liftOver/bosTau1ToBosTau2.over.chain.gz