# for emacs: -*- mode: sh; -*- # http://www.hgsc.bcm.tmc.edu/project-species-p-Rhesus%20Macaque.hgsc?pageLocation=Rhesus%20Macaque # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AANU00 # Macaca Mulatta (rheMac2) # Venter Institute final split merge assembly using washU and baylor # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | nscanGene | genePred nscanPep | # +-------------+---------------------------------+ # DOWNLOAD SEQUENCE (DONE jan27 2006 Robert) ssh kkstore01 mkdir /cluster/store2/rheMac2 mkdir /cluster/store2/rheMac2/final cd /cluster/data ln -s /cluster/store2/rheMac2/final rheMac2 cd /cluster/data/rheMac2 mkdir downloads cd downloads The macaque genome assembly (v. 1.0, Mmul051212) has been posted to the macaque genome ftp site: ftp://macftp@ftp.macaquegenome.hgsc.bcm.tmc.edu/ The pw is analysis . mkdir final cd final mkdir download cd download # A. The assembly itself is represented in three groups, 1) the main assembly, 2) the Chr#_random and ChrU set, and 3) the ChrUr set (a late addition to the ChrU set). #AGP and Fasta chromosome files for these three sets are: # 1) Main Assembly - these are scaffolds, not chromosomes. wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1-edit4.trimNoContam.agp.bz2 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1_edit4.scf.fasta.bz2 # 2) Mapped to chromosomes but unplaced Chr#_random, and unmapped contigs ChrU wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.random.ctgs.agp.bz2 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.random.scfs.agp.bz2 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1_edit4.random.chrm.fasta.bz2 # 3) Late addition of 353 contigs to ChrU wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.chrm.fasta.bz2 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.ctgs.agp.bz2 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.scfs.agp.bz2 # B. The contigs fasta and quality files that go into these are in two sets, the bulk of the files (those in parts 1 and 2 of the assembly above) and the few that were added back at the last minute (part 3 above). # 1) wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.noContam.ctg.fasta.bz2 # 2) wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.noContam.ctg.qv.bz2 # 3) The 353 contigs that were added back at the last minute are included in the following files (other contigs in these files are not included in the assembly) wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.degen.noContam.fasta.bz2 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.degen.noContam.qv.bz2 # C. The final assembly file in the VI internal assembly format is: wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.asm.bz2 #D. The chromosome files are below, note, all gaps are listed as "fragment yes" in the agp files, although some are spanned by clones and others are not (making use of human synteny or macaque map information instead). These files will be corrected to list the uncaptured gaps as "clone no", but the intervals and other information will not change, so use these for other information now. wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrm.fasta.corr.bz2 wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrome.ctgs.agp.corr.bz2 wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrome.scfs.agp.corr.bz2 bunzip2 *.bz2 #wrap extra long lines in fasta fold v1_edit4.random.chrm.fasta |sed -e 's/ChrUr/chrUr/' > rheMac2_random.fa fold v1.edit4.chrm.fasta.corr |sed -e 's/ChrUr/chrUr/' >rheMac2.fa fold v1_edit4.scf.fasta |sed -e 's/ChrUr/chrUr/' > rheMac2.scf.fa mkdir ../Ur fold v1.edit4.ChrUr.chrm.fasta |sed -e 's/ChrUr/chrUr/' > ../Ur/chrUr.fa mv v1.edit4.chrome.ctgs.agp.corr v1.edit4.chrome.ctgs.corr.agp mv v1.edit4.chrome.scfs.agp.corr v1.edit4.chrome.scfs.corr.agp #*** NOTE FOR NEXT TIME *** use makeGenomeDb.pl instead of the following # several sections: # Check that IDs in .fa match IDs in .qual ssh kkstore01 cd /cluster/data/rheMac2/downloads # foreach Bin0 contigs linearScaffolds reads grep ">" *.fasta > id.fa grep ">" *.qv > id.qual diff id.fa id.qual # change chromosomes to lower case sed -e 's/Chr/chr/' v1.edit4.chrome.ctgs.agp.final > v1.edit4.chrome.ctgs.final.fix2.agp cat v1.edit4.chrome.scfs.agp.final |sed -e 's/Chr/chr/' > v1.edit4.final.fix2.agp hgGoldGapGl -noGl rheMac2 /cluster/data/rheMac2/downloads/v1.edit4.final.fix2.agp /cluster/bin/i386/checkAgpAndFa v1.edit4.final.fix.agp rheMac2.fa > check.out # SPLIT UP chromosome and make 500k chunks for RM ssh kkstore02 cd /cluster/data/rheMac2 faSplit byname downloads/rheMac2.fa chrom for i in `awk '{print $1}' chrom.sizes` ; do echo $i ; mkdir -p ${i##chr} ; mv $i.fa ${i##chr} ; pushd ${i##chr} ; faSplit -lift=$i.lft size $i.fa 500000 split ; popd; done # generate list find scaffolds -print > scaffolds.list.0 grep fa scaffolds.list.0 > scaffolds.list #### MAKE 2BIT NIB FILE ssh kkstore01 nice /cluster/bin/i386/faToTwoBit rheMac2.fa rheMac2.2bit nice /cluster/bin/i386/twoBitToFa rheMac2.2bit check.fa mv rheMac2.2bit .. # could also use cmp # note: size match faSize -detailed=on rheMac2.fa >mac.len faSize -detailed=on check.fa >check.len diff mac.len check.len wc -l *.len ssh hgwdev mkdir /gbdb/rheMac2 ln -s /cluster/data/rheMac2/rheMac2.2bit /gbdb/rheMac2/rheMac2.2bit #### CREATE DATABASE (DONE Jan 27 , 2006 Robert) ssh hgwdev hgsql hg17 create database rheMac2; use rheMac2; create table grp (PRIMARY KEY(NAME)) select * from hg17.grp; # add rows to hgcentraltest on hgwbeta in dbDb and defaultDb # set orderKey past dog, before chimp # LOAD GAP & GOLD TABLES FROM AGP ssh hgwdev hgGoldGapGl -noGl rheMac2 /cluster/data/rheMac2/downloads/v1.edit4.chrome.scfs.final.fix.agp # CREATE GAP TABLE from v1.edit4.chrome.ctgs.final.fix.agp to get gaps ssh hgwdev cd kent/src/hg/lib # remove bin column for now hgsql rheMac2 < gap2.sql cd /cluster/data/rheMac2/downloads grep fragment v1.edit4.chrome.ctgs.final.fix.agp > fragment.txt grep clone v1.edit4.chrome.ctgs.final.fix.agp >> fragment.txt hgsql rheMac2 load data local infile 'fragment.txt' into table gap # create trackDb/monkey/rheMac2/gap.html # CREATE CONTIG TABLE (DONE Feb 15,2006 Robert) ssh hgwdev cd kent/src/hg/lib cp ~/kent/src/hg/lib/ctgPos.sql . # edit ctgPos.sql to increase size of primary key to 20 # this doesn't include strand or contig start/end hgsql rheMac2 < ctgPos.sql cd /cluster/data/rheMac2/downloads grep -v clone v1.edit4.chrome.ctgs.final.fix.agp |grep -v fragment > Contig.out ssh kkstore02 cd /cluster/data/rheMac2/downloads getContig.pl < Contig.out > ctgPos ssh hgwdev load data local infile 'ctgPos' into table ctgPos #convert to half open and fix bug in agp file hgsql rheMac2 -e "update ctgPos set chromStart = chromStart -1 " hgsql rheMac2 -e "update ctgPos set size = size -1 " # CREATE CHROMINFO TABLE # an improvement here would be to have getMaxCoord output the 2bit file name ssh hgwdev cd /cluster/data/rheMac2 # get coordinates from agp and put in database table getcoords.pl < downloads/v1.edit4.chrome.scfs.final.fix.agp > scaffolds.coords hgsql rheMac2 < /cluster/data/bosTau1/coords.sql load data local infile 'scaffolds.coords' into table scaffoldCoords faSize rheMac2.fa -detailed=on > chrom.sizes # calculate maximum coords; check that start coord is always 1 ## /cluster/home/heather/bin/i386/GetMaxCoord > getMaxCoord.rheMac2 edit chromInfo.sql; allow fileName to be null cp ~baertsch/kent/src/hg/lib/chromInfo.sql . hgsql rheMac2 < chromInfo.sql load data local infile 'chrom.sizes' into table chromInfo update chromInfo set fileName = "/gbdb/rheMac2/rheMac2.2bit" # REPEATMASKER # using the split scaffold fa files generated earlier #*** NOTE FOR NEXT TIME *** use doRepeatMasker.pl instead. # note the version of RepeatMasker in use; will want this for download README cat /cluster/bluearc/RepeatMasker/Libraries/version # RM database version 20060120 # do a trial run ssh hgwdev cd /cluster/data/rheMac2/scaffolds/0/0/0 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec macaca Contig1000.fa # configure cd /cluster/data/rheMac2 mkdir jkStuff cp /cluster/data/anoGam1/jkStuff/RMAnopheles jkStuff/RMRhesus # change references anoGam1 --> rheMac2 mkdir RMRun # /bin/csh makeJoblist-RM.csh # do the run (DONE Jan 30, 2006 Robert) ssh pk cd /cluster/data/rheMac2/RMRun para create RMJobs.2 para try para push # concatenate into one output file; took about 90 minutes ssh kkstore01 cd /cluster/data/rheMac2 mkdir repeats /bin/tcsh concatRM.csh cd repeats # use grep -v to get rid of all the headers # then, add back in an initial header # this is so hgLoadOut is happy grep chr repeats.all > repeats.clean cat repeats.header repeats.clean > repeats.final grep "There were no repetitive sequences " repeats.final > repeats.notfound grep -v "There were no repetitive sequences " repeats.final > repeat.out ssh hgwdev hgLoadOut rheMac2 repeat.out 2>errors # Strange perc. fields: Processing repeats.out Strange perc. field -6.4 line 652513 of repeat.out Strange perc. field -2236.0 line 749163 of repeat.out Strange perc. field -2637.9 line 749163 of repeat.out Strange perc. field -99.3 line 749163 of repeat.out Strange perc. field -0.3 line 1386112 of repeat.out Strange perc. field -0.2 line 2045133 of repeat.out Strange perc. field -0.4 line 2429164 of repeat.out Strange perc. field -4.8 line 2778536 of repeat.out Strange perc. field -0.4 line 2912076 of repeat.out Strange perc. field -5.5 line 3498109 of repeat.out Strange perc. field -1.0 line 3642974 of repeat.out Strange perc. field -2.7 line 3642974 of repeat.out Strange perc. field -263.9 line 3711627 of repeat.out Strange perc. field -187.0 line 3711627 of repeat.out Strange perc. field -3.4 line 3795344 of repeat.out Strange perc. field -0.1 line 3932239 of repeat.out Strange perc. field -0.2 line 4092279 of repeat.out Strange perc. field -0.1 line 4240026 of repeat.out Strange perc. field -0.1 line 4440809 of repeat.out Strange perc. field -7915.8 line 4506386 of repeat.out Strange perc. field -0.1 line 4514494 of repeat.out Strange perc. field -4361.2 line 4667281 of repeat.out Strange perc. field -840.4 line 4667281 of repeat.out Loading up table repeats_rmsk note: 997 records dropped due to repStart > repEnd hgsql rheMac2 rename table repeats_rmsk to rmsk # select count(*) from rmsk; # SIMPLE REPEATS (DONE Jan 30, 2006 Robert) # put the results throughout the scaffold 0/0/0 directories, # same as RepeatMasker, to avoid too many files in the same directory # *** BUG: FILTERING WAS NOT DONE --> OVERMASKING *** #*** NOTE FOR NEXT TIME *** use doSimpleRepeat.pl instead. ssh kkstore01 cd /cluster/data/rheMac2 mkdir bed cd bed mkdir simpleRepeat cd simpleRepeat /bin/csh makeJoblist-trf.csh tcsh trf-run.csh > & ! trf.log & # concatenate into one output file; took about an hour cat chr*.bed > simpleRepeat.bed # load /cluster/bin/i386/hgLoadBed rheMac2 simpleRepeat simpleRepeat.bed \ -sqlTable = /cluster/home/heather/kent/src/hg/lib/simpleRepeat.sql #Reading trf.bed #Loaded 628275 elements of size 16 #Sorted #Saving bed.tab #Loading rheMac2 create index chrom_2 on simpleRepeat (chrom(16), chromStart); create index chrom_3 on simpleRepeat (chrom(16), chromEnd); # CREATE MASKED FA USING REPEATMASKER AND FILTERED TRF FILES ssh kkstore02 cd /cluster/data/rheMac2 /cluster/bin/i386/maskOutFa -soft downloads/rheMac2.fa repeats/repeat.out rheMac2.softmask.fa WARNING: negative rEnd: -11 chr1:17040504-17040634 MLT2B4 WARNING: negative rEnd: -15 chr1:39400221-39400370 MLT2B4 WARNING: negative rEnd: -28 chr1:75866521-75866637 MLT2B4 WARNING: negative rEnd: -104 chr1:102866722-102866763 MLT2B4 WARNING: negative rEnd: -52 chr1:132930605-132930673 MLT2B4 WARNING: negative rEnd: -686 chr1:133186227-133186372 L1MCc WARNING: negative rEnd: -203 chr1:133186415-133186749 L1MCc WARNING: negative rEnd: -455 chr1:171379505-171379760 L1M3e WARNING: negative rEnd: -17 chr1:216018086-216018189 L1MCb WARNING: negative rEnd: -198 chr1:226920721-226920777 L1M4 WARNING: negative rEnd: -131 chr1:226921141-226921200 L1M4 WARNING: negative rEnd: -3 chr1:226921979-226922094 L1M4 WARNING: negative rEnd: -335 chr1:227071405-227071650 L1MCc WARNING: negative rEnd: -278 chr2:16216076-16216253 L1P4b WARNING: negative rEnd: -72 chr2:18621605-18622180 L1PBa WARNING: negative rEnd: -285 chr2:24878971-24879214 L1MCc WARNING: negative rEnd: -419 chr2:25206834-25207017 L1MCc WARNING: negative rEnd: -183 chr2:25207033-25207203 L1MCc WARNING: negative rEnd: -3 chr2:48529930-48529991 MER6 WARNING: negative rEnd: -426 chr2:80341345-80341639 L1M2c WARNING: negative rEnd: -314 chr2:82400541-82400763 L1MDa WARNING: negative rEnd: -1015 chr2:82731009-82731348 L1ME1 WARNING: negative rEnd: -282 chr2:82731554-82731973 L1ME1 WARNING: negative rEnd: -129 chr2:113802567-113802946 L1MCc WARNING: negative rEnd: -528 chr2:129549508-129549693 L1M3e WARNING: negative rEnd: -437 chr2:160075806-160076108 L1M2 WARNING: negative rEnd: -21 chr3:39033062-39033110 MER6 WARNING: negative rEnd: -271 chr3:58615645-58615852 L1MEb WARNING: negative rEnd: -202 chr3:70911642-70911963 L1M5 WARNING: negative rEnd: -33 chr3:71817441-71818111 L1M3e WARNING: negative rEnd: -316 chr3:112797698-112798023 L1PB3 WARNING: negative rEnd: -17 chr3:113476319-113476407 FRAM WARNING: negative rEnd: -27 chr3:116802770-116802992 L1MB7 WARNING: negative rEnd: -4 chr3:126585635-126585722 FRAM WARNING: negative rEnd: -632 chr3:183268961-183269045 L1M3b WARNING: negative rEnd: -1132 chr4:16324742-16324819 L1MEb WARNING: negative rEnd: -930 chr4:38188841-38189000 L1MEd WARNING: negative rEnd: -448 chr4:122440460-122440750 L1M3e WARNING: negative rEnd: -39 chr5:7580689-7581026 L1MEe WARNING: negative rEnd: -1956 chr5:27414234-27414442 L1MB8 WARNING: negative rEnd: -1682 chr5:27414453-27414671 L1MB8 WARNING: negative rEnd: -513 chr5:66093155-66093311 L1M4b WARNING: negative rEnd: -35 chr5:97364737-97364827 L1MCa WARNING: negative rEnd: -1077 chr5:109355971-109356130 L1MEb WARNING: negative rEnd: -20 chr5:109357351-109357727 L1MEb WARNING: negative rEnd: -102 chr5:119072794-119072945 L1MB4 WARNING: negative rEnd: -9 chr5:144123391-144123455 L1M3de WARNING: negative rEnd: -59 chr5:150681576-150681827 L1M4c WARNING: negative rEnd: -461 chr5:170348753-170349060 L1M3e WARNING: negative rEnd: -138 chr5:170614438-170615156 L1M1 WARNING: negative rEnd: -103 chr5:180911655-180912016 L1MEd WARNING: negative rEnd: -38 chr5:181275921-181276086 L1M4 WARNING: negative rEnd: -219 chr6:326885-327048 L1MCc WARNING: negative rEnd: -243 chr6:18237822-18238057 L1MC WARNING: negative rEnd: -458 chr6:24563135-24563511 L1M4b WARNING: negative rEnd: -14 chr6:27838867-27838984 MLT2B4 WARNING: negative rEnd: -92 chr6:33082868-33082962 L1M4 WARNING: negative rEnd: -505 chr6:60951240-60951389 L1PB3 WARNING: negative rEnd: -9 chr6:76795983-76796095 MLT2B4 WARNING: negative rEnd: -1039 chr6:153171051-153171255 L1MEb WARNING: negative rEnd: -448 chr6:156038512-156038675 L1M4b WARNING: negative rEnd: -1797 chr6:172500974-172501240 L1M4b WARNING: negative rEnd: -717 chr7:6530435-6530561 L1M4b WARNING: negative rEnd: -180 chr7:6530567-6530816 L1M4b WARNING: negative rEnd: -395 chr7:17573119-17573355 L1M4b WARNING: negative rEnd: -713 chr7:34841140-34841243 L1PA12 WARNING: negative rEnd: -917 chr7:36602478-36602794 L1MEd WARNING: negative rEnd: -993 chr7:49724690-49725009 L1MEc WARNING: negative rEnd: -519 chr7:64924143-64924287 L1PA15-16 WARNING: negative rEnd: -170 chr7:64924684-64924986 L1PA15-16 WARNING: negative rEnd: -1526 chr7:130257854-130258027 L1MDa WARNING: negative rEnd: -78 chr8:18037533-18037781 L1M3e WARNING: negative rEnd: -643 chr8:25868692-25868877 L1MCc WARNING: negative rEnd: -5 chr8:37458013-37458057 FRAM WARNING: negative rEnd: -420 chr8:40430422-40430587 L1MCc WARNING: negative rEnd: -73 chr8:53713425-53713503 MLT2B4 WARNING: negative rEnd: -38 chr8:58045714-58046896 L1MEd WARNING: negative rEnd: -46 chr8:70060490-70060930 L1M4 WARNING: negative rEnd: -3 chr8:71535410-71535460 FRAM WARNING: negative rEnd: -781 chr8:71840470-71840637 L1MCc WARNING: negative rEnd: -613 chr8:71841566-71841717 L1MCc WARNING: negative rEnd: -17 chr9:17870182-17870240 FRAM WARNING: negative rEnd: -1077 chr9:36754551-36754681 L1MEb WARNING: negative rEnd: -354 chr9:36754801-36755003 L1MEb WARNING: negative rEnd: -189 chr9:53706555-53706958 L1MCc WARNING: negative rEnd: -28 chr9:55631220-55631257 MER6 WARNING: negative rEnd: -259 chr9:83699632-83700159 L1MA4A WARNING: negative rEnd: -412 chr10:93305328-93305380 L1MDb WARNING: negative rEnd: -290 chr10:93305684-93305806 L1MDb WARNING: negative rEnd: -232 chr11:93725539-93725691 L1MCc WARNING: negative rEnd: -3 chr11:102814963-102815011 FRAM WARNING: negative rEnd: -614 chr12:24319914-24321380 L1M4b WARNING: negative rEnd: -557 chr12:24323027-24323094 L1M4b WARNING: negative rEnd: -259 chr12:100959333-100959494 L1M1 WARNING: negative rEnd: -529 chr12:102350218-102350466 L1M2 WARNING: negative rEnd: -654 chr13:53814747-53815048 L1M4b WARNING: negative rEnd: -7 chr13:58060931-58061578 L1M4c WARNING: negative rEnd: -1125 chr13:72166501-72166711 L1ME1 WARNING: negative rEnd: -6 chr13:72540001-72540050 FRAM WARNING: negative rEnd: -397 chr13:82523197-82523664 L1M3e WARNING: negative rEnd: -119 chr13:120528068-120528107 MLT2B3 WARNING: negative rEnd: -1934 chr14:27100172-27100323 L1MCc WARNING: negative rEnd: -158 chr14:27101192-27101535 L1MCc WARNING: negative rEnd: -281 chr14:60812899-60813046 L1MDa WARNING: negative rEnd: -376 chr15:22274805-22275047 L1M4b WARNING: negative rEnd: -669 chr15:23733251-23733307 L1M3e WARNING: negative rEnd: -422 chr15:24246692-24246910 L1MCc WARNING: negative rEnd: -290 chr15:24247095-24247225 L1MCc WARNING: negative rEnd: -252 chr15:24247564-24247617 L1MCc WARNING: negative rEnd: -11 chr15:24247667-24247964 L1MCc WARNING: negative rEnd: -261 chr15:77304256-77304526 L1M4c WARNING: negative rEnd: -121 chr15:80339141-80339347 L1M4b WARNING: negative rEnd: -488 chr15:83773572-83773877 L1MEd WARNING: negative rEnd: -101 chr15:83773914-83774074 L1MEd WARNING: negative rEnd: -92 chr16:40096072-40096144 MLT2B4 WARNING: negative rEnd: -76 chr17:44749449-44749518 MLT2B4 WARNING: negative rEnd: -97 chr17:85835577-85836019 L1MCc WARNING: negative rEnd: -112 chr18:12965293-12965685 L1MCc WARNING: negative rEnd: -879 chr18:41292895-41293389 L1MEc WARNING: negative rEnd: -322 chr18:41294544-41295033 L1MEc WARNING: negative rEnd: -1137 chr18:59335523-59335630 L1MEb WARNING: negative rEnd: -102 chr19:5934741-5934887 L1M4 WARNING: negative rEnd: -6 chr19:11007926-11008075 FRAM WARNING: negative rEnd: -24 chr19:44501149-44501305 L1M3b WARNING: negative rEnd: -145 chr20:45957350-45957401 L1MDa WARNING: negative rEnd: -1106 chr20:74584188-74584318 L1ME1 WARNING: negative rEnd: -348 chr20:74584451-74584806 L1ME1 WARNING: negative rEnd: -711 chr20:80285832-80285915 L1MCc WARNING: negative rEnd: -37 chrX:4743643-4743679 MER6 WARNING: negative rEnd: -1941 chrX:48147060-48147184 L1M4b WARNING: negative rEnd: -99 chrX:54535186-54535349 L1M4 WARNING: negative rEnd: -606 chrX:80315181-80315657 L1P4b WARNING: negative rEnd: -65 chrX:97720783-97720963 L1M4 WARNING: negative rEnd: -24 chrX:120523759-120523982 L1ME1 WARNING: negative rEnd: -13 chrX:121193467-121193615 MLT2B4 WARNING: negative rEnd: -939 chrX:153527918-153528288 L1M4 # matches select count(*) from rmsk where repEnd < 0; +----------+ | count(*) | +----------+ | 131 | +----------+ # DOH -- THIS MASKS TOO MUCH! SHOULD HAVE USED FILTERED TRF! /cluster/bin/i386/maskOutFa -softAdd rheMac2.softmask.fa bed/simpleRepeat/simpleRepeat.bed rheMac2.softmask2.fa # hard masking (Ns instead of lower case) for download files # split for use by genscan /cluster/bin/i386/maskOutFa rheMac2.softmask2.fa hard rheMac2.hardmask.fa mkdir hardmask-split /cluster/bin/i386/faSplit about rheMac2.hardmask.fa 2000000 hardmask-split # DOWNLOAD FILES # REMAKE SOME BIGZIPS DOWNLOADS IN SAME FORMAT AS FOR OTHER ASSEMBLIES # WITH ONE FILE PER CHROMOSOME (DONE, 2006-06-12, hartera) #*** NOTE FOR NEXT TIME *** use makeDownloads.pl instead. ssh hgwdev cd /cluster/data/rheMac2 cp rheMac2.softmask.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips cp rheMac2.softmask2.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips cp rheMac2.hardmask.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips cd /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips nice gzip rheMac2.softmask.fa nice gzip rheMac2.softmask2.fa nice gzip rheMac2.hardmask.fa # Remake some of the bigZips downlods in the same format as for # other assemblies with one file per chrom (2006-06-12, hartera) ssh kkstore01 cd /cluster/data/rheMac2 # use rheMac2.softmask2.fa as they contain masking from RepeatMasker # and from trf. cat << '_EOF_' > jkStuff/zipAll.csh mkdir -p bigZips # RepeatMasker files tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out tar cvzf bigZips/chromTrf.tar.gz bed/simpleRepeat/chr*.bed # get soft masked and hard masked sequences, one per chrom mkdir softMask hardMask foreach c (`awk '{print$1}' /cluster/data/rheMac2/chrom.sizes`) faOneRecord /cluster/data/rheMac2/rheMac2.softmask2.fa \ $c > softMask/${c}.fa faOneRecord /cluster/data/rheMac2/rheMac2.hardmask.fa \ $c > hardMask/${c}.fa.masked end tar cvzf bigZips/chromFa.tar.gz softMask/chr*.fa tar cvzf bigZips/chromFaMasked.tar.gz hardMask/chr*.fa.masked '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/zipAll.csh csh -ef ./jkStuff/zipAll.csh >& zipAll.log & # Took about 50 minutes. rm repeats.all.gz rheMac2.hardmask.fa.gz rheMac2.softmask.fa.gz \ rheMac2.chrom.fa.gz md5sum *.gz > md5sum.txt # link the *.gz and *.txt files to hgwdev:/usr/local/apache/... ssh hgwdev set gp = /usr/local/apache/htdocs/goldenPath/rheMac2 mkdir -p $gp/bigZips rm repeats.all.gz rheMac2.hardmask.fa.gz rheMac2.softmask.fa.gz \ rheMac2.chrom.fa.gz md5sum.txt ln -s /cluster/data/rheMac2/bigZips/{chrom*.gz,*.txt} $gp/bigZips # Update README.txt to give correct file names. # REGENERATE 2BIT NIB (DONE Jan 30, 2006 Robert) ssh kkstore02 cd /cluster/data/rheMac2 /cluster/bin/i386/faToTwoBit rheMac2.softmask2.fa rheMac2.softmask.2bit /cluster/bin/i386/twoBitToFa rheMac2.softmask.2bit check.softmask.fa diff -q rheMac2.softmask2.fa check.softmask.fa mv rheMac2.2bit rheMac2.2bit.unmasked mv rheMac2.softmask.2bit rheMac2.2bit # note: size match # GENERATE 2bit for blastz mkdir -p /san/sanvol1/scratch/rheMac2/ cd /cluster/data/rheMac2 cp rheMac2.2bit /san/sanvol1/scratch/rheMac2/ -p ssh kkstore02 cd /cluster/data/hg17/bed/blastz.rheMac2 # edit doBlastzChainNet.pl to add lines for hg17 glitch # in function getDbFromPath # if ($db eq "hg") { # $db = "hg17"; # } #doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/rheMac2/rheMacOut/ >& do.log & ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram) ssh pk cd /cluster/data/mm8/bed/blastz.rheMac2.2006-02-17 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ `pwd`/DEF > swap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=net `pwd`/DEF > swap.net.out 2>&1 & # failed during a san hiccup, finish that off, then: time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \ -swap -continue=load `pwd`/DEF > swap.load.out 2>&1 & time nice -n +19 featureBits rheMac2 chainMm8Link # 877906099 bases of 2646704109 (33.170%) in intersection # This was done before the loadUp script was improved to load # empty tables on chroms that have no chains, so sometime later: ssh hgwdev hgLoadChain rheMac2 chrUr_chainMm8 /dev/null # Loading 0 chains into rheMac2.chrUr_chainMm8 # 2006-04-24 ############################################################################ # QUALITY SCORES (Done March 9, 2006 Robert) ssh kkstore01 wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu:21/Assemblies/VI/FinalMergeSplit/v1.edit4.chrm.qv.bz2 cd /cluster/data/rheMac2/downloads/ bunzip2 v1.edit4.chrm.qv.bz2 & bunzip2 v1.edit4.chrUr.bz2 & nice fold --spaces v1.edit4.chrm.qv | sed -e 's/Chr/chr/' > rheMac2.chrm.qual.qv nice fold --spaces v1.edit4.chrUr.qv | sed -e 's/Chr/chr/' > rheMac2.chrUr.qv cat rheMac2.chrm.qual.qv rheMac2.chrUr.qv > rheMac2.qual.qv qaToQac rheMac2.qual.qv temp.qac gzip rheMac2.qual.qv qacToWig -fixed temp.qac rheMac2.wig rm temp.qac wigEncode rheMac2.wig rheMac2.qual.wig rheMac2.qual.wib mv all.qv rheMac2.qual.qv qaToQac rheMac2.qual.qv temp.qac qacToWig -fixed temp.qac rheMac2.wig wigEncode rheMac2.wig rheMac2.qual.wig rheMac2.qual.wib ssh hgwdev hgLoadWiggle rheMac2 quality rheMac2.qual.wig cp -p rheMac2.qual.qv.gz /usr/local/apache/htdocs/rheMac2/bigZips # RUN DATEREPEATS TO GET LINEAGE-SPECIFIC (DONE 3/22/06 angie) ssh kkstore01 cd /cluster/data/rheMac2 # First make per-chrom .fa.out files as should have been done: tail +4 repeats/repeat.out \ | perl -we '$prevChr = ""; \ while (<>) { \ @w = split; $chr = $w[4]; $c = $chr; $c =~ s/^chr//; \ if ($chr ne $prevChr) { \ print "creating $c/$chr.fa.out\n"; \ system("cat repeats/repeats.header > $c/$chr.fa.out"); \ close(OUT); open(OUT, ">>$c/$chr.fa.out") || die; \ } \ print OUT; \ $prevChr = $chr; \ }' # Run DateRepeats to get outputs on rodent, non-rodent mammal: mkdir linSpecRep cd linSpecRep ln -s ../*/chr*.fa.out . foreach f (*.fa.out) echo $f:r:r /cluster/bluearc/RepeatMasker/DateRepeats $f -query human \ -comp mouse -comp dog end mkdir /cluster/bluearc/rheMac2/linSpecRep mkdir /cluster/bluearc/rheMac2/linSpecRep/notIn{Rodent,NonRodentMammal} foreach f (*.out_mus*) set chr = $f:r:r echo $chr /cluster/bin/scripts/extractRepeats 1 $f \ > /cluster/bluearc/rheMac2/linSpecRep/notInRodent/$chr.out.spec /cluster/bin/scripts/extractRepeats 2 $f \ > /cluster/bluearc/rheMac2/linSpecRep/notInNonRodentMammal/$chr.out.spec end # BLASTZ SWAP FOR HUMAN (hg17) (DONE, 2006-03-22, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS # Documented in makeHg17.doc in the section: # SWAP CHAIN AND NET ALIGNMENTS OVER TO RHESUS (rheMac2) # alignments are in: /cluster/data/rheMac2/bed/blastz.hg17.swap # SWAP CHAINS/NET RN4 (DONE 3/23/06 angie) ssh kkstore01 mkdir /cluster/data/rheMac2/bed/blastz.rn4.swap cd /cluster/data/rheMac2/bed/blastz.rn4.swap doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.rheMac2/DEF \ >& do.log & tail -f do.log ln -s blastz.rn4.swap /cluster/data/rheMac2/bed/blastz.rn4 ########################################################################### # 5 way multiz WITH RheMac2 (robert, 5/3/06) -- MM8, RN4, CANFAM2, HG18 # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd) # first build 5-way multiz alignment from syntenic nets (helps reduce # false positive predictions due to paralogous alignments) # (prepare mafNet files from syntenic nets and copy to # /san/sanvol1/scratch/rheMac2/mafNetSyn; do this for mm8, rn4, canFam2, rheMac2 ssh kkstore02 cd /cluster/data/rheMac2/bed mkdir exoniphy cd exoniphy cat > makeSynMaf.sh << 'EOF' #!/bin/bash fromdb=$1 todb=$2 DIR=$3 outdir=$4/$2 base=`pwd` cut -f1 /cluster/data/$fromdb/chrom.sizes > chrom.lst echo making syntenic mafs for $todb in $DIR from $base cd $DIR cd axtChain nice netFilter -syn $fromdb.$todb.net.gz > $fromdb.$todb.syn.net mkdir -p netSynSplit echo splitting nets nice netSplit $fromdb.$todb.syn.net netSynSplit mkdir -p chain echo splitting chains $fromdb.$todb.all.chain.gz nice chainSplit chain $fromdb.$todb.all.chain.gz echo making mafs mkdir -p $outdir for i in `cat $base/chrom.lst` ; do echo "netToAxt netSynSplit/$i.net chain/$i.chain /cluster/data/$fromdb/nib /cluster/data/$todb/nib stdout | axtToMaf stdin -tPrefix=$fromdb. -qPrefix=$todb. /cluster/data/$fromdb/chrom.sizes /cluster/data/$todb/chrom.sizes $outdir/$i.maf " ; nice netToAxt netSynSplit/$i.net chain/$i.chain /cluster/data/$fromdb/nib /cluster/data/$todb/nib stdout | axtToMaf stdin -tPrefix=$fromdb. -qPrefix=$todb. /cluster/data/$fromdb/chrom.sizes /cluster/data/$todb/chrom.sizes $outdir/$i.maf ; done echo removing temporary files rm -rf axtChain/chain rm -rf axtChain/netSynSplit/ # makes syntenic net from net # with $1 as reference and $2 as query # uses blastz alignment in $3 then converts to maf written to $4 # sample usage # makeSynMaf.sh rheMac2 canFam2 /cluster/data/rheMac2/bed/blastz.canFam2 /san/sanvol1/scratch/rheMac2/mafNetSyn 'EOF' # << for emacs makeSynMaf.sh rheMac2 canFam2 /cluster/data/rheMac2/bed/blastz.canFam2 /san/sanvol1/scratch/rheMac2/mafNetSyn makeSynMaf.sh rheMac2 hg18 /cluster/data/rheMac2/bed/blastz.hg18 /san/sanvol1/scratch/rheMac2/mafNetSyn makeSynMaf.sh rheMac2 mm8 /cluster/data/rheMac2/bed/blastz.mm8 /san/sanvol1/scratch/rheMac2/mafNetSyn makeSynMaf.sh rheMac2 rn4 /cluster/data/rheMac2/bed/blastz.rn4 /san/sanvol1/scratch/rheMac2/mafNetSyn # make output dir and run dir ssh pk mkdir -p /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2 cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2 mkdir -p mafSyn runSyn cd runSyn # create scripts to run multiz on cluster cat > autoMultiz.csh << 'EOF' #!/bin/csh -ef set path=(/cluster/bin/penn/multiz.v11.x86_64/ $path ) set db = rheMac2 set c = $1 set maf = $2 set run = /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/ set tmp = /scratch/tmp/$db/multiz.syn.$c set pairs = /san/sanvol1/scratch/$db/mafNetSyn rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == rheMac2) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run $path); rehash $run/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp 'EOF' # << for emacs chmod +x autoMultiz.csh cat > gsub << 'EOF' #LOOP ./autoMultiz.csh $(root1) {check out line+ /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/$(root1).maf} #ENDLOOP 'EOF' # << for emacs cut -f 1 /cluster/data/rheMac2/chrom.sizes > chrom.lst set path = (/parasol/bin $path);rehash gensub2 chrom.lst single gsub jobList para create jobList # 46 jobs para try; para check para push # Load into database ssh hgwdev cd /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn mkdir -p /gbdb/rheMac2/multiz5waySyn/mafSyn ln -s /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn/*.maf \ /gbdb/rheMac2/multiz5waySyn/maf cat > loadMaf.csh << 'EOF' time hgLoadMaf -pathPrefix=/gbdb/rheMac2/multiz5waySyn/maf rheMac2 multiz5waySyn cat *.maf | \ nice hgLoadMafSummary rheMac2 -minSize=30000 -mergeGap=1500 -maxSize=200000 multiz5waySummary stdin 'EOF' #<< happy emacs # Dropped unused indexes (2006-05-09 kate) # NOTE: this is not required in the future, as the loader # has been fixed to not generate these indexes hgsql rheMac2 -e "alter table multiz5waySummary drop index chrom_2" hgsql rheMac2 -e "alter table multiz5waySummary drop index chrom_3" # expect lengthy load time for this -- a few hours ? csh loadMaf.csh >&! loadMaf.log & #Loading multiz5waySyn into database #Loaded 6207706 mafs in 21 files from /gbdb/rheMac2/multiz5waySyn/maf #Advisory lock created #Advisory lock has been released #455.360u 156.210s 13:51.54 73.5% 0+0k 0+0io 237pf+0w #Indexing and tabulating stdin #Created 795772 summary blocks from 15566976 components and 6207706 mafs from stdin #Loading into rheMac2 table multiz5waySummary... #Loading completeAdvisory lock has been released #drop index to improve performance hgsql rheMac2 -e " alter table multiz5waySummary drop index chrom_3" # build chromosome-by-chromosome SS files cd /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2 mkdir run-ss-syn cd run-ss-syn mkdir -p /san/sanvol1/scratch/rheMac2/multiz.rheMac2Hg18Mm8Rn4CanFam2/ssSyn cat > makeSS.csh << 'EOF' #!/bin/csh -fe set c = $1 /cluster/bin/phast/msa_view -i MAF -o SS /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/rheMac2/chrom/$c.fa | gzip -c > /san/sanvol1/scratch/rheMac2/multiz.rheMac2Hg18Mm8Rn4CanFam2/ssSyn/$c.ss.gz 'EOF' # << for emacs chmod +x makeSS.csh foreach chr (`cut -f 1 /cluster/data/rheMac2/chrom.sizes`) echo "makeSS.csh $chr" >> jobList.ss end para create jobList.ss # 20 jobs para try; para check para push # gzip mafs for downloads area ssh kkstore01 cd cluster/data/hg17/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/mafDownloads cat > downloads.csh << 'EOF' date foreach f (../mafSyn/chr*.maf) set c = $f:t:r echo $c nice gzip -c $f > $c.maf.gz end md5sum *.gz > md5sum.txt date 'EOF' time csh downloads.csh >&! downloads.log # ~2 hours ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/rheMac2/multiz5waySyn mkdir $dir ln -s /cluster/data/rheMac2/bed/multiz5waySyn/mafDownloads/{*.gz,md5sum.txt} $dir cp README.txt $dir #make multi5wayFrames table for browser track ssh kkstore01 cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/ set sanDir = /san/sanvol1/scratch/rheMac2/multiz5waySyn/frames mkdir -p $sanDir/maf cp -rp maf/* $sanDir/maf mkdir frames cd frames cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames . cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile . #edit Makefile to correct species names and set and sanDir ssh hgwdev cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames make getGenes >&! getGenes.log & # ~1 minute make getFrames >&! getFrames.log & # NOTE: if jobs get hung up (e.g. running for hours, when # they should run for minutes, do 'para stop' so that # the 'para make' can restart the job make loadDb >&! loadDb.log & #trackDb entry #track multiz5waySyn #shortLabel Conservation #longLabel Human/Mouse/Rat/Dog Multiz Alignment & Conservation using syntenic alignments #group compGeno #priority 104.1 #visibility pack #color 0, 10, 100 #altColor 0,90,10 #type wigMaf 0.0 1.0 #maxHeightPixels 100:40:11 #wiggle phastCons5way #pairwiseHeight 12 #spanList 1 #yLineOnOff Off #autoScale Off #summary multiz5waySummary #frames multiz5wayFrames #irows on #speciesGroups mammal #sGroup_mammal hg18 mm8 rn4 canFam2 #speciesDefaultOff # request push to hgwdev ### # rebuild frames to get bug fix, using 1-pass maf methodology # (2006-06-09 markd) ssh kkstore01 cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames mv mafFrames/ mafFrames.old nice tcsh # easy way to get process niced (cat ../mafSyn/*.maf | time genePredToMafFrames rheMac2 stdin stdout canFam2 genes/canFam2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz | gzip >multiz5wayFrames.mafFrames.gz)>&log ssh hgwdev cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames hgLoadMafFrames rheMac2 multiz5wayFrames multiz5wayFrames.mafFrames.gz >&log ############################################################### # PHASTCONS CONSERVATION (NOT DONE yet, 2006-05-06 Robert) # coverage is only 3% and needs more tweaking # These parameters are: # --rho # --expected-length # --target-coverage # Also, instead of generating cons and noncons tree models, # we use a single, pre-existing tree model -- Elliot Margulies' model # from the (37-way) ENCODE alignments. # NOTE: reusing cluster-friendly chrom fasta files created earlier ssh kkstore01 mkdir /cluster/bluearc/rheMac2/chrom cd /cluster/data/rheMac2 foreach f (`cat chrom.lst`) echo $f cp $f/*.fa /cluster/bluearc/rheMac2/chrom end # Split chromosome MAF's into windows and use to generate # "sufficient statistics" (ss) files for phastCons input # NOTE: as the SAN fs has lotsa space, we're leaving these # big (temp) files unzipped, to save time during phastCons run. # Note also the larger chunk sizes from previous runs -- this # reduces run-time on the split, slows down the actual phastCons # enough so jobs don't crash (jobs are very quick, just a minute # or so), and according to Adam, will produce better results. # The previous small chunks were probably required by # the phyloFit step, which we are no longer using for the # human alignments. ssh pk mkdir /cluster/data/rheMac2/bed/multiz5waySyn/cons cd /cluster/data/rheMac2/bed/multiz5waySyn/cons cp /cluster/store5/gs.18/build35/bed/multiz17way.2005-12-20/cons/elliotsEncode.mod . # edit, change to hg18, mm8, and rn4. mkdir run.split cd run.split set WINDOWS = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss rm -fr $WINDOWS mkdir -p $WINDOWS cat << 'EOF' > doSplit.csh #!/bin/csh -ef set MAFS = /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn set WINDOWS = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss cd $WINDOWS set c = $1 echo $c rm -fr $c mkdir $c /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \ -M /cluster/bluearc/rheMac2/chrom/$c.fa \ -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000 echo "Done" >> $c.done 'EOF' # << happy emacs chmod +x doSplit.csh rm -f jobList foreach f (../../mafSyn/*.maf) set c = $f:t:r echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList end para create jobList # 49 jobs para try para check para push # estimate model parameters # estimate avg. cross-species avg. GC content from chr1 maf's ssh kolossus set path = ($path /cluster/bin/phast); rehash cd /cluster/data/rheMac2/bed/multiz5waySyn/cons msa_view --aggregate rheMac2,hg18,rn4,mm8,canFam2 \ -i MAF \ --summary-only /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/chr1.maf \ > maf_summary.txt awk '$1 == "[aggregate]" {printf "%0.3f\n", $3 + $4}' maf_summary.txt # 0.424 # Be sure to substitute in the right G+C content. Also, notice the # target coverage of 0.17. We actually want 5% coverage here but # the final (posterior) coverage is only indirectly related to the # expected (prior) coverage. One thing to consider is that we # only have about 40% alignment coverage (excluding chimp, which # doesn't help us much in identifying conserved regions). As far # as phastCons is concerned, we want to aim for about 0.05 / 0.4 = # 0.125 coverage. In this case, though, --target-coverage # 0.125 resulted in only about 4.1% coverage. I had to iterate # a couple of times (using only chromosome 1) to find a value that # got me close to the target of 5% ssh pk mkdir -p /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.elements cd /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.elements mkdir -p TREES mkdir -p LOG ls /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss/chr*/*ss > all.windows for f in `cat all.windows` ; do root=`basename $f .ss.gz` ; echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ; done para create jobs.lst para push # check tree model on 5MB chunk, using params recommended by Adam, # he ok'ed the results -- not necessary for next human run ssh kolossus cd /cluster/data/rheMac2/bed/multiz5waySyn/cons /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \ --tree "`cat ../tree-commas.nh`" \ /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss/chr7/chr7.110000001-120000000.ss \ -o phyloFit.tree # Run phastCons # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ # cd .. mkdir run.cons cd run.cons cat > doPhast.csh << 'EOF' #!/bin/csh -fe set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set tmp = /scratch/tmp/$f mkdir -p $tmp set san = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp pushd $tmp > /dev/null /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative panTro1,rheMac2 \ --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp popd > /dev/null mkdir -p $san/pp/$c $san/bed/$c sleep 1 mv $tmp/$f.pp $san/pp/$c mv $tmp/$f.bed $san/bed/$c rm -fr $tmp 'EOF' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << 'EOF' #LOOP doPhast.csh $(root1) $(file1) 14 .008 .28 #ENDLOOP 'EOF' # happy emacs # Create parasol batch and run it pushd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons # mkdir /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.cons ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \ /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.cons/in.list ssh pk cd /cluster/store11/gs.19/build36/bed/multiz5waySyn/cons/run.cons gensub2 in.list single template jobList para create jobList # 295 jobs para try para check para push #Completed: 295 of 295 jobs #CPU time in finished jobs: 9160s 152.67m 2.54h 0.11d 0.000 y #IO & Wait Time: 2969s 49.48m 0.82h 0.03d 0.000 y #Average job time: 41s 0.69m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 78s 1.30m 0.02h 0.00d #Submission to last job: 276s 4.60m 0.08h 0.00d # create Most Conserved track ssh kolossus cd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons # The sed's and the sort get the file names in chrom,start order # (Hiram tricks -- split into columns on [.-/] with # identifying x,y,z, to allow column sorting and # restoring the filename. Warning: the sort column # will depend on how deep you are in the dir find ./bed -name "chr*.bed" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/rheMac2/bed/multiz5waySyn/cons # load into database ssh hgwdev cd /cluster/data/rheMac2/bed/multiz5waySyn/cons hgLoadBed -strict rheMac2 phastConsElements5way mostConserved.bed #Loaded 602351 elements of size 5 # compare with previous tracks hgsql rheMac2 -e "select count(*) from phastConsElements5way" # 2260575 # hgsql hg18 -e "select count(*) from phastConsElements" # hg18 does not have phastConsElements table # 1601903 # Try for 5% overall cov, and 70% CDS cov (used elen=13, tcov=.007, rho=.27) featureBits hg18 -enrichment refGene:cds phastConsElements17way # refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x featureBits hg17 -enrichment refGene:cds phastConsElements17way # refGene:cds 1.064%, phastConsElements17way 5.104%, both 0.748%, cover 70.29%, enrich 13.77x # compare with previous tracks featureBits hg18 -enrichment refGene:cds phastConsElements10way # refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x featureBits hg18 -enrichment refGene:cds phastConsElements # refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x # Create merged posterier probability file and wiggle track data files # pk is currently closer to the san than any other machine ssh pk cd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ # sort by chromName, chromStart so that items are in numerical order # for wigEncode find ./pp -name "chr*.pp" | \ sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \ sort -k7,7 -k9,9n | \ sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \ xargs cat | \ nice wigEncode stdin phastCons5way.wig phastCons5way.wib # about 23 minutes for above cp -p phastCons17way.wi? /cluster/data/hg18/bed/multiz17way/cons # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/hg18/bed/multiz17way/cons ln -s `pwd`/phastCons17way.wib /gbdb/hg18/multiz17way/phastCons17way.wib hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz17way hg18 \ phastCons17way phastCons17way.wig # ~ 3 minute load # Downloads (2006-02-22 Fan) ssh hgwdev cd /cluster/data/hg18/bed/multiz17way mkdir mafDownloads cd mafDownloads # upstream mafs (mafFrags takes a while) cat > mafFrags.csh << 'EOF' date foreach i (1000 2000 5000) echo "making upstream$i.maf" nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed rm up.bad nice mafFrags hg18 multiz17way up.bed upstream$i.maf \ -orgs=../species.lst rm up.bed end date 'EOF' time csh mafFrags.csh > mafFrags.log nice gzip up*.maf ssh kkstore02 cd /cluster/data/hg18/bed/multiz17way/mafDownloads cat > downloads.csh << 'EOF' date foreach f (../maf/chr*.maf) set c = $f:t:r echo $c nice gzip -c $f > $c.maf.gz end md5sum *.gz > md5sum.txt date 'EOF' time csh downloads.csh > downloads.log ssh hgwdev set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz17way mkdir $dir ln -s /cluster/data/hg18/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir ##################################################################### # SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie) # File emailed from Xinwei She mkdir /cluster/data/rheMac2/bed/genomicSuperDups cd /cluster/data/rheMac2/bed/genomicSuperDups sed -e 's/\t_\t/\t-\t/' rheMac2_genomicSuperDup.tab \ | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \ | hgLoadBed rheMac2 genomicSuperDups stdin\ -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql ###### # ENSEMBL (done 6/20/06 robert) wget http://www.sanger.ac.uk/Users/lec/macaque_data/gtf.tar.gz wget http://www.sanger.ac.uk/Users/lec/macaque_data/macaque_ensembl_cdnas.fa.gz wget -q http://www.sanger.ac.uk/Users/lec/macaque_data/macaque_ensembl_peptides.fa.gz gunzip *.gz for i in `ls ensembl_macaque_gtf_data/` ; do echo ${i%%.gtf} ; awk '{print "chr"$0}' ensembl_macaque_gtf_data/$i > chr$i ; done rm -f chrscaffold.gtf cp ensembl_macaque_gtf_data/scaffold.gtf scaffold.gtf cat chr*.gtf > ensGene.gtf grep -v pseudo ensAll.gtf > ensGene.gtf ldHgGene rheMac2 ensGene ensGene.gtf -gtf -genePredExt Reading ensGene.gtf Read 38561 transcripts in 741623 lines in 1 files 38561 groups 22 seqs 8 sources 4 feature types 38561 gene predictions hgsql rheMac2 < ~/kent/src/hg/lib/ensGtp.sql awk '{print $10,$12,$16}' ensGene.gtf |grep ENSMMUP|awk 'NF==3{print $0}'|sed -e 's/\"//g' | sed -e 's/\;//g'|sed -e 's/ /\t/g' |sort -u > ensGtp.tab echo "load data local infile 'ensGtp.tab' into table ensGtp ignore 1 lines" | hgsql -N rheMac2 sed -e 's/_/ /' macaque_ensembl_peptides.fa > ensPep.fa hgPepPred rheMac2 ensembl ensPep.fa grep pseudogene ensAll.gtf > ensPseudo.gtf grep -v protein_coding ensAll.gtf |grep -v pseudo > ensRna.gtf ########################################################################## # N-SCAN gene predictions (nscanGene) - (2006-08-04 markd) cd /cluster/data/rheMac2/bed/nscan/ # obtained NSCAN predictions from michael brent's group # at WUSTL wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/chr_gtf wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/proteins # clean up and rename downloaded directorys: mv ardor.wustl.edu/jeltje/rheMac2/chr_gtf . mv ardor.wustl.edu/jeltje/rheMac2/proteins chr_ptx rm -rf ardor.wustl.edu rm chr_*/index.html* gzip chr_*/* chmod a-w chr_*/*.gz # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt rheMac2 nscanGene chr_gtf/chr*.gtf.gz # add .a suffix to match transcript id # add .a suffix to match transcript id hgPepPred -suffix=.a rheMac2 generic nscanPep chr_ptx/chr*.fa.gz rm *.tab # update trackDb; need a rheMac2-specific page to describe informants rhesus/rheMac2/nscanGene.html (copy from panTro2 and edit) rhesus/rheMac2/trackDb.ra # QA found: Checking keys on database rheMac2 rheMac2.nscanPep.name - hits 22192 of 24608 Error: 2416 of 24608 elements of rheMac2.nscanPep.name are not in key nscanGene.name line 524 of all.joiner Example miss: CHR1.5.052.A # due to WUStL giving us a mismatched protein file. # 2006-08-11 markd - obtained new protein sequences wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/proteins hgPepPred -suffix=.a rheMac2 generic nscanPep proteins/chr*.fa.gz joinerCheck -keys -database=rheMac2 -identifier=nscanName ~/compbio/kent/src/hg/makeDb/schema/all.joiner Checking keys on database rheMac2 rheMac2.nscanPep.name - hits 22003 of 22003 ok # 2006-08-11 markd - N-SCAN track has UTR only predictions, reload discarding # these ldHgGene -bin -gtf -requireCDS -genePredExt rheMac2 nscanGene chr_gtf/chr*.gtf.gz # CPGISLANDS (DONE - 2006-09-12 - Robert) ssh hgwdev mkdir -p /cluster/data/rheMac2/bed/cpgIsland cd /cluster/data/rheMac2/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make # gcc readseq.c cpg_lh.c -o cpglh.exe mv cpglh.exe /cluster/data/rheMac2/bed/cpgIsland/ # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. ssh kkstore02 cd /cluster/data/rheMac2/bed/cpgIsland foreach f (../../*/chr*.fa.masked) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpglh.exe $f > $fout end # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed ssh hgwdev cd /cluster/data/rheMac2/bed/cpgIsland hgLoadBed rheMac2 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed #Loaded 24226 elements of size 10 #Sorted #Saving bed.tab #Loading rheMac2 ########################################################################## # Hg 18 Reciprocal Best Nets (8-13-07, kpollard) ssh kolossus cd /cluster/data/hg18/bed/blastz.rheMac2/axtChain mkdir rBestNet cp /cluster/data/rheMac2/chrom.sizes /cluster/bluearc/rheMac2/. cat > rbest.csh << 'EOF' set swapChain = rheMac2ToHg18.swap.chain echo "merge and swap $swapChain" zcat /cluster/data/rheMac2/bed/liftOver/rheMac2ToHg18.over.chain.gz \ | chainSwap stdin stdout | chainMergeSort stdin > $swapChain echo "split chain" chainSplit swapChain/ $swapChain echo "run netter" chainPreNet $swapChain /cluster/bluearc/hg18/chrom.sizes \ /cluster/bluearc/rheMac2/chrom.sizes stdout | \ chainNet stdin -minSpace=1 /cluster/bluearc/hg18/chrom.sizes \ /cluster/bluearc/rheMac2/chrom.sizes stdout /dev/null | \ netSyntenic stdin rbest.noClass.net echo "split net" netSplit rbest.noClass.net rBestNet echo "extract chains from net" netChainSubset -verbose=0 rbest.noClass.net $swapChain stdout | \ chainMergeSort stdin | gzip -c > hg18.rheMac2.rbest.chain.gz echo "split extracted chain" chainSplit rBestChain/ hg18.rheMac2.rbest.chain.gz cd .. mkdir axtRBestNet echo "extract axts" foreach f (axtChain/rBestNet/*.net) echo $f:t:r netToAxt $f axtChain/swapChain/$f:t:r.chain \ /san/sanvol1/scratch/hg18/hg18.2bit \ /cluster/bluearc/rheMac2/rheMac2.2bit stdout | \ axtSort stdin stdout | gzip -c \ > axtRBestNet/$f:t:r.hg18.rheMac2.net.axt.gz end 'EOF' csh rbest.csh >&! rbest.log & #Not yet loaded ############################################################################ ## BLASTZ swap from monDom4 alignments (2007-11-11 - markd) ssh hgwdev mkdir /cluster/data/rheMac2/bed/blastz.monDom4.swap cd /cluster/data/rheMac2/bed/blastz.monDom4.swap ln -s blastz.monDom4.swap ../blastz.monDom4 /cluster/bin/scripts/doBlastzChainNet.pl \ -swap /cluster/data/monDom4/bed/blastz.rheMac2/DEF >& swap.out& # fb.rheMac2.chainMonDom4Link.txt: # 611785347 bases of 2646704109 (23.115%) in intersection ######################################################################### # Blastz Marmoset calJac1 (DONE - 2007-11-15 - Hiram) ssh kkstore01 screen # use screen to control this job mkdir /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16 cd /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16 cat << '_EOF_' > DEF # Rhesus vs marmoset BLASTZ_M=50 # TARGET: Rhesus RheMac2 SEQ1_DIR=/scratch/hg/rheMac2/rheMac2.2bit SEQ1_LEN=/cluster/data/rheMac2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Marmoset calJac1 SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit SEQ2_LEN=/cluster/data/calJac1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=kk > blastz.log 2>&1 & # real 2115m6.616s # failed during the load due to MySQL problems on hgwdev # run loadUp.csh manually on hgwdev # real 28m35.926s cat fb.rheMac2.chainCalJac1Link.txt # 2055107003 bases of 2646704109 (77.648%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=download -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=kk > download.log 2>&1 & mkdir /cluster/data/calJac1/bed/blastz.rheMac2.swap cd /cluster/data/calJac1/bed/blastz.rheMac2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=kk > swap.log 2>&1 & # real 349m36.073s cat fb.calJac1.chainRheMac2Link.txt # 2191300051 bases of 2929139385 (74.810%) in intersection ########################################################################### # Blastz Orangutan ponAbe2 (DONE - 2007-11-16 - Hiram) ssh kkstore01 screen # use screen to control this job mkdir /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16 cd /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16 cat << '_EOF_' > DEF # Rhesus vs marmoset BLASTZ_M=50 # TARGET: Rhesus RheMac2 SEQ1_DIR=/scratch/hg/rheMac2/rheMac2.2bit SEQ1_LEN=/cluster/data/rheMac2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Orangutan ponAbe2 SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=kk > blastz.log 2>&1 & # something got lost during the para make. A couple of jobs couldn't # finish, due to memory limits. Finish them off on hgwdev # real 1911m11.906s # Completed: 108924 of 108928 jobs # Crashed: 4 jobs # CPU time in finished jobs: 20658708s 344311.80m 5738.53h 239.11d 0.655 y # IO & Wait Time: 1801215s 30020.25m 500.34h 20.85d 0.057 y # Average job time: 206s 3.44m 0.06h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 5401s 90.02m 1.50h 0.06d # Submission to last job: 111924s 1865.40m 31.09h 1.30d # continuing after those were done: time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=cat -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=kk > cat.log 2>&1 & # some jobs failed due to memory limits on kk nodes # finished them manually on kolossus and hgwdev, then continuing time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=cat -bigClusterHub=kk > cat.log 2>&1 & # real 322m54.734s # failed during load due to MySQL problems on hgwdev # run loadUp.csh manually on hgwdev # real 22m22.225s cat fb.rheMac2.chainPonAbe2Link.txt # 2333264093 bases of 2646704109 (88.157%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=download -bigClusterHub=kk > download.log 2>&1 & # and, for the swap mkdir /cluster/data/ponAbe2/bed/blastz.rheMac2.swap cd /cluster/data/ponAbe2/bed/blastz.rheMac2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -bigClusterHub=kk > swap.log 2>&1 & # real 289m9.479s cat fb.ponAbe2.chainRheMac2Link.txt # 2556010573 bases of 3093572278 (82.623%) in intersection ############################################################################ # SGP GENES (DONE - 2007-12-20 - Hiram) ssh kkstore01 mkdir /cluster/data/rheMac2/bed/sgp cd /cluster/data/rheMac2/bed/sgp mkdir gtf for C in `awk '{print $1}' /cluster/data/rheMac2/chrom.sizes` do wget --timestamping \ "http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/SGP/200603mm8/${C}.gtf" \ -O "gtf/${C}.gtf" done ssh hgwdev cd /cluster/data/rheMac2/bed/sgp ldHgGene -gtf -genePredExt rheMac2 sgpGene gtf/chr*.gtf # Read 31416 transcripts in 264479 lines in 22 files # 31416 groups 22 seqs 1 sources 3 feature types # 31416 gene predictions featureBits rheMac2 sgpGene # 32556064 bases of 2646704109 (1.230%) in intersection featureBits rheMac2 -enrichment refGene:CDS sgpGene # refGene:CDS 0.017%, sgpGene 1.230%, both 0.014%, cover 83.53%, enrich 67.91x ##################################################################### # LOAD GENEID GENES (DONE - 2007-12-20 - Hiram) ssh kkstore01 mkdir -p /cluster/data/rheMac2/bed/geneid cd /cluster/data/rheMac2/bed/geneid mkdir gtf prot for C in `awk '{print $1}' /cluster/data/rheMac2/chrom.sizes` do echo $C wget --timestamping \ "http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/geneid_v1.2/${C}.gtf" \ -O "gtf/${C}.gtf" wget --timestamping \ "http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/geneid_v1.2/${C}.prot" \ -O "prot/${C}.prot" done # Add missing .1 to protein id's cd prot foreach f (*.prot) perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot end ssh hgwdev cd /cluster/data/rheMac2/bed/geneid ldHgGene -genePredExt -gtf rheMac2 geneid gtf/*.gtf # Read 30202 transcripts in 248094 lines in 22 files # 30202 groups 22 seqs 1 sources 3 feature types # 30202 gene predictions hgPepPred rheMac2 generic geneidPep prot/chr*-fixed.prot featureBits rheMac2 geneid # 33978840 bases of 2646704109 (1.284%) in intersection featureBits rheMac2 -enrichment refGene:CDS geneid # refGene:CDS 0.017%, geneid 1.284%, both 0.013%, cover 77.14%, enrich 60.09x ########################################################################## ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: rheMac2.upstreamGeneTbl = refGene ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-02-11 - Hiram) screen # use a screen to manage this multi-day job mkdir /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11 cd /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11 # same kind of parameters as used in human<->marmoset cat << '_EOF_' > DEF # Rhesus vs. marmoset BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q # and place those items here BLASTZ_O=600 BLASTZ_E=150 BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Rhesus RheMac2 SEQ1_DIR=/scratch/data/rheMac2/rheMac2.2bit SEQ1_LEN=/scratch/data/rheMac2/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Marmoset (calJac3) SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_LIMIT=50 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ > do.log 2>&1 & # real 248m5.651s cat fb.rheMac2.chainCalJac3Link.txt # 1871513554 bases of 2646704109 (70.711%) in intersection mkdir /hive/data/genomes/calJac3/bed/blastz.rheMac2.swap cd /hive/data/genomes/calJac3/bed/blastz.rheMac2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 & # real 142m34.894s cat fb.calJac3.chainHg19Link.txt # 1916431926 bases of 2752505800 (69.625%) in intersection ############################################################################ # refSeqAnno - RefSeq genomic annotations at the request of # Kim Pruit for use by the Rhesus community. # (2010-10-14 markd) checkout markds python's tools in some directory: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/code/pycbio/trunk and run make at top mkdir /hive/data/genomes/rheMac2/bed/refSeqAnno cd /hive/data/genomes/rheMac2/bed/refSeqAnno # get NCBI genome annotation: wget ftp://ftp.ncbi.nih.gov/genomes/Macaca_mulatta/README_CURRENT_BUILD wget -nv -r -A 'mmu_ref_*.gbs.gz' 'ftp://ftp.ncbi.nih.gov/genomes/Macaca_mulatta/CHR*' mkdir gbs mv ftp.ncbi.nih.gov/genomes/Macaca_mulatta/*/*.gbs.gz gbs/ rm -rf ftp.ncbi.nih.gov # get mRNAs to create mRNA <-> protein mappings wget ftp://ftp.ncbi.nih.gov/genomes/Macaca_mulatta/RNA/rna.gbk.gz zcat rna.gbk.gz |awk '/^VERSION/{mrna=$2} /\/protein_id=/{print mrna"\t"gensub("^.*\"(.*)\"","\\1","g",$1)}' >mrna2prot.tbl zcat gbs/*.gz |~/compbio/code/pycbio/bin/gbffGenesToGenePred --mrnaProtMap=mrna2prot.tbl --refSeqOnly --ext --allGenes --other /dev/stdin refSeqAnno.contig.gp >&cnv.probs # no lift file was made, so must build one ftp://ftp.ncbi.nih.gov/genomes/Macaca_mulatta/seq_contig.md.gz mdToNcbiLift seq_contig.md.gz ncbi.lft # need to fix various things in lift: # - drop /start and /end entries # - we don't have chromsome chrUn # - mappings to weird chome names like: chrX|NW_001219121.1 tawk '$2~/\/start/ || $2~/\/end/ || $4 ~/chrUn/{next} {$4=gensub("[|].*", "", "g",$4); print $0}' ncbi.lft >ncbi.fix.lft # lift to chromosomes liftUp -type=.genepred -extGenePred refSeqAnno.gp ncbi.fix.lft warn refSeqAnno.contig.gp >&lift.probs # 635 drop due to not having chrUn genePredCheck -db=rheMac2 refSeqAnno.gp hgLoadGenePred -genePredExt rheMac2 refSeqAnno refSeqAnno.gp # add to trackDb/rhesus/trackDb.ra trackDb/rhesus/rheMac2/trackDb.ra # create trackDb/rhesus/refSeqAnno.html ############################################################################ # LASTZ Gorilla GorGor3 (DONE - 2011-10-21,23 - Hiram) mkdir /hive/data/genomes/rheMac2/bed/lastzGorGor3.2011-10-21 cd /hive/data/genomes/rheMac2/bed/lastzGorGor3.2011-10-21 cat << '_EOF_' > DEF # Gorilla vs Rhesus # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Gorilla rheMac2 SEQ1_DIR=/scratch/data/rheMac2/rheMac2.2bit SEQ1_LEN=/scratch/data/rheMac2/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: Gibbon gorGor3 SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/rheMac2/bed/lastzGorGor3.2011-10-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > do.log 2>&1 & # 1729m35s cat fb.rheMac2.chainGorGor3Link.txt # 2175264883 bases of 2646704109 (82.188%) in intersection cd /hive/data/genomes/rheMac2/bed ln -s lastzGorGor3.2011-10-21 lastz.gorGor3 # running the swap - DONE - 2011-09-21 mkdir /hive/data/genomes/gorGor3/bed/blastz.rheMac2.swap cd /hive/data/genomes/gorGor3/bed/blastz.rheMac2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/rheMac2/bed/lastzGorGor3.2011-10-21/DEF \ -swap -syntenicNet \ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ > swap.log 2>&1 & # real 85m16.618s cat fb.gorGor3.chainRheMac2Link.txt # 2236867094 bases of 2822760080 (79.244%) in intersection ######################################################################### # POLYA-SEQ TRACK (from Adnan Derti, Merck) (DONE, Andy 2012-02-06) # (see hg19.txt for multi-species build notes) ############################################################################ # LIFTOVER TO rheMac3 (DONE - 2012-07-24 - Hiram ) # need a .ooc file for this # hg19 is 2897316137, calculate the ratio factor for 1024: # the 2646668809 is the 'real' size from faSize on the genome calc \( 2646668809 / 2897316137 \) \* 1024 # ( 2646668809 / 2897316137 ) * 1024 = 935.413580 # round down to 900 as was in rheMac3: cd /hive/data/genomes/rheMac2 blat rheMac2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/rheMac2.11.ooc -repMatch=900 # Wrote 31602 overused 11-mers to jkStuff/rheMac2.11.ooc mkdir /hive/data/genomes/rheMac2/bed/blat.rheMac3.2012-07-24 cd /hive/data/genomes/rheMac2/bed/blat.rheMac3.2012-07-24 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug \ -ooc /hive/data/genomes/rheMac2/jkStuff/rheMac2.11.ooc rheMac2 rheMac3 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /hive/data/genomes/rheMac2/jkStuff/rheMac2.11.ooc \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ rheMac2 rheMac3 > do.log 2>&1 # real 24m28.376s #############################################################################