# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on # the Patch 1 release for the GRC build 38 (August 2012) aka: # GRCm38.p1 - Genome Reference Consortium Mouse Reference 38 patch 1 ############################################################################ # gather sequence and AGP definitions (DONE - 2012-09-21 - Hiram) mkdir -p /hive/data/genomes/mm10Patch1/sequence cd /hive/data/genomes/mm10Patch1/sequence # a round about way here since patch1 sequence was already assembled. # there are perl and shell scripts in # ../../mm10/bed/patch1 # which created the fasta file with UCSC names # see also in mm10.txt: # GRCm38.p1 patch 1 (DONE - 2012-09-21 - Hiram) ln -s ../../mm10/bed/patch1/mm10.patch1.fa . # check what might be different from previous (there is no previous) faCount mm10.patch1.fa | grep -v total | grep -v seq \ | sort > patch1.faCount.txt faCount ../some/previous/mm10.patchN.fa | grep -v total | grep -v seq | sort > patchN.faCount.txt comm -12 patchN.faCount.txt patch1.faCount.txt | wc # xx xxx xxxx # it appears that seven have been removed since patch N: comm -23 patchN.faCount.txt patch1.faCount.txt | wc # 7 56 385 # and 54 added: comm -13 patchN.faCount.txt patch1.faCount.txt | wc # 54 432 2960 # reuse the script from hg19 patch9 sed -e "s/atch5/atch1/g" ../../hg19Patch9/sequence/patch9Agp.pl \ > ./patch1Agp.pl # one little addition to this script since then: diff ./patch1Agp.pl ../../mm10Patch5/sequence/patch5Agp.pl # 41d40 # < $faName =~ s/.*gb.JH/JH/; ./patch1Agp.pl \ ../../mm10/bed/patch1/patches.chrom.sizes \ ../../mm10/bed/patch1/ucscNames.patch1.txt \ ../../mm10/bed/patch1/genbank/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz \ > mm10Patch1.agp # verify we have correct sequence and AGP file: faToTwoBit *.fa patch1.2bit checkAgpAndFa mm10Patch1.agp patch1.2bit 2>&1 | tail -3 # All AGP and FASTA entries agree - both files are valid # there is no previous patch yet, do this next time # compare the two chrom.sizes to see what is missing or has been added twoBitInfo patch1.2bit stdout | sort > patch1.chrom.sizes twoBitInfo ../../mm10PatchN/sequence/patchN.2bit stdout \ | sort > patchN.chrom.sizes # 108 identical: comm -12 patchN.chrom.sizes patch1.chrom.sizes | wc # 108 216 2674 # 8 have disappeared: comm -23 patchN.chrom.sizes patch1.chrom.sizes | wc # 7 14 170 # and 54 new ones: comm -13 patchN.chrom.sizes patch1.chrom.sizes | wc # 54 108 1205 ########################################################################### # Build the browser (DONE - 2012-09-21 - Hiram) cd /hive/data/genomes/mm10Patch1 cat << '_EOF_' > mm10Patch1.config.ra # Config parameters for makeGenomeDb.pl: db mm10Patch1 clade haplotypes genomeCladePriority 199 scientificName Mus musculus commonName GRCm38.p1 assemblyDate Aug. 2012 assemblyLabel GRCm38 Patch 1 Genome Reference Consortium Mouse Reference 38 assemblyShortLabel GRCm38.p1 orderKey 1299 mitoAcc none fastaFiles /hive/data/genomes/mm10Patch1/sequence/mm10.patch1.fa agpFiles /hive/data/genomes/mm10Patch1/sequence/mm10Patch1.agp # qualFiles /dev/null dbDbSpeciesDir mouse photoCreditURL http://www.jax.org/ photoCreditName Photo courtesy The Jackson Laboratory ncbiGenomeId 52 ncbiAssemblyId 416688 ncbiAssemblyName GRCm38.p1 ncbiBioProject 20689 genBankAccessionID GCA_000001635.3 taxId 10090 '_EOF_' # << happy emacs # you need to have the clade and genomeCladePriority since this unique # db name mm10Patch1 is always a 'new' genome # stop after agp to verify agp and fasta agree properly makeGenomeDb.pl -dbHost=hgwdev -fileServer=hgwdev -workhorse=hgwdev \ -stop=agp mm10Patch1.config.ra > makeGenomeDb.log 2>&1 makeGenomeDb.pl -dbHost=hgwdev -fileServer=hgwdev -workhorse=hgwdev \ -continue=db mm10Patch1.config.ra > makeGenomeDb.db.log 2>&1 featureBits -countGaps mm10Patch1 gap # 0 bases of 2926456 (0.000%) in intersection ########################################################################### # RepeatMasker (WORKING - 2012-07-17 - Hiram) mkdir /hive/data/genomes/mm10Patch1/bed/repeatMasker cd /hive/data/genomes/mm10Patch1/bed/repeatMasker time doRepeatMasker.pl mm10Patch1 -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev > do.log 2>&1 & # real 82m56.099s # temporary broken twoBitMask command time doRepeatMasker.pl mm10Patch1 -buildDir=`pwd` -noSplit \ -continue=install -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev > install.log 2>&1 & # real 0m15.910s cat faSize.rmsk.txt # 2926456 bases (0 N's 2926456 real 1369863 upper 1556593 lower) # in 9 sequences in 1 files # Total size: mean 325161.8 sd 124628.6 min 182256 (chrX_jh792831) # max 544189 (chrY_jh792832) median 331480 # %53.19 masked total, %53.19 masked real ########################################################################### # TRF simple repeats (WORKING - 2012-07-17 - Hiram) mkdir /hive/data/genomes/mm10Patch1/bed/simpleRepeat cd /hive/data/genomes/mm10Patch1/bed/simpleRepeat time doSimpleRepeat.pl mm10Patch1 -buildDir=`pwd` -dbHost=hgwdev \ -smallClusterHub=swarm -workhorse=hgwdev > do.log 2>&1 & # real 1m4.143s cat fb.simpleRepeat # 135001 bases of 2926456 (4.613%) in intersection cd /hive/data/genomes/mm10Patch1 twoBitMask mm10Patch1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed mm10Patch1.2bit # safe to ignore warning: has >=13 fields twoBitToFa mm10Patch1.2bit stdout | faSize stdin \ > faSize.mm10Patch1.2bit.txt # 2926456 bases (0 N's 2926456 real 1367108 upper 1559348 lower) # in 9 sequences in 1 files # Total size: mean 325161.8 sd 124628.6 min 182256 (chrX_jh792831) # max 544189 (chrY_jh792832) median 331480 # %53.28 masked total, %53.28 masked real time blat mm10Patch1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/mm10Patch1.11.ooc \ -repMatch=1024 # Wrote 0 overused 11-mers to jkStuff/mm10Patch1.11.ooc # real 0m0.163s rm /gbdb/mm10Patch1/mm10Patch1.2bit ln -s `pwd`/mm10Patch1.2bit /gbdb/mm10Patch1/ # the makeGenomeDb.pl script changed the case of the genome name: hgsql -e 'update dbDb set genome="GRCm38.p1" where name="mm10Patch1";' \ hgcentraltest ########################################################################### # ctgPos track (DONE - 2012-09-25 - Hiram) mkdir /hive/data/genomes/mm10Patch1/bed/ctgPos cd /hive/data/genomes/mm10Patch1/bed/ctgPos # in hg19, this used to look for positions in ctgPos, there # isn't any ctgPos in mm10, there will be errors from that # select statement here, but this works: for C in `cut -f1 ../../chrom.sizes` do ctgPos=`hgsql -N -e 'select * from ctgPos where chrom="'$C'";' mm10` if [ "x${ctgPos}y" = "xy" ]; then GL=`echo $C | sed -e "s/.*_gl//; s/.*_jh//"` glAcc=`grep -i ${GL} ../../../mm10/bed/patch1/genbank/PATCHES/scaffold_localID2acc | cut -f2` glSize=`grep -i ${GL} ../../chrom.sizes | cut -f2` echo -e "$glAcc\t$glSize\t${C}\t0\t$glSize" else echo "$ctgPos" fi done > ctgPos.txt # check length of ctg names: cut -f 1 ctgPos.txt | awk '{print length($0)}' | sort -n | tail -1 # 10 # and length of chrom names: cut -f 3 ctgPos.txt | awk '{print length($0)}' | sort -n | tail -1 # 14 # set those lengths in the indexes for the SQL create: sed -e "s/14/10/; s/16/14/" $HOME/kent/src/hg/lib/ctgPos.sql > ctgPos.sql hgLoadSqlTab mm10Patch1 ctgPos ctgPos.sql ctgPos.txt # should be %100 with gaps: featureBits -countGaps mm10Patch1 ctgPos # 2926456 bases of 2926456 (100.000%) in intersection ########################################################################### # ctgPos2 track (DONE - 2012-09-25 - Hiram) mkdir /hive/data/genomes/mm10Patch1/bed/ctgPos2 cd /hive/data/genomes/mm10Patch1/bed/ctgPos2 for C in `cut -f1 ../../chrom.sizes` do GL=`echo $C | sed -e "s/chr.*_//; s/jh/JH/"` glSize=`grep ${C} ../../chrom.sizes | cut -f2` cmName=`grep ${GL} ../../../mm10/bed/patch1/genbank/PATCHES/alt_scaffolds/alt_scaffold_placement.txt | cut -f3` echo -e "$cmName\t$glSize\t${C}\t0\t$glSize\tF" done > ctgPos2.tab # check length of ctg names: cut -f 1 ctgPos2.tab | awk '{print length($0)}' | sort -n | tail -1 # 12 # and length of chrom names: cut -f 3 ctgPos2.tab | awk '{print length($0)}' | sort -n | tail -1 # 14 sed -e "s/20/12/; s/16/14/" $HOME/kent/src/hg/lib/ctgPos2.sql \ > ctgPos2.sql hgLoadSqlTab mm10Patch1 ctgPos2 ctgPos2.sql ctgPos2.tab # should be %100 with gaps featureBits -countGaps mm10Patch1 ctgPos2 # 2926456 bases of 2926456 (100.000%) in intersection ########################################################################### # altSequence track (DONE - 2012-09-25 - Hiram) # provide links to locations on reference genome where these patches and # haplotypes belong mkdir /hive/data/genomes/mm10Patch1/bed/altSequence cd /hive/data/genomes/mm10Patch1/bed/altSequence ln -s ../../../mm10/bed/patch1/altSequence.bed \ altSeq.bed.0 cat altSeq.bed.0 | while read L do C=`echo "${L}" | awk '{print $4}'` mm10C=`echo "${L}" | awk '{print $1}'` mm10S=`echo "${L}" | awk '{print $2}'` mm10E=`echo "${L}" | awk '{print $3}'` S=`grep "^${C}" ../../chrom.sizes | cut -f2` echo $C $S $mm10C $mm10S $mm10E | awk '{printf "%s\t0\t%d\t%s:%d-%d\t", $1, $2, $3, $4, $5}' echo "${L}" | awk '{printf "%d\t%s\t%d\t%d\t%s\n", $5,$6,$7,$8,$9}' done > altSequence.tab hgLoadBed mm10Patch1 altSequence altSequence.tab # Loaded 162 elements of size 9 from altSequence.tab featureBits -countGaps mm10Patch1 altSequence # 2926456 bases of 2926456 (100.000%) in intersection ############################################################################ # lastz alignment to mm10 (WORKING - 2012-09-25 - Hiram) mkdir /hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25 cd /hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25 # construct a 2bit file of just the mm10 reference sequences # and all the business to run lastz on each haplotype with its # corresponding target sequence in mm10 rm -fr mm10Bits run.blastz mm10Bits.lift mkdir mm10Bits mkdir -p run.blastz/tParts mkdir -p run.blastz/qParts awk '{print $1}' ../altSequence/altSequence.tab | sort -u | while read H do P=`grep "^${H}" ../altSequence/altSequence.tab | head -1 | awk '{print $4}'` HE=`grep "^${H}" ../altSequence/altSequence.tab | head -1 | awk '{print $3}'` C=`echo ${P} | sed -e "s/:.*//"` CE=`grep "^${C}" /hive/data/genomes/mm10/chrom.sizes | cut -f2 | head -1` SE=`echo ${P} | sed -e "s/.*://"` S=`echo ${SE} | sed -e "s/-.*//" | awk '{printf "%d", $1-1}'` if [ "${S}" -lt 0 ]; then S=0 fi E=`echo ${SE} | sed -e "s/.*-//"` size=`echo $E $S | awk '{printf "%d", $1-$2}'` echo -e "$S\t$C.$S-$E\t$size\t$C\t$CE" echo mm10.2bit:${C}:$S-$E 1>&2 if [ ! -s mm10Bits/$C.$S-$E.fa ]; then echo ">$C.$S-$E" > mm10Bits/$C.$S-$E.fa twoBitToFa /gbdb/mm10/mm10.2bit:${C}:$S-$E stdout \ | grep -v "^>" >> mm10Bits/$C.$S-$E.fa fi echo -e "/hive/data/genomes/mm10Patch1/mm10Patch1.2bit:$H:0-$HE" \ > run.blastz/tParts/$H.lst echo -e "/hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/mm10Bits.2bit:$C.$S-$E:0-$size" \ > run.blastz/qParts/$H.lst echo -e "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/$H.lst qParts/$H.lst ../DEF {check out exists ../psl/$H.psl}" \ >> run.blastz/jobList done | sort -u > mm10Bits.lift faToTwoBit mm10Bits/chr*.fa mm10Bits.2bit twoBitInfo mm10Bits.2bit stdout | sort -k2nr > mm10Bits.chrom.sizes cat << '_EOF_' > DEF # mouse vs mouse BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 # lastz does not like the O= and E= lines in the matrix file BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters from hg18 vs venter1 lastz on advice from Webb BLASTZ_K=10000 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Mouse Mm10Patch1 SEQ1_DIR=/hive/data/genomes/mm10Patch1/mm10Patch1.2bit SEQ1_LEN=/hive/data/genomes/mm10Patch1/chrom.sizes SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 SEQ1_LIMIT=2 # QUERY: Mouse Mm10 SEQ2_DIR=/scratch/data/mm10/mm10.2bit SEQ2_LEN=/scratch/data/mm10/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/mm10Bits.2bit SEQ2_CTGLEN=/hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/mm10Bits.chrom.sizes SEQ2_LIFT=/hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/mm10Bits.lift SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 SEQ2_LIMIT=2 BASE=/hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs ssh swarm cd /hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/run.blastz mkdir ../psl para create jobList para try ... check ... push para time # Completed: 9 of 9 jobs # CPU time in finished jobs: 11s 0.18m 0.00h 0.00d 0.000 y # IO & Wait Time: 21s 0.35m 0.01h 0.00d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest finished job: 5s 0.08m 0.00h 0.00d # Submission to last job: 17s 0.28m 0.00h 0.00d # put together the individual results: cd /hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25 mkdir pslParts cat psl/chr*.psl | gzip -c > pslParts/mm10Patch1.mm10.psl.gz # constructing a chain from those results mkdir -p /hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/axtChain/run cd /hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/axtChain/run time zcat ../../pslParts/mm10Patch1.mm10.psl.gz \ | axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \ /hive/data/genomes/mm10Patch1/mm10Patch1.2bit \ /hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/mm10Bits.2bit \ stdout \ | chainAntiRepeat /hive/data/genomes/mm10Patch1/mm10Patch1.2bit \ /hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/mm10Bits.2bit \ stdin mm10Patch1.mm10.preLift.chain # real 0m48.446s liftUp -chainQ mm10Patch1.mm10.lifted.chain \ ../../mm10Bits.lift carry mm10Patch1.mm10.preLift.chain # constructing the net files: cd /hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/axtChain chainMergeSort run/mm10Patch1.mm10.lifted.chain \ | nice gzip -c > mm10Patch1.mm10.all.chain.gz chainSplit chain mm10Patch1.mm10.all.chain.gz # Make nets ("noClass", i.e. without rmsk/class stats which are added later): time chainPreNet mm10Patch1.mm10.all.chain.gz \ /hive/data/genomes/mm10Patch1/chrom.sizes \ /scratch/data/mm10/chrom.sizes stdout \ | chainNet stdin -minSpace=1 /hive/data/genomes/mm10Patch1/chrom.sizes \ /scratch/data/mm10/chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # Make liftOver chains: netChainSubset -verbose=0 noClass.net mm10Patch1.mm10.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > mm10Patch1.mm10.over.chain.gz # Make axtNet for download: one .axt per mm10Patch1 seq. netSplit noClass.net net cd .. mkdir -p axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /hive/data/genomes/mm10Patch1/mm10Patch1.2bit \ /scratch/data/mm10/mm10.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.mm10Patch1.mm10.net.axt.gz end # Make mafNet for multiz: one .maf per mm10Patch1 seq. mkdir -p mafNet foreach f (axtNet/*.mm10Patch1.mm10.net.axt.gz) axtToMaf -tPrefix=mm10Patch1. -qPrefix=mm10. $f \ /hive/data/genomes/mm10Patch1/chrom.sizes \ /scratch/data/mm10/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end # swap that business to mm10 mkdir /hive/data/genomes/mm10/bed/blastz.mm10Patch1.swap cd /hive/data/genomes/mm10/bed/blastz.mm10Patch1.swap time doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10Patch1/bed/lastzMm10.2012-09-25/DEF \ -swap -noDbNameCheck -stop=load \ -noLoadChainSplit -chainMinScore=2000 \ -chainLinearGap=medium -workhorse=hgwdev \ -smallClusterHub=encodek -bigClusterHub=swarm > swap.load.log 2>&1 # real 0m43.451s cat fb.mm10.chainMm10Patch1Link.txt # 2510337 bases of 2652783500 (0.095%) in intersection # and then fixup the chains to include the haplotypes cd /hive/data/genomes/mm10/bed/blastz.mm10Patch1.swap/axtChain # split up each chain by the mm10Patch1 query sequences mkdir -p queryChains chainSplit -q queryChains mm10.mm10Patch1.all.chain.gz # then run a 'lift over' chain/net on each single one mkdir -p singleLiftOver for F in queryChains/*.chain do C=`basename ${F}` B=`echo ${C} | sed -e "s/.chain//"` chainPreNet -inclHap ${F} /scratch/data/mm10/chrom.sizes \ /hive/data/genomes/mm10Patch1/chrom.sizes stdout \ | chainNet -inclHap stdin -minSpace=1 /scratch/data/mm10/chrom.sizes \ /hive/data/genomes/mm10Patch1/chrom.sizes singleLiftOver/${B}.raw.net \ /dev/null netSyntenic singleLiftOver/${B}.raw.net singleLiftOver/${B}.noClass.net netFilter -chimpSyn singleLiftOver/${B}.noClass.net > singleLiftOver/${B}.chimpSyn.net netChainSubset -verbose=0 singleLiftOver/${B}.noClass.net \ ${F} stdout \ | chainStitchId stdin stdout > singleLiftOver/${C} echo "${F} -> singleLiftOver/${C}" done # put the chains together into one file chainMergeSort singleLiftOver/chr*.chain | gzip -c \ > mm10.mm10Patch1.single.over.chain.gz # construct psl files from those chains chainToPsl mm10.mm10Patch1.single.over.chain.gz \ /hive/data/genomes/mm10/chrom.sizes \ /hive/data/genomes/mm10Patch1/chrom.sizes \ /hive/data/genomes/mm10/mm10.2bit \ /hive/data/genomes/mm10Patch1/mm10Patch1.2bit \ mm10.mm10Patch1.over.psl # chainToPsl appears to have a problem, note errors from pslCheck: pslCheck -db=mm10 mm10.mm10Patch1.over.psl # checked: 10 failed: 0 errors: 0 # despite no errors, this appears to change the psl: pslRecalcMatch mm10.mm10Patch1.over.psl \ /hive/data/genomes/mm10/mm10.2bit \ /hive/data/genomes/mm10Patch1/mm10Patch1.2bit \ fixup.mm10.mm10Patch1.over.psl pslCheck -db=mm10 fixup.mm10.mm10Patch1.over.psl # checked: 10 failed: 0 errors: 0 # load this PSL track hgLoadPsl mm10 -table=altSeqLiftOverPslP1 fixup.mm10.mm10Patch1.over.psl ############################################################################ # Add this sequence to mm10 (DONE - 2012-09-25 - Hiram) mkdir /hive/data/genomes/mm10Patch1/bed/altSequence/seqExt cd /hive/data/genomes/mm10Patch1/bed/altSequence/seqExt twoBitToFa ../../../mm10Patch1.2bit mm10Patch1.fa mkdir -p /gbdb/mm10/mm10Patch1 mm10Patch1 faSplit byname mm10Patch1.fa ./mm10Patch1/ ln -s `pwd`/mm10Patch1/*.fa /gbdb/mm10/mm10Patch1 hgLoadSeq -drop -seqTbl=seqMm10Patch1 -extFileTbl=extMm10Patch1 mm10 \ /gbdb/mm10/mm10Patch1/*.fa ############################################################################