# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on # the Patch 9 release for the NCBI build 37 (February 2009 freeze) aka: # GRCh37.p9 - Genome Reference Consortium Human Reference 37 ############################################################################ # gather sequence and AGP definitions (DONE - 2012-07-17 - Hiram) mkdir -p /hive/data/genomes/hg19Patch9/sequence cd /hive/data/genomes/hg19Patch9/sequence # a round about way here since patch9 sequence was already assembled. # there are perl and shell scripts in # ../../hg19/bed/additionalSequence/patch9 # which created the fasta file with UCSC names # see also in hg19.txt: # NCBI patch 9 (DONE - 2012-07-16 - Hiram) ln -s ../../hg19/bed/additionalSequence/patch9/hg19.patch9.fa . # check what might be different from previous faCount hg19.patch9.fa | grep -v total | grep -v seq \ | sort > patch9.faCount.txt faCount ../../hg19/bed/additionalSequence/patch5/hg19.patch5.fa \ | grep -v total | grep -v seq | sort > patch5.faCount.txt comm -12 patch5.faCount.txt patch9.faCount.txt | wc # 98 784 5609 # it appears that seven have been removed since patch 5: comm -23 patch5.faCount.txt patch9.faCount.txt | wc # 7 56 385 # and 54 added: comm -13 patch5.faCount.txt patch9.faCount.txt | wc # 54 432 2960 # reuse the script from patch5 sed -e "s/atch5/atch9/g" ../../hg19Patch5/sequence/patch5Agp.pl \ > ./patch9Agp.pl # one little addition to this script since then: diff ./patch9Agp.pl ../../hg19Patch5/sequence/patch5Agp.pl # 41d40 # < $faName =~ s/.*gb.JH/JH/; ./patch9Agp.pl \ ../../hg19/bed/additionalSequence/patch9/patches.chrom.sizes \ ../../hg19/bed/additionalSequence/patch9/ucscNames.patch9.txt \ ../../hg19/bed/additionalSequence/patch9/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz \ > hg19Patch9.agp # add in the haplotypes from hg19 for H in chr17_ctg5_hap1 chr4_ctg9_hap1 chr6_apd_hap1 chr6_cox_hap2 \ chr6_dbb_hap3 chr6_mann_hap4 chr6_mcf_hap5 chr6_qbl_hap6 \ chr6_ssto_hap7 do grep "^${H}" /hive/data/genomes/hg19/hg19.agp twoBitToFa ../../hg19/hg19.2bit:${H} ${H}.fa done >> hg19Patch9.agp # and the chrM_rCRS echo -e "chrM_rCRS\t1\t16569\t1\tF\tNC_012920\t1\t16569\t+" \ >> hg19Patch9.agp sed -e "s/^>.*/>chrM_rCRS/" \ ../../hg19/bed/additionalSequence/chrM/NC_012920.1.fa > chrM_rCRS.fa # verify we have correct sequence and AGP file: faToTwoBit *.fa patch9.2bit checkAgpAndFa hg19Patch9.agp patch9.2bit 2>&1 | tail -3 # All AGP and FASTA entries agree - both files are valid # compare the two chrom.sizes to see what is missing or has been added twoBitInfo patch9.2bit stdout | sort > patch9.chrom.sizes twoBitInfo ../../hg19Patch5/sequence/patch5.2bit stdout \ | sort > patch5.chrom.sizes # 108 identical: comm -12 patch5.chrom.sizes patch9.chrom.sizes | wc # 108 216 2674 # 8 have disappeared: comm -23 patch5.chrom.sizes patch9.chrom.sizes | wc # 7 14 170 # and 54 new ones: comm -13 patch5.chrom.sizes patch9.chrom.sizes | wc # 54 108 1205 ########################################################################### # Build the browser (DONE - 2012-07-17 - Hiram) cd /hive/data/genomes/hg19Patch9 cat << '_EOF_' > hg19Patch9.config.ra # Config parameters for makeGenomeDb.pl: db hg19Patch9 clade haplotypes genomeCladePriority 138 scientificName Homo sapiens commonName GRCh37.p9 assemblyDate Jul. 2012 assemblyLabel GRCh37 Patch 9 Genome Reference Consortium Human Reference 37 assemblyShortLabel GRCh37.p9 orderKey 144 mitoAcc none fastaFiles /hive/data/genomes/hg19Patch9/sequence/*.fa agpFiles /hive/data/genomes/hg19Patch9/sequence/hg19Patch9.agp # qualFiles /dev/null dbDbSpeciesDir human photoCreditURL http://www.cbse.ucsc.edu/ photoCreditName CBSE ncbiGenomeId 51 # this is patch8: 368578 can't find the patch 9 id right now ncbiAssemblyId 368578 ncbiAssemblyName GRCh37.p9 ncbiBioProject 31257 genBankAccessionID GCA_000001405.10 taxId 9606 '_EOF_' # << happy emacs # you need to have the clade and genomeCladePriority since this unique # db name hg19Patch9 is always a 'new' genome # stop after agp to verify agp and fasta agree properly makeGenomeDb.pl -dbHost=hgwdev -fileServer=hgwdev -workhorse=hgwdev \ -stop=agp hg19Patch9.config.ra > makeGenomeDb.log 2>&1 makeGenomeDb.pl -dbHost=hgwdev -fileServer=hgwdev -workhorse=hgwdev \ -continue=db hg19Patch9.config.ra > makeGenomeDb.db.log 2>&1 featureBits -countGaps hg19Patch9 gap # 8743321 bases of 93963414 (9.305%) in intersection ########################################################################### # RepeatMasker (WORKING - 2012-07-17 - Hiram) mkdir /hive/data/genomes/hg19Patch9/bed/repeatMasker cd /hive/data/genomes/hg19Patch9/bed/repeatMasker time doRepeatMasker.pl hg19Patch9 -buildDir=`pwd` -noSplit \ -bigClusterHub=encodek \ -dbHost=hgwdev -workhorse=hgwdev > do.log 2>&1 & # about 601m == 10h 10m cat faSize.rmsk.txt # 93963414 bases (8743672 N's 85219742 real 40393959 upper 44825783 lower) # in 162 sequences in 1 files # Total size: mean 580021.1 sd 1120614.0 min 16569 (chrM_rCRS) # max 7107865 (chr1_jh636052) median 201198 # %47.71 masked total, %52.60 masked real ########################################################################### # TRF simple repeats (WORKING - 2012-07-17 - Hiram) mkdir /hive/data/genomes/hg19Patch9/bed/simpleRepeat cd /hive/data/genomes/hg19Patch9/bed/simpleRepeat time doSimpleRepeat.pl hg19Patch9 -buildDir=`pwd` -dbHost=hgwdev \ -smallClusterHub=encodek -workhorse=hgwdev > do.log 2>&1 & # real 7m58.604s cat fb.simpleRepeat # 2867144 bases of 85220093 (3.364%) in intersection cd /hive/data/genomes/hg19Patch9 twoBitMask hg19Patch9.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed hg19Patch9.2bit # safe to ignore warning: has >=13 fields twoBitToFa hg19Patch9.2bit stdout | faSize stdin \ > faSize.hg19Patch9.2bit.txt # 93963414 bases (8743672 N's 85219742 real 40352594 upper 44867148 lower) # in 162 sequences in 1 files # Total size: mean 580021.1 sd 1120614.0 min 16569 (chrM_rCRS) # max 7107865 (chr1_jh636052) median 201198 # %47.75 masked total, %52.65 masked real time blat hg19Patch9.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/hg19Patch9.11.ooc \ -repMatch=1024 # Wrote 212 overused 11-mers to jkStuff/hg19Patch9.11.ooc rm /gbdb/hg19Patch9/hg19Patch9.2bit ln -s `pwd`/hg19Patch9.2bit /gbdb/hg19Patch9/ # the makeGenomeDb.pl script changed the case of the genome name: hgsql -e 'update dbDb set genome="GRCh37.p9" where name="hg19Patch9";' \ hgcentraltest ########################################################################### # ctgPos track (DONE - 2012-07-17 - Hiram) mkdir /hive/data/genomes/hg19Patch9/bed/ctgPos cd /hive/data/genomes/hg19Patch9/bed/ctgPos for C in `cut -f1 ../../chrom.sizes | grep -v chrM_rCRS` do ctgPos=`hgsql -N -e 'select * from ctgPos where chrom="'$C'";' hg19` if [ "x${ctgPos}y" = "xy" ]; then GL=`echo $C | sed -e "s/.*_gl//; s/.*_jh//"` glAcc=`grep -i ${GL} ../../../hg19/bed/additionalSequence/patch9/PATCHES/scaffold_localID2acc | cut -f2` glSize=`grep -i ${GL} ../../chrom.sizes | cut -f2` echo -e "$glAcc\t$glSize\t${C}\t0\t$glSize" else echo "$ctgPos" fi done > ctgPos.txt echo -e "NC_012920.1\t16569\tchrM_rCRS\t0\t16569" >> ctgPos.txt # check length of ctg names: cut -f 1 ctgPos.txt | awk '{print length($0)}' | sort -n | tail -1 # 11 # and length of chrom names: cut -f 3 ctgPos.txt | awk '{print length($0)}' | sort -n | tail -1 # 25 # set those lengths in the indexes for the SQL create: sed -e "s/14/11/; s/16/25/" $HOME/kent/src/hg/lib/ctgPos.sql > ctgPos.sql hgLoadSqlTab hg19Patch9 ctgPos ctgPos.sql ctgPos.txt # should be %100 with gaps: featureBits -countGaps hg19Patch9 ctgPos # 93963414 bases of 93963414 (100.000%) in intersection ########################################################################### # ctgPos2 track (DONE - 2012-07-17 - Hiram) mkdir /hive/data/genomes/hg19Patch9/bed/ctgPos2 cd /hive/data/genomes/hg19Patch9/bed/ctgPos2 for C in `cut -f1 ../../chrom.sizes | grep -v chrM_rCRS` do ctgPos2=`hgsql -N -e 'select * from ctgPos2 where chrom="'$C'";' hg19` if [ "x${ctgPos}y" = "xy" ]; then GL=`echo $C | sed -e "s/.*_gl//; s/.*_jh//"` glSize=`grep ${GL} /hive/data/genomes/hg19Patch9/chrom.sizes | cut -f2` ncbiChrName=`grep ${GL} ../../../hg19/bed/additionalSequence/patch9/PATCHES/scaffold_localID2acc | cut -f1` if [ "x${ncbiChrName}y" = "xy" ]; then GL=`echo $C | sed -e "s/_hap.*//" | sed -e "s/chr.*_/_/" | tr '[a-z]' '[A-Z]'` ncbiChrName=`grep -h ${GL} /hive/data/genomes/hg19/download/alternate_loci/ALT_REF_LOCI_?/localID2acc | cut -f1` fi echo -e "$ncbiChrName\t$glSize\t${C}\t0\t$glSize\tF" else echo -e "$ctgPos2\tF" fi done > ctgPos2.tab echo -e "NC_012920.1\t16569\tchrM_rCRS\t0\t16569\tF" >> ctgPos2.tab # check length of ctg names: cut -f 1 ctgPos2.tab | awk '{print length($0)}' | sort -n | tail -1 # 24 # and length of chrom names: cut -f 3 ctgPos2.tab | awk '{print length($0)}' | sort -n | tail -1 # 25 sed -e "s/20/24/; s/16/25/" $HOME/kent/src/hg/lib/ctgPos2.sql \ > ctgPos2.sql hgLoadSqlTab hg19Patch9 ctgPos2 ctgPos2.sql ctgPos2.tab # should be %100 with gaps featureBits -countGaps hg19Patch9 ctgPos2 # 93963414 bases of 93963414 (100.000%) in intersection ########################################################################### # altSequence track (DONE - 2012-07-17 - Hiram) # provide links to locations on reference genome where these patches and # haplotypes belong mkdir /hive/data/genomes/hg19Patch9/bed/altSequence cd /hive/data/genomes/hg19Patch9/bed/altSequence ln -s ../../../hg19/bed/additionalSequence/patch9/altSequence.bed \ altSeq.bed.0 cat altSeq.bed.0 | while read L do C=`echo "${L}" | awk '{print $4}'` hg19C=`echo "${L}" | awk '{print $1}'` hg19S=`echo "${L}" | awk '{print $2}'` hg19E=`echo "${L}" | awk '{print $3}'` S=`grep "^${C}" ../../chrom.sizes | cut -f2` echo $C $S $hg19C $hg19S $hg19E | awk '{printf "%s\t0\t%d\t%s:%d-%d\t", $1, $2, $3, $4, $5}' echo "${L}" | awk '{printf "%d\t%s\t%d\t%d\t%s\n", $5,$6,$7,$8,$9}' done | grep -v "chrM_rCRS:" > altSequence.tab hgLoadBed hg19Patch9 altSequence altSequence.tab # Loaded 162 elements of size 9 from altSequence.tab featureBits -countGaps hg19Patch9 altSequence # 93963414 bases of 93963414 (100.000%) in intersection ############################################################################ # create lift file on unBridged gaps for genbank splits (2012-07-17 - Hiram) mkdir /hive/data/genomes/hg19Patch9/bed/gap cd /hive/data/genomes/hg19Patch9/bed/gap # verify all gaps are properly in the gap table: time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../hg19Patch9.2bit > findMotif.txt 2>&1 # real 0m0.370s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits hg19Patch9 -not gap -bed=notGap.bed # 85220093 bases of 85220093 (100.000%) in intersection featureBits hg19Patch9 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 85220093 (0.000%) in intersectio # there are no bases missing being counted as gaps # construct an unBridged gap file for genbank (there are no unbridged gaps) gapToLift hg19Patch9 hg19Patch9.unBridged.lift -bedFile=unBridged.lift.bed cp -p hg19Patch9.unBridged.lift ../../jkStuff ########################################################################### # AUTO UPDATE GENBANK RUN (NOT DONE - 2012-07-17 - Hiram) # we no longer run this. # align with latest genbank process. cd ~/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add hg19Patch9 just before hg19Patch2 # hg19Patch9 - GRCh37.p9 - Genome Reference Consortium Human Reference 37 hg19Patch9.serverGenome = /hive/data/genomes/hg19Patch9/hg19Patch9.2bit hg19Patch9.clusterGenome = /hive/data/genomes/hg19Patch9/hg19Patch9.2bit hg19Patch9.ooc = /hive/data/genomes/hg19Patch9/hg19Patch9.11.ooc hg19Patch9.lift = /hive/data/genomes/hg19Patch9/jkStuff/hg19Patch9.unBridged.lift hg19Patch9.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} hg19Patch9.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} hg19Patch9.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} hg19Patch9.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} hg19Patch9.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} hg19Patch9.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter} hg19Patch9.genbank.est.xeno.load = no hg19Patch9.genbank.est.xeno.loadDesc = no hg19Patch9.genbank.mrna.xeno.load = no hg19Patch9.genbank.mrna.xeno.loadDesc = no hg19Patch9.refseq.mrna.xeno.load = no hg19Patch9.refseq.mrna.xeno.loadDesc = no hg19Patch9.mgc = yes hg19Patch9.orfeome = yes hg19Patch9.downloadDir = hg19Patch9 hg19Patch9.genbank.mrna.blatTargetDb = yes hg19Patch9.perChromTables = no git commit -m "adding hg19Patch9" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update ssh hgwdev # genbank procedure only functions on hgwdev screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial hg19Patch9 & # logFile: var/build/logs/2011.07.12-13:04:17.hg19Patch9.initalign.log # real 667m37.192s # load database when finished ssh hgwdev screen # use screen to manage this long running command cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19Patch9 & # logFile: var/dbload/hgwdev/logs/2011.07.13-09:59:27.dbload.log # real 51m17.530s # the following has not been done, XXX - 2011-07-13 - Hiram # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add hg19Patch9 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added hg19Patch9 - Human - GRCh37.p9" etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # new blat server for the hg19.patch9 sequence (WORKING - 2011-07-13 - Hiram) hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("hg19Patch9", "blatx", "17838", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("hg19Patch9", "blatx", "17839", "0", "1");' \ hgcentraltest ############################################################################ # lastz alignment to hg19 (WORKING - 2012-07-18 - Hiram) mkdir /hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18 cd /hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18 # construct a 2bit file of just the hg19 reference sequences # and all the business to run lastz on each haplotype with its # corresponding target sequence in hg19 rm -fr hg19Bits run.blastz hg19Bits.lift mkdir hg19Bits mkdir -p run.blastz/tParts mkdir -p run.blastz/qParts awk '{print $1}' ../altSequence/altSequence.tab | sort -u | while read H do P=`grep "^${H}" ../altSequence/altSequence.tab | head -1 | awk '{print $4}'` HE=`grep "^${H}" ../altSequence/altSequence.tab | head -1 | awk '{print $3}'` C=`echo ${P} | sed -e "s/:.*//"` CE=`grep "^${C}" /hive/data/genomes/hg19/chrom.sizes | cut -f2 | head -1` SE=`echo ${P} | sed -e "s/.*://"` S=`echo ${SE} | sed -e "s/-.*//" | awk '{printf "%d", $1-1}'` if [ "${S}" -lt 0 ]; then S=0 fi E=`echo ${SE} | sed -e "s/.*-//"` size=`echo $E $S | awk '{printf "%d", $1-$2}'` echo -e "$S\t$C.$S-$E\t$size\t$C\t$CE" echo hg19.2bit:${C}:$S-$E 1>&2 if [ ! -s hg19Bits/$C.$S-$E.fa ]; then echo ">$C.$S-$E" > hg19Bits/$C.$S-$E.fa twoBitToFa /gbdb/hg19/hg19.2bit:${C}:$S-$E stdout \ | grep -v "^>" >> hg19Bits/$C.$S-$E.fa fi echo -e "/hive/data/genomes/hg19Patch9/hg19Patch9.2bit:$H:0-$HE" \ > run.blastz/tParts/$H.lst echo -e "/hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/hg19Bits.2bit:$C.$S-$E:0-$size" \ > run.blastz/qParts/$H.lst echo -e "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/$H.lst qParts/$H.lst ../DEF {check out exists ../psl/$H.psl}" \ >> run.blastz/jobList done | sort -u > hg19Bits.lift faToTwoBit hg19Bits/chr*.fa hg19Bits.2bit twoBitInfo hg19Bits.2bit stdout | sort -k2nr > hg19Bits.chrom.sizes cat << '_EOF_' > DEF # human vs human BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 # lastz does not like the O= and E= lines in the matrix file BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters from hg18 vs venter1 lastz on advice from Webb BLASTZ_K=10000 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Human Hg19Patch9 SEQ1_DIR=/hive/data/genomes/hg19Patch9/hg19Patch9.2bit SEQ1_LEN=/hive/data/genomes/hg19Patch9/chrom.sizes SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 SEQ1_LIMIT=2 # QUERY: Human Hg19 SEQ2_DIR=/scratch/data/hg19/hg19.2bit SEQ2_LEN=/scratch/data/hg19/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/hg19Bits.2bit SEQ2_CTGLEN=/hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/hg19Bits.chrom.sizes SEQ2_LIFT=/hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/hg19Bits.lift SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 SEQ2_LIMIT=2 BASE=/hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs ssh swarm cd /hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/run.blastz mkdir ../psl para create jobList para try ... check ... push para time # Completed: 162 of 162 jobs # CPU time in finished jobs: 429s 7.14m 0.12h 0.00d 0.000 y # IO & Wait Time: 572s 9.54m 0.16h 0.01d 0.000 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest finished job: 153s 2.55m 0.04h 0.00d # Submission to last job: 225s 3.75m 0.06h 0.00d # put together the individual results: cd /hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18 mkdir pslParts cat psl/chr*.psl | gzip -c > pslParts/hg19Patch9.hg19.psl.gz # constructing a chain from those results mkdir -p /hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/axtChain/run cd /hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/axtChain/run time zcat ../../pslParts/hg19Patch9.hg19.psl.gz \ | axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \ /hive/data/genomes/hg19Patch9/hg19Patch9.2bit \ /hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/hg19Bits.2bit \ stdout \ | chainAntiRepeat /hive/data/genomes/hg19Patch9/hg19Patch9.2bit \ /hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/hg19Bits.2bit \ stdin hg19Patch9.hg19.preLift.chain # real 0m48.446s liftUp -chainQ hg19Patch9.hg19.lifted.chain \ ../../hg19Bits.lift carry hg19Patch9.hg19.preLift.chain # constructing the net files: cd /hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/axtChain chainMergeSort run/hg19Patch9.hg19.lifted.chain \ | nice gzip -c > hg19Patch9.hg19.all.chain.gz chainSplit chain hg19Patch9.hg19.all.chain.gz # Make nets ("noClass", i.e. without rmsk/class stats which are added later): time chainPreNet hg19Patch9.hg19.all.chain.gz \ /hive/data/genomes/hg19Patch9/chrom.sizes \ /scratch/data/hg19/chrom.sizes stdout \ | chainNet stdin -minSpace=1 /hive/data/genomes/hg19Patch9/chrom.sizes \ /scratch/data/hg19/chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # Make liftOver chains: netChainSubset -verbose=0 noClass.net hg19Patch9.hg19.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > hg19Patch9.hg19.over.chain.gz # Make axtNet for download: one .axt per hg19Patch9 seq. netSplit noClass.net net cd .. mkdir -p axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /hive/data/genomes/hg19Patch9/hg19Patch9.2bit \ /scratch/data/hg19/hg19.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.hg19Patch9.hg19.net.axt.gz end # Make mafNet for multiz: one .maf per hg19Patch9 seq. mkdir -p mafNet foreach f (axtNet/*.hg19Patch9.hg19.net.axt.gz) axtToMaf -tPrefix=hg19Patch9. -qPrefix=hg19. $f \ /hive/data/genomes/hg19Patch9/chrom.sizes \ /scratch/data/hg19/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end # swap that business to hg19 mkdir /hive/data/genomes/hg19/bed/blastz.hg19Patch9.swap cd /hive/data/genomes/hg19/bed/blastz.hg19Patch9.swap time doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19Patch9/bed/lastzHg19.2012-07-18/DEF \ -swap -noDbNameCheck -stop=load \ -noLoadChainSplit -chainMinScore=2000 \ -chainLinearGap=medium -workhorse=hgwdev \ -smallClusterHub=encodek -bigClusterHub=swarm > swap.load.log 2>&1 # real 2m49.392s cat fb.hg19.chainHg19Patch9Link.txt # 55576147 bases of 2897316137 (1.918%) in intersection # and then fixup the chains to include the haplotypes cd /hive/data/genomes/hg19/bed/blastz.hg19Patch9.swap/axtChain # split up each chain by the hg19Patch9 query sequences mkdir -p queryChains chainSplit -q queryChains hg19.hg19Patch9.all.chain.gz # then run a 'lift over' chain/net on each single one mkdir -p singleLiftOver for F in queryChains/*.chain do C=`basename ${F}` B=`echo ${C} | sed -e "s/.chain//"` chainPreNet -inclHap ${F} /scratch/data/hg19/chrom.sizes \ /hive/data/genomes/hg19Patch9/chrom.sizes stdout \ | chainNet -inclHap stdin -minSpace=1 /scratch/data/hg19/chrom.sizes \ /hive/data/genomes/hg19Patch9/chrom.sizes singleLiftOver/${B}.raw.net \ /dev/null netSyntenic singleLiftOver/${B}.raw.net singleLiftOver/${B}.noClass.net netFilter -chimpSyn singleLiftOver/${B}.noClass.net > singleLiftOver/${B}.chimpSyn.net netChainSubset -verbose=0 singleLiftOver/${B}.noClass.net \ ${F} stdout \ | chainStitchId stdin stdout > singleLiftOver/${C} echo "${F} -> singleLiftOver/${C}" done # put the chains together into one file chainMergeSort singleLiftOver/chr*.chain | gzip -c \ > hg19.hg19Patch9.single.over.chain.gz # construct psl files from those chains chainToPsl hg19.hg19Patch9.single.over.chain.gz \ /hive/data/genomes/hg19/chrom.sizes \ /hive/data/genomes/hg19Patch9/chrom.sizes \ /hive/data/genomes/hg19/hg19.2bit \ /hive/data/genomes/hg19Patch9/hg19Patch9.2bit \ hg19.hg19Patch9.over.psl # chainToPsl appears to have a problem, note errors from pslCheck: pslCheck -db=hg19 hg19.hg19Patch9.over.psl # checked: 884 failed: 45 errors: 45 pslRecalcMatch hg19.hg19Patch9.over.psl \ /hive/data/genomes/hg19/hg19.2bit \ /hive/data/genomes/hg19Patch9/hg19Patch9.2bit \ fixup.hg19.hg19Patch9.over.psl pslCheck -db=hg19 fixup.hg19.hg19Patch9.over.psl checked: 884 failed: 0 errors: 0 # load this PSL track hgLoadPsl hg19 -table=altSeqLiftOverPslP9 fixup.hg19.hg19Patch9.over.psl # to replace this table in the current track: hgLoadPsl hg19 -table=altSeqLiftOverPsl fixup.hg19.hg19Patch9.over.psl ############################################################################ # Add this sequence to hg19 (DONE - 2012-07-18 - Hiram) mkdir /hive/data/genomes/hg19Patch9/bed/altSequence/seqExt cd /hive/data/genomes/hg19Patch9/bed/altSequence/seqExt twoBitToFa ../../../hg19Patch9.2bit hg19Patch9.fa mkdir -p /gbdb/hg19/hg19Patch9 hg19Patch9 faSplit byname hg19Patch9.fa ./hg19Patch9/ ln -s `pwd`/hg19Patch9/*.fa /gbdb/hg19/hg19Patch9 hgLoadSeq -drop -seqTbl=seqHg19Patch9 -extFileTbl=extHg19Patch9 hg19 \ /gbdb/hg19/hg19Patch9/*.fa ############################################################################