# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on # the Patch 2 release for the NCBI build 37 (February 2009 freeze) aka: # GRCh37.p2 - Genome Reference Consortium Human Reference 37 ############################################################################ # gather sequence and AGP definitions (DONE - 2010-08-26 - Hiram) mkdir -p /hive/data/genomes/hg19Patch2/sequence cd /hive/data/genomes/hg19Patch2/sequence # a round about way here since patch2 sequence was already assembled. # there are perl and shell scripts in # ../../hg19/bed/additionalSequence/patch2 # which created the fasta file with UCSC names # see also in hg19.txt: # Updating to patch2 sequence (DONE - 2010-08-18 - Hiram) ln -s ../../hg19/bed/additionalSequence/patch2/patch2.ucsc.fa . ln -s ../../hg19/17_ctg5_hap1/chr17_ctg5_hap1.fa . ln -s ../../hg19/4_ctg9_hap1/chr4_ctg9_hap1.fa . # the fasta files in hg19 have incorrect headers: for C in 6_apd_hap1 6_cox_hap2 6_dbb_hap3 6_mann_hap4 6_mcf_hap5 \ 6_qbl_hap6 6_ssto_hap7 do rm -f chr${C}.fa echo ">chr${C}" > chr${C}.fa grep -v "^>" ../../hg19/${C}/*.fa >> chr${C}.fa done cat << '_EOF_' > patch2Agp.pl #!/usr/bin/env perl use strict; use warnings; sub usage() { printf STDERR "usage: ./patch2Agp.pl ../../hg19/bed/additionalSequence/patch2/patches.chrom.sizes \\\n"; printf STDERR " ../../hg19/bed/additionalSequence/patches/ucscNames.patch2.txt \\\n"; printf STDERR " ../../hg19/bed/additionalSequence/patches/patch_release_2/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz\n"; } my $argc = scalar(@ARGV); if ($argc < 3) { usage; exit 255; } my %skipSequence; $skipSequence{"GL339449.1"} = 1; $skipSequence{"GL339450.1"} = 1; my $sizes = shift; # patches.chrom.sizes my $names = shift; # patches/ucscNames.txt my $agpFile = shift; # alt.scaf.agp.gz my %glToChr; my %chrToCtg; my %fastaToChr; my %chrToSize; open(FH, "<$sizes") or die "can not read $sizes"; while (my $line = ) { chomp $line; my ($chr, $size) = split('\s+', $line); $chrToSize{$chr} = $size; } close (FH); open(FH, "<$names"); while (my $line = ) { chomp $line; my ($faName, $ctg, $cmName, $chr) = split('\s+', $line); $faName =~ s/.*gb.GL/GL/; next if (exists($skipSequence{$faName})); my $size = $chrToSize{$chr}; if (exists($glToChr{$faName})) { if ($glToChr{$faName} ne $chr) { printf STDERR "ERROR: contig name: $faName was chr name: $glToChr{$faName}\n"; printf STDERR " now claiming to be chr name: $chr\n"; exit 255; } } else { $glToChr{$faName} = $chr; } die "not defined faName" if (!defined($faName)); die "not defined $faName $chr size" if (!defined($size)); } close (FH); my $prevObj = ""; my $newIx = 1; open (FH,"zcat $agpFile|") or die "can not read $agpFile"; while (my $line = ) { next if ($line =~ m/^\s*#/); chomp $line; my ($object, $objStart, $objEnd, $ix, $type, $frag, $fragStart, $fragEnd, $strand) = split('\s+', $line); next if (exists($skipSequence{$object})); die "ERROR: can not find contig $object to chr name" if (!exists($glToChr{$object})); $newIx = 1 if ($prevObj ne $object); my $chr = $glToChr{$object}; if ($type eq "N") { # frag is size, fragStart is type of gap, and fragEnd is bridged y/n printf "%s\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n", $chr, $objStart, $objEnd, $newIx, $type, $frag, $fragStart, $fragEnd; } else { printf "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n", $chr, $objStart, $objEnd, $newIx, $type, $frag, $fragStart, $fragEnd, $strand; } ++$newIx; $prevObj = $object; # printf "%s\n", $line; } close (FH); '_EOF_' # << happy emacs chmod +x patch2Agp.pl zcat \ ../../hg19/bed/additionalSequence/patches/patch_release_1/AGP/alt.scaf.agp.gz \ | grep "^GL" | sed -e "s/GL339449.1/chr5_ctg1_gl339449/; s/GL339450.1/chr9_gl339450/" > hg19Patch2.agp ./patch2Agp.pl \ ../../hg19/bed/additionalSequence/patch2/patches.chrom.sizes \ ../../hg19/bed/additionalSequence/patches/ucscNames.patch2.txt \ ../../hg19/bed/additionalSequence/patches/patch_release_2/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz \ >> hg19Patch2.agp for H in chr17_ctg5_hap1 chr4_ctg9_hap1 chr6_apd_hap1 chr6_cox_hap2 \ chr6_dbb_hap3 chr6_mann_hap4 chr6_mcf_hap5 chr6_qbl_hap6 \ chr6_ssto_hap7 chrM_rCRS do grep "^${H}" /hive/data/genomes/hg19/hg19.agp done >> hg19Patch2.agp echo -e "chrM_rCRS\t1\t16569\t1\tF\tNC_012920\t1\t16569\t+" >> hg19Patch2.agp # verify we have correct sequence and AGP file: faToTwoBit *.fa patch2.2bit checkAgpAndFa hg19Patch2.agp patch2.2bit # All AGP and FASTA entries agree - both files are valid ########################################################################### # Build the browser (DONE - 2010-08-26 - Hiram) cd /hive/data/genomes/hg19Patch2 cat << '_EOF_' > hg19Patch2.config.ra # Config parameters for makeGenomeDb.pl: db hg19Patch2 scientificName Homo sapiens commonName GRCh37.p2 assemblyDate Aug. 2009 assemblyLabel GRCh37 Patch 2 Genome Reference Consortium Human Reference 37 (GCA_000001405.1) orderKey 14 mitoAcc none fastaFiles /hive/data/genomes/hg19Patch2/sequence/*.fa agpFiles /hive/data/genomes/hg19Patch2/sequence/hg19Patch2.agp # qualFiles /dev/null dbDbSpeciesDir human taxId 9606 clade haplotypes genomeCladePriority 139 assemblyShortLabel GRCh37.p2 '_EOF_' # << happy emacs makeGenomeDb.pl -dbHost=hgwdev -fileServer=hgwdev -workhorse=hgwdev \ -noGoldGapSplit hg19Patch2.config.ra > makeGenomeDb.log 2>&1 featureBits -countGaps hg19Patch2 gap # 5650321 bases of 49709657 (11.367%) in intersection ########################################################################### # RepeatMasker (DONE - 2010-08-26 - Hiram) mkdir /hive/data/genomes/hg19Patch2/bed/repeatMasker cd /hive/data/genomes/hg19Patch2/bed/repeatMasker doRepeatMasker.pl hg19Patch2 -buildDir=`pwd` -noSplit -bigClusterHub=pk \ -dbHost=hgwdev -workhorse=hgwdev > do.log 2>&1 cat faSize.rmsk.txt # 49709657 bases (5650322 N's 44059335 real 21233685 upper 22825650 lower) # in 80 sequences in 1 files # %45.92 masked total, %51.81 masked real ########################################################################### # TRF simple repeats (DONE - 2010-08-26 - Hiram) mkdir /hive/data/genomes/hg19Patch2/bed/simpleRepeat cd /hive/data/genomes/hg19Patch2/bed/simpleRepeat doSimpleRepeat.pl hg19Patch2 -buildDir=`pwd` -dbHost=hgwdev \ -smallClusterHub=pk -workhorse=hgwdev > do.log 2>&1 cat fb.simpleRepeat # 1328856 bases of 44059336 (3.016%) in intersection twoBitMask hg19Patch2.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed hg19Patch2.2bit twoBitToFa hg19Patch2.2bit stdout | faSize stdin \ > faSize.hg19Patch2.2bit.txt # 49709657 bases (5650322 N's 44059335 real 21214276 upper 22845059 lower) # in 80 sequences in 1 files # %45.96 masked total, %51.85 masked real time blat hg19Patch2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=hg19Patch2.11.ooc \ -repMatch=1024 # Wrote 41 overused 11-mers to hg19Patch2.11.ooc mkdir /hive/data/staging/data/hg19Patch2 cp -p hg19Patch2.2bit /hive/data/staging/data/hg19Patch2 cp -p hg19Patch2.11.ooc /hive/data/staging/data/hg19Patch2 cp -p chrom.sizes /hive/data/staging/data/hg19Patch2 ########################################################################### # ctgPos track (DONE - 2010-08-26 - Hiram) mkdir /hive/data/genomes/hg19Patch2/bed/ctgPos cd /hive/data/genomes/hg19Patch2/bed/ctgPos for C in `cut -f1 ../../chrom.sizes` do ctgPos=`hgsql -N -e 'select * from ctgPos where chrom="'$C'";' hg19` if [ "x${ctgPos}y" = "xy" ]; then GL=`echo $C | sed -e "s/.*_gl//"` glAcc=`grep ${GL} /hive/data/genomes/hg19/bed/additionalSequence/patches/patch_release_2/PATCHES/localID2acc | cut -f2` glSize=`grep ${GL} /hive/data/genomes/hg19Patch2/chrom.sizes | cut -f2` echo -e "$glAcc\t$glSize\t${C}\t0\t$glSize" else echo "$ctgPos" fi done > ctgPos.txt hgLoadSqlTab hg19Patch2 ctgPos ctgPos.sql ctgPos.txt featureBits -countGaps hg19Patch2 ctgPos # 49709657 bases of 49709657 (100.000%) in intersection ########################################################################### # ctgPos2 track (DONE - 2010-08-26 - Hiram) mkdir /hive/data/genomes/hg19Patch2/bed/ctgPos2 cd /hive/data/genomes/hg19Patch2/bed/ctgPos2 for C in `cut -f1 ../../chrom.sizes` do ctgPos2=`hgsql -N -e 'select * from ctgPos2 where chrom="'$C'";' hg19` if [ "x${ctgPos}y" = "xy" ]; then GL=`echo $C | sed -e "s/.*_gl//"` glSize=`grep ${GL} /hive/data/genomes/hg19Patch2/chrom.sizes | cut -f2` glAcc=`grep ${GL} /hive/data/genomes/hg19/bed/additionalSequence/patches/patch_release_2/PATCHES/localID2acc | cut -f1` if [ "x${glAcc}y" = "xy" ]; then GL=`echo $C | sed -e "s/_hap.*//" | sed -e "s/chr.*_/_/" | tr '[a-z]' '[A-Z]'` glAcc=`grep -h ${GL} /hive/data/genomes/hg19/download/alternate_loci/ALT_REF_LOCI_?/localID2acc | cut -f1` fi echo -e "$glAcc\t$glSize\t${C}\t0\t$glSize\tF" else echo -e "$ctgPos2\tF" fi done | grep -v chrM > ctgPos2.tab echo -e "NC_012920.1\t16569\tchrM_rCRS\t0\t16569\tF" >> ctgPos2.tab sed -e "s/chrom(16)/chrom(19)/" $HOME/kent/src/hg/lib/ctgPos2.sql \ > ctgPos2.sql hgLoadSqlTab hg19Patch2 ctgPos2 ctgPos2.sql ctgPos2.tab featureBits -countGaps hg19Patch2 ctgPos2 # 49709657 bases of 49709657 (100.000%) in intersection ########################################################################### # altSequence track (DONE - 2010-08-26 - Hiram) # provide links to locations on reference genome where these patches and # haplotypes belong mkdir /hive/data/genomes/hg19Patch2/bed/altSequence cd /hive/data/genomes/hg19Patch2/bed/altSequence ln -s ../../../hg19/bed/additionalSequence/patch2/altSequence.bed \ altSeq.bed.0 cat altSeq.bed.0 | while read L do C=`echo "${L}" | awk '{print $4}'` hg19C=`echo "${L}" | awk '{print $1}'` hg19S=`echo "${L}" | awk '{print $2}'` hg19E=`echo "${L}" | awk '{print $3}'` S=`grep "^${C}" ../../chrom.sizes | cut -f2` echo $C $S $hg19C $hg19S $hg19E | awk '{printf "%s\t0\t%d\t%s:%d-%d\t", $1, $2, $3, $4, $5}' echo "${L}" | awk '{printf "%d\t%s\t%d\t%d\t%s\n", $5,$6,$7,$8,$9}' done | grep -v "chrM_rCRS:" > altSequence.tab hgLoadBed hg19Patch2 altSequence altSequence.tab featureBits -countGaps hg19Patch2 altSequence # 49709657 bases of 49709657 (100.000%) in intersection ############################################################################ # create lift file on unBridged gaps for genbank splits (2009-03-09 - Hiram) mkdir /hive/data/genomes/hg19Patch2/bed/gap cd /hive/data/genomes/hg19Patch2/bed/gap gapToLift hg19Patch2 hg19Patch2.unBridged.lift -bedFile=unBridged.lift.bed cp -p hg19Patch2.unBridged.lift ../../jkStuff cp -p hg19Patch2.unBridged.lift /hive/data/staging/data/hg19Patch2 ########################################################################### # AUTO UPDATE GENBANK RUN (DONE - 2010-08-27 - Hiram) # align with latest genbank process. cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add hg19Patch2 just after hg19 # hg19Patch2 - GRCh37.p2 - Genome Reference Consortium Human Reference 37 hg19Patch2.serverGenome = /hive/data/genomes/hg19Patch2/hg19Patch2.2bit hg19Patch2.clusterGenome = /scratch/data/hg19Patch2/hg19Patch2.2bit hg19Patch2.ooc = /scratch/data/hg19Patch2/hg19Patch2.11.ooc hg19Patch2.lift = /hive/data/genomes/hg19/jkStuff/hg19Patch2.unBridged.lift hg19Patch2.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} hg19Patch2.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} hg19Patch2.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} hg19Patch2.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} hg19Patch2.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} hg19Patch2.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter} hg19Patch2.genbank.est.xeno.load = no hg19Patch2.genbank.est.xeno.loadDesc = no hg19Patch2.genbank.mrna.xeno.load = no hg19Patch2.genbank.mrna.xeno.loadDesc = no hg19Patch2.refseq.mrna.xeno.load = no hg19Patch2.refseq.mrna.xeno.loadDesc = no hg19Patch2.mgc = yes hg19Patch2.orfeome = yes hg19Patch2.downloadDir = hg19Patch2 hg19Patch2.ccds.ncbiBuild = 37.1 hg19Patch2.genbank.mrna.blatTargetDb = yes hg19Patch2.perChromTables = no git commit -m "adding hg19Patch2" genbank.conf git push # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial hg19Patch2 & # logFile: var/build/logs/2010.08.27-12:00:05.hg19Patch2.initalign.log # real 663m19.732s # load database when finished ssh hgwdev screen # use screen to manage this long running command cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hg19Patch2 & # logFile: var/dbload/hgwdev/logs/2010.08.28-08:46:41.dbload.log # real 60m50.989s # enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add hg19Patch2 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added hg19Patch2 - Human - GRCh37.p2" etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # new blat server for the hg19.patch2 sequence (DONE - 2010-08-27 - Hiram) hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("hg19Patch2", "blat4", "17792", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("hg19Patch2", "blat4", "17793", "0", "1");' \ hgcentraltest ############################################################################ # lastz alignment to hg19 (DONE - 2010-08-30 - Hiram) mkdir /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30 cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30 # construct a 2bit file of just the hg19 reference sequences: mkdir -p hg19Bits export A=-1 awk '{print $4}' ../altSequence/altSequence.tab | while read P do H=`grep "${P}" ../altSequence/altSequence.tab | awk '{print $1}'` HE=`grep "${P}" ../altSequence/altSequence.tab | awk '{print $3}'` C=`echo ${P} | sed -e "s/:.*//"` CE=`grep "^${C}" /hive/data/genomes/hg19/chrom.sizes | cut -f2 | head -1` SE=`echo ${P} | sed -e "s/.*://"` S=`echo ${SE} | sed -e "s/-.*//" | awk '{printf "%d", $1-1}'` E=`echo ${SE} | sed -e "s/.*-//"` size=`echo $E $S | awk '{printf "%d", $1-$2}'` A=`echo ${A} | awk '{printf "%02d", $1+1}'` echo -e "$S\t$C.$A\t$size\t$C\t$CE" echo ">$C.$A" > hg19Bits/$C.$A.fa twoBitToFa /gbdb/hg19/hg19.2bit:${C}:$S-$E stdout \ | grep -v "^>" >> hg19Bits/$C.$A.fa done > hg19Bits.lift faToTwoBit hg19Bits/chr*.fa hg19Bits.2bit twoBitInfo hg19Bits.2bit stdout | sort -k2nr > hg19Bits.chrom.sizes cat << '_EOF_' > DEF # human vs human BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 # lastz does not like the O= and E= lines in the matrix file # this copy has that removed from /scratch/data/scratch/human_chimp.v2.q BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q # and place those items here BLASTZ_O=600 BLASTZ_E=150 # other parameters from hg18 vs venter1 lastz on advice from Webb BLASTZ_K=10000 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Human Hg19Patch2 SEQ1_DIR=/scratch/data/hg19Patch2/hg19Patch2.2bit SEQ1_LEN=/scratch/data/hg19Patch2/chrom.sizes SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 SEQ1_LIMIT=2 # QUERY: Human Hg19 SEQ2_DIR=/scratch/data/hg19/hg19.2bit SEQ2_LEN=/scratch/data/hg19/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/hg19Bits.2bit SEQ2_CTGLEN=/hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/hg19Bits.chrom.sizes SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 SEQ2_LIMIT=2 BASE=/hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -noDbNameCheck -stop=net \ -noLoadChainSplit -chainMinScore=2000 \ -chainLinearGap=medium -workhorse=hgwdev \ -smallClusterHub=encodek -bigClusterHub=swarm > net.log 2>&1 # Elapsed time: 3m38s time doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -noDbNameCheck -continue=load -stop=load \ -noLoadChainSplit -chainMinScore=2000 \ -chainLinearGap=medium -workhorse=hgwdev \ -smallClusterHub=encodek -bigClusterHub=swarm > load.log 2>&1 # real 0m54.258s cat fb.hg19Patch2.chainHg19Link.txt # 42748025 bases of 44059336 (97.024%) in intersection # rework that job list to run these guys only against their bit of # the reference sequence. cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30 # move all of these out of the way until the next one is done mv psl psl.0 mv run.blastz run.blastz.0 mv pslParts pslParts.0 mv axtNet axtNet.0 mv mafNet mafNet.0 # start a new lastz run with a specific jobList mkdir -p run.blastz/tParts run.blastz/qParts export A=-1 awk '{print $4}' ../altSequence/altSequence.tab | while read P do H=`grep "${P}" ../altSequence/altSequence.tab | awk '{print $1}'` HE=`grep "${P}" ../altSequence/altSequence.tab | awk '{print $3}'` C=`echo ${P} | sed -e "s/:.*//"` CE=`grep "^${C}" /hive/data/genomes/hg19/chrom.sizes | cut -f2 | head -1` SE=`echo ${P} | sed -e "s/.*://"` S=`echo ${SE} | sed -e "s/-.*//" | awk '{printf "%d", $1-1}'` E=`echo ${SE} | sed -e "s/.*-//"` size=`echo $E $S | awk '{printf "%d", $1-$2}'` A=`echo ${A} | awk '{printf "%02d", $1+1}'` echo -e "/hive/data/staging/data/hg19Patch2/hg19Patch2.2bit:$H:0-$HE" \ > run.blastz/tParts/$A.lst echo -e "/hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/hg19Bits.2bit:$C.$A:0-$size" \ > run.blastz/qParts/$A.lst echo -e "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/$A.lst qParts/$A.lst ../DEF {check out exists ../psl/$H.$C.$A.psl }" done > run.blastz/jobList ssh swarm cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/run.blastz para create jobList para try para check ... push ... check para time # Completed: 80 of 80 jobs # CPU time in finished jobs: 161s 2.69m 0.04h 0.00d 0.000 y # IO & Wait Time: 238s 3.96m 0.07h 0.00d 0.000 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest finished job: 20s 0.33m 0.01h 0.00d # Submission to last job: 37s 0.62m 0.01h 0.00d # put together the individual results: cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30 cat psl/chr*.psl | gzip -c > pslParts/hg19Patch2.hg19.psl.gz # constructing a chain from those results cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/axtChain/run zcat ../../pslParts/hg19Patch2.hg19.psl.gz \ | axtChain -psl -verbose=0 -scoreScheme=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \ /scratch/data/hg19Patch2/hg19Patch2.2bit \ /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/hg19Bits.2bit \ stdout \ | chainAntiRepeat /scratch/data/hg19Patch2/hg19Patch2.2bit \ /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/hg19Bits.2bit \ stdin hg19Patch2.hg19.preLift.chain liftUp -chainQ hg19Patch2.hg19.lifted.chain \ ../../hg19Bits.lift carry hg19Patch2.hg19.preLift.chain # constructing the net files: cd /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/axtChain chainMergeSort run/hg19Patch2.hg19.lifted.chain | nice gzip -c > hg19Patch2.hg19.all.chain.gz chainSplit chain hg19Patch2.hg19.all.chain.gz # Make nets ("noClass", i.e. without rmsk/class stats which are added later): chainPreNet hg19Patch2.hg19.all.chain.gz /scratch/data/hg19Patch2/chrom.sizes /scratch/data/hg19/chrom.sizes stdout \ | chainNet stdin -minSpace=1 /scratch/data/hg19Patch2/chrom.sizes /scratch/data/hg19/chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # Make liftOver chains: netChainSubset -verbose=0 noClass.net hg19Patch2.hg19.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > hg19Patch2.hg19.over.chain.gz # Make axtNet for download: one .axt per hg19Patch2 seq. netSplit noClass.net net cd .. mkdir -p axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /scratch/data/hg19Patch2/hg19Patch2.2bit /scratch/data/hg19/hg19.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.hg19Patch2.hg19.net.axt.gz end # Make mafNet for multiz: one .maf per hg19Patch2 seq. mkdir -p mafNet foreach f (axtNet/*.hg19Patch2.hg19.net.axt.gz) axtToMaf -tPrefix=hg19Patch2. -qPrefix=hg19. $f \ /scratch/data/hg19Patch2/chrom.sizes /scratch/data/hg19/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end # no need for these as reference rm -fr psl.0 run.blastz.0 pslParts.0 axtNet.0 mafNet.0 # swap that business to hg19 mkdir /hive/data/genomes/hg19/bed/blastz.hg19Patch2.swap cd /hive/data/genomes/hg19/bed/blastz.hg19Patch2.swap time doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19Patch2/bed/lastzHg19.2010-08-30/DEF \ -swap -noDbNameCheck -stop=load \ -noLoadChainSplit -chainMinScore=2000 \ -chainLinearGap=medium -workhorse=hgwdev \ -smallClusterHub=encodek -bigClusterHub=swarm > swap.load.log 2>&1 # and then fixup the chains to include the haplotypes cd /hive/data/genomes/hg19/bed/blastz.hg19Patch2.swap/axtChain # split up each chain by the hg19Patch2 query sequences mkdir -p queryChains chainSplit -q queryChains hg19.hg19Patch2.all.chain.gz # then run a 'lift over' chain/net on each single one mkdir -p singleLiftOver for F in queryChains/*.chain do C=`basename ${F}` B=`echo ${C} | sed -e "s/.chain//"` chainPreNet -inclHap ${F} /scratch/data/hg19/chrom.sizes \ /scratch/data/hg19Patch2/chrom.sizes stdout \ | chainNet -inclHap stdin -minSpace=1 /scratch/data/hg19/chrom.sizes \ /scratch/data/hg19Patch2/chrom.sizes singleLiftOver/${B}.raw.net \ /dev/null netSyntenic singleLiftOver/${B}.raw.net singleLiftOver/${B}.noClass.net netFilter -chimpSyn singleLiftOver/${B}.noClass.net > singleLiftOver/${B}.chimpSyn.net netChainSubset -verbose=0 singleLiftOver/${B}.noClass.net \ ${F} stdout \ | chainStitchId stdin stdout > singleLiftOver/${C} echo "${F} -> singleLiftOver/${C}" done # put the chains together into one file chainMergeSort singleLiftOver/chr*.chain | gzip -c \ > hg19.hg19Patch2.single.over.chain.gz # construct psl files from those chains chainToPsl hg19.hg19Patch2.single.over.chain.gz \ /hive/data/genomes/hg19/chrom.sizes \ /hive/data/genomes/hg19Patch2/chrom.sizes \ /hive/data/genomes/hg19/hg19.2bit \ /hive/data/genomes/hg19Patch2/hg19Patch2.2bit \ hg19.hg19Patch2.over.psl # chainToPsl appears to have a problem, note errors from pslCheck: pslCheck -db=hg19 hg19.hg19Patch2.over.psl # Error: invalid PSL: chr6_ssto_hap7:3797750-3798078 chr6:32538701-32539032 + hg19.hg19Patch2.over.psl:362 # alignment size (328) doesn't match counts (0) pslRecalcMatch hg19.hg19Patch2.over.psl \ /hive/data/genomes/hg19/hg19.2bit \ /hive/data/genomes/hg19Patch2/hg19Patch2.2bit \ fixup.hg19.hg19Patch2.over.psl pslCheck -db=hg19 fixup.hg19.hg19Patch2.over.psl checked: 391 failed: 0 errors: 0 # load this PSL track hgLoadPsl hg19 -table=altSeqLiftOverPsl fixup.hg19.hg19Patch2.over.psl ############################################################################