# for emacs: -*- mode: sh; -*- # This file describes the construction of the browser database on # the mouse strains from NCBI build 38 (Dec 2011 freeze) aka: # GRCm38 - Genome Reference Consortium Mouse Reference 38 ############################################################################ # gather sequence and AGP definitions (DONE - 2012-09-28 - Hiram) mkdir -p /hive/data/genomes/mm10Strains1/genbank cd /hive/data/genomes/mm10Strains1/genbank # modified script from hg19Patch10 build, scans the FASTA and AGP # files from mm10/genbank/ for all the alternate sequence # to generate a fasta file and AGP file here with chr prefixes # for chrom names as is typical for UCSC browsers ./getSequence.pl ../../mm10/genbank -rw-rw-r-- 1 18386569 Sep 28 11:01 ucsc.fa.gz -rw-rw-r-- 1 27454 Sep 28 11:01 ucsc.agp # the script is: cat << '_EOF_' > getSequence.pl #!/usr/bin/env perl use strict; use warnings; my $debug = 1; my $patchName = "patch1"; sub usage() { print STDERR "usage: ./gatherNames.pl ../../mm10/genbank > ucscNames.${patchName}.txt\n"; print STDERR "expecting the Mus_musculus/GRCm38.p1/ hierarchy in ./genbank from NCBI\n"; exit 255; } my $argc = scalar(@ARGV); if ($argc != 1) { usage; } my $patchDir = shift; if ( ! -d $patchDir ) { print STDERR "ERROR: given directory $patchDir is not a directory or does not exist"; usage; } open (FA, "|gzip -c > ucsc.fa.gz") or die "can not write to ucsc.fa"; my %glSize; my %ctgToChr; my %ctgToFastaName; # my $fasta = "$patchDir/PATCHES/alt_scaffolds/FASTA/alt.scaf.fa.gz"; my @fastaList = split('\n',`find $patchDir -type f | grep FASTA | grep alt_scaffolds | grep -v UNKNOWN | grep alt.scaf.fa`); my @agpList = split('\n',`find $patchDir -type f | grep AGP | grep alt_scaffolds | grep -v UNKNOWN | grep alt.scaf.agp`); for (my $i = 0; $i < scalar(@fastaList); ++$i) { my $fasta = $fastaList[$i]; # get lengths of alternate scaffolds printf STDERR "# reading $fasta\n" if ($debug ); open (FH, "faCount $fasta|") or die "can not faCount $fasta"; while (my $line = ) { next if ($line =~ m/^#/); next if ($line =~ m/^total/); my ($gi, $len, $rest) = split('\s+', $line, 3); my ($x, $acc, $y, $gl, $z) = split('\|', $gi); die "ERROR: already exists glSize for {$gl}" if (exists($glSize{$gl})); $glSize{$gl} = $len; printf STDERR "# glSize{%s} <- %d\n", $gl, $len if ($debug); } close (FH); open (FH, "zcat $fasta|") or die "ERROR: can not read $fasta"; while ( my $line = ) { if ($line !~ m/^>/) { printf FA "%s", $line; } else { chomp $line; printf STDERR "# '%s'\n", $line; my ($gi, $rest) = split('\s+',$line,2); my ($x, $acc, $y, $gl, $z) = split('\|', $gi); my $chr = $rest; if ($chr =~ m/ unplaced /) { printf STDERR "# WARN: unplaced: %s\n", $line; $chr = "Un"; } $chr =~ s/Mus musculus strain .* chromosome //; $chr =~ s/Mus musculus chromosome //; $chr =~ s/ .*//; die "ERROR: already exists ctgToChr for {$gl} <- $chr" if (exists($ctgToChr{$gl})); $ctgToChr{$gl} = $chr; printf STDERR "# ctgToChr{%s} <- %s\n", $gl, $chr; die "ERROR: already exists ctgToFastaName for {$gl} <- $chr" if (exists($ctgToFastaName{$gl})); $ctgToFastaName{$gl} = $gi; $ctgToFastaName{$gl} =~ s/\|$//; $ctgToFastaName{$gl} =~ s/^>//; printf STDERR "# ctgToFastaName{%s} <- %s\n", $gl, $ctgToFastaName{$gl} if ($debug); # fix the suffix .N to be _N $gl =~ s/\./_/; printf FA ">chr%s_%s\n", $chr, $gl; } } close (FH); } # for (my $i = 0; $i < scalar(@fastaList); ++$i) close (FA); my $agpSequenceCount = 0; open (AG, ">ucsc.agp") or die "can not write to ucsc.agp"; for (my $i = 0; $i < scalar(@agpList); ++$i) { my $agp = $agpList[$i]; printf STDERR "# reading $agp\n" if ($debug ); open (FH, "zcat $agp|") or die "can not read $agp"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my @a = split('\s+', $line); my $gl = $a[0]; $a[3] = ++$agpSequenceCount; if (exists($ctgToChr{$gl})) { my $chr = $ctgToChr{$gl}; $gl =~ s/\./_/; printf AG "chr%s_%s", $chr, $gl; } else { printf STDERR "WARN: can not find gl: $gl in ctgToChr\n"; $gl =~ s/\./_/; printf AG "chrUn_%s", $gl; } for (my $i = 1; $i < scalar(@a); ++$i) { printf AG "\t%s", $a[$i]; } printf AG "\n"; } close (FH); } close (AG); '_EOF_' # << happy emacs # verify OK: checkAgpAndFa ucsc.agp ucsc.fa.gz 2>&1 | tail -3 # FASTA sequence entry # Valid Fasta file entry # All AGP and FASTA entries agree - both files are valid ############################################################################ # initial database build (DONE - 2012-09-28 - Hiram) cd /hive/data/genomes/mm10Strains1 cat << '_EOF_' > mm10Strains1.config.ra # Config parameters for makeGenomeDb.pl: db mm10Strains1 clade haplotypes genomeCladePriority 200 scientificName Mus musculus commonName mm10Haps assemblyDate Dec. 2011 assemblyLabel GRCm38 Genome Reference Consortium Mouse Reference 38, alternate strains assemblyShortLabel Mouse strains orderKey 1300 mitoAcc none fastaFiles /hive/data/genomes/mm10Strains1/genbank/ucsc.fa.gz agpFiles /hive/data/genomes/mm10Strains1/genbank/ucsc.agp # qualFiles /dev/null dbDbSpeciesDir mouse photoCreditURL http://www.jax.org/ photoCreditName Photo courtesy The Jackson Laboratory ncbiGenomeId 52 ncbiAssemblyId 416688 ncbiAssemblyName GRCm38.p1 ncbiBioProject 51977 genBankAccessionID GCA_000001635.3 taxId 10090 '_EOF_' # << happy emacs # verify fasta and AGP makeGenomeDb.pl -stop=agp mm10Strains1.config.ra > agp.log 2>&1 # when OK, continue with database construction makeGenomeDb.pl -continue=db mm10Strains1.config.ra > db.log 2>&1 ############################################################################ # RepeatMasker (DONE - 2012-10-04 - Hiram) mkdir /hive/data/genomes/mm10Strains1/bed/repeatMasker cd /hive/data/genomes/mm10Strains1/bed/repeatMasker time doRepeatMasker.pl mm10Strains1 -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev > do.log 2>&1 & # real 94m17.323s cat faSize.rmsk.txt # 62219607 bases (1203474 N's 61016133 real 34371016 upper 26645117 lower) # in 94 sequences in 1 files # Total size: mean 661910.7 sd 1106026.7 min 28130 (chr17_GL456021_2) # max 5956088 (chr6_GL456054_2) median 215601 # %42.82 masked total, %43.67 masked real ########################################################################### # TRF simple repeats (DONE - 2012-10-04 - Hiram) mkdir /hive/data/genomes/mm10Strains1/bed/simpleRepeat cd /hive/data/genomes/mm10Strains1/bed/simpleRepeat time doSimpleRepeat.pl mm10Strains1 -buildDir=`pwd` -dbHost=hgwdev \ -smallClusterHub=encodek -workhorse=hgwdev > do.log 2>&1 & # real 4m57.822s cat fb.simpleRepeat # 2190891 bases of 61018268 (3.591%) in intersection # when RM is done, add this mask: cd /hive/data/genomes/mm10Strains1 twoBitMask mm10Strains1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed mm10Strains1.2bit # safe to ignore warning: has >=13 fields twoBitToFa mm10Strains1.2bit stdout | faSize stdin \ > faSize.mm10Strains1.2bit.txt # 62219607 bases (1203474 N's 61016133 real 34322554 upper 26693579 lower) # in 94 sequences in 1 files # Total size: mean 661910.7 sd 1106026.7 min 28130 (chr17_GL456021_2) # max 5956088 (chr6_GL456054_2) median 215601 # %42.90 masked total, %43.75 masked real rm /gbdb/mm10Strains1/mm10Strains1.2bit ln -s `pwd`/mm10Strains1.2bit /gbdb/mm10Strains1/ ########################################################################### # altSequence track patch10 (DONE - 2012-10-05 - Hiram) # provide links to locations on reference genome where these # haplotypes map to mkdir /hive/data/genomes/mm10Strains1/bed/altSequence cd /hive/data/genomes/mm10Strains1/bed/altSequence cat << '_EOF_' > hapLocate.pl #!/usr/bin/env perl use strict; use warnings; sub usage() { print STDERR "usage: ./gatherNames.pl ../../../mm10/genbank | sort -k1,1 -k2,2n > strains.to.mm10.bed\n"; print STDERR "expecting the Mus_musculus/GRCm38.p1/ hierarchy in ../../../mm10/genbank from NCBI\n"; print STDERR "also writes to the file: mm10.to.strains.bed\n"; exit 255; } my $argc = scalar(@ARGV); if ($argc != 1) { usage; } my $genbankDir = shift; if ( ! -d $genbankDir ) { print STDERR "ERROR: given directory $genbankDir is not a directory or does not exist"; usage; } my @placeList = split('\n',`find $genbankDir -type f | grep alt_scaffold_placement.txt | grep -v UNKNOWN`); open (MM, "|sort -k1,1 -k2,2n > mm10.to.strains.bed") or die "can not write to mm10.to.strains.bed"; for (my $i = 0; $i < scalar(@placeList); ++$i) { my $placement = $placeList[$i]; open (FH, "sort -t'\t' -k6,6n $placement|") or die "ERROR: can not read $placement"; while (my $line = ) { chomp $line; next if ($line =~ m/^\s*#/); my ($altAsmName, $primAsmName, $altScafName, $altScafAcc, $parentType, $parentName, $parentAcc, $regionName, $ori, $altScafStart, $altScafStop, $parentStart, $parentStop, $altStartTail, $altStopTail) = split('\t',$line); if ($parentStart eq "na") { printf STDERR "# WARN: skipping no parentStart: %s\n", $line; next; } if ($parentName eq "na") { printf STDERR "# WARN: skipping no parentName: %s\n", $line; next; } my $altSize = $altScafStop - $altScafStart; my $ucscChrName = lc($altScafName); my @fields = split('_', $ucscChrName); my $ctgIndex = scalar(@fields) - 1; $ucscChrName = sprintf("chr%s_%s", $parentName, $altScafAcc); $ucscChrName =~ s/\./_/; $ori = "+" if ($ori eq "b"); printf MM "chr%s\t%d\t%d\t%s:%d-%d\n", $parentName, $parentStart-1, $parentStop, $ucscChrName, $altScafStart, $altScafStop; printf "%s\t%d\t%d\tchr%s:%d-%d\n", $ucscChrName, $altScafStart-1, $altScafStop, $parentName, $parentStart, $parentStop; } close (FH); } close (MM); '_EOF_' # << happy emacs chmod +x hapLocate.pl ./hapLocate.pl ../../../mm10/genbank \ | sort -k1,1 -k2,2n > strains.to.mm10.bed hgLoadBed mm10Strains1 mm10Locations strains.to.mm10.bed # Read 87 elements of size 4 from strains.to.mm10.bed featureBits -countGaps mm10Strains1 mm10Locations # 60596145 bases of 62219607 (97.391%) in intersection ############################################################################ # lastz alignment to mm10 (DONE - 2012-11-08 - Hiram) mkdir /hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05 cd /hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05 # construct a 2bit file of just the mm10 reference sequences # and all the business to run lastz on each haplotype with its # corresponding target sequence in mm10 rm -fr mm10Bits run.blastz mm10Bits.lift mkdir mm10Bits mkdir -p run.blastz/tParts mkdir -p run.blastz/qParts # typical line in ../altSequence/mm10.to.strains.bed # location in mm10 location in the altStrain # chr1 60640808 63588262 chr1_JH584320_1:1-1003859 awk '{print $1,$2,$3,$4}' ../altSequence/mm10.to.strains.bed | while read L do C=`echo $L | awk '{print $1}'` S=`echo $L | awk '{print $2}'` E=`echo $L | awk '{print $3}'` H=`echo $L | awk '{print $4}' | sed -e "s/:.*//"` HS=`echo $L | awk '{print $4}' | sed -e "s/-.*//"` HS=`echo $HS | sed -e "s/.*://" | awk '{print $1-1}'` HE=`echo $L | awk '{print $4}' | sed -e "s/.*-//"` CE=`grep "^${C}" /hive/data/genomes/mm10/chrom.sizes | cut -f2 | head -1` size=`echo $E $S | awk '{printf "%d", $1-$2}'` echo -e "$S\t$C.$S-$E\t$size\t$C\t$CE" echo mm10.2bit:${C}:$S-$E 1>&2 if [ ! -s mm10Bits/$C.$S-$E.fa ]; then echo ">$C.$S-$E" > mm10Bits/$C.$S-$E.fa twoBitToFa /gbdb/mm10/mm10.2bit:${C}:$S-$E stdout \ | grep -v "^>" >> mm10Bits/$C.$S-$E.fa fi echo -e "/hive/data/genomes/mm10Strains1/mm10Strains1.2bit:$H:$HS-$HE" \ > run.blastz/tParts/$H.lst echo -e "/hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/mm10Bits.2bit:$C.$S-$E:$HS-$size" \ > run.blastz/qParts/$H.lst echo -e "/cluster/bin/scripts/blastz-run-ucsc -outFormat psl tParts/$H.lst qParts/$H.lst ../DEF {check out exists ../psl/$H.psl}" \ >> run.blastz/jobList done | sort -u > mm10Bits.lift faToTwoBit mm10Bits/chr*.fa mm10Bits.2bit twoBitInfo mm10Bits.2bit stdout | sort -k2nr > mm10Bits.chrom.sizes # these lastz parameters are from the human-human self lastz cat << '_EOF_' > DEF # mouse vs mouse BLASTZ=lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 BLASTZ_K=10000 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Mouse Mm10Strains1 SEQ1_DIR=/hive/data/genomes/mm10Strains1/mm10Strains1.2bit SEQ1_LEN=/hive/data/genomes/mm10Strains1/chrom.sizes SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 SEQ1_LIMIT=2 # QUERY: Mouse Mm10 SEQ2_DIR=/scratch/data/mm10/mm10.2bit SEQ2_LEN=/scratch/data/mm10/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/mm10Bits.2bit SEQ2_CTGLEN=/hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/mm10Bits.chrom.sizes SEQ2_LIFT=/hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/mm10Bits.lift SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 SEQ2_LIMIT=2 BASE=/hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs ssh swarm cd /hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/run.blastz mkdir ../psl para create jobList para try ... check ... push para time # Completed: 87 of 87 jobs # CPU time in finished jobs: 237s 3.95m 0.07h 0.00d 0.000 y # IO & Wait Time: 422s 7.04m 0.12h 0.00d 0.000 y # Average job time: 8s 0.13m 0.00h 0.00d # Longest finished job: 72s 1.20m 0.02h 0.00d # Submission to last job: 3359s 55.98m 0.93h 0.04d # put together the individual results cd /hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05 mkdir pslParts cat psl/chr*.psl | gzip -c > pslParts/mm10Strains1.mm10.psl.gz # lift the query bits to mm10 coordinates to properly show on this # browser if the raw psl bits are to be seen cat psl/chr*.psl \ | liftUp -pslQ -type=.psl stdout mm10Bits.lift error stdin \ | gzip -c > mm10Strains1.liftedTo_mm10.psl.gz # constructing a chain from those results mkdir -p /hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/axtChain/run cd /hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/axtChain/run time zcat ../../pslParts/mm10Strains1.mm10.psl.gz \ | axtChain -psl -verbose=0 -scoreScheme=/scratch/data/blastz/human_chimp.v2.q -minScore=2000 -linearGap=medium stdin \ /hive/data/genomes/mm10Strains1/mm10Strains1.2bit \ /hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/mm10Bits.2bit \ stdout \ | chainAntiRepeat /hive/data/genomes/mm10Strains1/mm10Strains1.2bit \ /hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/mm10Bits.2bit \ stdin mm10Strains1.mm10.preLift.chain # real 0m44.175s liftUp -chainQ mm10Strains1.mm10.lifted.chain \ ../../mm10Bits.lift carry mm10Strains1.mm10.preLift.chain # constructing the net files: cd /hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/axtChain chainMergeSort run/mm10Strains1.mm10.lifted.chain \ | nice gzip -c > mm10Strains1.mm10.all.chain.gz chainSplit chain mm10Strains1.mm10.all.chain.gz # Make nets ("noClass", i.e. without rmsk/class stats which are added later): time chainPreNet mm10Strains1.mm10.all.chain.gz \ /hive/data/genomes/mm10Strains1/chrom.sizes \ /scratch/data/mm10/chrom.sizes stdout \ | chainNet stdin -minSpace=1 /hive/data/genomes/mm10Strains1/chrom.sizes \ /scratch/data/mm10/chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # real 0m1.338s # Make liftOver chains: netChainSubset -verbose=0 noClass.net mm10Strains1.mm10.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > mm10Strains1.mm10.over.chain.gz # Make axtNet for download: one .axt per mm10Strains1 seq. netSplit noClass.net net cd .. mkdir -p axtNet foreach f (axtChain/net/*.net) netToAxt $f axtChain/chain/$f:t:r.chain \ /hive/data/genomes/mm10Strains1/mm10Strains1.2bit \ /scratch/data/mm10/mm10.2bit stdout \ | axtSort stdin stdout \ | gzip -c > axtNet/$f:t:r.mm10Strains1.mm10.net.axt.gz end # Make mafNet for multiz: one .maf per mm10Strains1 seq. mkdir -p mafNet foreach f (axtNet/*.mm10Strains1.mm10.net.axt.gz) axtToMaf -tPrefix=mm10Strains1. -qPrefix=mm10. $f \ /hive/data/genomes/mm10Strains1/chrom.sizes \ /scratch/data/mm10/chrom.sizes \ stdout \ | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz end # swap that business to mm10 mkdir /hive/data/genomes/mm10/bed/blastz.mm10Strains1.swap cd /hive/data/genomes/mm10/bed/blastz.mm10Strains1.swap time doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10Strains1/bed/lastzMm10.2012-10-05/DEF \ -swap -noDbNameCheck -stop=load \ -noLoadChainSplit -chainMinScore=2000 \ -chainLinearGap=medium -workhorse=hgwdev \ -smallClusterHub=encodek -bigClusterHub=swarm > swap.load.log 2>&1 # real 2m51.872s cat fb.mm10.chainMm10Strains1Link.txt # 48691854 bases of 2652783500 (1.836%) in intersection # construct psl files from the liftOver chain: cd /hive/data/genomes/mm10/bed/blastz.mm10Strains1.swap/axtChain chainToPsl mm10.mm10Strains1.over.chain.gz \ /hive/data/genomes/mm10/chrom.sizes \ /hive/data/genomes/mm10Strains1/chrom.sizes \ /hive/data/genomes/mm10/mm10.2bit \ /hive/data/genomes/mm10Strains1/mm10Strains1.2bit \ mm10.mm10Strains1.over.psl # chainToPsl appears to have a problem, note errors from pslCheck: pslCheck -db=mm10 mm10.mm10Strains1.over.psl # checked: 578 failed: 51 errors: 51 pslRecalcMatch mm10.mm10Strains1.over.psl \ /hive/data/genomes/mm10/mm10.2bit \ /hive/data/genomes/mm10Strains1/mm10Strains1.2bit \ fixup.mm10.mm10Strains1.over.psl pslCheck -db=mm10 fixup.mm10.mm10Strains1.over.psl # checked: 946 failed: 0 errors: 0 # load this PSL track hgLoadPsl mm10 -table=altSeqLiftOverPslStrains1 fixup.mm10.mm10Strains1.over.psl # using this name on this table since hgc code triggers off of # the beginning: "altSeqLiftOverPsl" to display the sequence # from the seqMm10Strains1 and extMm10Strains1 tables loaded below ############################################################################ # Add this sequence to mm10 (DONE - 2012-11-08 - Hiram) mkdir /hive/data/genomes/mm10Strains1/bed/altSequence/seqExt cd /hive/data/genomes/mm10Strains1/bed/altSequence/seqExt twoBitToFa ../../../mm10Strains1.2bit mm10Strains1.fa mkdir -p /gbdb/mm10/mm10Strains1 mm10Strains1 faSplit byname mm10Strains1.fa ./mm10Strains1/ ln -s `pwd`/mm10Strains1/*.fa /gbdb/mm10/mm10Strains1 hgLoadSeq -drop -seqTbl=seqMm10Strains1 -extFileTbl=extMm10Strains1 mm10 \ /gbdb/mm10/mm10Strains1/*.fa ############################################################################