# for emacs: -*- mode: sh; -*-
# Creating the assembly for Monodelphis domestica
# South American, Short-tailed Opossum
# http://www.genome.gov/11510687
# http://www.genome.gov/12512285
# DOWNLOAD SEQUENCE (DONE - 2004-12-01 - Hiram)
ssh kksilo
mkdir -p /cluster/store8/monDom1/broad.mit.edu
ln -s /cluster/store8/monDom1 /cluster/data/monDom1
cd /cluster/data/monDom1/broad.mit.edu
# Set user name and password in your $HOME/.netrc
wget --timestamping "ftp://ftp.broad.mit.edu/*"
# takes a couple hours
# What we have:
ls -ogrt
# total 4071636
# -rw-rw-r-- 1 3992 Oct 18 16:45 assembly.format
# -rw-rw-r-- 1 1196876365 Oct 18 16:52 contigs.bases.gz
# -rw-rw-r-- 1 539154819 Oct 18 17:24 contigs.quals.gz
# -rwxrwxr-x 1 4418102 Oct 18 17:52 supercontigs.summary
# -rwxrwxrwx 1 3910827 Oct 18 17:52 supercontigs
# -rw-rw-r-- 1 23299859 Oct 18 17:52 reads.unplaced.gz
# -rw-rw-r-- 1 645129395 Oct 18 17:52 reads.placed.gz
# -rw-rw-r-- 1 10757748 Oct 18 17:52 assembly.agp
# -rw-rw-r-- 1 1199683639 Oct 19 17:15 assembly.agp.fasta.gz
# -rw-rw-r-- 1 541975023 Oct 19 17:34 assembly.agp.qual.gz
time gunzip *.gz
# That took almost 2 hours on a busy kksilo
ls -ogrt
# total 29983148
# -rw-rw-r-- 1 3992 Oct 18 16:45 assembly.format
# -rw-rw-r-- 1 3537338236 Oct 18 16:52 contigs.bases
# -rw-rw-r-- 1 10472323694 Oct 18 17:24 contigs.quals
# -rwxrwxr-x 1 4418102 Oct 18 17:52 supercontigs.summary
# -rwxrwxrwx 1 3910827 Oct 18 17:52 supercontigs
# -rw-rw-r-- 1 146964359 Oct 18 17:52 reads.unplaced
# -rw-rw-r-- 1 2273830873 Oct 18 17:52 reads.placed
# -rw-rw-r-- 1 10757748 Oct 18 17:52 assembly.agp
# -rw-rw-r-- 1 3608989613 Oct 19 17:15 assembly.agp.fasta
# -rw-rw-r-- 1 10614160078 Oct 19 17:34 assembly.agp.qual
# DATA INTEGRITY VERIFICATION (DONE - Hiram - 2004-12-02)
ssh kksilo
mkdir /cluster/data/monDom1/dataCheck
cd /cluster/data/monDom1/dataCheck
grep "^>" ../broad.mit.edu/contigs.bases | sed -e "s/^>//" > contigs.names
# 2 minutes
# Check for duplicates, should be same line count
sort contigs.names | wc
# 109065 109065 1415800
sort -u contigs.names | wc
# 109065 109065 1415800
sort contigs.names | head -3
# contig_0
# contig_1
# contig_10
sort contigs.names | tail -3
# contig_99997
# contig_99998
# contig_99999
wc contigs.names
# 109065 109065 1415800 contigs.names
grep "^>" ../broad.mit.edu/contigs.quals \
| sed -e "s/^>//" > contigs.quals.names
# 6 minutes
sum -r contigs*
# 35350 1383 contigs.names
# 35350 1383 contigs.quals.names
awk '{print $6}' ../broad.mit.edu/assembly.agp \
| grep contig_ > assembly.agp.contig.names
# The assembly agp contig list appears to be the same contig list
sort -u contigs.names | sum -r
# 18868 1383
sort -u assembly.agp.contig.names | sum -r
# 18868 1383
awk '{print $1}' ../broad.mit.edu/assembly.agp \
| sort -u > assembly.agp.scaffold.names
wc assembly.agp.scaffold.names
# 19348 19348 279110 assembly.agp.scaffold.names
head -3 assembly.agp.scaffold.names
# scaffold_0
# scaffold_1
# scaffold_10
tail -3 assembly.agp.scaffold.names
# scaffold_9997
# scaffold_9998
# scaffold_9999
grep "^>" ../broad.mit.edu/assembly.agp.fasta | \
sed -e "s/^>//; s/\..*//" > assembly.fasta.scaffold.names
# 2 minutes
# There are some duplicates in this fasta file:
sort -u assembly.fasta.scaffold.names | wc
# 19348 19348 279110
wc assembly.fasta.scaffold.names
# 19576 19576 282523 assembly.fasta.scaffold.names
# For example:
grep "^>scaffold_13303" ../broad.mit.edu/assembly.agp.fasta
# >scaffold_13303.1-5000000 (MonodelphisPreliminaryAssemblyOct04)
# >scaffold_13303.5000001-10000000 (MonodelphisPreliminaryAssemblyOct04)
# >scaffold_13303.10000001-15000000 (MonodelphisPreliminaryAssemblyOct04)
# >scaffold_13303.15000001-20000000 (MonodelphisPreliminaryAssemblyOct04)
# >scaffold_13303.20000001-22286839 (MonodelphisPreliminaryAssemblyOct04)
faCount ../broad.mit.edu/assembly.agp.fasta > faCount.assembly.agp.fasta
# seq len A C G
# total 3563247205 1085390300 660684418 660336426
# T N cpg
# 1085697086 71138975 16821709
grep scaffold_ faCount.assembly.agp.fasta > scaffolds.faCount
# Taking a look at those counts, the smallest scaffold is 1000
# bases. The largest ones are broken into 5,000,000 base chunks
# as evidenced by the duplicate example above.
faCount ../broad.mit.edu/contigs.bases > faCount.contigs
# seq len A C G
# total 3492108230 1085390300 660684418 660336426
# T N cpg
# 1085697086 0 16821709
grep contig_ faCount.contigs > contigs.faCount
# The contigs range in size from 85 bases to 658,080
# The bulk of them start at a size of 1000
# COMBINE scaffold fasta pieces into single chunks, remove the extra
# base-range info from the names. (DONE - Hiram - 2004-12-02)
ssh kksilo
cd /cluster/data/monDom1/broad.mit.edu
cat << '_EOF_' collapseScaffolds.sh
#!/bin/sh
#
awk '
BEGIN { name="" }
/^>/ {
id=$1
sub(">","",id)
sub("\\..*","",id)
if (!match(name,id)) {
printf ">%s\n", id
name=id
}
}
/^[^>]/ { print }
' assembly.agp.fasta
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x collapseScaffolds.sh
time ./collapseScaffolds.sh > scaffolds.fa
# 11 minutes
# make sure we have not damaged the sequence:
time faCount scaffolds.fa | tail -1
# 4 minutes
# total 3563247205 1085390300 660684418 660336426
# 1085697086 71138975 16821709
# Those are the same numbers as above
# MAKE 2BIT NIB FILE (DONE - Hiram - 2004-12-02)
ssh kksilo
cd /cluster/data/monDom1
faToTwoBit broad.mit.edu/scaffolds.fa monDom1.2bit
# check that the sequence hasn't been damaged
twoBitToFa monDom1.2bit stdout | faCount stdin | tail -1
# total 3563247205 1085390300 660684418 660336426
# 1085697086 71138975 16821709
# still the same numbers
# CREATE DATABASE (DONE - Hiram - 2004-12-02)
ssh hgwdev
cd /cluster/data/monDom1
mkdir /gbdb/monDom1
ln -s /cluster/data/monDom1/monDom1.2bit /gbdb/monDom1
twoBitInfo monDom1.2bit stdout |
awk '{printf "%s\t%s\t/gbdb/monDom1/monDom1.2bit\n", $1,$2}' \
> chromInfo.tab
hgsql -e "create database monDom1;" mysql
hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" \
monDom1
hgsql monDom1 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' \
monDom1
# generate chrom.sizes list in order by size, biggest first
# This listing is going to be used in a variety of procedures below
twoBitInfo monDom1.2bit stdout | sort -rn +1 > chrom.sizes
hgsql monDom1 -N -e 'select chrom from chromInfo' > chrom.lst
# Enter monDom1 into dbDb and defaultDb so test browser knows about it:
hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
VALUES("monDom1", "Oct 2004", "/gbdb/monDom1", "M. domestica", \
"scaffold_13303:1000000-11000000", 1, 33, "Opossum", \
"Monodelphis domestica", \
"/gbdb/monDom1/html/description.html", 0, 0, \
"Broad Inst. Prelim Oct04");' \
-h localhost hgcentraltest
hgsql -e 'INSERT INTO defaultDb (name, genome) \
VALUES("monDom1", "Opossum")' \
-h localhost hgcentraltest
# Make trackDb table so browser knows what tracks to expect (DONE - Hiram)
cd $HOME/kent/src/hg/makeDb/trackDb
mkdir /cluster/data/monDom1/html
ln -s /cluster/data/monDom1/html /gbdb/monDom1/html
mkdir opossum
cvs add opossum
cd opossum
mkdir monDom1
cvs add monDom1
cd monDom1
cat << '_EOF_' > description.html
Monodelphis domestica
Description page to be done for Monodelphis domestica
'_EOF_'
# << this line keeps emacs coloring happy
cvs add description.html
cvs commit
cd ../..
make DBS="monDom1" ZOO_DBS=""
# CHUNK up scaffolds for repeat masking (DONE - Hiram - 2004-12-03)
# See also: comments about this in: makeApiMel1.doc
ssh kksilo
cd /cluster/data/monDom1
mkdir -p jkStuff
faSplit size broad.mit.edu/scaffolds.fa 500000 -oneFile \
scaffoldsSplit -lift=jkStuff/scaffoldsSplit.lft
time faSplit about scaffoldsSplit.fa 200000 chunks500k/chunk_
# about 9 minutes for each of those two faSplits
mkdir -p /cluster/bluearc/monDom1
# copy to bluearc for cluster run
cp -Rp ./chunks500k /cluster/bluearc/monDom1
# this business in /cluster/bluearc/monDom1/ was later removed
# and to iservers:
ssh kkr1u00
mkdir /iscratch/i/monDom1
cd /iscratch/i/monDom1
cp -Rp /cluster/bluearc/monDom1/chunks500k .
/cluster/bin/iSync
# LOAD GAP & GOLD TABLES FROM AGP (DONE - 2004-12-03 - Hiram)
ssh hgwdev
cd /cluster/data/monDom1
hgGoldGapGl -noGl monDom1 broad.mit.edu/assembly.agp
# Verify sanity of indexes
hgsql -e "show index from gold;" monDom1
hgsql -e "show index from gap;" monDom1
# Look at the Cardinality column, it should have some healthy
# numbers for all indices
# For some reason, the indices do not get built correctly --
# "show index from gap/gold" shows NULL cardinalities for chrom.
# Rebuild indices with "analyze table".
hgsql -e "analyze table gold; analyze table gap;" monDom1
# RUN REPEAT MASKER (DONE - 2004-12-06 - Hiram)
# Received a library from Michele Clamp who says Damian Keefe
# at Ensembl created to use in addition to the generic mammal library.
# Will run both -mammal and this one, and combine the results.
# have placed the library in monDom1/jkStuff/monDom1.novel.RM.lib
# DIFFERENT fileserver, new disk space
ssh kksilo
mkdir /cluster/store1/monDom1
mkdir /cluster/store1/monDom1/RMOut
ln -s /cluster/store1/monDom1/RMOut /cluster/store8/monDom1
ln -s /cluster/store8/monDom1/chunks500k /cluster/store1/monDom1
ln -s /cluster/store8/monDom1/jkStuff /cluster/store1/monDom1
#
cd /cluster/data/monDom1
cat << '_EOF_' > jkStuff/RMOpossum
#!/bin/csh -fe
cd $1
/bin/mkdir -p /tmp/monDom1/$2
/bin/cp ../chunks500k/$2 /tmp/monDom1/$2/
pushd /tmp/monDom1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -lib /iscratch/i/monDom1/monDom1.novel.RM.lib $2
mv $2.out $2.novel.out
/cluster/bluearc/RepeatMasker/RepeatMasker -s -species "Monodelphis domestica" $2
tail +4 $2.novel.out >> $2.out
popd
/bin/cp -p /tmp/monDom1/$2/$2.out ./
/bin/rm -fr /tmp/monDom1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/monDom1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/monDom1
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x jkStuff/RMOpossum
mkdir RMRun
bash # if you are csh
ls chunks500k | while read chunk
do
echo ../jkStuff/RMOpossum /cluster/data/monDom1/RMOut $chunk \
'{'check in line+ /iscratch/i/monDom1/chunks500k/$chunk'}' \
'{'check out line+ /cluster/data/monDom1/RMOut/$chunk.out'}'
done > RMRun/RMJobs
exit # out of bash back to csh
# do the run
ssh kk
mkdir /cluster/data/monDom1/RMRun
cd /cluster/data/monDom1/RMRun
para create RMJobs
para try, check, push, check,...
# This takes extra long compared to normal because of the two
# passes being run on the data
Completed: 7627 of 7627 jobs
CPU time in finished jobs: 117842813s 1964046.88m 32734.11h 1363.92d 3.737 y
IO & Wait Time: 704870s 11747.84m 195.80h 8.16d 0.022 y
Average job time: 15543s 259.05m 4.32h 0.18d
Longest job: 24045s 400.75m 6.68h 0.28d
Submission to last job: 185727s 3095.45m 51.59h 2.15d
# Lift up the split-scaffold .out's to scaffold .out's
ssh eieio
cd /cluster/store1/monDom1
mkdir RMOutLifted
ln -s `pwd`/RMOutLifted /cluster/data/monDom1
bash # if you are csh
find ./RMOut -type f | while read F
do
B=${F/.\/RMOut\//}
echo "$F -> RMOutLifted/$B"
liftUp RMOutLifted/$B jkStuff/scaffoldsSplit.lft warn $F
done
exit # the bash session if your are csh
# Cat all those lifted .out files together into one single file
head -3 RMOutLifted/chunk_00.fa.out > RMRun/scaffolds.fa.RM.out
find ./RMOutLifted -type f | while read f
do
tail +4 $f >> RMRun/scaffolds.fa.RM.out
done
# I tried the following sort in the above loop to see if
# the sort mentioned below could be avoided:
# tail +4 $f | sort -u >> scaffolds.unique.fa.RM.out
# but this sort was not thorough enough. It ended up with more
# records loaded into the database, but with the same amount
# of featureBits coverage.
# Eliminate the duplicates from the double RepeatMasker run.
# When run twice, it finds the Simple and Low Complexity repeats
# twice.
hgLoadOut -tabFile=hgLoadOut.tab monDom1 -nosplit scaffolds.fa.RM.out
# A few "Strange perc. field" outputs, which always happens, but
# not too many. e.g.:
# Strange perc. field -1.4 line 1939539 of scaffolds.fa.RM.out
# Strange perc. field -2.5 line 3078381 of scaffolds.fa.RM.out
# Strange perc. field -2.1 line 3078381 of scaffolds.fa.RM.out
# Strange perc. field -40.9 line 3442991 of scaffolds.fa.RM.out
# Strange perc. field -3.9 line 3442991 of scaffolds.fa.RM.out
# ... etc ...
# note: 3 records dropped due to repStart > repEnd
ls -og scaffolds.fa.RM.out hgLoadOut.tab
# -rw-rw-r-- 1 2553485808 Dec 6 00:05 scaffolds.fa.RM.out
# -rw-rw-r-- 1 1935604506 Dec 6 00:16 hgLoadOut.tab
sort -k 6,6 -k7n,7n hgLoadOut.tab | uniq > hgLoadOutUniq.tab
# takes almost an hour
ssh hgwdev
cd /cluster/store1/monDom1
# Get the structure of the table started
hgLoadOut -nosplit -table=rmsk monDom1 RMOutLifted/chunk_1000.fa.out
# Now, reload it fully with the unique set
hgsql -e 'delete from rmsk;' monDom1
hgsql -e 'load data local infile "hgLoadOutUniq.tab" into table rmsk;' \
monDom1
# This load takes an hour, 12 minutes for a featureBits:
featureBits monDom1 rmsk
# 1775646140 bases of 3492108230 (50.847%) in intersection
featureBits hg17 rmsk
# 1390952984 bases of 2866216770 (48.529%) in intersection
featureBits canFam1 rmsk
# 896773874 bases of 2359845093 (38.001%) in intersection
featureBits mm5 rmsk
# 1137310280 bases of 2615483787 (43.484%) in intersection
featureBits rn3 rmsk
# 1117483165 bases of 2571104688 (43.463%) in intersection
featureBits panTro1 rmsk
# 1311281819 bases of 2733948177 (47.963%) in intersection
# Verify the indexes are correct
hgsql -e "show index from rmsk;" monDom1
# Should be a good cardinality count for each one
# It does not, it shows None for genoName (genoName(14), bin)
# and 18744904 for both (genomeName(14), genoStart) and (...genoEnd)
hgsql monDom1 -e 'drop index genoName on rmsk; \
drop index genoName_2 on rmsk; \
drop index genoName_3 on rmsk; \
create index bin on rmsk (genoName(14), bin); \
create index genoStart on rmsk (genoName(14), genoStart); \
create index genoEnd on rmsk (genoName(14), genoEnd);'
# This index creation took 76 minutes
# After that, the "bin" index has a cardinality of 48,561
# genoStart and genoEnd are still at 18,744,904
# SIMPLE REPEAT [TRF] TRACK (DONE - 2004-12-03 - Hiram)
# Run this on the rack 9 cluster
ssh kk9
mkdir /cluster/data/monDom1/bed/simpleRepeat
cd /cluster/data/monDom1/bed/simpleRepeat
mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
# << this line makes emacs coloring happy
chmod +x runTrf
cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1S /iscratch/i/monDom1/chunks500k | sed -e "s#^#/iscratch/i/monDom1/chunks500k/#" > genome.lst
gensub2 genome.lst single gsub jobList
para create jobList
para try
para check
para push
Completed: 7627 of 7627 jobs
CPU time in finished jobs: 24845s 414.08m 6.90h 0.29d 0.001 y
IO & Wait Time: 19407s 323.45m 5.39h 0.22d 0.001 y
Average job time: 6s 0.10m 0.00h 0.00d
Longest job: 49s 0.82m 0.01h 0.00d
Submission to last job: 1309s 21.82m 0.36h 0.02d
find ./trf -type f | xargs cat | \
liftUp simpleRepeat.bed \
/cluster/data/monDom1/jkStuff/scaffoldsSplit.lft \
warn stdin > lu.out 2>&1
# Load into the database:
ssh hgwdev
cd /cluster/data/monDom1/bed/simpleRepeat
hgLoadBed monDom1 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 697668 elements of size 16
# Compare with some other assemblies
featureBits monDom1 simpleRepeat
# 55000238 bases of 3492108230 (1.575%) in intersection
featureBits rn3 simpleRepeat
# 70073656 bases of 2571104688 (2.725%) in intersection
featureBits mm5 simpleRepeat
# 81414259 bases of 2615483787 (3.113%) in intersection
featureBits monDom1 simpleRepeat
# 54952425 bases of 2866216770 (1.917%) in intersection
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
ssh hgwdev
cd /cluster/data/monDom1/bed
mkdir microsat
cd microsat
awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
/cluster/bin/i386/hgLoadBed monDom1 microsat microsat.bed
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-12-06 - Hiram)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh eieio
cd /cluster/data/monDom1/bed/simpleRepeat
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE - 2004-12-06 - Hiram)
# maskOutFa is trying to read in the entire input sequence to get
# it ready for masking. This doesn't work on the ordinary memory
# machines since scaffolds.fa is the whole thing.
ssh kolossus
mkdir /scratch/monDom1
cd /scratch/monDom1
cp -p /cluster/store8/monDom1/broad.mit.edu/scaffolds.fa .
cp -p /cluster/store1/monDom1/bed/simpleRepeat/trfMask.bed .
cp -p /cluster/store1/monDom1/RMRun/scaffolds.fa.RM.out .
maskOutFa -soft scaffolds.fa trfMask.bed maskedScaffolds.fa
maskOutFa -clip -softAdd maskedScaffolds.fa scaffolds.fa.RM.out \
maskedScaffolds.fa
# those two mask operations each take 3 to 4 minutes.
# There were a number of negative rEnd warnings on scaffold_13261,
# all on type L1Md_T, one warning each on
# scaffold_13244, scaffold_13338 for type L1M4c,
# and one on scaffold_13498 for type monodelphis_domestica_23_1
# and one warning:
# Mask record out of range line 6686033 of scaffolds.fa.RM.out
# scaffold_13485 size is 3678616, range is 3678568-3678618
# This last error makes it stop without doing anything. Need to
# fix this. Use the -clip option. That helps it past that and
# now more warnings are seen, on scaffold_16808 for type Lx5,
# and one warning on scaffold_14879 for type monodelphis_domestica_48
# May as well create a 2bit while we are here:
time faToTwoBit maskedScaffolds.fa masked.2bit
# 3 minutes
# Then, check it:
twoBitToFa masked.2bit stdout | faCount stdin | tail -1
# 4 minutes, seems to have lost 2A's and 1 C:
# total 3563247202 1085390298 660684418 660336425
# 1085697086 71138975 16821709
# previously:
# total 3563247205 1085390300 660684418 660336426
# 1085697086 71138975 16821709
# I went back through this sequence and could not duplicate this
# failure of missing three bases ...
# Copy the masked 2bit file back to kksilo
ssh kksilo
cd /cluster/data/monDom1
scp -p kolossus:/scratch/hiram/monDom1/masked.2bit .
rm monDom1.2bit
mv masked.2bit monDom1.2bit
# Ask for the blat servers to be restarted with this new masked
# sequence
# Now clean up the unmasked split scaffolds to avoid confusion later.
rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft
# MAKE 11.OOC FILE FOR BLAT (TBD)
ssh kolossus
cd /cluster/bluearc/monDom1
# use repMatch of 1228 as this genome is ~ %20 larger than human
# 1024 + (1024 * 0.2) = 1228
blat /cluster/data/monDom1/monDom1.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228
# Wrote 43126 overused 11-mers to 11.ooc
# -rw-rw-r-- 1 172512 Dec 7 17:22 11.ooc
ssh kkr1u00
cd /iscratch/i/monDom1
cp -p /cluster/bluearc/monDom1/11.ooc .
/cluster/bin/iSync
# at repMatch=1024
# Wrote 61187 overused 11-mers to 11.ooc
# at repMatch=540
# Wrote 207412 overused 11-mers to 11.ooc
# For comparison, Hg17 has:
# Wrote 30077 overused 11-mers to 11.ooc
# PUT SEQUENCE ON SCRATCH FOR BLASTZ (DONE - 2004-12-07 - Hiram)
# These chunks do not need to correspond to the 500,000 chunks
# used in the repeat mask run because we are not going to use the
# repeat mask .out files at lineage specific repeats.
ssh kksilo
cd /tmp
# Make sure you have enough room for this: (~ 4 Gb)
df -h .
# Filesystem Size Used Avail Use% Mounted on
# /dev/sda2 32G 23G 8.0G 74% /
# split scaffolds that are over 2,000,000
faSplit size /cluster/data/monDom1/maskedScaffolds.fa 2000000 -oneFile \
scaf_ \
-lift=/cluster/data/monDom1/jkStuff/chunkedScaffolds.lft
mkdir /cluster/bluearc/scratch/monDom1
cd /cluster/bluearc/scratch/monDom1
mkdir chunks
# Collect the split scaffolds into reasonably sized chunks
# This produces over 5000 files of size 500,000
# a few smaller, a few larger, 7627 total files
faSplit about /tmp/scaf_.fa 10000000 chunks/chunk_
# done with this temp file
rm /tmp/scaf_.fa
# a list for convenience
ls -1S chunks > chunk.list
cp -p /cluster/data/monDom1/jkStuff/chunkedScaffolds.lft .
cp -p /cluster/data/monDom1/monDom1.2bit .
# And create a chunks.2bit for the lavToAxt operations
ssh kkr1u00
cd /iscratch/i/monDom1/chunks
faToTwoBit chunks*.fa ../chunks.2bit
cd ..
twoBitInfo chunks.2bit chunks.info
/cluster/bin/iSync
# request rsync of scratch to the cluster
# PRODUCING GENSCAN PREDICTIONS (DONE - 2004-12-07 - Hiram)
# PRODUCING GENSCAN PREDICTIONS (REDONE - 2005-01-07 - Angie
# note added by kuhn)
# genscanPep table touched by alisq to satisfy joinerCheck 2005-03-32 - kuhn)
ssh kolossus
cd /scratch/monDom1
time maskOutFa maskedScaffolds.fa hard hardMaskedScaffolds.fa
# Clean out the /cluster/bluearc/monDom1/ files that were used
# during repeat masking, then, this copy
cp -p hardMaskedScaffolds.fa /cluster/bluearc/monDom1
cd /cluster/bluearc/monDom1
mkdir hardChunks
faSplit about hardMaskedScaffolds.fa 200000 hardChunks/chunk_
# This leaves some rather large pieces, will have to see if
# genscan can handle it. (2086 files)
ssh hgwdev
mkdir /cluster/data/monDom1/bed/genscan
cd /cluster/data/monDom1/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Make 3 subdirectories for genscan to place output files
mkdir gtf pep subopt
ssh kk9
cd /cluster/data/monDom1/bed/genscan
ls -1S /cluster/bluearc/monDom1/hardChunks/chunk*.fa > chunks.list
cat << '_EOF_' > gsub
#LOOP
gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
gensub2 chunks.list single gsub jobList
para create jobList
para try, check, push, check, ...
# Completed: 2086 of 2086 jobs
# CPU time in finished jobs: 130316s 2171.94m 36.20h 1.51d 0.004 y
# IO & Wait Time: 17821s 297.01m 4.95h 0.21d 0.001 y
# Average job time: 71s 1.18m 0.02h 0.00d
# Longest job: 2509s 41.82m 0.70h 0.03d
# Submission to last job: 2573s 42.88m 0.71h 0.03d
# Concatenate scaffold-level results:
ssh eieio
cd /cluster/data/monDom1/bed/genscan
cat gtf/*.gtf > genscan.gtf
cat subopt/*.bed > genscanSubopt.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/monDom1/bed/genscan
# Reloaded without -genePredExt 1/6/05:
ldHgGene -gtf monDom1 genscan genscan.gtf
hgPepPred monDom1 generic genscanPep genscan.pep
hgLoadBed monDom1 genscanSubopt genscanSubopt.bed
# 41090 sequences added
# Compare with other assemblies:
featureBits monDom1 genscan
# 47170795 bases of 3492108230 (1.351%) in intersection
featureBits monDom1 genscanSubopt
# 51805835 bases of 3492108230 (1.484%) in intersection
featureBits hg17 genscan
# 55323340 bases of 2866216770 (1.930%) in intersection
featureBits hg17 genscanSubopt
# 55986178 bases of 2866216770 (1.953%) in intersection
# Clean up
# Save a copy of the hardMask for later .zip prep:
ssh kksilo
cp -p /cluster/bluearc/monDom1/hardMaskedScaffolds.fa .
ssh eieio
cd /cluster/bluearc/monDom1
rm -fr hardChunks hardMaskedScaffolds.fa
# This kolossus clean up was actually left until much later.
# I found these copies here to be useful later on.
ssh kolossus
cd /scratch/monDom1
rm -f *
# BLAT SERVER SETUP (DONE - 2004-12-07 - Hiram)
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES("monDom1", "blat14", 17786, 1, 0);' \
-h localhost hgcentraltest
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES("monDom1", "blat14", 17787, 0, 0);' \
-h localhost hgcentraltest
# Make trackDb table so browser knows what tracks to expect (DONE - Hiram)
# AUTO UPDATE GENBANK MRNA RUN (DONE - 2004-12-15 - Hiram)
ssh hgwdev
# In the source tree, since this is a new organism, go to:
# kent/src/hg/makeDb/genbank/src/lib
# And edit gbGenome.c to add monDom in two places for two lists:
# static char *monDomNames[] = {"Monodelphis domestica", NULL};
# {"monDom", monDomNames, NULL},
# Genbank species list is kept in:
# /cluster/data/genbank/data/species.lst
# This one is already on there.
# And add an entry for the ooc file in:
# hg/makeDb/genbank/src/align/gbBlat
# In a clean source tree, go to hg/makeDb/genbank
# and run a 'make install-server'
cd /cluster/data/genbank
# This is a new organism, edit the etc/genbank.conf file and add:
# monDom1
monDom1.genome = /scratch/monDom1/monDom1.2bit
monDom1.refseq.mrna.native.load = no
monDom1.mondoTwoBitParts = 7000
monDom1.downloadDir = monDom1
monDom1.lift = no
monDom1.perChromTables = no
# Do the refseq's first, they are the quick ones
ssh eieio
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial monDom1
# var/build/logs/2004.12.08-19:09:59.monDom1.initalign.log
# it said: "nothing to align" - i.e. there are no native refseq
# mRNA's for this organism, add the line:
# monDom1.refseq.mrna.native.load = no
# to the configuration file
# We were having some trouble with eieio, running these on kksilo
# this time. This is best done on eieio.
# perform the genbank mrna run, this does only native and xeno mRNA's
ssh kksilo
nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial monDom1
# This run had non-linear behavior, in the early stages it
# appeared that it was going to take 4 days of run time, then in
# the last 7 hours it processed over 250,000 jobs:
# Completed: 357938 of 357938 jobs
# CPU time in finished jobs: 51924178s 865402.97m 14423.38h 600.97d 1.647 y
# IO & Wait Time: 3461243s 57687.38m 961.46h 40.06d 0.110 y
# Average job time: 155s 2.58m 0.04h 0.00d
# Longest job: 3885s 64.75m 1.08h 0.04d
# Submission to last job: 101926s 1698.77m 28.31h 1.18d
# Due to various interruptions, the cluster job was completed
# manually by going to the directory and pushing the jobs. To
# clean up after this, and finish the alignment steps:
ssh kksilo
cd /cluster/data/genbank
nice bin/gbAlignStep -continue=finish -srcDb=genbank -type=mrna \
-verbose=1 -initial monDom1
# 2 hour finish time
# Load the results from the above
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad monDom1
# 2 minute load time
# And finally, the long-lived EST alignments. Several cluster days
ssh kksilo
cd /cluster/data/genbank/work
mv initial.monDom1 initial.monDom1.genbank.mrna
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial monDom1
# logFile: var/build/logs/2004.12.15-13:16:59.monDom1.initalign.log
# It said there was nothing to align, must be no native est's for
# the Opossum, add this line to the genbank.conf file:
# monDom1.genbank.est.native.load = no
# not useful to do xeno ESTs, they are off by default
# GC5BASE (DONE - 2004-12-08 - Hiram)
ssh eieio
mkdir /cluster/data/monDom1/bed/gc5Base
cd /cluster/data/monDom1/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 monDom1 \
/cluster/data/monDom1 | wigEncode stdin gc5Base.wig gc5Base.wib
# takes about 4 hours, need to improve hgGcPercent and how it reads
# five bases at a time from the sequence.
ssh hgwdev
cd /cluster/data/monDom1/bed/gc5Base
mkdir /gbdb/monDom1/wib
ln -s `pwd`/gc5Base.wib /gbdb/monDom1/wib
hgLoadWiggle -pathPrefix=/gbdb/monDom1/wib monDom1 gc5Base gc5Base.wig
# need to fix-up hgLoadWiggle so it will make a proper index on
# these scaffold assemblies:
hgsql monDom1 -e "drop index chrom on gc5Base;"
hgsql monDom1 -e "create index bin on gc5Base (chrom(14), bin);"
# this is fixed in hgLoadWiggle as of 2004-12-09
# QUALITY (DONE - 2004-12-14 - Hiram)
ssh eieio
mkdir /cluster/data/monDom1/bed/quality
cd /cluster/data/monDom1/bed/quality
# Create perl script to read the quality file and pipe to
# wigEncode:
./qaToWigEncode.pl ../../broad.mit.edu/assembly.agp.qual \
| wigEncode stdin quality.wig quality.wib
# Converted stdin, upper limit 68.00, lower limit 0.00
# Took 6 hours 40 minutes of processing time.
# Resulting files:
ls -og quality.wi?
# -rw-rw-r-- 1 377775898 Dec 13 21:40 quality.wig
# -rw-rw-r-- 1 3492108230 Dec 13 21:40 quality.wib
ssh hgwdev
cd /cluster/data/monDom1/bed/quality
ln -s `pwd`/quality.wib /gbdb/monDom1/wib
hgLoadWiggle -pathPrefix=/gbdb/monDom1/wib monDom1 quality quality.wig
# fix the index (this was supposed to be fixed, hgLoadWiggle
# needs to be looked at)
hgsql monDom1 -e "drop index chrom on quality;"
hgsql monDom1 -e "create index bin on quality (chrom(14), bin);"
# ALIGN TO HUMAN (DONE - 2004-12-12 - Hiram)
# NAMING CONVENTION: the zb.hg17 directory will be the Opossum
# aligned to the Human blastz result.
# The bz.hg17 directory will the swapped result, Human aligned
# to Opossum
ssh kk
mkdir -p /cluster/data/monDom1/bed/zb.hg17
cd /cluster/data/monDom1/bed/zb.hg17
ln -s /cluster/data/monDom1/bed/zb.hg17 /cluster/data/hg17/bed/blastz.monDom1
# Create DEF file for run of scaffolds vs. human chromosomes
cat << '_EOF_' > DEF
# human vs. M.domestica
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Set up blastz parameters using parameters between (chicken and fish ?)
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum
SEQ2_DIR=/iscratch/i/monDom1/chunks
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/monDom1/bed/zb.hg17
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
cp -p /cluster/data/hg17/chrom.sizes S1.len
cp -p /cluster/data/monDom1/chrom.sizes S2.len
# script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh
/cluster/data/monDom1/jkStuff/BlastZ_run0.sh
cd run.0
para try, push check, para push, para check....
# Completed: 112871 of 112871 jobs
# CPU time in finished jobs: 42061387s 701023.12m 11683.72h 486.82d 1.334 y
# IO & Wait Time: 578862s 9647.70m 160.80h 6.70d 0.018 y
# Average job time: 378s 6.30m 0.10h 0.00d
# Longest job: 3157s 52.62m 0.88h 0.04d
# Submission to last job: 81243s 1354.05m 22.57h 0.94d
# Second cluster run to convert the .out's to .lav's
# You do NOT want to run this on the big cluster. It brings
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/monDom1/bed/zb.hg17
# script copied over from /cluster/data/hg17/jkStuff/BlastZ_run1.sh
/cluster/data/monDom1/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 3525s 58.75m 0.98h 0.04d 0.000 y
# IO & Wait Time: 11315s 188.58m 3.14h 0.13d 0.000 y
# Average job time: 44s 0.73m 0.01h 0.00d
# Longest job: 181s 3.02m 0.05h 0.00d
# Submission to last job: 1524s 25.40m 0.42h 0.02d
# third run: lav -> axt (an I/O bound process, run only on fileserver)
# Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt
ssh eieio
cd /cluster/data/monDom1/bed/zb.hg17
mkdir axtChrom pslChrom
cat << '_EOF_' > lav2axt.sh
#!/bin/sh
for L in lav/chr*
do
chr=${L/lav\//}
cd ${L}
echo -n "${L} -> ${chr}.axt working ... "
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
/iscratch/i/hg17/bothMaskedNibs \
/iscratch/i/monDom1/chunks.2bit stdout \
| axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \
/iscratch/i/monDom1/chunkedScaffolds.lft warn stdin
cd ../..
echo -n "${chr}.axt -> ${chr}.psl working ..."
axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl
echo "DONE"
done
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x lav2axt.sh
./lav2axt.sh
# load this blast result just to see how it looks
# It is not needed in the browser after the chains and nets are
# made, drop it later.
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.hg17/pslChrom
hgLoadPsl -table=blastzMonDom1 -noTNameIx hg17 chrM.psl
bash # if you are normally csh
for I in `ls *.psl | grep -v chrM.psl`
do
/cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \
-noTNameIx hg17 ${I}
echo "done: ${I}"
done
exit # exit bash if you are normally csh
# Let's see what featureBits has to say about this data
featureBits hg17 blastzMonDom1
# 486216407 bases of 2866216770 (16.964%) in intersection
featureBits hg17 blastzMm5
# 1052077141 bases of 2866216770 (36.706%) in intersection
featureBits hg17 blastzRn3
# 1013003285 bases of 2866216770 (35.343%) in intersection
# CHAIN HUMAN BLASTZ (DONE - 2004-12-12 - Hiram)
# Run axtChain on file server, job is mostly I/O, cluster does not help
ssh eieio
cd /cluster/data/monDom1/bed/zb.hg17
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chainRaw
ls -1S /cluster/data/monDom1/bed/zb.hg17/axtChrom/*.axt > input.lst
cat << '_EOF_' > doChain.sh
#!/bin/sh
for chrAxtFile in `cat input.lst`
do
chr=`basename ${chrAxtFile}`
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../opossumHumanTuned.gap \
-minScore=5000 ${chrAxtFile} \
/cluster/data/hg17/nib \
/cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out
done
'_EOF_'
# << this line keeps emacs coloring happy
# Reuse gap penalties from chicken run.
cat << '_EOF_' | sed 's/ */\t/g' > ../../opossumHumanTuned.gap
tablesize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x doChain
./doChain
# On the file server weed the chains for repeats
ssh eieio
cd /cluster/data/monDom1/bed/zb.hg17/axtChain
mkdir chain
cd chainRaw
bash # if you are normally csh
for i in *.chain
do
chainAntiRepeat /cluster/data/hg17/nib \
/cluster/data/monDom1/monDom1.2bit $i ../chain/$i
done
exit # exit bash if you are normally csh
# now on the file server, sort chains. The chainMergeSort and chainSplit
# steps both take about 10 minutes
ssh kksilo
cd /cluster/data/monDom1/bed/zb.hg17/axtChain
chainMergeSort chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# Load chains into database. This takes an hour
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.hg17/axtChain/chain
for i in *.chain
do
c=${i/.chain/}
echo "loading ${c}"
hgLoadChain hg17 ${c}_chainMonDom1 $i
done
featureBits hg17 chainMonDom1
# 2618859142 bases of 2866216770 (91.370%) in intersection
featureBits hg17 chainMonDom1Link
# 456069062 bases of 2866216770 (15.912%) in intersection
# create a copy of these things for hgdownload (2005-01-12 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1
cd /cluster/data/monDom1/bed/zb.hg17/axtChain/chain
tar cvzf \
/usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/opossum.chain.tgz \
./chr*.chain
# NET OPOSSUM TO HUMAN BLASTZ (DONE - 2004-12-13 - Hiram)
ssh eieio
cd /cluster/data/monDom1/bed/zb.hg17/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# 11 minutes
# Create axt files for the net as so:
netSplit noClass.net noClass
# 2 minutes
cd noClass
mkdir ../../axtNet
bash # if you are csh normally
for i in *.net
do
r=${i/.net/}
netToAxt $i ../chain/${r}.chain /cluster/data/hg17/nib \
/cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt
done
# 18 minutes
exit # back to csh if you are csh normally
cd ..
rm -r noClass
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.hg17/axtChain
netClass -noAr noClass.net hg17 monDom1 human.net
# 41 minutes
# Load the nets into database
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.hg17/axtChain
netFilter -minGap=10 human.net | hgLoadNet hg17 netMonDom1 stdin
# 4 minutes
# compare featureBits with some other assemblies
featureBits hg17 netMonDom1
# 2595812636 bases of 2866216770 (90.566%) in intersection
featureBits hg17 netBosTau1
# 2261085097 bases of 2866216770 (78.887%) in intersection
featureBits hg17 netRn3
# 2817656275 bases of 2866216770 (98.306%) in intersection
# create a copy of these things for hgdownload (2005-01-12 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1
cd /cluster/data/monDom1/bed/zb.hg17/axtChain
cp -p human.net \
/usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/opossum.net
cd /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1
gzip opossum.net
# The axtNet files need to go too (2005-02-24 - Hiram)
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/axtNet
cd /usr/local/apache/htdocs/goldenPath/hg17/vsMonDom1/axtNet
cp -p /cluster/data/monDom1/bed/zb.hg17/axtNet/*.axt .
gzip *
# index had NULL cardinality; analyze to fix (2005-01-18 - Heather)
hgsql hg17 -e "analyze table netMonDom1;"
# MAKE NET MAFS FOR MULTIZ (2004-12-24 kate)
ssh kksilo
cd /cluster/data/monDom1/bed/zb.hg17/axtNet
mkdir -p ../mafNet
cat *.axt | axtSort stdin stdout | axtToMaf -tSplit stdin \
-tPrefix=hg17. /cluster/data/hg17/chrom.sizes \
-qPrefix=monDom1. /cluster/data/monDom1/chrom.sizes ../mafNet
# SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE
# Create HUMAN TO OPOSSUM chain, net
ssh eieio
cd /cluster/data/monDom1/bed
mkdir -p bz.hg17/axtChain
chainSwap zb.hg17/axtChain/all.chain bz.hg17/axtChain/all.chain
# 4 minutes
cd bz.hg17/axtChain
chainPreNet all.chain ../../zb.hg17/S2.len ../../zb.hg17/S1.len stdout \
| chainNet stdin -minSpace=1 ../../zb.hg17/S2.len \
../../zb.hg17/S1.len stdout /dev/null \
| netSyntenic stdin noClass.net
# 10 minutes
mkdir ../axtNet
netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \
/cluster/data/hg17/nib ../axtNet/all.axt
# 23 minutes
# Classify net and load into database
ssh hgwdev
cd /cluster/data/monDom1/bed/bz.hg17/axtChain
netClass -noAr noClass.net monDom1 hg17 monDom1.net
# 16 minutes
netFilter -minGap=10 monDom1.net | hgLoadNet monDom1 netHg17 stdin
# 6 minutes
# Load chains into database (takes about 4 hours).
hgLoadChain -tIndex monDom1 chainHg17 all.chain
# if you do a 'show index' SQL reports Cardinality NULL for the tName
# indexes although they are actually working. To make the
# cardinality show up:
hgsql monDom1 -e "analyze table chainHg17;"
hgsql monDom1 -e "analyze table chainHg17Link;"
# check featureBits (~ 30 minute commands)
featureBits monDom1 chainHg17Link
# 459086956 bases of 3492108230 (13.146%) in intersection
featureBits monDom1 chainHg17
# 3246158618 bases of 3492108230 (92.957%) in intersection
featureBits monDom1 netHg17
# 3223616544 bases of 3492108230 (92.311%) in intersection
# create a copy of for hgdownload (2005-01-13 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsHg17
cd /cluster/data/monDom1/bed/bz.hg17/axtChain
gzip --stdout all.chain > \
/usr/local/apache/htdocs/goldenPath/monDom1/vsHg17/human.chain.gz
gzip --stdout monDom1.net > \
/usr/local/apache/htdocs/goldenPath/monDom1/vsHg17/human.net.gz
# BLASTZ HUMAN CLEAN UP (DONE - 2005-01-12 - Hiram)
ssh eieio
cd /cluster/data/monDom1/bed/zb.hg17/axtChain
nice rm -rf noClass.net chain/chain.tab chain/link.tab chain.run1 \
chainRaw out &
nice gzip chain/* &
nice gzip human.net &
nice gzip all.chain &
cd /cluster/data/monDom1/bed/zb.hg17
nice rm -rf raw &
nice gzip axtChrom/maf/* &
nice gzip axtChrom/*.axt &
nice gzip pslChrom/* lav/*/* &
cd /cluster/data/monDom1/bed/bz.hg17/axtChain
nice rm -f noClass.net link.tab chain.tab &
nice gzip all.chain monDom1.net &
cd ../axtNet
nice gzip all.axt &
# LOAD CPGISSLANDS (DONE - 2004-12-13 - Hiram)
ssh hgwdev
mkdir -p /cluster/data/monDom1/bed/cpgIsland
cd /cluster/data/monDom1/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
mv cpglh.exe /cluster/data/monDom1/bed/cpgIsland/
ssh eieio
cd /cluster/data/monDom1/bed/cpgIsland
# AWK filter to transform cpglh output to bed +
cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << this line keeps emacs coloring happy
rm -f cpgIsland.bed
cat /dev/null > cpgIsland.bed
bash # if you are normally csh
for scaffold in `awk '{print $1}' /cluster/data/monDom1/chrom.sizes`
do
cpgOutput=${scaffold}.cpg
echo "${scaffold}"
rm -f /tmp/${scaffold}.fa
twoBitToFa /cluster/data/monDom1/monDom1.2bit:${scaffold} \
/tmp/${scaffold}.fa
./cpglh.exe /tmp/${scaffold}.fa | awk -f filter.awk >> cpgIsland.bed
rm -f /tmp/${scaffold}.fa
done
exit # exit bash if you are normally csh
# load into database:
ssh hgwdev
cd /cluster/data/monDom1/bed/cpgIsland
hgLoadBed monDom1 cpgIslandExt -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Loaded 27004 elements of size 10
# compare to some other assemblies
featureBits monDom1 cpgIslandExt
# 18445119 bases of 3492108230 (0.528%) in intersection
featureBits hg17 cpgIslandExt
# 21245283 bases of 2866216770 (0.741%) in intersection
featureBits panTro1 cpgIsland
# 17540676 bases of 2733948177 (0.642%) in intersection
featureBits rn3 cpgIsland
# 9679881 bases of 2571104688 (0.376%) in intersection
featureBits xenTro1 cpgIslandExt
# 19279778 bases of 1381238994 (1.396%) in intersection
featureBits fr1 cpgIsland
# 25518433 bases of 315518167 (8.088%) in intersection
# ALIGN TO CHICKEN (DONE - 2004-12-14 - Hiram)
# NAMING CONVENTION: the zb.galGal2 directory will be the Opossum
# aligned to the Chicken blastz result.
# The bz.galGal2 directory will the swapped result, Chicken aligned
# to Opossum
ssh kk
mkdir -p /cluster/data/monDom1/bed/zb.galGal2
cd /cluster/data/monDom1/bed/zb.galGal2
# Create DEF file for run of scaffolds vs. chicken chromosomes
cat << '_EOF_' > DEF
# chicken vs. M.domestica
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Set up blastz parameters using parameters between (chicken and fish ?)
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: chicken
SEQ1_DIR=/iscratch/i/galGal2/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum
SEQ2_DIR=/iscratch/i/monDom1/chunks
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/monDom1/bed/zb.galGal2
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
cp -p /cluster/data/galGal2/chrom.sizes S1.len
cp -p /cluster/data/monDom1/chrom.sizes S2.len
# script copied over from /cluster/data/galGal2/jkStuff/BlastZ_run0.sh
/cluster/data/monDom1/jkStuff/BlastZ_run0.sh
cd run.0
para try, push check, para push, para check....
# Completed: 49981 of 49981 jobs
# CPU time in finished jobs: 24745519s 412425.31m 6873.76h 286.41d 0.785 y
# IO & Wait Time: 204758s 3412.64m 56.88h 2.37d 0.006 y
# Average job time: 499s 8.32m 0.14h 0.01d
# Longest job: 2857s 47.62m 0.79h 0.03d
# Submission to last job: 85888s 1431.47m 23.86h 0.99d
# Second cluster run to convert the .out's to .lav's
# You do NOT want to run this on the big cluster. It brings
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/monDom1/bed/zb.galGal2
# script copied over from /cluster/data/galGal2/jkStuff/BlastZ_run1.sh
/cluster/data/monDom1/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 151 of 151 jobs
# CPU time in finished jobs: 1131s 18.85m 0.31h 0.01d 0.000 y
# IO & Wait Time: 2130s 35.50m 0.59h 0.02d 0.000 y
# Average job time: 22s 0.36m 0.01h 0.00d
# Longest job: 95s 1.58m 0.03h 0.00d
# Submission to last job: 490s 8.17m 0.14h 0.01d
# third run: lav -> axt (an I/O bound process, run only on fileserver)
# Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt
ssh eieio
cd /cluster/data/monDom1/bed/zb.galGal2
mkdir axtChrom pslChrom
cat << '_EOF_' > lav2axt.sh
#!/bin/sh
for L in lav/chr*
do
chr=${L/lav\//}
cd ${L}
echo -n "${L} -> ${chr}.axt working ... "
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
/iscratch/i/galGal2/nib \
/iscratch/i/monDom1/chunks.2bit stdout \
| axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \
/iscratch/i/monDom1/chunkedScaffolds.lft warn stdin
cd ../..
echo -n "${chr}.axt -> ${chr}.psl working ..."
axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl
echo "DONE"
done
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x lav2axt.sh
./lav2axt.sh
# 12 minutes
# load this blast result just to see how it looks
# It is not needed in the browser after the chains and nets are
# made, drop it later.
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.galGal2/pslChrom
hgLoadPsl -table=blastzMonDom1 -noTNameIx galGal2 chrM.psl
bash # if you are normally csh
for I in `ls *.psl | grep -v chrM.psl`
do
/cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \
-noTNameIx galGal2 ${I}
echo "done: ${I}"
done
exit # exit bash if you are normally csh
# 7 minutes
# Let's see what featureBits has to say about this data
featureBits galGal2 blastzMonDom1
102241492 bases of 1054197620 (9.699%) in intersection
featureBits galGal2 blastzBestHg16
97272784 bases of 1054197620 (9.227%) in intersection
# CHAIN CHICKEN BLASTZ (DONE - 2004-12-12 - Hiram)
# Run axtChain on file server, job is mostly I/O, cluster does not help
ssh eieio
cd /cluster/data/monDom1/bed/zb.galGal2
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chainRaw
ls -1S /cluster/data/monDom1/bed/zb.galGal2/axtChrom/*.axt > input.lst
cat << '_EOF_' > doChain.sh
#!/bin/sh
for chrAxtFile in `cat input.lst`
do
chr=`basename ${chrAxtFile}`
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../opossumChickenTuned.gap \
-minScore=5000 ${chrAxtFile} \
/cluster/data/galGal2/nib \
/cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out
done
'_EOF_'
# << this line keeps emacs coloring happy
# Reuse gap penalties from chicken run.
cat << '_EOF_' | sed 's/ */\t/g' > ../../opossumChickenTuned.gap
tablesize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x doChain
time ./doChain
# 42 minutes
# On the file server weed the chains for repeats
ssh eieio
cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
mkdir chain
cd run1/chainRaw
bash # if you are normally csh
for i in *.chain
do
chainAntiRepeat /cluster/data/galGal2/nib \
/cluster/data/monDom1/monDom1.2bit $i ../../chain/$i
echo "$i done"
done
# 7 minutes
exit # exit bash if you are normally csh
# now on the file server, sort chains. The chainMergeSort and
# chainSplit steps each take less than a minute
ssh eieio
cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
chainMergeSort chain/*.chain > all.chain
mv chain chainBeforeMerge
chainSplit chain all.chain
rm run1/chain/*.chain
# Load chains into database. This takes an hour
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.galGal2/axtChain/chain
bash # if you are csh normally
for i in *.chain
do
c=${i/.chain/}
echo "loading ${c}"
hgLoadChain galGal2 ${c}_chainMonDom1 $i
done
# 4 minutes
exit # exit bash if you are csh normally
# Compare to some other organisms
featureBits galGal2 chainMonDom1
# 782212657 bases of 1054197620 (74.200%) in intersection
featureBits galGal2 chainHg17
# 921410328 bases of 1054197620 (87.404%) in intersection
featureBits galGal2 chainXenTro1
# 691628520 bases of 1054197620 (65.607%) in intersection
featureBits galGal2 chainMm5
# 846905330 bases of 1054197620 (80.336%) in intersection
# create a copy of these things for hgdownload (2005-01-12 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1
cd /cluster/data/monDom1/bed/zb.galGal2/axtChain/chain
tar cvzf \
/usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1/opossum.chain.tgz \
./chr*.chain
# NET OPOSSUM TO CHICKEN BLASTZ (DONE - 2004-12-13 - Hiram)
ssh eieio
cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# 2 minutes
# Create axt files for the net as so:
netSplit noClass.net noClass
# 1 minute
cd noClass
mkdir ../../axtNet
bash # if you are csh normally
for i in *.net
do
r=${i/.net/}
netToAxt $i ../chain/${r}.chain /cluster/data/galGal2/nib \
/cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt
done
# 6 minutes
exit # back to csh if you are csh normally
cd ..
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
netClass -noAr noClass.net galGal2 monDom1 chicken.net
# 56 minutes
# Load the nets into database
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
netFilter -minGap=10 chicken.net | hgLoadNet galGal2 netMonDom1 stdin
# 4 minutes
# compare featureBits with some other assemblies
featureBits galGal2 netMonDom1
# 771045333 bases of 1054197620 (73.140%) in intersection
featureBits galGal2 netHg17
# 908340945 bases of 1054197620 (86.164%) in intersection
featureBits galGal2 netMm5
# 835277984 bases of 1054197620 (79.234%) in intersection
featureBits galGal2 netXenTro1
# 674438463 bases of 1054197620 (63.976%) in intersection
# create a copy of these things for hgdownload (2005-01-12 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1
cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
cp -p chicken.net \
/usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1/opossum.net
cd /usr/local/apache/htdocs/goldenPath/galGal2/vsMonDom1
gzip opossum.net
# SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE
# Create CHICKEN TO OPOSSUM chain, net
ssh eieio
cd /cluster/data/monDom1/bed
mkdir -p bz.galGal2/axtChain
chainSwap zb.galGal2/axtChain/all.chain bz.galGal2/axtChain/all.chain
# 1 minute
cd bz.galGal2/axtChain
chainPreNet all.chain ../../zb.galGal2/S2.len \
../../zb.galGal2/S1.len stdout \
| chainNet stdin -minSpace=1 ../../zb.galGal2/S2.len \
../../zb.galGal2/S1.len stdout /dev/null \
| netSyntenic stdin noClass.net
# 2 minutes
mkdir ../axtNet
netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \
/cluster/data/galGal2/nib ../axtNet/all.axt
# 6 minutes
# Classify net and load into database
ssh hgwdev
cd /cluster/data/monDom1/bed/bz.galGal2/axtChain
netClass -noAr noClass.net monDom1 galGal2 monDom1.net
# 31 minutes
netFilter -minGap=10 monDom1.net | hgLoadNet monDom1 netGalGal2 stdin
# 4 minutes
# Load chains into database (takes about 4 hours).
hgLoadChain -tIndex monDom1 chainGalGal2 all.chain
# if you do a 'show index' SQL reports Cardinality NULL for the tName
# indexes although they are actually working. To make the
# cardinality show up:
hgsql monDom1 -e "analyze table chainGalGal2;"
hgsql monDom1 -e "analyze table chainGalGal2Link;"
featureBits monDom1 chainGalGal2Link
# 110933245 bases of 3492108230 (3.177%) in intersection
featureBits monDom1 chainGalGal2
# 2437546469 bases of 3492108230 (69.802%) in intersection
# create a copy of for hgdownload (2005-01-13 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsGalGal2
cd /cluster/data/monDom1/bed/bz.galGal2/axtChain
gzip --stdout all.chain > \
/usr/local/apache/htdocs/goldenPath/monDom1/vsGalGal2/chicken.chain.gz
gzip --stdout monDom1.net > \
/usr/local/apache/htdocs/goldenPath/monDom1/vsGalGal2/chicken.net.gz
# BLASTZ CHICKEN CLEAN UP (DONE - 2005-01-12 - Hiram)
ssh eieio
cd /cluster/data/monDom1/bed/zb.galGal2/axtChain
nice rm -rf run1/out run1/chainRaw chain/chain.tab chain/link.tab \
noClass.net chainBeforeMerge noClass &
nice gzip chain/* &
nice gzip chicken.net &
nice gzip all.chain &
cd /cluster/data/monDom1/bed/zb.galGal2
nice rm -rf raw &
nice gzip axtNet/*.axt &
nice gzip axtChrom/*.axt &
nice gzip pslChrom/* lav/*/* &
cd /cluster/data/monDom1/bed/bz.galGal2/axtChain
nice rm -f noClass.net link.tab chain.tab &
nice gzip all.chain monDom1.net &
cd ../axtNet
nice gzip all.axt &
# ALIGN TO MOUSE (DONE - 2004-12-16 - Hiram)
# NAMING CONVENTION: the zb.mm5 directory will be the Opossum
# aligned to the Mouse blastz result.
# The bz.mm5 directory will the swapped result, Mouse aligned
# to Opossum
ssh kk
mkdir -p /cluster/data/monDom1/bed/zb.mm5
cd /cluster/data/monDom1/bed/zb.mm5
# Create DEF file for run of scaffolds vs. mouse chromosomes
cat << '_EOF_' > DEF
# mouse vs. M.domestica
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Set up blastz parameters using parameters between (chicken and fish ?)
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum
SEQ2_DIR=/iscratch/i/monDom1/chunks
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/monDom1/bed/zb.mm5
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
cp -p /cluster/data/mm5/chrom.sizes S1.len
cp -p /cluster/data/monDom1/chrom.sizes S2.len
# script copied over from /cluster/data/mm5/jkStuff/BlastZ_run0.sh
/cluster/data/monDom1/jkStuff/BlastZ_run0.sh
cd run.0
para try, push check, para push, para check....
# Completed: 112871 of 112871 jobs
# CPU time in finished jobs: 36169619s 602826.99m 10047.12h 418.63d 1.147 y
# IO & Wait Time: 452242s 7537.36m 125.62h 5.23d 0.014 y
# Average job time: 324s 5.41m 0.09h 0.00d
# Longest job: 2755s 45.92m 0.77h 0.03d
# Submission to last job: 153519s 2558.65m 42.64h 1.78d
# extra delay in this run due to disk problems in the middle and
# thus being interrupted
# Second cluster run to convert the .out's to .lav's
# You do NOT want to run this on the big cluster. It brings
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/monDom1/bed/zb.mm5
# script copied over from /cluster/data/mm5/jkStuff/BlastZ_run1.sh
/cluster/data/monDom1/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 3131s 52.19m 0.87h 0.04d 0.000 y
# IO & Wait Time: 9815s 163.58m 2.73h 0.11d 0.000 y
# Average job time: 38s 0.63m 0.01h 0.00d
# Longest job: 223s 3.72m 0.06h 0.00d
# Submission to last job: 1267s 21.12m 0.35h 0.01d
# third run: lav -> axt (an I/O bound process, run only on fileserver)
# Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt
ssh eieio
cd /cluster/data/monDom1/bed/zb.mm5
mkdir axtChrom pslChrom
cat << '_EOF_' > lav2axt.sh
#!/bin/sh
for L in lav/chr*
do
chr=${L/lav\//}
cd ${L}
echo -n "${L} -> ${chr}.axt working ... "
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
/scratch/mus/mm5/softNib \
/iscratch/i/monDom1/chunks.2bit stdout \
| axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \
/iscratch/i/monDom1/chunkedScaffolds.lft warn stdin
cd ../..
echo -n "${chr}.axt -> ${chr}.psl working ..."
axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl
echo "DONE"
done
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x lav2axt.sh
./lav2axt.sh
# 38 minutes
# load this blast result just to see how it looks
# It is not needed in the browser after the chains and nets are
# made, drop it later.
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.mm5/pslChrom
hgLoadPsl -table=blastzMonDom1 -noTNameIx mm5 chrM.psl
bash # if you are csh normally
for I in `ls *.psl | grep -v chrM.psl`
do
/cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \
-noTNameIx mm5 ${I}
echo "done: ${I}"
done
# 12 minutes
exit # back to csh if you are csh normally
# Let's see what featureBits has to say about this data
featureBits mm5 blastzMonDom1
263272041 bases of 2615483787 (10.066%) in intersection
# CHAIN MOUSE BLASTZ (TBD)
# Run axtChain on file server, job is mostly I/O, cluster does not help
ssh eieio
cd /cluster/data/monDom1/bed/zb.mm5
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chainRaw
ls -1S /cluster/data/monDom1/bed/zb.mm5/axtChrom/*.axt > input.lst
cat << '_EOF_' > doChain.sh
#!/bin/sh
for chrAxtFile in `cat input.lst`
do
chr=`basename ${chrAxtFile}`
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../opossumMouseTuned.gap \
-minScore=5000 ${chrAxtFile} \
/scratch/mus/mm5/softNib \
/cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out
done
'_EOF_'
# << this line keeps emacs coloring happy
# Reuse gap penalties from mouse run.
cat << '_EOF_' | sed 's/ */\t/g' > ../../opossumMouseTuned.gap
tablesize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x doChain
time ./doChain
# 95 minutes
# On the file server weed the chains for repeats
ssh eieio
cd /cluster/data/monDom1/bed/zb.mm5/axtChain
mkdir chain
cd run1/chainRaw
bash # if you are normally csh
for i in *.chain
do
chainAntiRepeat /scratch/mus/mm5/softNib \
/cluster/data/monDom1/monDom1.2bit $i ../../chain/$i
echo "$i done"
done
# 34 minutes
exit # exit bash if you are normally csh
# now on the file server, sort chains.
# chainSplit steps each take less than a minute
ssh eieio
cd /cluster/data/monDom1/bed/zb.mm5/axtChain
chainMergeSort chain/*.chain > all.chain
# 3 minutes
mv chain chainBeforeMerge
chainSplit chain all.chain
# 3 minutes
rm run1/chain/*.chain
# Load chains into database. This takes an hour
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.mm5/axtChain/chain
bash # if you are csh normally
for i in *.chain
do
c=${i/.chain/}
echo "loading ${c}"
hgLoadChain mm5 ${c}_chainMonDom1 $i
done
# 20 minutes
exit # exit bash if you are csh normally
# Compare to some other organisms
featureBits mm5 chainMonDom1
# 2121448151 bases of 2615483787 (81.111%) in intersection
featureBits -countGaps mm5 chainRn3
# 2646682349 bases of 3164952073 (83.625%) in intersection
featureBits mm5 chainRn3
# 2646682349 bases of 2615483787 (101.193%) in intersection
featureBits mm5 chainHg17
# 2507720521 bases of 2615483787 (95.880%) in intersection
featureBits mm5 chainXenTro1
# 1078618413 bases of 2615483787 (41.240%) in intersection
# create a copy of these things for hgdownload (2005-01-12 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1
cd /cluster/data/monDom1/bed/zb.mm5/axtChain/chain
tar cvzf \
/usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1/opossum.chain.tgz \
./chr*.chain
# NET OPOSSUM TO MOUSE BLASTZ (TBD)
ssh eieio
cd /cluster/data/monDom1/bed/zb.mm5/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# 6 minutes
# Create axt files for the net as so:
netSplit noClass.net noClass
# 1 minute
cd noClass
mkdir ../../axtNet
bash # if you are csh normally
for i in *.net
do
r=${i/.net/}
netToAxt $i ../chain/${r}.chain /scratch/mus/mm5/softNib \
/cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt
done
# 10 minutes
exit # back to csh if you are csh normally
cd ..
rm -r noClass
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.mm5/axtChain
netClass -noAr noClass.net mm5 monDom1 mouse.net
# 41 minutes
# Load the nets into database
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.mm5/axtChain
netFilter -minGap=10 mouse.net | hgLoadNet mm5 netMonDom1 stdin
# 3 minutes
# compare featureBits with some other assemblies
featureBits mm5 netMonDom1
# 2094316044 bases of 2615483787 (80.074%) in intersection
featureBits mm5 netRn3
# 2638255333 bases of 2615483787 (100.871%) in intersection
featureBits -countgaps mm5 netRn3
# 2638255333 bases of 3164952073 (83.358%) in intersection
featureBits mm5 netHg17
# 2504056038 bases of 2615483787 (95.740%) in intersection
featureBits mm5 netCanFam1
# 2456773441 bases of 2615483787 (93.932%) in intersection
featureBits mm5 netGalGal2
# 1958796258 bases of 2615483787 (74.892%) in intersection
featureBits mm5 netXenTro1
# 1042210258 bases of 2615483787 (39.848%) in intersection
featureBits mm5 netTetNig1
# 618111072 bases of 2615483787 (23.633%) in intersection
featureBits mm5 netDanRer2
# 560950601 bases of 2615483787 (21.447%) in intersection
# create a copy of these things for hgdownload (2005-01-12 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1
cd /cluster/data/monDom1/bed/zb.mm5/axtChain
cp -p mouse.net \
/usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1/opossum.net
cd /usr/local/apache/htdocs/goldenPath/mm5/vsMonDom1
gzip opossum.net
# SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE
# Create MOUSE TO OPOSSUM chain, net
ssh eieio
cd /cluster/data/monDom1/bed
mkdir -p bz.mm5/axtChain
chainSwap zb.mm5/axtChain/all.chain bz.mm5/axtChain/all.chain
# 2 minutes
cd bz.mm5/axtChain
chainPreNet all.chain ../../zb.mm5/S2.len \
../../zb.mm5/S1.len stdout \
| chainNet stdin -minSpace=1 ../../zb.mm5/S2.len \
../../zb.mm5/S1.len stdout /dev/null \
| netSyntenic stdin noClass.net
# 6 minutes
mkdir ../axtNet
netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \
/scratch/mus/mm5/softNib ../axtNet/all.axt
# 9 minutes
# Classify net and load into database
ssh hgwdev
cd /cluster/data/monDom1/bed/bz.mm5/axtChain
netClass -noAr noClass.net monDom1 mm5 monDom1.net
# 13 minutes
netFilter -minGap=10 monDom1.net | hgLoadNet monDom1 netMm5 stdin
# 3 minutes
# Load chains into database (takes about 4 hours).
hgLoadChain -tIndex monDom1 chainMm5 all.chain
# if you do a 'show index' SQL reports Cardinality NULL for the tName
# indexes although they are actually working. To make the
# cardinality show up:
hgsql monDom1 -e "analyze table chainMm5;"
hgsql monDom1 -e "analyze table chainMm5Link;"
featureBits monDom1 chainMm5Link
# 249594220 bases of 3492108230 (7.147%) in intersection
featureBits monDom1 chainMm5
# 2913812625 bases of 3492108230 (83.440%) in intersection
featureBits monDom1 netMm5
# 2889580530 bases of 3492108230 (82.746%) in intersection
# create a copy of for hgdownload (2005-01-13 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsMm5
cd /cluster/data/monDom1/bed/bz.mm5/axtChain
gzip --stdout all.chain > \
/usr/local/apache/htdocs/goldenPath/monDom1/vsMm5/mouse.chain.gz
gzip --stdout monDom1.net > \
/usr/local/apache/htdocs/goldenPath/monDom1/vsMm5/mouse.net.gz
# BLASTZ MOUSE CLEAN UP (DONE - 2005-01-12 - Hiram)
ssh eieio
cd /cluster/data/monDom1/bed/zb.mm5/axtChain
nice rm -rf run1/out run1/chainRaw chain/chain.tab chain/link.tab \
noClass.net chainBeforeMerge noClass &
nice gzip chain/* &
nice gzip mouse.net &
nice gzip all.chain &
cd /cluster/data/monDom1/bed/zb.mm5
nice rm -rf raw &
nice gzip axtNet/*.axt &
nice gzip axtChrom/*.axt &
nice gzip pslChrom/* lav/*/* &
cd /cluster/data/monDom1/bed/bz.mm5/axtChain
nice rm -f noClass.net link.tab chain.tab &
nice gzip all.chain monDom1.net &
cd ../axtNet
nice gzip all.axt &
# MAKE Human Proteins track (hg17 DONE 2004-12-15 braney)
ssh eieio
mkdir -p /cluster/data/monDom1/blastDb
cd /cluster/data/monDom1/blastDb
faSplit sequence ../broad.mit.edu/scaffolds.fa.unmasked 500 x
for i in x*
do
formatdb -i $i -p F
done
rm *.log *.fa
ssh kkr1u00
mkdir -p /iscratch/i/monDom1/blastDb
cp /cluster/data/monDom1/blastDb/* /iscratch/i/monDom1/blastDb
(iSync) > sync.out
exit # back to eieio
mkdir -p /cluster/data/monDom1/bed/tblastn.hg17KG
cd /cluster/data/monDom1/bed/tblastn.hg17KG
ls -1S /iscratch/i/monDom1/blastDb/*.nsq | sed "s/\.nsq//" > query.lst
calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\)
# 42156/(350000/499) = 60.102411
mkdir -p /cluster/bluearc/monDom1/bed/tblastn.hg17KG/kgfa
split -l 65 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/monDom1/bed/tblastn.hg17KG/kgfa/kg
ln -s /cluster/bluearc/monDom1/bed/tblastn.hg17KG/kgfa kgfa
cd kgfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
ssh kk
cd /cluster/data/monDom1/bed/tblastn.hg17KG
para create blastSpec
para push
# Completed: 323851 of 323851 jobs
# CPU time in finished jobs: 113015986s 1883599.76m 31393.33h 1308.06d 3.584 y
# IO & Wait Time: 2732896s 45548.27m 759.14h 31.63d 0.087 y
# Average job time: 357s 5.96m 0.10h 0.00d
# Longest job: 4911s 81.85m 1.36h 0.06d
# Submission to last job: 274144s 4569.07m 76.15h 3.17d
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
ssh kki
cd /cluster/data/monDom1/bed/tblastn.hg17KG
para create chainSpec
para push
# Completed: 649 of 649 jobs
# CPU time in finished jobs: 4726s 78.76m 1.31h 0.05d 0.000 y
# IO & Wait Time: 76824s 1280.41m 21.34h 0.89d 0.002 y
# Average job time: 126s 2.09m 0.03h 0.00d
# Longest job: 312s 5.20m 0.09h 0.00d
# Submission to last job: 7487s 124.78m 2.08h 0.09d
exit
# back to eieio
cd /cluster/data/monDom1/bed/tblastn.hg17KG/blastOut
for i in kg??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/monDom1/bed/tblastn.hg17KG/blastHg17KG.psl
cd ..
ssh hgwdev
cd /cluster/data/monDom1/bed/tblastn.hg17KG
hgLoadPsl monDom1 blastHg17KG.psl
# back to eieio
rm -rf blastOut
# End tblastn
# MAKE Human Known Gene mRNAs Mapped by BLATZ track (hg17 DONE 2005-1-14 JK)
# Create working directory and extract known gene mrna from database
ssh hgwdev
cd /cluster/data/monDom1/bed
mkdir blatz.hg17KG
cd blatz.hg17KG
pepPredToFa hg17 knownGeneMrna known.fa
# Split known genes into pieces and load on /iscratch/i temp device
ssh kkr1u00
cd /iscratch/i/hg17
mkdir knownRna
cd knownRna
faSplit sequence /cluster/data/monDom1/bed/blatz.hg17KG/known.fa 9 known
iSync
rm /cluster/data/monDom1/bed/blatz.hg17KG/known.fa
# Do parasol job to align via blatz
ssh kk
cd /cluster/data/monDom1/bed/blatz.hg17KG
mkdir run0
cd run0
mkdir psl
awk '{print $1;}' /cluster/data/monDom1/chrom.sizes > genome.lst
ls -1 /iscratch/i/hg17/knownRna/*.fa > mrna.lst
cat << '_EOF_' > gsub
#LOOP
blatz -rna -minScore=3000 /scratch/monDom1/monDom1.2bit:$(file1) $(path2) -out=psl {check out line psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
# Note, I recommend -minScore=6000 next time.
gensub2 genome.lst mrna.lst gsub spec
para create spec
# Then do usual para try/para push/para time
Completed: 174132 of 174132 jobs
CPU time in finished jobs: 9399737s 156662.29m 2611.04h 108.79d 0.298 y
IO & Wait Time: 2143618s 35726.96m 595.45h 24.81d 0.068 y
Average job time: 66s 1.10m 0.02h 0.00d
Longest job: 2437s 40.62m 0.68h 0.03d
Submission to last job: 13892s 231.53m 3.86h 0.16d
# Do sort and near-besting on file server
ssh eieio
cd /cluster/data/monDom1/bed/blatz.hg17KG/run0
catDir psl | pslFilter -minScore=100 -minAli=255 -noHead stdin stdout | sort -k 10 > ../filtered.psl
cd ..
pslReps filtered.psl nearBest.psl /dev/null -nohead -minAli=0.5 -nearTop=0.01 -minCover=0.255
sort -k 14 nearBest.psl > blatzHg17KG.psl
# Clean up
rm -r run0/psl
# Load into database
ssh hgwdev
cd /cluster/data/monDom1/bed/blatz.hg17KG
hgLoadPsl monDom1 blatzHg17KG.psl
# ALIGN TO ZEBRAFISH (WORKING - 2004-12-17 - Hiram)
# NAMING CONVENTION: the zb.danRer2 directory will be the Opossum
# aligned to the Zebrafish blastz result.
# The bz.danRer2 directory will the swapped result, Zebrafish aligned
# to Opossum
ssh kk
mkdir -p /cluster/data/monDom1/bed/zb.danRer2
cd /cluster/data/monDom1/bed/zb.danRer2
# Create DEF file for run of scaffolds vs. zebrafish chromosomes
cat << '_EOF_' > DEF
# zebrafish vs. M.domestica
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Set up blastz parameters using parameters between (chicken and fish ?)
# but not abridging repeats since can't do that with scaffolds, and
# it's not very relevant at this evolutionary distance.
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: zebrafish
SEQ1_DIR=/iscratch/i/danRer2/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum
SEQ2_DIR=/iscratch/i/monDom1/chunks
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/monDom1/bed/zb.danRer2
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
cp -p /cluster/data/danRer2/chrom.sizes S1.len
cp -p /cluster/data/monDom1/chrom.sizes S2.len
# script copied over from /cluster/data/danRer2/jkStuff/BlastZ_run0.sh
/cluster/data/monDom1/jkStuff/BlastZ_run0.sh
cd run.0
para try, push check, para push, para check....
# Completed: 57263 of 57263 jobs
# CPU time in finished jobs: 20015224s 333587.07m 5559.78h 231.66d 0.635 y
# IO & Wait Time: 364057s 6067.61m 101.13h 4.21d 0.012 y
# Average job time: 356s 5.93m 0.10h 0.00d
# Longest job: 2681s 44.68m 0.74h 0.03d
# Submission to last job: 117505s 1958.42m 32.64h 1.36d
# Second cluster run to convert the .out's to .lav's
# You do NOT want to run this on the big cluster. It brings
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/monDom1/bed/zb.danRer2
# script copied over from /cluster/data/danRer2/jkStuff/BlastZ_run1.sh
/cluster/data/monDom1/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
Completed: 173 of 173 jobs
CPU time in finished jobs: 1228s 20.47m 0.34h 0.01d 0.000 y
IO & Wait Time: 4828s 80.46m 1.34h 0.06d 0.000 y
Average job time: 35s 0.58m 0.01h 0.00d
Longest job: 49s 0.82m 0.01h 0.00d
Submission to last job: 393s 6.55m 0.11h 0.00d
# third run: lav -> axt (an I/O bound process, run only on fileserver)
# Use the chunks.2bit so lavToAxt will work. Then lift the resulting axt
ssh eieio
cd /cluster/data/monDom1/bed/zb.danRer2
mkdir axtChrom pslChrom
cat << '_EOF_' > lav2axt.sh
#!/bin/sh
for L in lav/chr*
do
chr=${L/lav\//}
cd ${L}
echo -n "${L} -> ${chr}.axt working ... "
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
/iscratch/i/danRer2/nib \
/iscratch/i/monDom1/chunks.2bit stdout \
| axtSort stdin stdout | liftUp -axtQ ../../axtChrom/${chr}.axt \
/iscratch/i/monDom1/chunkedScaffolds.lft warn stdin
cd ../..
echo -n "${chr}.axt -> ${chr}.psl working ..."
axtToPsl axtChrom/${chr}.axt S1.len S2.len pslChrom/${chr}.psl
echo "DONE"
done
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x lav2axt.sh
./lav2axt.sh
# 6 minutes
# load this blast result just to see how it looks
# It is not needed in the browser after the chains and nets are
# made, drop it later.
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.danRer2/pslChrom
hgLoadPsl -table=blastzMonDom1 -noTNameIx danRer2 chrM.psl
bash # if you are csh normally
for I in `ls *.psl | grep -v chrM.psl`
do
/cluster/bin/i386/hgLoadPsl -table=blastzMonDom1 -append \
-noTNameIx danRer2 ${I}
echo "done: ${I}"
done
# 2 minutes
exit # back to csh if you are csh normally
# Let's see what featureBits has to say about this data
featureBits danRer2 blastzMonDom1
# 56615212 bases of 1560497282 (3.628%) in intersection
# CHAIN ZEBRAFISH BLASTZ (DONE - 2004-12-19 - Hiram))
# Run axtChain on file server, job is mostly I/O, cluster does not help
ssh eieio
cd /cluster/data/monDom1/bed/zb.danRer2
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chainRaw
ls -1S /cluster/data/monDom1/bed/zb.danRer2/axtChrom/*.axt > input.lst
cat << '_EOF_' > doChain.sh
#!/bin/sh
for chrAxtFile in `cat input.lst`
do
chr=`basename ${chrAxtFile}`
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../opossumZFishTuned.gap \
-minScore=5000 ${chrAxtFile} \
/iscratch/i/danRer2/nib \
/cluster/data/monDom1/monDom1.2bit chainRaw/${chr}.chain > out/${chr}.out
done
'_EOF_'
# << this line keeps emacs coloring happy
# Reuse gap penalties from mouse run.
cat << '_EOF_' | sed 's/ */\t/g' > ../../opossumZFishTuned.gap
tablesize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x doChain.sh
time ./doChain.sh
# 57 minutes
# On the file server weed the chains for repeats
ssh eieio
cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
mkdir chain
cd run1/chainRaw
bash # if you are normally csh
for i in *.chain
do
chainAntiRepeat /iscratch/i/danRer2/nib \
/cluster/data/monDom1/monDom1.2bit $i ../../chain/$i
echo "$i done"
done
# 4 minutes
exit # exit bash if you are normally csh
# now on the file server, sort chains.
# chainSplit steps each take less than a minute
ssh eieio
cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
chainMergeSort chain/*.chain > all.chain
# 1 minute
mv chain chainBeforeMerge
chainSplit chain all.chain
# 1 minute
rm run1/chain/*.chain
# Load chains into database. This takes an hour
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.danRer2/axtChain/chain
bash # if you are csh normally
for i in *.chain
do
c=${i/.chain/}
echo "loading ${c}"
hgLoadChain danRer2 ${c}_chainMonDom1 $i
done
# 3 minutes
exit # exit bash if you are csh normally
# Compare to some other organisms
featureBits danRer2 chainMonDom1
# 267969653 bases of 1560497282 (17.172%) in intersection
featureBits danRer2 chainHg17
# 510842993 bases of 1560497282 (32.736%) in intersection
featureBits danRer2 chainMm5
# 507912858 bases of 1560497282 (32.548%) in intersection
featureBits danRer2 chainFr1
# 309784483 bases of 1560497282 (19.852%) in intersection
# create a copy of these things for hgdownload (2005-01-12 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1
cd /cluster/data/monDom1/bed/zb.danRer2/axtChain/chain
tar cvzf \
/usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1/opossum.chain.tgz \
./chr*.chain
# NET OPOSSUM TO ZEBRAFISH BLASTZ (TBD)
ssh eieio
cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# 1 minute
# Create axt files for the net as so:
netSplit noClass.net noClass
# 1 minute
cd noClass
mkdir ../../axtNet
bash # if you are csh normally
for i in *.net
do
r=${i/.net/}
netToAxt $i ../chain/${r}.chain /iscratch/i/danRer2/nib \
/cluster/data/monDom1/monDom1.2bit ../../axtNet/${r}.axt
done
# 4 minutes
exit # back to csh if you are csh normally
cd ..
rm -r noClass
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
netClass -noAr noClass.net danRer2 monDom1 zfish.net
# 39 minutes
# Load the nets into database
ssh hgwdev
cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
netFilter -minGap=10 zfish.net | hgLoadNet danRer2 netMonDom1 stdin
# 1 minute
# compare featureBits with some other assemblies
featureBits danRer2 netMonDom1
# 251706312 bases of 1560497282 (16.130%) in intersection
featureBits danRer2 netRn3
XXX - TBD - 2004-12-19 20:15
# 2638255333 bases of 2615483787 (100.871%) in intersection
featureBits -countgaps danRer2 netRn3
# 2638255333 bases of 3164952073 (83.358%) in intersection
featureBits danRer2 netHg17
# 2504056038 bases of 2615483787 (95.740%) in intersection
featureBits danRer2 netCanFam1
# 2456773441 bases of 2615483787 (93.932%) in intersection
featureBits danRer2 netGalGal2
# 1958796258 bases of 2615483787 (74.892%) in intersection
featureBits danRer2 netXenTro1
# 1042210258 bases of 2615483787 (39.848%) in intersection
featureBits danRer2 netTetNig1
# 618111072 bases of 2615483787 (23.633%) in intersection
featureBits danRer2 netDanRer2
# 560950601 bases of 2615483787 (21.447%) in intersection
# create a copy of these things for hgdownload (2005-01-12 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1
cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
cp -p zfish.net \
/usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1/opossum.net
cd /usr/local/apache/htdocs/goldenPath/danRer2/vsMonDom1
gzip opossum.net
# SWAP EVERYTHING AND REMAKE NET FROM OPOSSUM REFERENCE
# Create ZEBRAFISH TO OPOSSUM chain, net
ssh eieio
cd /cluster/data/monDom1/bed
mkdir -p bz.danRer2/axtChain
chainSwap zb.danRer2/axtChain/all.chain bz.danRer2/axtChain/all.chain
# 1 minutes
cd bz.danRer2/axtChain
chainPreNet all.chain ../../zb.danRer2/S2.len \
../../zb.danRer2/S1.len stdout \
| chainNet stdin -minSpace=1 ../../zb.danRer2/S2.len \
../../zb.danRer2/S1.len stdout /dev/null \
| netSyntenic stdin noClass.net
# 6 minutes
mkdir ../axtNet
netToAxt noClass.net all.chain /cluster/data/monDom1/monDom1.2bit \
/iscratch/i/danRer2/nib ../axtNet/all.axt
# 5 minutes
# Classify net and load into database
ssh hgwdev
cd /cluster/data/monDom1/bed/bz.danRer2/axtChain
netClass -noAr noClass.net monDom1 danRer2 ZFish.net
# 12 minutes
netFilter -minGap=10 ZFish.net | hgLoadNet monDom1 netDanRer2 stdin
# 1 minute
# Load chains into database
hgLoadChain -tIndex monDom1 chainDanRer2 all.chain
# 10 minutes
# if you do a 'show index' SQL reports Cardinality NULL for the tName
# indexes although they are actually working. To make the
# cardinality show up:
hgsql monDom1 -e "analyze table chainDanRer2;"
hgsql monDom1 -e "analyze table chainDanRer2Link;"
featureBits monDom1 chainDanRer2Link
# 66821827 bases of 3492108230 (1.914%) in intersection
featureBits monDom1 chainDanRer2
# 397007345 bases of 3492108230 (11.369%) in intersection
featureBits monDom1 chainGalGal2Link
# 110933245 bases of 3492108230 (3.177%) in intersection
featureBits monDom1 chainGalGal2
# 2437546469 bases of 3492108230 (69.802%) in intersection
featureBits monDom1 chainMm5Link
# create a copy of for hgdownload (2005-01-13 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/vsDanRer2
cd /cluster/data/monDom1/bed/bz.danRer2/axtChain
gzip --stdout all.chain > \
/usr/local/apache/htdocs/goldenPath/monDom1/vsDanRer2/zebrafish.chain.gz
gzip --stdout ZFish.net > \
/usr/local/apache/htdocs/goldenPath/monDom1/vsDanRer2/zebrafish.net.gz
# BLASTZ ZEBRAFISH CLEAN UP (DONE - 2005-01-12 - Hiram)
ssh eieio
cd /cluster/data/monDom1/bed/zb.danRer2/axtChain
nice rm -rf run1/out run1/chainRaw chain/chain.tab chain/link.tab \
noClass.net chainBeforeMerge noClass &
nice gzip chain/* &
nice gzip zfish.net &
nice gzip all.chain &
cd /cluster/data/monDom1/bed/zb.danRer2
nice rm -rf raw &
nice gzip axtNet/*.axt &
nice gzip axtChrom/*.axt &
nice gzip pslChrom/* lav/*/* &
cd /cluster/data/monDom1/bed/bz.danRer2/axtChain
nice rm -f noClass.net link.tab chain.tab &
nice gzip all.chain ZFish.net &
cd ../axtNet
nice gzip all.axt &
# 5-WAY VAR_MULTIZ - ALIGNMENTS Hg17, Mm5, MonDom1, GalGal2, DanRer2
# DONE - 2005-01-04 - Hiram
ssh eieio
cd /cluster/data/monDom1/bed
mkdir multiz
bash # if you are csh normally
# if all.axt has been gzipped, simply use all.axt.gz
# It will work the same way
for A in danRer2 galGal2 mm5 hg17
do
axtSort bz.${A}/axtNet/all.axt stdout | axtToMaf -tSplit stdin \
-tPrefix=monDom1. /cluster/data/monDom1/chrom.sizes \
-qPrefix=${A}. /cluster/data/${A}/chrom.sizes \
multiz/${A}
done
# 6 minutes
exit # back to csh if you are csh normally
# Copy MAFs to Iservers for kluster run
ssh kkr1u00
mkdir /iscratch/i/monDom1/multiz
cd /iscratch/i/monDom1/multiz
rsync -a /cluster/data/monDom1/bed/multiz/ .
mkdir penn
cp -p /cluster/bin/penn/psuCVS/multiz-tba/multiz penn
cp -p /cluster/bin/penn/maf_project penn
/cluster/bin/iSync
# Progressive alignment up the tree w/o stager,
# using multiz.v10 (var_multiz)
# Method: align internal subtrees (using 0 flag to var_multiz)
# Then, align these to human (using 1 flag to var_multiz)
# NOTE: must use maf_project after each multiz run, in order
# to order output. Single-cov guaranteed by use of net MAF's,
# so it is not necessary to run single_cov2.
ssh eieio
cd /cluster/data/monDom1/bed/multiz
# Not sure what this is used for. Appears to be from something
# that is no longer used. Can find no reference to it in the
# procedures. Below in the parameter estimation for phastCons
# there is a tree specified:
# ((((hg17,mm5),galGal2),monDom1),danRer2)
cat << '_EOF_' > tree.nh
(((hg17,mm5),galGal2),danRer2)
'_EOF_'
# << this line keeps emacs coloring happy
# make output dir and run dir
ssh kk9
cd /cluster/data/monDom1/bed/multiz
mkdir -p maf
mkdir -p run
cd run
# create scripts to run var_multiz on cluster
cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set multi = /scratch/monDom1/multiz5way.$c
set pairs = /iscratch/i/monDom1/multiz
# special mode --
# with 1 arg, cleanup
if ($#argv == 1) then
rm -fr $multi
exit
endif
# special mode --
# with 3 args, saves an alignment file
if ($#argv == 3) then
cp $multi/$2/$c.maf $3
exit
endif
set s1 = $2
set s2 = $3
set flag = $4
# locate input files -- in pairwise dir, or multiple dir
set d1 = $multi
set d2 = $multi
if (-d $pairs/$s1) then
set d1 = $pairs
endif
if (-d $pairs/$s2) then
set d2 = $pairs
endif
set f1 = $d1/$s1/$c.maf
set f2 = $d2/$s2/$c.maf
# write to output dir
set out = $multi/${s1}${s2}
mkdir -p $out
# check for empty input file
if (-s $f1 && -s $f2) then
echo "Aligning $f1 $f2 $flag"
/iscratch/i/monDom1/multiz/penn/multiz.v10 $f1 $f2 $flag $out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf
cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
$out/$c.tmp.maf
echo "Ordering $c.maf"
/iscratch/i/monDom1/multiz/penn/maf_project $out/$c.tmp.maf monDom1.$c > $out/$c.maf
else if (-s $f1) then
cp $f1 $out
else if (-s $f2) then
cp $f2 $out
endif
'EOF'
# << keep emacs coloring happy
chmod +x oneMultiz.csh
# Copy this script to iscratch
ssh kkr1u00
cd /iscratch/i/monDom1/multiz/penn
cp -p /cluster/data/monDom1/bed/multiz/run/oneMultiz.csh .
/cluster/bin/iSync
# back to run the job
ssh kki
cd /cluster/data/monDom1/bed/multiz/run
cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
# multiple alignment steps:
# multiz.v10 human.maf mouse.maf 1
# multiz.v10 human-mouse.maf chicken.maf 1
# multiz.v10 human-mouse-chicken.maf zfish.maf 1
set c = $1
/iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c hg17 mm5 1
/iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c galGal2 hg17mm5 1
/iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c danRer2 galGal2hg17mm5 1
# get final alignment file
/iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c danRer2galGal2hg17mm5 /cluster/data/monDom1/bed/multiz/maf/$c.maf
#cleanup
/iscratch/i/monDom1/multiz/penn/oneMultiz.csh $c
'EOF'
# << keep emacs coloring happy
chmod +x allMultiz.csh
cat << 'EOF' > template
#LOOP
./allMultiz.csh $(root1) {check out line+ /cluster/data/monDom1/bed/multiz/maf/$(root1).maf}
#ENDLOOP
'EOF'
# make a list of scaffolds that successfully had actual alignments
# at least once somewhere. Not every scaffold has an alignment.
cd /iscratch/i/monDom1/multiz
for D in danRer2 galGal2 hg17 mm5
do
cd $D
ls | sed -e "s/.maf//"
cd ..
done | sort -u > /cluster/data/monDom1/bed/multiz/run/chrom.lst
cd /cluster/data/monDom1/bed/multiz/run
gensub2 chrom.lst single template jobList
para create jobList
para try; para check
para push
# Completed: 5517 of 5517 jobs
# CPU time in finished jobs: 8337s 138.95m 2.32h 0.10d 0.000 y
# IO & Wait Time: 15166s 252.77m 4.21h 0.18d 0.000 y
# Average job time: 4s 0.07m 0.00h 0.00d
# Longest job: 65s 1.08m 0.02h 0.00d
# Submission to last job: 1495s 24.92m 0.42h 0.02d
# combine results into a single file for loading
ssh eieio
cd /cluster/data/monDom1/bed/multiz
catDir maf | mafFilter stdin -minScore=500 > multiz5way.maf
# Load into database (DONE - 2005-01-04 - Hiram)
ssh hgwdev
cd /cluster/data/monDom1/bed/multiz
mkdir -p /gbdb/monDom1/multiz5way
ln -s /cluster/data/monDom1/bed/multiz/multiz5way.maf /gbdb/monDom1/multiz5way
hgLoadMaf monDom1 multiz5way
# Loaded 988922 mafs in 1 files from /gbdb/monDom1/multiz5way
# 4 minutes to load
# Create pairwise maf files (no longer split across scaffolds)
ssh eieio
cd /cluster/data/monDom1/bed/multiz
mkdir -p pairwise
catDir galGal2 > pairwise/galGal2.maf
catDir hg17 > pairwise/hg17.maf
catDir mm5 > pairwise/mm5.maf
catDir danRer2 > pairwise/danRer2.maf
# Load pairwise mafs into database (DONE - 2005-01-04 - Hiram)
ssh hgwdev
set h = /cluster/data/monDom1/bed/multiz/pairwise
cd $h
foreach i (*.maf)
set o = $i:r
set t = ${o}_netBlastz
mkdir -p /gbdb/monDom1/multiz5way/$t
ln -s $h/$i /gbdb/monDom1/multiz5way/$t
hgLoadMaf monDom1 $t -pathPrefix=/gbdb/monDom1/multiz5way/$t
end
# CREATE CONSERVATION WIGGLE WITH PHASTCONS (WORKING - 2005-01-04 - Hiram)
# Estimate phastCons parameters
ssh eieio
mkdir /cluster/data/monDom1/bed/multiz/cons
cd /cluster/data/monDom1/bed/multiz/cons
# Create a starting-tree.mod based on scaffold_13303. (the largest one)
twoBitToFa ../../../monDom1.2bit -seq=scaffold_13303 scaffold_13303.fa
/cluster/bin/phast/msa_split ../maf/scaffold_13303.maf --refseq scaffold_13303.fa --in-format MAF --windows 100000000,1000 --out-format SS --between-blocks 5000 --out-root s1
(((hg17,mm5),galGal2),danRer2)
/cluster/bin/phast/phyloFit -i SS s1.*.ss --tree "((((hg17,mm5),galGal2),monDom1),danRer2)" --out-root starting-tree
rm scaffold_13303.fa s1.*.ss
# Create directory full of sequence in individual scaffold files on bluearc.
ssh kksilo
mkdir -p /cluster/bluearc/monDom1/scaffoldFa
cd /cluster/bluearc/monDom1/scaffoldFa
twoBitToFa /cluster/data/monDom1/monDom1.2bit stdout | faSplit byname stdin foo
# Create big bad bloated SS files in bluearc (takes ~45 minutes)
mkdir -p /cluster/bluearc/monDom1/cons/ss
cd /cluster/bluearc/monDom1/cons/ss
foreach i (`awk '{print $1}' /cluster/data/monDom1/chrom.sizes`)
if (-e /cluster/data/monDom1/bed/multiz/maf/$i.maf) then
echo msa_split $i
/cluster/bin/phast/msa_split /cluster/data/monDom1/bed/multiz/maf/$i.maf \
--refseq /cluster/bluearc/monDom1/scaffoldFa/$i.fa \
--in-format MAF --windows 1000000,0 --between-blocks 5000 \
--out-format SS --out-root $i
endif
end
# Create a random list of 50 1 mb regions
ls -l | awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs
# Set up parasol directory to calculate trees on these 50 regions
ssh kki
mkdir /cluster/bluearc/monDom1/cons/treeRun1
cd /cluster/bluearc/monDom1/cons/treeRun1
mkdir tree log
# Create little script that calls phastCons with right arguments
# I wonder if target-coverage here has to go along with the
# readjusted target-coverage below as it is tuned up ?
# In that case, this would be 0.50 here.
cat > makeTree << '_EOF_'
/cluster/bin/phast/phastCons ../ss/$1.ss \
/cluster/data/monDom1/bed/multiz/cons/starting-tree.mod \
--gc 0.410 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-lengths 12 --target-coverage 0.35 \
--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
chmod a+x makeTree
# Create gensub file
cat > template << '_EOF_'
#LOOP
makeTree $(root1)
#ENDLOOP
'_EOF_'
# Make cluster job and run it
gensub2 ../randomSs single template jobList
para create jobList
para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs: 1526s 25.43m 0.42h 0.02d 0.000 y
# IO & Wait Time: 650s 10.84m 0.18h 0.01d 0.000 y
# Average job time: 44s 0.73m 0.01h 0.00d
# Longest job: 84s 1.40m 0.02h 0.00d
# Submission to last job: 168s 2.80m 0.05h 0.00d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ls tree/*.cons.mod > cons.txt
/cluster/bin/phast/phyloBoot --read-mods '*cons.txt' --output-average ../ave.cons.mod > cons_summary.txt
ls tree/*.noncons.mod > noncons.txt
/cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' --output-average ../ave.noncons.mod > noncons_summary.txt
cd ..
cp -p ave.*.mod /cluster/data/monDom1/bed/multiz/cons
ssh kk9
# Create cluster dir to do main phastCons run
mkdir /cluster/bluearc/monDom1/cons/consRun1
cd /cluster/bluearc/monDom1/cons/consRun1
mkdir ppRaw bed
# Create script to run phastCons with right parameters
cat > doPhast << '_EOF_'
mkdir -p ppRaw/$2
/cluster/bin/phast/phastCons ../ss/$1.ss ../ave.cons.mod,../ave.noncons.mod \
--expected-lengths 12 --target-coverage 0.50 --quiet --seqname $2 \
--idpref $2 --viterbi bed/$1.bed --score --require-informative 0 > \
ppRaw/$2/$1.pp
'_EOF_'
chmod a+x doPhast
# Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast $(file1) $(root1)
#ENDLOOP
'_EOF_'
# Create parasol batch and run it
ls -1 ../ss | sed 's/.ss//' > in.lst
gensub2 in.lst single template jobList
para create jobList
para try/check/push/etc.
# Completed: 8171 of 8171 jobs
# CPU time in finished jobs: 24641s 410.68m 6.84h 0.29d 0.001 y
# IO & Wait Time: 81696s 1361.60m 22.69h 0.95d 0.003 y
# Average job time: 13s 0.22m 0.00h 0.00d
# Longest job: 39s 0.65m 0.01h 0.00d
# Submission to last job: 1128s 18.80m 0.31h 0.01d
# combine predictions and transform scores to be in 0-1000 interval
catDir bed | awk \
'{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > \
/cluster/data/monDom1/bed/multiz/mostConserved.bed
# Figure out how much is actually covered by the bed files as so:
awk '{print $3 - $2;}' /cluster/data/monDom1/bed/multiz/mostConserved.bed | addCols stdin
at 0.35: 99792852.00
at 0.55: 169027762
at 0.50: 140230712.00
at 0.35: 99792852/3492108230 = .028576
at 0.55: 169027762/3492108230 = .048402 with two points claiming to
to be negative at 0.00000 changed to 0
at 0.50: 140230712.00/3492108230 = .040156 - no unusual points noted
# If the results of the this divided by the non-n genome size (1.5G) aren't
# around 4%, then do it again, adjusting the target-coverage phastCons
# parameter. Beware of negative scores when too high. The logToBedScore
# will output an error on any negative scores.
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/monDom1/bed/multiz
hgLoadBed monDom1 mostConserved mostConserved.bed
# Create merged posterier probability file and wiggle track data files
ssh eieio
cd /cluster/bluearc/monDom1/cons/consRun1
# interesting sort here on scaffold number and position.
# first convert the names to have a consistent field separator
# of . then do the sort on fields 6 and 7 (both numerically)
# then convert those . field separators back to what they were
find ./ppRaw -type f | \
sed -e "s#/scaffold_#.scaffold.#g; s#-#._.#" | \
sort -t\. -k6,6n -k7,7n | \
sed -e "s#.scaffold.#/scaffold_#g; s#\._\.#-#" | xargs cat | \
wigEncode stdin phastCons5.wig phastCons5.wib
# 30 minutes for above
cp -p phastCons5.wi? /cluster/data/monDom1/bed/multiz/cons
# 1 minute copy
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/monDom1/bed/multiz/cons
ln -s `pwd`/phastCons5.wib /gbdb/monDom1/wib/phastCons5.wib
hgLoadWiggle monDom1 phastCons5 phastCons5.wig
# 2 minute load
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/monDom1/bed/multiz/cons
time hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=monDom1 phastCons5 > histogram.data 2>&1
# about 15 minutes scan all data
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 xff0000 xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Opossum Histogram phastCons5 track"
set xlabel " phastCons5 score"
set ylabel "p-Value"
set y2label "Cumulative Probability Distribution"
set y2range [0:1]
set y2tics
plot "histogram.data" using 2:5 title " pValue" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CPD" with lines
'_EOF_'
display histo.png &
# Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram)
ssh hgwdev
cd /tmp
wget \
'http://www.genome.gov/Pages/News/Photos/Science/opossum_image.jpg'
-O opossum_image.jpg
# view that image in 'display' to determine crop edges, then:
convert -crop 1712x1240+560+360 -quality 80 -sharpen 0 \
-normalize opossum_image.jpg mmd.jpg
convert -geometry 300x200 -quality 80 mmd.jpg Monodelphis_domestica.jpg
rm -f mmd.jpg opossum_image.jpg
cp -p Monodelphis_domestica.jpg /usr/local/apache/htdocs/images
# add links to this image in the description.html page, request push
# MAKE DOWNLOADABLE SEQUENCE FILES (WORKING, 2005-01-06, Hiram)
ssh kksilo
cd /cluster/data/monDom1
#- Build the .zip files
rm -rf zip
mkdir zip
# ! 2005-02-08 - Should not be .zip files, should be .gz files
#zip -j zip/softMask.fa.zip maskedScaffolds.fa &
#zip -j zip/hardMask.fa.zip hardMaskedScaffolds.fa &
#zip -j zip/novel.RM.lib.fa.zip jkStuff/monDom1.novel.RM.lib &
#zip ../../zip/trf.bed.zip trfMask.bed &
# RepeatMasker library
gzip -c jkStuff/monDom1.novel.RM.lib > zip/novel.RM.lib.fa.gz
touch -r jkStuff/monDom1.novel.RM.lib zip/novel.RM.lib.fa.gz
# soft masked scaffold fasta
gzip -c maskedScaffolds.fa > zip/softMask.fa.gz
touch -r maskedScaffolds.fa zip/softMask.fa.gz
# hard masked scaffold fasta
gzip -c hardMaskedScaffolds.fa > zip/hardMask.fa.gz
touch -r hardMaskedScaffolds.fa zip/hardMask.fa.gz
# TRF output files
gzip -c bed/simpleRepeat/trfMask.bed > zip/trf.bed.gz
touch -r bed/simpleRepeat/trfMask.bed zip/trf.bed.gz
# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=monDom1 -native GenBank mrna \
/cluster/data/monDom1/zip/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=monDom1 -xeno GenBank mrna \
/cluster/data/monDom1/zip/xenoMrna.fa &
# THERE ARE NO native ESTs - skip this - get native GenBank ESTs
# ./bin/i386/gbGetSeqs -db=monDom1 -native GenBank est \
# /cluster/data/monDom1/zip/est.fa &
cd /cluster/data/monDom1/zip
# zip GenBank native and xeno mRNAs and native ESTs,
# after the above gbGetSeqs jobs are done.
# zip -j mrna.fa.zip mrna.fa
# zip -j xenoMrna.fa.zip xenoMrna.fa
# zip -j est.fa.zip est.fa
#rm -f mrna.fa xenoMrna.fa est.fa
gzip mrna.fa xenoMrna.fa
# RepeatMasker out files, we don't have this as traditionally delivered
# since the process involved turning the data into the database
# representation so they could be sorted and uniqued. So instead
# of delivering the usual files, simply dump the database
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/monDom1/bigZips
cd /usr/local/apache/htdocs/goldenPath/monDom1/bigZips
hgsqldump monDom1 rmsk -T .
#zip -j rmsk.txt.zip rmsk.txt &
#rm rmsk.txt
gzip rmsk.txt &
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/monDom1
rsync -a --progress /cluster/data/monDom1/zip/ .
for i in *.gz
do
/bin/echo -n "$i - "
zcat $i | sum -r
done
# Check zip file integrity
#for i in *.zip
#do
# unzip -t $i | tail -1
#done
# No errors detected in compressed data of hardMask.fa.zip.
# No errors detected in compressed data of mrna.fa.zip.
# No errors detected in compressed data of novel.RM.lib.fa.zip.
# No errors detected in compressed data of rmsk.txt.zip.
# No errors detected in compressed data of softMask.fa.zip.
# No errors detected in compressed data of trf.bed.zip.
# No errors detected in compressed data of xenoMrna.fa.zip.
# md5sum *.zip > md5sum.txt
md5sum *.gz > md5sum.txt
# update README.txt.
# ACCESSIONS FOR CONTIGS (DONE 2005-03-16 kate)
# Used for Assembly track details, and also ENCODE AGP's
cd /cluster/data/monDom1/bed
mkdir -p contigAcc
cd contigAcc
# extract WGS contig to accession mapping from Entrez Nucleotide summaries
# To get the summary file, access the Genbank page for the project
# by searching:
# genus[ORGN] AND WGS[KYWD]
# At the end of the page, click on the list of accessions for the contigs.
# Select summary format, and send to file.
# output to file .acc
contigAccession opossum.acc > contigAcc.tab
wc -l contigAcc.tab
# 109065 contigAcc.tab
grep contig /cluster/data/monDom1/broad.mit.edu/assembly.agp | wc -l
# 109065
hgsql monDom1 < $HOME/kent/src/hg/lib/contigAcc.sql
echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
hgsql monDom1
hgsql monDom1 -e "SELECT COUNT(*) FROM contigAcc"
# 109065 contigAcc.tab
#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)
# EXTRACT AXTs and MAFs FROM OPOSSUM (monDom1) NET FOR ZEBRAFISH (danRer2)
# zb.danRer2 are the blastz alignments, chains and nets for
# danRer2 as target and monDom1 as query. (DONE, 2005-05-13, hartera)
# GZIP AXTs AND MAFs (DONE, 2005-05-15, hartera)
ssh kkstore01
# create axts
cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2/axtChain
gunzip zfish.net.gz
gunzip chain/*.chain.gz
netSplit zfish.net zfishNet
mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
foreach f (zfishNet/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt zfishNet/$c.net chain/$c.chain \
/cluster/data/danRer2/nib \
/cluster/data/monDom1/monDom1.2bit ../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
end
'EOF'
chmod +x axtNet.csh
csh axtNet.csh >&! axtNet.log &
tail -100f axtNet.log
# sort axts before making mafs - must be sorted for multiz
cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2
mv axtNet axtNet.unsorted
mkdir axtNet
foreach f (axtNet.unsorted/*.axt)
set c = $f:t:r
echo "Sorting $c"
axtSort $f axtNet/$c.axt
end
# create maf
ssh kkstore01
cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2/axtNet
mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
foreach f (chr*.axt)
set maf = $f:t:r.monDom1.maf
echo translating $f to $maf
axtToMaf $f \
/cluster/data/danRer2/chrom.sizes /cluster/data/monDom1/chrom.sizes \
../mafNet/$maf -tPrefix=danRer2. -qPrefix=monDom1.
end
'EOF'
chmod +x makeMaf.csh
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2/axtChain
nice gzip zfish.net chain/*.chain &
cd /cluster/data/monDom1/bed/blastz.monDom1.to.danRer2
nice gzip axtNet/*.axt mafNet/*.maf &
# LIFTOVER CHAINS TO MONDOM2 (WORKING 2006-02-07 kate)
# NOTE: split monDom2 is doc'ed in makeMonDom2.doc
ssh kkstore01
cd /cluster/data/monDom1
# split scaffolds and convert to 2bit, as done for cow
mkdir split2bit
cd split2bit
faSplit sequence ../maskedScaffolds.fa 50 x.
foreach f (*.fa)
echo $f
faToTwoBit $f $f:r.2bit
rm $f
end
cd ..
mkdir -p /san/sanvol1/scratch/monDom1
cp -r split2bit /san/sanvol1/scratch/monDom1
rm -fr split2bit
cp /cluster/bluearc/monDom1/11.ooc /san/sanvol1/scratch/monDom1/
# run alignment
ssh pk
cd /cluster/data/monDom1
makeLoChain-align monDom1 /san/sanvol1/scratch/monDom1/split2bit \
monDom2 /san/sanvol1/scratch/monDom2/liftSplits/split \
/san/sanvol1/scratch/monDom1/11.ooc
cd bed
rm blat.monDom2
ln -s blat.monDom2.2006-02-07 blat.monDom2
cd blat.monDom2/run
para try
para check
para push
# 2500 jobs
# All jobs crash when run via cluster, though a test
# manual job on a cluster job ran successfully:
# blat /san/sanvol1/scratch/monDom1/split2bit/x18.2bit /san/sanvol1/scratch/monDom2/liftSplits/split/x19.fa ../raw/x18_x19.psl -tileSize=11 -ooc=/san/sanvol1/scratch/monDom1/11.ooc -minScore=100 -minIdentity=98 -fastMap
# Loaded 83749395 letters in 35 sequences
# Searched 71987986 bases in 7199 sequences
# the job took ~7 hours --
# this needs to be broken up into much smaller jobs.