# for emacs: -*- mode: sh; -*- # Tetraodon Nigirividis from Genoscope, version v6 (released 5/6/02) # Project website: # http://www.genoscope.cns.fr/externe/tetraodon/sequencing.html # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +------------+--------------------+ # | tableName | type | # +------------+--------------------+ # | gaze | genePred | # | geneid | genePred geneidPep | # | genscan | genePred | # | hoxGenes | genePred | # | cytokines | genePred | # | ecoresHg17 | genePred | # +------------+--------------------+ # DOWNLOAD SEQUENCE ssh kksilo mkdir /cluster/store7/tetNig1 ln -s /cluster/store7/tetNig1 /cluster/data cd /cluster/data/tetNig1 wget http://www.genoscope.cns.fr/externe/tetraodon/Data/Reads/tetraodon6.gz # DOWNLOAD SEQUENCE BY CHROMOSOME, GFF AND AGP FILES (DONE, 2004-08-13, hartera) # Genoscope provided the url, username and password - this is V7 # download agp, gff and accompanying FASTA files by chromosome ssh kksilo cd /cluster/data/tetNig1 wget --timestamp --http-user= --http-passwd= \ http://www.genoscope.cns.fr/nda/tetraodon/GFF/tetraodon.all.gff.* wget --timestamp --http-user= --http-passwd= \ http://www.genoscope.cns.fr/nda/tetraodon/GP/chr.agp wget --timestamp --http-user= --http-passwd= \ http://www.genoscope.cns.fr/nda/tetraodon/GP/chr.agp.info wget --timestamp -r -l1 --http-user= --http-passwd= \ http://www.genoscope.cns.fr/nda/tetraodon/GP/unmasked/ mv ./www.genoscope.cns.fr/secure-nda/tetraodon/GP/unmasked/* . rm -r www.genoscope.cns.fr rm index* # unzip FASTA files bzip2 -d chr*.bz2 bzip2 -d tetraodon.all.gff.bz2 # no complete mitochondrial sequence in Genbank # Create list of chromosomes (DONE, 2004-08-13, hartera) ssh hgwdev cd /cluster/data/tetNig1 foreach f (*.fa) set chr = `echo $f:h | sed -e 's/^chr//' | sed -e 's/\.fa$//'` echo $chr >> chrom end sort -n chrom > chrom.lst rm chrom # SPLIT AGP FILES BY CHROMOSOME (DONE, 2004-08-16, hartera) ssh kksilo cd /cluster/data/tetNig1 foreach c (`cat chrom.lst`) mkdir $c mv chr${c}.fa ./$c perl -we "while(<>){if (/^chr$c\t/) {print;}}" \ ./chr.agp > $c/chr$c.agp end # agp has either gaps (N) or scaffolds (S), change S to W for WGS # agp files are not standard formats so need to reformat for # processing programs to run - missing field 4 which is just line number # and need to reformat gap line to: # chr chr_start chr_end line type length gap type linkage foreach c (`cat chrom.lst`) cp $c/chr${c}.agp $c/chr${c}.agp.bak echo "Processing chr${c}.agp ..." awk 'BEGIN {OFS="\t"} \ { if ($4 == "S") print $1, $2, $3, NR, "W", $5, $6, $7, $8, $9; \ if ($4 == "N" && $5 == "GAP_IS") \ print $1, $2, $3, NR, $4, $9, "fragment", "yes"; \ if ($4 == "N" && $5 == "GAP_UC") \ print $1, $2, $3, NR, $4, $9, "contig", "yes"; \ if ($4 == "N" && $5 == "GAP_UN") \ print $1, $2, $3, NR, $4, $9, "contig", "no"; \ if ($4 == "N" && $5 == "CENT") \ print $1, $2, $3, NR, $4, $9, "centromere", "no" }' \ $c/chr${c}.agp >> $c/chr${c}.new.agp mv $c/chr${c}.new.agp $c/chr{$c}.agp end # check agp files and then remove backup files foreach c (`cat chrom.lst`) rm $c/chr${c}.agp.bak end # CHECK FASTA AND AGP FILES (DONE, 2004-08-16, hartera) ssh kksilo cd /cluster/data/tetNig1 foreach c (`cat chrom.lst`) foreach f ( $c/chr$c.agp) set agpLen = `tail -1 $f | awk '{print $3;}'` set h = $f:r set g = $h:r echo "Getting size of $g.fa" set faLen = `faSize $g.fa | awk '{print $1;}'` if ($agpLen == $faLen) then echo " OK: $f length = $g length = $faLen" else echo "ERROR: $f length = $agpLen, but $g length = $faLen" endif end end # all are OK so FASTA files sizes are in agreement with agp files # BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS # (DONE, 2004-08-16, hartera) ssh kksilo cd /cluster/data/tetNig1 foreach c (`cat chrom.lst`) foreach agp ($c/chr$c.agp) if (-e $agp) then set fa = $c/chr$c.fa echo splitting $agp and $fa cp -p $agp $agp.bak cp -p $fa $fa.bak splitFaIntoContigs $agp $fa . -nSize=5000000 endif end end # MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-08-16, hartera) # This used to hold scripts -- better to keep them inline here # Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. ssh hgwdev mkdir /cluster/data/tetNig1/jkStuff # This is where most tracks will be built: mkdir /cluster/data/tetNig1/bed # CREATING DATABASE (DONE, 2004-08-16, hartera) # Create the database. # next machine ssh hgwdev echo 'create database tetNig1' | hgsql '' # if you need to delete that database: !!! WILL DELETE EVERYTHING !!! echo 'drop database tetNig1' | hgsql tetNig1 # Use df to make sure there is at least 5 gig free on df -h /var/lib/mysql # Before loading data: # Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 525G 1.2T 32% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-08-16, hartera) # next machine ssh hgwdev # the following command copies all the data from the table # grp in the database galGal2 to our new database ce2 echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \ | hgsql tetNig1 # if you need to delete that table: !!! WILL DELETE ALL grp data !!! echo 'drop table grp;' | hgsql tetNig1 # REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2004-08-17, hartera) #- Split contigs into 500kb chunks, at gaps if possible: ssh kksilo cd /cluster/data/tetNig1 foreach c (`cat chrom.lst`) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/tetNig1 cat << '_EOF_' > jkStuff/RMTetraodon #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/tetNig1/$2 /bin/cp $2 /tmp/tetNig1/$2/ cd /tmp/tetNig1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2 popd /bin/cp /tmp/tetNig1/$2/$2.out ./ if (-e /tmp/tetNig1/$2/$2.align) /bin/cp /tmp/tetNig1/$2/$2.align ./ if (-e /tmp/tetNig1/$2/$2.tbl) /bin/cp /tmp/tetNig1/$2/$2.tbl ./ if (-e /tmp/tetNig1/$2/$2.cat) /bin/cp /tmp/tetNig1/$2/$2.cat ./ /bin/rm -fr /tmp/tetNig1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/tetNig1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/tetNig1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMTetraodon mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (`cat chrom.lst`) foreach d ($c/chr${c}_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/tetNig1/jkStuff/RMTetraodon \ /cluster/data/tetNig1/$d $f \ '{'check out line+ /cluster/data/tetNig1/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh kk cd /cluster/data/tetNig1/RMRun para create RMJobs para try, para check, para check, para push, para check,... # para time # Completed: 940 of 940 jobs # CPU time in finished jobs: 6654614s 110910.24m 1848.50h 77.02d 0.211 y # IO & Wait Time: 54843s 914.05m 15.23h 0.63d 0.002 y # Average job time: 7138s 118.96m 1.98h 0.08d # Longest job: 11024s 183.73m 3.06h 0.13d # Submission to last job: 34509s 575.15m 9.59h 0.40d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/tetNig1 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (`cat chrom.lst`) echo lifting $c cd $c if (-e lift/ordered.lft && ! -z lift/ordered.lft) then liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \ > /dev/null endif cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/tetNig1 hgLoadOut tetNig1 */chr*.fa.out # MAKE LIFTALL.LFT (DONE, 2004-08-16, hartera) ssh kksilo cd /cluster/data/tetNig1 cat */lift/ordered.lft > jkStuff/liftAll.lft # SIMPLE REPEATS TRACK (DONE, 2004-08-17, hartera) # TRF runs pretty quickly now... it takes a few hours total runtime, # so instead of binrsyncing and para-running, just do this on the # local fileserver ssh kksilo mkdir -p /cluster/data/tetNig1/bed/simpleRepeat cd /cluster/data/tetNig1/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/tetNig1/*/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end chmod a+x jobs.csh csh -ef jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l endsInLf trf/* liftUp simpleRepeat.bed /cluster/data/tetNig1/jkStuff/liftAll.lft warn \ trf/*.bed # Load into the database: ssh hgwdev cd /cluster/data/tetNig1/bed/simpleRepeat hgLoadBed tetNig1 simpleRepeat \ /cluster/data/tetNig1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # PROCESS SIMPLE REPEATS INTO MASK (DONE, 2004-08-17, hartera) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh kksilo cd /cluster/data/tetNig1/bed/simpleRepeat mkdir -p trfMask foreach f (trf/chr*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords as well: cd /cluster/data/tetNig1 mkdir bed/simpleRepeat/trfMaskChrom foreach c (`cat chrom.lst`) if (-e $c/lift/ordered.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/ordered.lst > $c/lift/oTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \ jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst` endif if (-e $c/lift/random.lst) then perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \ $c/lift/random.lst > $c/lift/rTrf.lst liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \ jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst` endif end # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF # (DONE, 2004-08-17, hartera) ssh kksilo cd /cluster/data/tetNig1 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end foreach c (`cat chrom.lst`) echo "repeat- and trf-masking contigs of chr$c" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end # Build nib files, using the soft masking in the fa mkdir nib foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE, 2004-08-17, hartera) # Make symbolic links from /gbdb/tetNig1/nib to the real nibs. ssh hgwdev cd /cluster/data/tetNig1 mkdir -p /gbdb/tetNig1/nib foreach f (/cluster/data/tetNig1/nib/chr*.nib) ln -s $f /gbdb/tetNig1/nib end # Load /gbdb/tetNig1/nib paths into database and save size info # hgNibSeq creates chromInfo table hgNibSeq -preMadeNib tetNig1 /gbdb/tetNig1/nib */chr*.fa echo "select chrom,size from chromInfo" | hgsql -N tetNig1 > chrom.sizes # take a look at chrom.sizes, should be 27 lines wc chrom.sizes # Make one big 2bit file as well, and make a link to it in # /gbdb/tetNig1/nib because hgBlat looks there: faToTwoBit */chr*.fa tetNig1.2bit ln -s /cluster/data/tetNig1/tetNig1.2bit /gbdb/tetNig1/nib/ # MAKE GOLD AND GAP TRACKS (DONE, 2004-08-17, hartera) ssh hgwdev cd /cluster/data/tetNig1 # the gold and gap tracks are created from the chrN.agp file hgGoldGapGl -noGl -chromLst=chrom.lst tetNig1 /cluster/data/tetNig1 . # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR TETNIG1 # (DONE, 2004-06-18, hartera) # UPDATED SOURCE AND ASSEMBLY DATE IN dbDb (DONE, 2004-09-09, hartera) # Make trackDb table so browser knows what tracks to expect: ssh hgwdev mkdir -p ~/kent/src/hg/makeDb/trackDb/tetraodon/tetNig1 cd $HOME/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add tetNig1 in all the right places and do make update make alpha cvs commit -m "Added tetNig1." makefile # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ values("tetNig1", "May 2002", \ "/gbdb/tetNig1/nib", "Tetraodon", "chr1:1000-4000", 1, \ 39, "Tetraodon", "Tetraodon nigroviridis", \ "/gbdb/tetNig1/html/description.html", 0, 0, \ "Genoscope and Whitehead Institute, V6");' \ | hgsql -h genome-testdb hgcentraltest echo 'insert into defaultDb (genome, name) \ values ("Tetraodon", "tetNig1");' \ | hgsql -h genome-testdb hgcentraltest # add correct assembly date and source (2004-09-08, hartera) echo 'update dbDb set description = "Feb 2004" where name = "tetNig1";' \ | hgsql -h genome-testdb hgcentraltest echo 'update dbDb set sourceName = "Genoscope and Broad Institute, V7" \ where name = "tetNig1";' | hgsql -h genome-testdb hgcentraltest # correction to assembly date format (2004-09-09) echo 'update dbDb set description = "Feb. 2004" where name = "tetNig1";' \ | hgsql -h genome-testdb hgcentraltest # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-08-23, hartera) ssh hgwdev mkdir /cluster/data/tetNig1/html cd /cluster/data/tetNig1/html # make a symbolic link from /gbdb/tetNig1/html to /cluster/data/tetNig1/html mkdir /gbdb/tetNig1 ln -s /cluster/data/tetNig1/html /gbdb/tetNig1/html # Add a description page for zebrafish cd /cluster/data/tetNig1/html cp /cluster/data/fr1/html/*.html . # Edit this for zebrafish # create a description.html page here mkdir -p ~/kent/src/hg/makeDb/trackDb/tetraodon/tetNig1 cd ~/kent/src/hg/makeDb/trackDb/ cvs add tetraodon cd tetraodon cvs add tetNig1 # Add description page here too cp /cluster/data/tetNig1/html/description.html \ $HOME/kent/src/hg/makeDb/trackDb/tetraodon/tetNig1/ chmod a+r \ $HOME/kent/src/hg/makeDb/trackDb/tetraodon/tetNig1/description.html cd $HOME/kent/src/hg/makeDb/trackDb/tetraodon/tetNig1/ # Check it in and copy (ideally using "make alpha" in trackDb) to # /gbdb/tetNig1/html cvs add description.html cvs commit description.html # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE, 2004-08-18, hartera) ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/tetNig1/nib mkdir -p /iscratch/i/tetNig1/nib cp -p /cluster/data/tetNig1/nib/chr*.nib /iscratch/i/tetNig1/nib # Pseudo-contig fa that have been repeat- and trf-masked: rm -rf /iscratch/i/tetNig1/trfFa mkdir /iscratch/i/tetNig1/trfFa foreach d (/cluster/data/tetNig1/*/chr*_?{,?}) cp $d/$d:t.fa /iscratch/i/tetNig1/trfFa end rm -rf /iscratch/i/tetNig1/rmsk mkdir -p /iscratch/i/tetNig1/rmsk cp -p /cluster/data/tetNig1/*/chr*.fa.out /iscratch/i/tetNig1/rmsk cp -p /cluster/data/tetNig1/tetNig1.2bit /iscratch/i/tetNig1/ iSync # add to /cluster/bluearc too ssh kksilo # masked contigs rm -fr /cluster/bluearc/scratch/tetra/tetNig1/trfFa mkdir -p /cluster/bluearc/scratch/tetra/tetNig1/trfFa foreach d (/cluster/data/tetNig1/*/chr*_?{,?}) cp $d/$d:t.fa /cluster/bluearc/scratch/tetra/tetNig1/trfFa end # masked chrom nibs rm -fr /cluster/bluearc/scratch/tetra/tetNig1/nib mkdir -p /cluster/bluearc/scratch/tetra/tetNig1/nib cp -p /cluster/data/tetNig1/nib/chr*.nib /cluster/bluearc/scratch/tetra/tetNig1/nib # fasta files - CHECK FOR RANDOMS cd /cluster/data/tetNig1 rm -fr /cluster/bluearc/scratch/tetra/tetNig1/fasta mkdir -p /cluster/bluearc/scratch/tetra/tetNig1/fasta cp -p */*.fa /cluster/bluearc/scratch/tetra/tetNig1/fasta # RepeatMasker *.out files - CHECK FOR RANDOMS rm -rf /cluster/bluearc/scratch/tetra/tetNig1/rmsk mkdir -p /cluster/bluearc/scratch/tetra/tetNig1/rmsk cp -p ./*/chr*.fa.out /cluster/bluearc/scratch/tetra/tetNig1/rmsk # CREATE gc5Base wiggle TRACK (DONE, 2004-08-19, hartera) ssh kki mkdir /cluster/data/tetNig1/bed/gc5Base cd /cluster/data/tetNig1/bed/gc5Base # in the script below, the 'grep -w GC' selects the lines of # output from hgGcPercent that are real data and not just some # information from hgGcPercent. The awk computes the number # of bases that hgGcPercent claimed it measured, which is not # necessarily always 5 if it ran into gaps, and then the division # by 10.0 scales down the numbers from hgGcPercent to the range # [0-100]. Two columns come out of the awk print statement: # and which are fed into wigAsciiToBinary through # the pipe. It is set at a dataSpan of 5 because each value # represents the measurement over five bases beginning with # . The result files end up in ./wigData5. # A new script is used (from makeHg17.doc) which gets around the # problem that wigAsciiToBinary was calculating chromEnd to be # beyond the real chromosome end mkdir wigData5 dataLimits5 cat << '_EOF_' > kkRun.sh #!/bin/sh NIB=$1 chr=${NIB/.nib/} chrom=${chr#chr} hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 tetNig1 \ /iscratch/i/tetNig1/nib | \ grep -w GC | \ awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \ wigAsciiToBinary -dataSpan=5 -chrom=${chr} \ -wibFile=wigData5/gc5Base_${chrom} \ -name=${chrom} stdin 2> dataLimits5/${chr} '_EOF_' # << this line makes emacs coloring happy chmod +x kkRun.sh ls /iscratch/i/tetNig1/nib > nibList cat << '_EOF_' > gsub #LOOP ./kkRun.sh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 nibList single gsub jobList para create jobList para try, check, push ... etc # para time # Completed: 27 of 27 jobs # CPU time in finished jobs: 587s 9.79m 0.16h 0.01d 0.000 y # IO & Wait Time: 16s 0.26m 0.00h 0.00d 0.000 y # Average job time: 22s 0.37m 0.01h 0.00d # Longest job: 187s 3.12m 0.05h 0.00d # Submission to last job: 623s 10.38m 0.17h 0.01d # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/tetNig1/bed/gc5Base # up to here hgLoadWiggle -pathPrefix=/gbdb/tetNig1/wib/gc5Base \ tetNig1 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir -p /gbdb/tetNig1/wib/gc5Base ln -s `pwd`/wigData5/*.wib /gbdb/tetNig1/wib/gc5Base # And then the zoomed data view ssh kki cd /cluster/data/tetNig1/bed/gc5Base mkdir wigData5_1K dataLimits5_1K cat << '_EOF_' > kkRunZoom.sh #!/bin/sh NIB=$1 chr=${NIB/.nib/} chrom=${chr#chr} hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 tetNig1 \ /iscratch/i/tetNig1/nib | \ grep -w GC | \ awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \ -chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \ -name=${chrom} stdin 2> dataLimits5_1K/${chr} '_EOF_' # << this line makes emacs coloring happy chmod +x kkRunZoom.sh cat << '_EOF_' > gsubZoom #LOOP ./kkRunZoom.sh $(path1) #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 nibList single gsubZoom jobListZoom para create jobListZoom para try, check, push, ... etc. # para time # Completed: 27 of 27 jobs # CPU time in finished jobs: 571s 9.51m 0.16h 0.01d 0.000 y # IO & Wait Time: 40s 0.67m 0.01h 0.00d 0.000 y # Average job time: 23s 0.38m 0.01h 0.00d # Longest job: 223s 3.72m 0.06h 0.00d # Submission to last job: 986s 16.43m 0.27h 0.01d # Then load these .wig files into the same database as above ssh hgwdev cd /cluster/data/tetNig1/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/tetNig1/wib/gc5Base \ -oldTable tetNig1 gc5Base wigData5_1K/*.wig # and symlink these .wib files into /gbdb mkdir -p /gbdb/tetNig1/wib/gc5Base ln -s `pwd`/wigData5_1K/*.wib /gbdb/tetNig1/wib/gc5Base # CREATE 10.OOC, 11.OOC FILES FOR BLAT (DONE, 2004-08-20, hartera) # Use -repMatch=130 (based on size - for human 1024 was used, Tetraodon # is about 380 Mb in length and it is 12.7% the size of the human genome) ssh kkr1u00 mkdir /cluster/data/tetNig1/bed/ooc cd /cluster/data/tetNig1/bed/ooc mkdir -p /cluster/bluearc/tetNig1 ls -1 /cluster/data/tetNig1/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/tetNig1/11.ooc -repMatch=130 # Wrote 8776 overused 11-mers to /cluster/bluearc/tetNig1/11.ooc # For 10.ooc, repMatch = 4096 for human, so use 520 blat nib.lst /dev/null /dev/null -tileSize=10 \ -makeOoc=/cluster/bluearc/tetNig1/10.ooc -repMatch=520 # Wrote 1976 overused 11-mers to /cluster/bluearc/tetNig1/11.ooc # keep copies of ooc files in this directory and copy to iscratch cp /cluster/bluearc/tetNig1/*.ooc . cp -p /cluster/bluearc/tetNig1/*.ooc /iscratch/i/tetNig1/ iSync # AUTO UPDATE GENBANK MRNA AND EST RUN (DONE, 2004-08-24, hartera and markd) ssh eieio cd /cluster/data/genbank # This is a new organism, edit the etc/genbank.conf file and add: # tetNig1 (Tetraodon) tetNig1.genome = /iscratch/i/tetNig1/nib/chr*.nib tetNig1.lift = /cluster/data/tetNig1/jkStuff/liftAll.lft tetNig1.downloadDir = tetNig1 # Default includes native genbank mRNAs and ESTs, # genbank xeno mRNAs but no xenoESTs, native RefSeq mRNAs but # not xeno RefSeq cvs commit -m "Added Tetraodon." etc/genbank.conf # Edit src/align/gbBlat to add /iscratch/i/tetNig1/11.ooc cd ~/kent/src/hg/makeDb/genbank # Add line: TETNIG_OOC=/iscratch/i/tetNig1/11.ooc # Add line: tetNig*) oocOpt="-ooc=${TETNIG_OOC}" ;; cvs diff src/align/gbBlat make cvs update src/align/gbBlat cvs commit -m "Added 11.ooc for tetNig1." src/align/gbBlat # Edit src/lib/gbGenome.c to add new genome # ADD: static char *tetNigNames[] = {"Tetraodon nigroviridis", NULL}; # ADD: {"tetNig", tetNigNames, NULL}, # to static struct dbToSpecies dbToSpeciesMap[] cvs diff src/lib/gbGenome.c make cvs update src/lib/gbGenome.c cvs commit -m "Added Tetraodon, tetNig." src/lib/gbGenome.c # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, RefSeq mRNA only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial tetNig1 # Load results for RefSeq mRNAs: ssh hgwdev cd /cluster/data/genbank # There are no RefSeq mRNAs for Tetraodon # -initial for GenBank mRNAs nice ./bin/gbAlignStep -initial -srcDb=genbank -type=mrna tetNig1 # Load results for GenBank mRNAs ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 tetNig1 cd /cluster/data/genbank/work mv initial.tetNig1 initial.tetNig1.genbank.mrna # -initial for ESTs ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial tetNig1 # Load results for GenBank ESTs ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 tetNig1 cd /cluster/data/genbank/work mv initial.tetNig1 initial.tetNig1.genbank.est # Everything loaded ok, so clean up rm -r /cluster/data/genbank/work/initial.tetNig1* # BLASTZ SWAP FOR hg17 vs tetNig1 BLASTZ TO CREATE tetNig1 vs hg17 BLASTZ # (DONE, 2004-08-26, hartera) ssh kolossus mkdir -p /cluster/data/tetNig1/bed/blastz.hg17.swap cd /cluster/data/tetNig1/bed/blastz.hg17.swap # use axtChrom from blastzTetNig1 on hg17 set aliDir = /cluster/data/hg17/bed/blastz.tetNig1 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 709M /cluster/data/hg17/bed/blastz.tetNig1/axtChrom # 710M unsorted # 709M axtChrom rm -r unsorted # translate sorted axt files into psl ssh kolossus cd /cluster/data/tetNig1/bed/blastz.hg17.swap mkdir -p pslChrom set tbl = "blastzHg17" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.hg17.swap/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl tetNig1 $f echo "$f Done" end # CHAIN HUMAN (hg17) BLASTZ (DONE, 2004-08-27, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/tetNig1/bed/blastz.hg17.swap mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Make our own linear gap file with reduced gap penalties, # in hopes of getting longer chains - works well for species at # chicken-human distance or greater cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -linearGap=../../chickenHumanTuned.gap $1 \ /iscratch/i/tetNig1/nib \ /iscratch/i/gs.18/build35/bothMaskedNibs $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 27 of 27 jobs # CPU time in finished jobs: 2207s 36.78m 0.61h 0.03d 0.000 y # IO & Wait Time: 124s 2.07m 0.03h 0.00d 0.000 y # Average job time: 86s 1.44m 0.02h 0.00d # Longest job: 160s 2.67m 0.04h 0.00d # Submission to last job: 334s 5.57m 0.09h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # apart from chrUn_random, not too many chains with score < 5000 # load chr 1 into table ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain/chain hgLoadChain tetNig1 chr1_chainHg17 chr1.chain # no RefSeqs for Tetraodon so use mrna table # featureBits -chrom=chr1 tetNig1 all_mrna chainHg17Link -enrichment # all_mrna 2.637%, chainHg17Link 9.262%, both 0.772%, cover 29.28%, enrich 3.16x # try filtering with minScore < 5000 ssh kksilo cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain chainSplit chainFilt5k all.chain ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain/chainFilt5k hgLoadChain tetNig1 chr1_chainHg17Filt5k chr1.chain # featureBits -chrom=chr1 tetNig1 all_mrna chainHg17Filt5kLink -enrichment # all_mrna 2.637%, chainHg17Filt5kLink 8.746%, both 0.746%, cover 28.29%, # enrich 3.23x # add all chains for minScore=5000 filtered chains # remove test chain tables for chr1 ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain hgsql -e "drop table chr1_chainHg17;" tetNig1 hgsql -e "drop table chr1_chainHg17Link;" tetNig1 hgsql -e "drop table chr1_chainHg17Filt5k;" tetNig1 hgsql -e "drop table chr1_chainHg17Filt5kLink;" tetNig1 mv chainFilt5k chain cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain tetNig1 ${c}_chainHg17 $i echo done $c end # NET HUMAN (hg17) BLASTZ (DONE, 2004-08-27, hartera) ssh kksilo cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 55447552, utime 259 s/100, stime 33 # Add classification info using db tables: cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option mkdir -p /cluster/bluearc/hg17/linSpecRep.notInTetraodon mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInHuman cp -p /iscratch/i/hg17/linSpecRep.notInTetraodon/* \ /cluster/bluearc/hg17/linSpecRep.notInTetraodon cp -p /iscratch/i/tetNig1/linSpecRep.notInHuman/* \ /cluster/bluearc/tetNig1/linSpecRep.notInHuman ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain time netClass noClass.net tetNig1 hg17 hg17.net \ -tNewR=/cluster/bluearc/tetNig1/linSpecRep.notInHuman \ -qNewR=/cluster/bluearc/hg17/linSpecRep.notInTetraodon -noAr # 47.430u 31.160s 2:15.46 58.0% 0+0k 0+0io 197pf+0w netFilter -minGap=10 hg17.net | hgLoadNet tetNig1 netHg17 stdin # featureBits tetNig1 all_mrna netHg17 -enrichment # all_mrna 3.204%, netHg17 42.716%, both 1.698%, cover 53.00%, enrich 1.24x # create symlinks to make consistent with newer builds 2006-04-09 markd ln -s all.chain.gz /cluster/data/tetNig1/bed/blastz.hg17/axtChain/tetNig1.hg17.all.chain.gz ln -s hg17.net.gz /cluster/data/tetNig1/bed/blastz.hg17/axtChain/tetNig1.hg17.net.gz # BLASTZ FOR FR1 (FUGU) (DONE, 2004-08-31, hartera) ssh kkr1u00 # blastz requires lineage-specific repeats but there are none # available between these two fish species to make empty files mkdir -p /iscratch/i/tetNig1/linSpecRep.notInFugu mkdir -p /iscratch/i/fugu/linSpecRep.notInZebrafish cd /cluster/data/tetNig1 # for Tetraodon foreach c (`cat chrom.lst`) cp /dev/null /iscratch/i/tetNig1/linSpecRep.notInFugu/chr${c}.out.spec end # for Fugu cp /dev/null /iscratch/i/fugu/linSpecRep.notInZebrafish/chrUn.out.spec iSync ssh kk mkdir -p /cluster/data/tetNig1/bed/blastz.fr1.2004-08-31 ln -s /cluster/data/tetNig1/bed/blastz.fr1.2004-08-31 \ /cluster/data/tetNig1/bed/blastz.fr1 cd /cluster/data/tetNig1/bed/blastz.fr1 cat << '_EOF_' > DEF # Tetraodon (tetNig1) vs. Fugu (fr1) export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_L=8000 BLASTZ_H=2200 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified BLASTZ_ABRIDGE_REPEATS=0 # TARGET - Tetraodon (tetNig1) SEQ1_DIR=/iscratch/i/tetNig1/nib SEQ1_RMSK= # lineage-specific repeats # we don't have that information for these species so these files are empty SEQ1_SMSK=/iscratch/i/tetNig1/linSpecRep.notInFugu SEQ1_FLAG= SEQ1_IN_CONTIGS=0 # 10 MB chunk for target SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Fugu (fr1) # soft-masked chrom nib SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib SEQ2_RMSK= SEQ2_SMSK=/iscratch/i/fugu/linSpecRep.notInZebrafish SEQ2_FLAG= SEQ2_IN_CONTIGS=0 # 10 Mbase for query SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/tetNig1/bed/blastz.fr1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.tetNig1-fr1.2004-08-31 # setup cluster run # source the DEF file bash . ./DEF /cluster/data/tetNig1/jkStuff/BlastZ_run0.sh cd run.0 # check batch looks ok then para try, check, push, check, .... # para time # Completed: 1995 of 1995 jobs # CPU time in finished jobs: 502856s 8380.93m 139.68h 5.82d 0.016 y # IO & Wait Time: 1190285s 19838.08m 330.63h 13.78d 0.038 y # Average job time: 849s 14.14m 0.24h 0.01d # Longest job: 19130s 318.83m 5.31h 0.22d # Submission to last job: 19415s 323.58m 5.39h 0.22d # second cluster run: lift raw alignments -> lav dir cd /cluster/data/tetNig1/bed/blastz.fr1 bash # if a csh/tcsh user . ./DEF /cluster/data/tetNig1/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, check etc. # para time # Completed: 57 of 57 jobs # IO & Wait Time: 511s 8.52m 0.14h 0.01d 0.000 y # Average job time: 29s 0.48m 0.01h 0.00d # Longest job: 58s 0.97m 0.02h 0.00d # Submission to last job: 853s 14.22m 0.24h 0.01d # third run: lav -> axt ssh kki cd /cluster/data/tetNig1/bed/blastz.fr1 mkdir axtChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin /iscratch/i/tetNig1/nib \ /cluster/bluearc/fugu/fr1/chromNib stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/tetNig1/bed/blastz.fr1/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 27 of 27 jobs # CPU time in finished jobs: 73s 1.22m 0.02h 0.00d 0.000 y # IO & Wait Time: 495s 8.25m 0.14h 0.01d 0.000 y # Average job time: 21s 0.35m 0.01h 0.00d # Longest job: 77s 1.28m 0.02h 0.00d # Submission to last job: 318s 5.30m 0.09h 0.00d # translate sorted axt files into psl ssh kolossus cd /cluster/data/tetNig1/bed/blastz.fr1 mkdir -p pslChrom set tbl = "blastzFr1" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.fr1/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl -noTNameIx tetNig1 $f echo "$f Done" end # default parameters and H=2000 # featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1 -enrichment # all_mrna 2.637%, blastzFr1 78.364%, both 2.361%, cover 89.53%, enrich 1.14x # repeat with H=2000 and L=6000 - blastzL6k # featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1L6k -enrichment # all_mrna 2.637%, blastzFr1L6k 77.955%, both 2.348%, cover 89.04%, enrich 1.14x# very little difference # featureBits -chrom=chr1 danRer1 all_mrna blastzFr1 -enrichment # all_mrna 0.981%, blastzFr1 5.542%, both 0.592%, cover 60.32%, enrich 10.88x # blastzFr1L6k # BLASTZ_H=2000 # BLASTZ_Y=3400 # BLASTZ_L=6000 # BLASTZ_K=2200 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1L6k2 -enrichment # all_mrna 2.637%, blastzFr1L6k2 79.917%, both 2.403%, cover 91.12%, # enrich 1.14x # H=2200, default parameters # featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1H2200 -enrichment # all_mrna 2.637%, blastzFr1H2200 78.225%, both 2.359%, cover 89.46%, # enrich 1.14x # rows in chr1_blastzFr1* tables: # blastzFr1 47079 # blastzFr1L6k 26408 # blastzFr1L6k2 52340 # blastzFr1H2200 44240 # blastzFr1H2200L8k 20502 # blastzFr1H2200L10k 17355 # try increasing L to 8000 # H=2200, L=8000 # featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1H2200L8k -enrichment # all_mrna 2.637%, blastzFr1H2200L8k 77.714%, both 2.345%, cover 88.94%, # enrich 1.14x # H=2200 L=10000 # featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1H2200L10k -enrichment # all_mrna 2.637%, blastzFr1H2200L10k 77.568%, both 2.340%, cover 88.72%, # enrich 1.14x # 6 jobs took a very long time and did not finish # looking at some alignments in these tracks with the mRNA track on there is not # much difference between most of them but blastzFr1L6k2 often has more # alignments and a different strucutre. Use blastzFr1H2200L8k as this has # fewer alingments with very little loss of coverage and more lower scoring # alignments can be removed at the chaining step. # CHAIN FR1 BLASTZ (DONE, 2004-09-01, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/tetNig1/bed/blastz.fr1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/tetNig1/bed/blastz.fr1/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/tetNig1/nib \ /cluster/bluearc/fugu/fr1/chromNib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Completed: 27 of 27 jobs # CPU time in finished jobs: 414s 6.90m 0.11h 0.00d 0.000 y # IO & Wait Time: 252s 4.20m 0.07h 0.00d 0.000 y # Average job time: 25s 0.41m 0.01h 0.00d # Longest job: 118s 1.97m 0.03h 0.00d # Submission to last job: 972s 16.20m 0.27h 0.01d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's,try also with larger bin size. foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain/chain hgLoadChain tetNig1 chr1_chainFr1 chr1.chain # no RefSeqs for Tetraodon so use mrna table # featureBits -chrom=chr1 tetNig1 all_mrna chainFr1Link -enrichment # all_mrna 2.637%, chainFr1Link 76.099%, both 2.320%, cover 87.97%, enrich 1.16x # try filtering with minScore < 5000 ssh kksilo cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain chainSplit chainFilt5k all.chain ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain/chainFilt5k hgLoadChain tetNig1 chr1_chainFr1Filt5k chr1.chain # featureBits -chrom=chr1 tetNig1 all_mrna chainFr1Filt5kLink -enrichment # all_mrna 2.637%, chainFr1Filt5kLink 75.999%, both 2.319%, cover 87.95%, # enrich 1.16x # very little difference in the coverage so use the filtered version ssh hgwdev rm -r chain mv chainFilt5k chain cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain tetNig1 ${c}_chainFr1 $i echo done $c end # NET FUGU (FR1) BLASTZ (DONE, 2004-09-01, hartera) # RELOADED NET WITH CORRECT TABLE NAME (2004-09-02, hartera) ssh kksilo cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 300011520, utime 1360 s/100, stime 247 # Add classification info using db tables: ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option time netClass noClass.net tetNig1 fr1 fuguFr1.net -noAr netFilter -minGap=10 fuguFr1.net | hgLoadNet tetNig1 netFr1 stdin # MAKE Human Proteins track ssh kksilo mkdir -p /cluster/data/tetNig1/blastDb cd /cluster/data/tetNig1 cat */*/*.lft > jkStuff/subLift.lft for i in */*/*_*_*.fa; do ln -s `pwd`/$i blastDb; done # remove *_random_#.fa (not listed) cd blastDb for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa ssh kkr1u00 mkdir -p /iscratch/i/tetNig1/blastDb cd /cluster/data/tetNig1/blastDb cp * /iscratch/i/tetNig1/blastDb (~kent/bin/iSync) 2>&1 > sync.out mkdir -p /cluster/data/tetNig1/bed/tblastn.hg17KG cd /cluster/data/tetNig1/bed/tblastn.hg17KG ls -1S /iscratch/i/tetNig1/blastDb/*.nsq | sed "s/\.nsq//" > tetra.lst exit # back to kksilo cd /cluster/data/tetNig1/bed/tblastn.hg17KG mkdir kgfa # calculate a reasonable number of jobs calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(150000/`wc tetra.lst | awk "{print \\\$1}"`\) # 40262/(150000/979) = 262.776653 split -l 263 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.1 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/tetNig1/jkStuff/subLift.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/tetNig1/jkStuff/liftAll.lft warn $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 tetra.lst kg.lst blastGsub blastSpec # exit from tcsh exit ssh kk cd /cluster/data/tetNig1/bed/tblastn.hg17KG para create blastSpec para try, push # Completed: 144760 of 144760 jobs # CPU time in finished jobs: 22218348s 370305.81m 6171.76h 257.16d 0.705 y # IO & Wait Time: 3507367s 58456.11m 974.27h 40.59d 0.111 y # Average job time: 178s 2.96m 0.05h 0.00d # Longest job: 2810s 46.83m 0.78h 0.03d # Submission to last job: 41709s 695.15m 11.59h 0.48d tcsh cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/tetNig1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 261 of 261 jobs # CPU time in finished jobs: 828s 13.81m 0.23h 0.01d 0.000 y # IO & Wait Time: 19521s 325.34m 5.42h 0.23d 0.001 y # Average job time: 78s 1.30m 0.02h 0.00d # Longest job: 270s 4.50m 0.07h 0.00d # Submission to last job: 3878s 64.63m 1.08h 0.04d exit # back to kksilo cd /cluster/data/tetNig1/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | liftUp -nosort -type=".psl" -nohead stdout /cluster/data/tetNig1/jkStuff/liftAll.lft warn stdin | \ sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > ../blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/tetNig1/bed/tblastn.hg17KG hgLoadPsl tetNig1 blastHg17KG.psl exit # back to kksilo rm -rf blastOut # End tblastn # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-09-08, hartera) ssh kksilo cd /cluster/data/tetNig1 #- Build the .zip files cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip mkdir zip # chrom AGP's zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp # chrom RepeatMasker out files zip -j zip/chromOut.zip [0-9A-Z]*/chr*.fa.out # soft masked chrom fasta zip -j zip/chromFa.zip [0-9A-Z]*/chr*.fa # hard masked chrom fasta zip -j zip/chromFaMasked.zip [0-9A-Z]*/chr*.fa.masked # chrom TRF output files cd bed/simpleRepeat zip ../../zip/chromTrf.zip trfMask/chr*.bed cd ../.. # get GenBank native mRNAs cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=tetNig1 -native GenBank mrna \ /cluster/data/tetNig1/zip/mrna.fa # get GenBank xeno mRNAs ./bin/i386/gbGetSeqs -db=tetNig1 -xeno GenBank mrna \ /cluster/data/tetNig1/zip/xenoMrna.fa # get native GenBank ESTs ./bin/i386/gbGetSeqs -db=tetNig1 -native GenBank est \ /cluster/data/tetNig1/zip/est.fa cd /cluster/data/tetNig1/zip # zip GenBank native and xeno mRNAs and native ESTs zip -j mrna.zip mrna.fa zip -j xenoMrna.zip xenoMrna.fa zip -j est.zip est.fa '_EOF_' # << this line makes emacs coloring happy nice csh ./jkStuff/zipAll.csh |& tee ./jkStuff/zipAll.log cd jkStuff #- Look at zipAll.log to make sure all file lists look reasonable. # Copy the .zip files to # hgwdev:/usr/local/apache/... cd /cluster/data/tetNig1/zip #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test # 29 chromAgp.zip.test # 29 chromFa.zip.test # 29 chromFaMasked.zip.test # 29 chromOut.zip.test # 87 chromTrf.zip.test # 3 est.zip.test # 3 mrna.zip.test # 3 xenoMrna.zip.test # 212 total ssh hgwdev /cluster/data/tetNig1/zip set gp = /usr/local/apache/htdocs/goldenPath/tetNig1 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips mkdir -p $gp/chromosomes foreach f (../*/chr*.fa) zip -j $gp/chromosomes/$f:t.zip $f end cd $gp/bigZips md5sum *.zip > md5sum.txt cd $gp/chromosomes md5sum *.zip > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # MAKE VSHG17 DOWNLOADABLES (DONE, 2004-09-08, hartera) ssh kksilo # zip chains and nets cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain cp all.chain hg17.chain zip -j /cluster/data/tetNig1/zip/hg17.chain.zip hg17.chain rm hg17.chain zip -j /cluster/data/tetNig1/zip/hg17.net.zip hg17.net ssh hgwdev # copy chains and nets to downloads area set gp = /usr/local/apache/htdocs/goldenPath/tetNig1 mkdir -p $gp/vsHg17 cd $gp/vsHg17 mv /cluster/data/tetNig1/zip/hg17*.zip . md5sum *.zip > md5sum.txt # move axt files to downloads area and zip cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChrom mkdir -p $gp/vsHg17/axtChrom cp -p *.axt $gp/vsHg17/axtChrom cd $gp/vsHg17/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MAKE VSFR1 DOWNLOADABLES (DONE, 2004-09-08, hartera) ssh kksilo # zip chains and nets cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain cp all.chain fr1.chain zip -j /cluster/data/tetNig1/zip/fr1.chain.zip fr1.chain rm fr1.chain cp fuguFr1.net fr1.net zip -j /cluster/data/tetNig1/zip/fr1.net.zip fr1.net rm fr1.net ssh hgwdev # copy chains and nets to downloads area set gp = /usr/local/apache/htdocs/goldenPath/tetNig1 mkdir -p $gp/vsFr1 cd $gp/vsFr1 mv /cluster/data/tetNig1/zip/fr1*.zip . md5sum *.zip > md5sum.txt # move axt files to downloads area and zip cd /cluster/data/tetNig1/bed/blastz.fr1/axtChrom mkdir -p $gp/vsFr1/axtChrom cp -p *.axt $gp/vsFr1/axtChrom cd $gp/vsFr1/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # BLASTZ SWAP FOR mm5 vs tetNig1 BLASTZ TO CREATE tetNig1 vs MOUSE mm5 BLASTZ # (DONE, 2004-09-08, hartera) ssh kksilo mkdir -p /cluster/store8/tetNig1/blastz.mm5.swap.2004-09-08 ln -s /cluster/store8/tetNig1/blastz.mm5.swap.2004-09-08 \ /cluster/data/tetNig1/bed ln -s /cluster/data/tetNig1/bed/blastz.mm5.swap.2004-09-08 \ /cluster/data/tetNig1/bed/blastz.mm5.swap ssh kolossus cd /cluster/data/tetNig1/bed/blastz.mm5.swap # use axtChrom from blastzTetNig1 on mm5 set aliDir = /cluster/data/mm5/bed/blastz.tetNig1 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 522M /cluster/data/mm5/bed/blastz.tetNig1/axtChrom # 523M unsorted # 522M axtChrom rm -r unsorted # translate sorted axt files into psl ssh kolossus cd /cluster/data/tetNig1/bed/blastz.mm5.swap mkdir -p pslChrom set tbl = "blastzMm5" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.mm5.swap/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl tetNig1 $f echo "$f Done" end # featureBits -chrom=chr1 tetNig1 all_mrna blastzMm5 -enrichment # all_mrna 2.637%, blastzMm5 12.633%, both 0.852%, cover 32.30%, enrich 2.56x # CHAIN MOUSE (mm5) BLASTZ (DONE, 2004-09-09, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/tetNig1/bed/blastz.mm5.swap mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Make our own linear gap file with reduced gap penalties, # in hopes of getting longer chains - works well for species at # chicken-human distance or greater cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize^V 11 smallSize^V 111 position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111 qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600 bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -linearGap=../../chickenHumanTuned.gap $1 \ /iscratch/i/tetNig1/nib \ /iscratch/i/mus/mm5/softNib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # para time # Completed: 27 of 27 jobs # CPU time in finished jobs: 2124s 35.40m 0.59h 0.02d 0.000 y # IO & Wait Time: 59s 0.99m 0.02h 0.00d 0.000 y # Average job time: 81s 1.35m 0.02h 0.00d # Longest job: 123s 2.05m 0.03h 0.00d # Submission to last job: 3725s 62.08m 1.03h 0.04d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # apart from chrUn_random, not too many chains with score < 5000 # load chr 1 into table ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain/chain hgLoadChain tetNig1 chr1_chainMm5 chr1.chain # no RefSeqs for Tetraodon so use mrna table # featureBits -chrom=chr1 tetNig1 all_mrna chainMm5Link -enrichment # all_mrna 2.637%, chainMm5Link 11.552%, both 0.823%, cover 31.19%, enrich 2.70x # try filtering with minScore < 5000 ssh kksilo cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chain chainSplit chainFilt5k all.chain ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain/chainFilt5k hgLoadChain tetNig1 chr1_chainMm5Filt5k chr1.chain # featureBits -chrom=chr1 tetNig1 all_mrna chainMm5Filt5kLink -enrichment # all_mrna 2.637%, chainMm5Filt5kLink 10.492%, both 0.780%, cover 29.58%, # enrich 2.82x # use chains with minScore=5000 filtering # remove test chain tables for chr1 ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain hgsql -e "drop table chr1_chainMm5;" tetNig1 hgsql -e "drop table chr1_chainMm5Link;" tetNig1 hgsql -e "drop table chr1_chainMm5Filt5k;" tetNig1 hgsql -e "drop table chr1_chainMm5Filt5kLink;" tetNig1 rm -r chain mv chainFilt5k chain cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain tetNig1 ${c}_chainMm5 $i echo done $c end # NET MOUSE (mm5) BLASTZ (DONE, 2004-09-09, hartera) ssh kksilo cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain mkdir preNet cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 67969024, utime 336 s/100, stime 36 # Add classification info using db tables: cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option mkdir -p /cluster/bluearc/mm5/linSpecRep.notInTetraodon mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInMouse cp -p /iscratch/i/mm5/linSpecRep.notInTetraodon/* \ /cluster/bluearc/mm5/linSpecRep.notInTetraodon cp -p /iscratch/i/tetNig1/linSpecRep.notInMouse/* \ /cluster/bluearc/tetNig1/linSpecRep.notInMouse ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain time netClass noClass.net tetNig1 mm5 mouseMm5.net \ -tNewR=/cluster/bluearc/tetNig1/linSpecRep.notInMouse \ -qNewR=/cluster/bluearc/mm5/linSpecRep.notInTetraodon -noAr # 50.960u 34.880s 2:25.82 58.8% 0+0k 0+0io 200pf+0w netFilter -minGap=10 mouseMm5.net | hgLoadNet tetNig1 netMm5 stdin # featureBits tetNig1 all_mrna netHg17 -enrichment # all_mrna 3.204%, netHg17 42.716%, both 1.698%, cover 53.00%, enrich 1.24x # MAKE VSMM5 DOWNLOADABLES (DONE, 2004-09-10, hartera) ssh kksilo # zip chains and nets cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain cp all.chain mm5.chain zip -j /cluster/data/tetNig1/zip/mm5.chain.zip mm5.chain rm mm5.chain cp mouseMm5.net mm5.net zip -j /cluster/data/tetNig1/zip/mm5.net.zip mm5.net rm mm5.net ssh hgwdev # copy chains and nets to downloads area set gp = /usr/local/apache/htdocs/goldenPath/tetNig1 mkdir -p $gp/vsMm5 cd $gp/vsMm5 mv /cluster/data/tetNig1/zip/mm5*.zip . md5sum *.zip > md5sum.txt # move axt files to downloads area and zip cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChrom mkdir -p $gp/vsMm5/axtChrom cp -p *.axt $gp/vsMm5/axtChrom cd $gp/vsMm5/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # CLEANUP HUMAN (hg17) BLASTZ (DONE, 2004-09-10, hartera) ssh kksilo cd /cluster/data/tetNig1/bed/blastz.hg17.swap nice rm axtChain/run1/chain/* & nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} & # CLEANUP FUGU (fr1) BLASTZ (DONE, 2004-09-10, hartera) ssh kksilo cd /cluster/data/tetNig1/bed/blastz.fr1 nice rm -rf raw & nice rm -rf lav & nice rm -rf test* & nice rm axtChain/run1/chain/* & nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} & # CLEANUP MOUSE (mm5) BLASTZ (DONE, 2004-09-10, hartera) ssh kksilo cd /cluster/data/tetNig1/bed/blastz.mm5 nice rm -rf raw & nice rm -rf lav & nice rm -rf test* & nice rm axtChain/run1/chain/* & nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} & # GENSCAN GENE PREDICTION ANNOTATIONS (DONE, 2004-09-10, hartera) # Provided by Genoscope, Genscan was run on masked Tetraodon genomic DNA # using parameters calibrated for Tetraodon genes. ssh kksilo mkdir -p /cluster/data/tetNig1/bed/Genoscope/genscan cd /cluster/data/tetNig1/bed/Genoscope/genscan awk '{if ($2 == "GSC") print;}' /cluster/data/tetNig1/tetraodon.all.gff > genscan.gff foreach c (`cat /cluster/data/tetNig1/chrom.lst`) echo "Processing chr$c ... " awk '{if ($1 == "'chr${c}'") print;}' genscan.gff >> chr${c}.gff end # need to remove CDS from name then load foreach f (./chr*.gff) perl -pi.bak -e 's/CDS G/G/' $f end # load database with gene predictions ssh hgwdev cd /cluster/data/tetNig1/bed/Genoscope/genscan # Reloaded without -genePredExt 1/6/05: nice ldHgGene tetNig1 genscan chr*.gff # Read 56120 transcripts in 227294 lines in 27 files # 56120 groups 27 seqs 1 sources 2 feature types # 28060 gene predictions # GENOSCOPE GAZE ANNOTATIONS (DONE, 2004-09-10, hartera) # GAZE annotations have been automatically generated by the GAZE software # (Howe et al.(2002). Genome Res., 12: 1418-27) using a custom designed gene # model. GAZE integrates annotation information from Geneid, Genescan, # Exofish (Human, Mouse, Fugu), Genewise (Human, Mouse) and cDNAs (Tetraodon). ssh kksilo mkdir -p /cluster/data/tetNig1/bed/Genoscope/gaze cd /cluster/data/tetNig1/bed/Genoscope/gaze awk '{if ($2 == "GSTEN") print;}' /cluster/data/tetNig1/tetraodon.all.gff > gaze.gff # remove ; as separators between names of gene, protein, mRNA etc. perl -pi.bak -e "s/\s\;\s/\t/g;" gaze.gff foreach c (`cat /cluster/data/tetNig1/chrom.lst`) echo "Processing chr$c ... " awk '{if ($1 == "'chr${c}'") print;}' gaze.gff >> chr${c}.gff end # need to remove CDS from name then load foreach f (./chr*.gff) perl -pi.bak -e 's/CDS G/G/' $f end # load database with gene predictions ssh hgwdev cd /cluster/data/tetNig1/bed/Genoscope/gaze # loaded chr1 only into rahseq first to check then load all chroms # Reloaded without -genePredExt 1/6/05: nice ldHgGene tetNig1 gaze chr*.gff # GALT 09-15-04 index on name(10) going to name(17) as all have long prefix echo "drop index name on gaze ;" | hgsql tetNig1 echo "create index name on gaze (name(17)) ;" | hgsql tetNig1 # Read 91905 transcripts in 265439 lines in 27 files # 91905 groups 27 seqs 1 sources 4 feature types # 27918 gene predictions # HOX GENES TRACK (DONE, 2004-09-10, hartera) # Provided by Genoscope: Tetraodon HOX genes were annotated by human experts # using sequence similarities with human, mouse and zebrafish HOX protein and # nucleic sequence. ssh kksilo mkdir -p /cluster/data/tetNig1/bed/Genoscope/hoxGenes cd /cluster/data/tetNig1/bed/Genoscope/hoxGenes awk '{if ($2 == "HOX") print;}' /cluster/data/tetNig1/tetraodon.all.gff > hoxGenes.gff # remove ; as separators between names of gene, protein, mRNA etc. perl -pi.bak -e "s/\s\;\s/\t/g;" hoxGenes.gff foreach c (`cat /cluster/data/tetNig1/chrom.lst`) echo "Processing chr$c ... " awk '{if ($1 == "'chr${c}'") print;}' hoxGenes.gff >> chr${c}.gff end # need to remove CDS from name then load foreach f (./chr*.gff) perl -pi.bak -e 's/CDS H/H/' $f end # load database with gene predictions ssh hgwdev cd /cluster/data/tetNig1/bed/Genoscope/hoxGenes # loaded chr1 only into rahseq first to check then load all chroms # Reloaded without -genePredExt 1/6/05: nice ldHgGene tetNig1 hoxGenes chr*.gff # Read 96 transcripts in 142 lines in 27 files # 96 groups 6 seqs 1 sources 2 feature types # 48 gene predictions # CYTOKINE GENES TRACK (DONE, 2004-09-10, hartera) # Provided by Genoscope: Tetraodon Cytokine genes were annotated by human # experts using sequence similarities with human, mouse and zebrafish # Cytokine protein and nucleic sequence. ssh kksilo mkdir -p /cluster/data/tetNig1/bed/Genoscope/cytokines cd /cluster/data/tetNig1/bed/Genoscope/cytokines awk '{if ($2 == "CYT") print;}' /cluster/data/tetNig1/tetraodon.all.gff > cytokines.gff # remove ; as separators between names of gene, protein, mRNA etc. perl -pi.bak -e "s/\s\;\s/\t/g;" cytokines.gff foreach c (`cat /cluster/data/tetNig1/chrom.lst`) echo "Processing chr$c ... " awk '{if ($1 == "'chr${c}'") print;}' cytokines.gff >> chr${c}.gff end # need to remove CDS from name then load foreach f (./chr*.gff) perl -pi.bak -e 's/CDS A/A/' $f end # load database with gene predictions ssh hgwdev cd /cluster/data/tetNig1/bed/Genoscope/cytokines # loaded chr1 only into rahseq first to check then load all chroms # Reloaded without -genePredExt 1/6/05: nice ldHgGene tetNig1 cytokines chr*.gff # GENEID GENES (DONE 9/22/04 angie) ssh kksilo mkdir /cluster/data/tetNig1/bed/geneid cd /cluster/data/tetNig1/bed/geneid foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/T.nigroviridis/tetNigFeb2004/geneid_v1.2/$chr.gtf wget http://genome.imim.es/genepredictions/T.nigroviridis/tetNigFeb2004/geneid_v1.2/$chr.prot end # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf cp /dev/null geneidPep.fa foreach f (chr*.prot) perl -wpe 's/^(>chr\S+)/$1.1/' $f >> geneidPep.fa end ssh hgwdev cd /cluster/data/tetNig1/bed/geneid ldHgGene -gtf -genePredExt tetNig1 geneid chr*.gtf hgPepPred tetNig1 generic geneidPep geneidPep.fa # GENOSCOPE GENEWISE HUMAN IPI (DONE, 2004-10-14, hartera) ssh kksilo mkdir -p /cluster/data/tetNig1/bed/Genoscope/humanIPI cd /cluster/data/tetNig1/bed/Genoscope/humanIPI # Create README to describe data from Genoscope tetraodon.all.gff.readme awk '{if ($2 == "GWS_H") print;}' /cluster/data/tetNig1/tetraodon.all.gff \ > humanIPI.gff # remove "exon" from name field sed -e 's/exon GWS/GWS/g' humanIPI.gff > humanIPIformat.gff # load database with gene predictions ssh hgwdev cd /cluster/data/tetNig1/bed/Genoscope/humanIPI nice ldHgGene -genePredExt tetNig1 humanIPI humanIPIformat.gff # Read 43384 transcripts in 185494 lines in 1 files # 43384 groups 27 seqs 1 sources 2 feature types # 21692 gene predictions # add to trackDb and write html page # GENOSCOPE HUMAN IPI ECOTIGS (DONE, 2004-10-18, hartera) ssh kksilo mkdir -p /cluster/data/tetNig1/bed/Genoscope/humanIPIEcotigs cd /cluster/data/tetNig1/bed/Genoscope/humanIPIEcotigs # Create README to describe data from Genoscope tetraodon.all.gff.readme awk '{if ($2 == "EP3_H") print;}' /cluster/data/tetNig1/tetraodon.all.gff \ > humanIPIEcotigs.gff # remove "exon" from name field sed -e 's/exon EP3/EP3/g' humanIPIEcotigs.gff > humanIPIEcotigsformat.gff # load database with gene predictions - LOAD IN RAHSEQ FIRST TO CHECK ssh hgwdev cd /cluster/data/tetNig1/bed/Genoscope/humanIPIEcotigs nice ldHgGene -genePredExt tetNig1 humanIPIEcotigs humanIPIEcotigsformat.gff # Read 55800 transcripts in 204355 lines in 1 files # 55800 groups 27 seqs 1 sources 2 feature types # 27900 gene predictions # GENOSCOPE FUGU (Takifugu) ECOTIGS (in progress, 2004-10-18, hartera) ssh kksilo mkdir -p /cluster/data/tetNig1/bed/Genoscope/fuguEcotigs cd /cluster/data/tetNig1/bed/Genoscope/fuguEcotigs # Create README to describe data from Genoscope tetraodon.all.gff.readme awk '{if ($2 == "EG3_F") print;}' /cluster/data/tetNig1/tetraodon.all.gff \ > fuguEcotigs.gff # remove "HSP" from name field, change HSP to "exon" and match to "transcript" sed -e 's/HSP EG3/EG3/g' fuguEcotigs.gff | sed -e 's/HSP/exon/' \ | sed -e 's/match/transcript/' > fuguEcotigsformat.gff # load database with gene predictions # load in when know which version of Fugu this is so can name appropriately ssh hgwdev cd /cluster/data/tetNig1/bed/Genoscope/fuguEcotigs nice ldHgGene -genePredExt tetNig1 fuguEcotigs fuguEcotigsformat.gff # Read 35552 transcripts in 210128 lines in 1 files # 35552 groups 27 seqs 1 sources 2 feature types # 17776 gene predictions # Adding a photograph to the description # Permission to use photos obtained from Hugues Roest Crollius: # Email message attached below ssh hgwdev cd /cluster/data/tetNig1/html wget \ 'http://www.genoscope.cns.fr/externe/Francais/Projets/Projet_C/img/tetra23.jpg' \ -O photo_tetraodon.jpg convert -crop 1320x600+280+40 -sharpen 0 -normalize \ -geometry 300x200 -quality 80 photo_tetraodon.jpg \ Tetraodon_nigroviridis.jpg cp -p Tetraodon_nigroviridis.jpg /usr/local/apache/htdocs/images # add links to this image in the description.html page, request push # After checking with a couple of journals who have used our pictures for # their covers, it appears that none holds copyrights so you are free to use # any of the pictures on the followings pages: # http://www.genoscope.cns.fr/externe/English/Projets/Projet_C/album.html # For some of the images on the first page we can provide you with high # resolution versions if needed. # I hope this helps. # Best wishes, # Hugues Roest Crollius # ------------------------------------------------------------------------- # Hugues Roest Crollius # Laboratoire Dynamique et Organisation des Genomes (Dyogen) # CNRS UMR8541 # Departement de Biologie # Ecole Normale Superieure Tel: +33 (0)1 44 32 23 70 # 46 rue d'Ulm, 75230 Paris Cedex 05 Email: hrc@ens.fr # SELF BLASTZ (DONE, 2005-01-12, hartera) ssh kksilo rm /cluster/data/tetNig1/bed/blastzSelf mkdir -p /cluster/store8/tetNig1/bed/blastzSelf.2005-01-11 ln -s /cluster/store8/tetNig1/bed/blastzSelf.2005-01-11 \ /cluster/data/tetNig1/bed/blastzSelf cd /cluster/data/tetNig1/bed/blastzSelf # try using contigs mkdir -p /cluster/data/tetNig1/bed/blastzSelf/contigSeqs cd /cluster/data/tetNig1/bed/blastzSelf/contigSeqs foreach c (`cat /cluster/data/tetNig1/chrom.lst`) foreach d (/cluster/data/tetNig1/$c/chr${c}*_?{,?}) echo "splitting $d" set contig = $d:t echo "$contig" ~/bin/i386/faSplit gap $d/$contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 end end # 1025 files for chr*.fa and chr*.lft - 940 for chr*.fa, 85 for chr*.lft cat chr*.fa >> tetNig1Contigs.fa cat chr*.lft >> 500kbcontigs.lft rm chr*.fa chr*.lft ssh kkr1u00 mkdir -p /iscratch/i/tetNig1/contigs mv /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/tetNig1Contigs.fa \ /iscratch/i/tetNig1/contigs iSync ssh kk cd /cluster/data/tetNig1/bed/blastzSelf # for this blastz run, M=50 cat << '_EOF_' > DEF # tetraodon vs. tetraodon export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run1 BLASTZ=blastz BLASTZ_L=5000 BLASTZ_H=2500 BLASTZ_ABRIDGE_REPEATS=0 # TARGET # Tetraodon SEQ1_DIR=/iscratch/i/tetNig1/nib # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=500000 SEQ1_LAP=500 # QUERY # Tetraodon SEQ2_DIR=/iscratch/i/tetNig1/contigs # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK= SEQ2_LAP= BASE=/cluster/data/tetNig1/bed/blastzSelf DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place - TO BE DONE chmod +x DEF cp DEF ~angie/hummus/DEF.tetNig1-tetNig1.2005-01-11 # prepare first cluster run bash # if a csh/tcsh user . ./DEF /cluster/data/tetNig1/jkStuff/BlastZ_run0.sh cd run.0 # check batch looks ok then para try, check, push, check, .... # para time # Completed: 820 of 820 jobs # CPU time in finished jobs: 1611164s 26852.74m 447.55h 18.65d 0.051 y # IO & Wait Time: 11448s 190.80m 3.18h 0.13d 0.000 y # Average job time: 1979s 32.98m 0.55h 0.02d # Longest job: 4231s 70.52m 1.18h 0.05d # Submission to last job: 14753s 245.88m 4.10h 0.17d # second cluster run: lift raw alignments -> lav dir cd /cluster/data/tetNig1/bed/blastzSelf bash # if a csh/tcsh user . ./DEF /cluster/data/tetNig1/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, check etc. # para time # Completed: 820 of 820 jobs # CPU time in finished jobs: 2741s 45.68m 0.76h 0.03d 0.000 y # IO & Wait Time: 13346s 222.44m 3.71h 0.15d 0.000 y # Average job time: 20s 0.33m 0.01h 0.00d # Longest job: 106s 1.77m 0.03h 0.00d # Submission to last job: 926s 15.43m 0.26h 0.01d # third run: lav -> axt ssh kki cd /cluster/data/tetNig1/bed/blastzSelf mkdir axtChrom run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh -ef cd $1 cat `ls -1 *.lav | sort -g` \ | lavToAxt -fa stdin /iscratch/i/tetNig1/nib \ /iscratch/i/tetNig1/contigs/tetNig1Contigs.fa stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/tetNig1/bed/blastzSelf/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy \ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList wc -l jobList head jobList para create jobList para try, check, push, check,... # para time # Completed: 27 of 27 jobs # CPU time in finished jobs: 337s 5.61m 0.09h 0.00d 0.000 y # IO & Wait Time: 324s 5.41m 0.09h 0.00d 0.000 y # Average job time: 24s 0.41m 0.01h 0.00d # Longest job: 273s 4.55m 0.08h 0.00d # Submission to last job: 1675s 27.92m 0.47h 0.02d # do liftup here #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/tetNig1/bed/blastzSelf foreach d (axtChrom/chr*.axt) echo $d liftUp -axtQ ${d}.out.axt ./contigSeqs/500kbcontigs.lft warn $d \ > /dev/null end #- Lift pseudo-contigs to chromosome level # check this is working correctly set tetra = "/cluster/data/tetNig1" foreach c (`cat ${tetra}/chrom.lst`) echo lifting $c liftUp -axtQ ./axtChrom/chr${c}.out2.axt \ $tetra/jkStuff/liftAll.lft warn \ ./axtChrom/chr${c}.axt.out.axt > /dev/null end cd axtChrom # need to drop Self alignments after lifting up co-ordinates foreach c (`cat ${tetra}/chrom.lst`) echo "Processing $c..." axtDropSelf chr${c}.out2.axt chr${c}.dropped.axt mv chr${c}.axt chr${c}.axt.old mv chr${c}.dropped.axt chr${c}.axt rm chr${c}.axt.out.axt end # translate sorted axt files into psl ssh kolossus cd /cluster/data/tetNig1/bed/blastzSelf mkdir -p pslChrom set tbl = "blastzSelf" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S1.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/tetNig1/bed/blastzSelf/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl -noTNameIx tetNig1 $f echo "$f Done" end # not in contigs: blastzSelf # featureBits -chrom=chr1 tetNig1 all_mrna blastzSelf -enrichment # all_mrna 2.601%, blastzSelf 16.728%, both 0.654%, cover 25.13%, enrich 1.50x # featureBits -chrom=chr1 tetNig1 blastzSelf # 2704737 bases of 16169091 (16.728%) in intersection # blastzSelfC, query is in contigs, M=50 # featureBits -chrom=chr1 tetNig1 all_mrna blastzSelfC -enrichment # all_mrna 2.601%, blastzSelfC 16.208%, both 0.638%, cover 24.51%, enrich 1.51x # featureBits -chrom=chr1 tetNig1 blastzSelfC # 2620707 bases of 16169091 (16.208%) in intersection # blastzSelfCM100, query is in contigs, M=100 # featureBits -chrom=chr1 tetNig1 all_mrna blastzSelfCM100 -enrichment # all_mrna 2.601%, blastzSelfCM100 16.319%, both 0.639%, cover 24.56%, # enrich 1.51x # featureBits -chrom=chr1 tetNig1 blastzSelfCM100 # 2638606 bases of 16169091 (16.319%) in intersection # gain little by using M=100 except for a lot of extra alignments so use # M=50 instead. # L=5000 # featureBits -chrom=chr1 tetNig1 blastzSelfL5k # 2488953 bases of 16169091 (15.393%) in intersection # L=8000, L8k # featureBits -chrom=chr1 tetNig1 blastzSelfL8k # 2303191 bases of 16169091 (14.244%) in intersection # rows in table # blastzSelf: 517242 # blastzSelfC: 86073 # blastzSelfCM100: 132429 # blastzSelfCL5k: 62933 # blastzSelfL8k: 43137 # use blastz with contigs, M=50 and L=5000. Using L=5000 removes some of the # short low scoring alignments but not too stringently. After chaining, # further removal of low scoring alignments can be done. # CHAIN SELF BLASTZ (DONE, 2005-01-14, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/tetNig1/bed/blastzSelf mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain rm axtChrom/chr*.out2.axt rm axtChrom/chr*.old ls -1S /cluster/data/tetNig1/bed/blastzSelf/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # axtChain now using human/chicken gap scoring matrix as default cat << '_EOF_' > doChain #!/bin/csh axtChain $1 \ /iscratch/i/tetNig1/nib \ /iscratch/i/tetNig1/nib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... # Completed: 27 of 27 jobs # CPU time in finished jobs: 864s 14.40m 0.24h 0.01d 0.000 y # IO & Wait Time: 137s 2.28m 0.04h 0.00d 0.000 y # Average job time: 37s 0.62m 0.01h 0.00d # Longest job: 559s 9.32m 0.16h 0.01d # Submission to last job: 1758s 29.30m 0.49h 0.02d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/tetNig1/bed/blastzSelf/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chainRaw all.chain # apply chainAntiRepeat to remove chains that are the result of repeats # and degenerate DNA mkdir chain cd chainRaw foreach f (*.chain) set c = $f:r echo $c nice chainAntiRepeat /cluster/bluearc/tetNig1/nib \ /cluster/bluearc/tetNig1/nib $f \ ../chain/$c.chain end cd .. # take a look at score distr's,try also with larger bin size. foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist10k.out textHistogram -binSize=10000 /tmp/score.$f:t:r >> hist10k.out echo "" end # trim to minScore=10000 to cut some of the fluff mkdir chainFilt foreach f (chain/*.chain) chainFilter -minScore=10000 $f > chainFilt/$f:t end # load tables ssh hgwdev cd /cluster/data/tetNig1/bed/blastzSelf/axtChain/chainFilt foreach i (*.chain) set c = $i:r hgLoadChain tetNig1 ${c}_chainSelf $i echo done $c end # filtering on minScore=5000 # featureBits -chrom=chr1 tetNig1 chainSelf5kLink # 2377755 bases of 16169091 (14.706%) in intersection # rows = 362482 # filtering on minScore=10000, use this # featureBits -chrom=chr1 tetNig1 chainSelfLink # 2199465 bases of 16169091 (13.603%) in intersection # rows = 233729 # NET SELF BLASTZ (DONE, 2005-01-14, hartera) ssh kksilo cd /cluster/data/tetNig1/bed/blastzSelf/axtChain mkdir preNet cd chainFilt foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S1.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S1.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 78983168, utime 485 s/100, stime 55 # Add classification info using db tables: # netClass looks for ancient repeats in one of the databases # hg17 has this table - hand-curated by Arian but this is for # human-rodent comparisons so do not use here, use -noAr option ssh hgwdev cd /cluster/data/tetNig1/bed/blastzSelf/axtChain time netClass noClass.net tetNig1 tetNig1 self.net -noAr # 14.530u 4.550s 0:31.26 61.0% 0+0k 0+0io 218pf+0w netFilter -minGap=10 self.net | hgLoadNet tetNig1 netSelf stdin # featureBits tetNig1 netSelf # 184618536 bases of 342403326 (53.918%) in intersection # MAKE Human Proteins track (hg17 DONE 2005-1-15 braney) ssh kksilo mkdir -p /cluster/data/tetNig1/blastDb cd /cluster/data/tetNig1/blastDb for i in `cat ../chrom.lst`; do ls -1 ../$i/*/*.fa ; done | sed "/Un_random/d" > list for i in `cat list` do ln -s $i . formatdb -i `basename $i` -p F done rm *.log *.fa list cd .. for i in `cat chrom.lst`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft ssh kkr1u00 mkdir -p /iscratch/i/tetNig1/blastDb cd /cluster/data/tetNig1/blastDb for i in nhr nin nsq; do cp *.$i /iscratch/i/tetNig1/blastDb ; echo $i; done cd iSync > sync.out mkdir -p /cluster/data/tetNig1/bed/tblastn.hg17KG cd /cluster/data/tetNig1/bed/tblastn.hg17KG echo /iscratch/i/tetNig1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst # back to kksilo exit cd /cluster/data/tetNig1/bed/tblastn.hg17KG calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(300000/`wc query.lst | awk "{print \\\$1}"`\) # 42156/(300000/593) = 83.328360 mkdir -p /cluster/bluearc/tetNig1/bed/tblastn.hg17KG/kgfa split -l 83 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/tetNig1/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/tetNig1/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/tetNig1/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/tetNig1/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2 liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec # I ended up doing the scaffolds separately from the chopped up chrom segments # but next time I wouldn't gensub2 scaffold.lst kg.lst blastGsub scaffoldBlastSpec ssh kk cd /cluster/data/tetNig1/bed/tblastn.hg17KG para create blastSpec para push # Completed: 301244 of 301244 jobs # CPU time in finished jobs: 25047824s 417463.74m 6957.73h 289.91d 0.794 y # IO & Wait Time: 2658184s 44303.06m 738.38h 30.77d 0.084 y # Average job time: 92s 1.53m 0.03h 0.00d # Longest job: 226899s 3781.65m 63.03h 2.63d # Submission to last job: 250711s 4178.52m 69.64h 2.90d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/tetNig1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 1287 of 1287 jobs # CPU time in finished jobs: 72236s 1203.94m 20.07h 0.84d 0.002 y # IO & Wait Time: 22886s 381.43m 6.36h 0.26d 0.001 y # Average job time: 74s 1.23m 0.02h 0.00d # Longest job: 3374s 56.23m 0.94h 0.04d # Submission to last job: 5982s 99.70m 1.66h 0.07d exit # back to kksilo cd /cluster/data/tetNig1/bed/tblastn.hg17KG/blastOut # again some weirdness because I did the NA and Un scaffolds separately for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > cs60.$i.psl sort -rn cs60.$i.psl | pslUniq stdin us.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" cs60.$i.psl > ms60.$i.psl echo $i done for i in kg?? do cat $i/c.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat us.*.psl ms60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/tetNig1/bed/tblastn.hg17KG/blastHg17KG.psl cd .. # lift the chrUn and chrNA scaffolds ssh hgwdev cd /cluster/data/tetNig1/bed/tblastn.hg17KG hgLoadPsl tetNig1 blastHg17KG.psl # back to kksilo rm -rf blastOut # End tblastn # GENOSCOPE HUMAN (hg17) ECORES (DONE, 2005-02-07, hartera) ssh kksilo mkdir -p /cluster/data/tetNig1/bed/Genocscope/ecoresHg17 cd /cluster/data/tetNig1/bed/Genoscope/ecoresHg17 wget --timestamp \ http://www.genoscope.cns.fr/externe//4ucsc/ExofishTnig1Hs35 # this is in gff format # remove "Ecotig" from name field sed -e 's/Ecotig EG/EG/g' ExofishTnig1Hs35 > ExofishTnig1Hs35.gff # need to have tabs between fields not a space to load file into table sed -e 's/ /\t/g' ExofishTnig1Hs35.gff > Tnig1Hs35format.gff # if "ecore" is changed to "CDS" and "ecotig" to "transcript" this loads # correctly into the table. sed -e 's/ecore/CDS/' Tnig1Hs35format.gff | sed -e 's/ecotig/transcript/' \ > tetNig1vsHg17.gff ssh hgwdev cd /cluster/data/tetNig1/bed/Genoscope/ecoresHg17 nice ldHgGene tetNig1 ecoresHg17 tetNig1vsHg17.gff # Read 36417 transcripts in 184981 lines in 1 files # 36417 groups 27 seqs 1 sources 2 feature types # 36417 gene predictions # added ecoresHg17 entry to trackDb.ra in trackDb/tetraodon # and created ecoresHg17.html. Genoscope will not be maintaining this # newest data in their Exofish comparative browser display. # ACCESSIONS FOR CONTIGS (DONE 2005-03-17 kate) # Used for Assembly track details, and also ENCODE AGP's cd /cluster/data/tetNig1/bed mkdir -p contigAcc cd contigAcc # extract WGS contig to accession mapping from Entrez Nucleotide summaries # To get the summary file, access the Genbank page for the project # by searching: # genus[ORGN] AND WGS[KYWD] # At the end of the page, click on the list of accessions for the contigs. # Select summary format, and send to file. # output to file .acc grep SCAF tetra.acc | wc -l # 25773 # edit hg/encode/regionAgp/contigAccession.pl to recognize tetra format # and install in /cluster/bin/scripts contigAccession tetra.acc > contigAcc.tab wc -l contigAcc.tab # 25773 grep SCAF /cluster/data/tetNig1/chr.agp | wc -l # 25773 hgsql tetNig1 < $HOME/kent/src/hg/lib/contigAcc.sql echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \ hgsql tetNig1 hgsql tetNig1 -e "SELECT COUNT(*) FROM contigAcc" # 25773 ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) # MAKE 2BIT FILES FOR 500 kb SEQUENCE CONTIGS, CHROMS CONTIGS AND RANDOM # CHROMOSOME SCAFFOLDS # (DONE, 2005-08-24, hartera) ssh hgwdev # 2bit file from 500 kb contigs file used for blastz alignments faToTwoBit /cluster/bluearc/tetNig1/contigs/tetNig1Contigs.fa \ /cluster/bluearc/tetNig1/contigs/tetNig1Contigs.2bit # then make FASTA file and 2bit file for just the ordered chroms and # not the randoms. ssh kkr1u00 mkdir -p /cluster/data/tetNig1/twoBit cd /cluster/data/tetNig1/twoBit grep '>' /iscratch/i/tetNig1/contigs/tetNig1Contigs.fa | sed -e 's/>//' \ > tetNig1.contigs grep -v "random" tetNig1.contigs > tetNig1.contigs.norandoms wc -l tetNig1.contigs.norandoms # 507 tetNig1.contigs.norandoms # get these sequence without the randoms, run randoms as scaffolds faSomeRecords /iscratch/i/tetNig1/contigs/tetNig1Contigs.fa \ tetNig1.contigs.norandoms tetNig1ContigNoRandoms.fa faToTwoBit tetNig1ContigsNoRandoms.fa tetNig1ContigsNoRandoms.2bit mv tetNig1ContigsNoRandoms.2bit /cluster/bluearc/tetNig1/contigs/ cp /cluster/bluearc/tetNig1/contigs/tetNig1ContigsNoRandoms.2bit \ /iscratch/i/tetNig1/contigs/ # get list of scaffolds for randoms - need to make faFrag list to get # sequences: cd /cluster/data/tetNig1/twoBit mkdir -p /cluster/bluearc/tetNig1/randomScaffolds mkdir -p /cluster/bluearc/tetNig1/randomFa mkdir run cd run grep 'random' /cluster/data/tetNig1/chrom.lst > chroms.random foreach c (`cat chroms.random`) set dir=/cluster/data/tetNig1 echo $c cp $dir/${c}/chr{$c}.fa /cluster/bluearc/tetNig1/randomFa/ end cat > getSeqs.csh << 'EOF' #!/bin/csh -ef set inFa=$1 set st=$2 set end=$3 set out=$4 faFrag -mixed $inFa $st $end $out > ${out}.log 'EOF' chmod +x getSeqs.csh awk '{if (($1 ~ /random/) && ($5 ~ /SCAF/)) print "/cluster/data/tetNig1/twoBit/run/getSeqs.csh","/cluster/bluearc/tetNig1/randomFa/"$1".fa",$2-1,$3,"/cluster/bluearc/tetNig1/randomScaffolds/"$5".fa"}' /cluster/data/tetNig1/chr.agp \ > jobList para create jobList awk '{if ($1 ~ /random/ && ($5 !~ /GAP/)) print $5;}' \ /cluster/data/tetNig1/chr.agp > randoms.scaffolds wc -l jobList # 24571 jobList wc -l randoms.scaffolds # 24571 randoms.scaffolds para try, check, push, check etc. # para time # Completed: 24571 of 24571 jobs # CPU time in finished jobs: 249907s 4165.12m 69.42h 2.89d 0.008 y # IO & Wait Time: 93508s 1558.46m 25.97h 1.08d 0.003 y # Average job time: 14s 0.23m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 3689s 61.48m 1.02h 0.04d # Submission to last job: 3952s 65.87m 1.10h 0.05d # once have all the sequences, cat together and make a 2bit file ssh kolossus cd /cluster/bluearc/tetNig1/randomScaffolds # replace FASTA header with scaffold name and cat sequences together foreach f (SCAF*.fa) set s=$f:r perl -pi.bak -e "s/>chr[0-9Un]+_random:[0-9]+\-[0-9]+/>$s/" $f cat $f >> tetraRandomsScaffolds.fa end faToTwoBit tetraRandomsScaffolds.fa tetraRandomsScaffolds.2bit # then cat to the contigs file without randoms and make a 2bit file cat tetraRandomsScaffolds.fa \ /cluster/data/tetNig1/twoBit/tetNig1ContigNoRandoms.fa \ > tetNig1ChrContigsRandomScafs.fa faToTwoBit tetNig1ChrContigsRandomScafs.fa \ tetNig1ChrContigsRandomScafs.2bit ssh kkr1u00 cd /cluster/bluearc/tetNig1/randomScaffolds # copy 2bit files to iservers cp *.2bit /iscratch/i/tetNig1/contigs cp *.2bit /cluster/bluearc/tetNig1/contigs # rsync to the iservers - this takes a long time these days # so do this with rsync foreach i (2 3 4 5 6 7 8) rsync -a --progress --delete --rsh=rsh \ /iscratch/i/tetNig1/contigs/ kkr${i}u00:/iscratch/i/tetNig1/contigs/ end # make a 2bit file with the full chroms and the random scaffolds ssh kolossus cd /cluster/data/tetNig1/twoBit grep -v "random" /cluster/data/tetNig1/chrom.lst > chromsNoRandoms.lst foreach c (`cat chromsNoRandoms.lst`) cat /cluster/data/tetNig1/${c}/chr${c}.fa >> tetNig1chromsNoRandoms.fa end # then add scaffolds FASTAs cat tetNig1chromsNoRandoms.fa \ /cluster/bluearc/tetNig1/randomScaffolds/tetraRandomsScaffolds.fa \ > tetNig1ChromsRandomScafs.fa faToTwoBit tetNig1ChromsRandomScafs.fa \ tetNig1ChromsRandomScafs.2bit mkdir -p /cluster/bluearc/tetNig1/twoBit mv tetNig1ChromsRandomScafs* /cluster/bluearc/tetNig1/twoBit/ ssh kkstore03 mkdir /cluster/data/tetNig1/liftScaffoldsToChrom cd /cluster/data/tetNig1/liftScaffoldsToChrom foreach c (`cat chromsNoRandoms.lst`) awk 'BEGIN {FS="\t"} {OFS="\t"} {print $2-1,$6,$3,$1;}' \ /cluster/data/tetNig1/${c}/${c}.agp > ${c}.lft awk '{if ($1 == "'${c}'") print $2;}' chrom.sizes ######################################################################### # MOUSE NET/CHAINS MM7 - Info contained in makeMm7.doc (200503 Hiram) ######################################################################### # Prep for the Mm7 net/chains, need a lift file for the randoms # (DONE - 2005-09-20 - Hiram) ssh kkstore003 cd /cluster/data/tetNig1 # We need the lift information from the random.agp files cp -p /cluster/data/cb2/scripts/agpToLift.pl ./jkStuff mkdir liftScaffoldsToChrom for AGP in ?_random/*_random.agp ??_random/*_random.agp do CHR=`dirname ${AGP}` grep -v fragment "${AGP}" | ./jkStuff/agpToLift.pl /dev/stdin \ > liftScaffoldsToChrom/${CHR}.lft done cat liftScaffoldsToChrom/*.lft > jkStuff/chromsAndScafs.lft cp -p jkStuff/chromsAndScafs.lft \ /san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft # BLASTZ SWAP FOR ZEBRAFISH (danRer3) (DONE, 2005-10-20, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS. # do swap of danRer3 vs. tetNig1 chain and net alignments to # create tetNig1 vs. danRer3. see makeDanRer3.doc for details. ssh hgwdev cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -bigClusterHub=pk \ -swap -chainMinScore=5000 >& doSwap.log & # Start: Thu Oct 20 11:59 Finished: Oct 20 12:25 # Parameters used for danRer3 vs tetNig1 BLASTZ: # BLASTZ_M=50 # BLASTZ_H=2500 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified # BLASTZ_ABRIDGE_REPEATS=0 # Add entries for danRer3 chains and net to trackDb.ra for monDom2 and # Modify track descriptions to describe the process using scaffolds for danRer3 # chrNA and chrUn and the fact that dynamic masking was used for the Blastz # alignments - see makeDanRer3.doc for details. ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-19 - Hiram) ssh kk cd /cluster/data/mm8/bed/blastzTetNig1.2006-02-19 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=net `pwd`/DEF > swap-net.out 2>&1 & time nice -n +19 featureBits tetNig1 chainMm8Link # 47024263 bases of 342403326 (13.734%) in intersection ########################################################################### # BLASTZ SWAP TO CREATE CHAIN AND NET ALIGNMENTS FOR ZEBRAFISH (danRer4) # AND CREATE AXNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS # (DONE, 2006-04-30, hartera) ssh pk # Used these BLASTZ parameters - see also makeDanRer4.doc # BLASTZ_H=2500 # BLASTZ_M=50 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # and no lineage-specific repeats. # swap directory for alignments is: # /cluster/data/tetNig1/bed/blastz.danRer4.swap cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29 nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& doSwap.log & # Took about 22 minutes. # Check coverage compared to danRer3: featureBits tetNig1 chainDanRer4Link # 82462283 bases of 342403326 (24.083%) in intersection featureBits tetNig1 chainDanRer3Link # 76373873 bases of 342403326 (22.305%) in intersection # No refGene table for tetNig1 so use mrna: featureBits -chrom=chr1 tetNig1 mrna chainDanRer4Link -enrichment # mrna 2.624%, chainDanRer4Link 22.894%, both 1.297%, cover 49.42%, # enrich 2.16x featureBits -chrom=chr1 tetNig1 mrna chainDanRer3Link -enrichment # mrna 2.624%, chainDanRer3Link 21.321%, both 1.205%, cover 45.92%, # enrich 2.15x featureBits -chrom=chr1 tetNig1 mrna netDanRer4 -enrichment # mrna 2.624%, netDanRer4 78.279%, both 2.278%, cover 86.81%, enrich 1.11x featureBits -chrom=chr1 tetNig1 mrna netDanRer3 -enrichment # mrna 2.624%, netDanRer3 95.070%, both 2.435%, cover 92.80%, enrich 0.98x # Similar coverage for chains and nets for danRer4 and danRer3. # nets coverage is a little lower for danRer4 than for danRer3. # Try also Human Proteins (hg17): featureBits -chrom=chr1 tetNig1 blastHg17KG netDanRer4 -enrichment # blastHg17KG 5.728%, netDanRer4 78.279%, both 5.666%, cover 98.92%, # enrich 1.26x featureBits -chrom=chr1 tetNig1 blastHg17KG netDanRer3 -enrichment # blastHg17KG 5.728%, netDanRer3 95.070%, both 5.687%, cover 99.29%, # enrich 1.04x # Similar coverage here too. Looked at chain and net tracks for danRer4 # on the tetNig1 Browser and they look ok. ######################################################################### ## Reorder Fish organisms (DONE - 2006-12-22 - Hiram) hgsql -h genome-testdb hgcentraltest \ -e "update dbDb set orderKey = 460 where name = 'tetNig1';" ######################################################################### # BLASTZ/CHAIN/NET FR2 (DONE - 2007-01-29 - Hiram) ## Align to fr2 scaffolds, ## results lifted to fr2 chrUn coordinates ssh kkstore03 mkdir /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25 cd /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25 cat << '_EOF_' > DEF # Tetraodon vs. Fugu # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Tetraodon tetNig1 SEQ1_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Fugu fr2 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/cluster/data/tetNig1/bed/blastz.fr2.2007-01-25 TMPDIR=/scratch/tmp '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -bigClusterHub=pk \ -blastzOutRoot /cluster/bluearc/tetNig1Fr2 > do.log 2>&1 & # real 259m39.548s ## Swap to fr2 mkdir /cluster/data/fr2/bed/blastz.tetNig1.swap cd /cluster/data/fr2/bed/blastz.tetNig1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -swap > swap.log 2>&1 & time doBlastzChainNet.pl -verbose=2 \ /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=net -bigClusterHub=pk -swap > net_swap.log 2>&1 & # real 40m40.471s ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25 time nice -n +19 featureBits tetNig1 chainFr2Link \ > fb.tetNig1.chainFr2Link.txt 2>&1 # 246828605 bases of 342403326 (72.087%) in intersection cd /cluster/data/fr2/bed/blastz.tetNig1.swap time nice -n +19 featureBits fr2 chainTetNig1Link \ > fb.fr2.chainTetNig1.txt 2>&1 # 247086553 bases of 393312790 (62.822%) in intersection ######################################################################### ## SWAP oryLat1 to tetNig1 (DONE - 2007-01-18 - Hiram) ssh kkstore04 cd /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13 cat fb.oryLat1.chainTetNig1Link.txt # 160388236 bases of 700386597 (22.900%) in intersection # And the swap ssh kkstore04 cd /cluster/data/tetNig1/bed mv blastz.oryLat1.swap blastz.oryLat1.swap.2006-12-08 mkdir blastz.oryLat1.swap cd /cluster/data/tetNig1/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=kk > swap.log 2>&1 & time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -continue=net -swap -bigClusterHub=kk > net-swap.log 2>&1 & ssh hgwdev cd /cluster/data/tetNig1/bed/blastz.oryLat1.swap nice -n 19 featureBits -noRandom tetNig1 chainOryLat1Link \ >fb.tetNig1.oryLat1.txt 2>&1 # 130327582 bases of 342403326 (38.063%) in intersection ########################################################################## # N-SCAN gene predictions (nscanGene) - (2007-08-16 markd) cd /cluster/data/tetNig1/bed/nscan/ # obtained NSCAN predictions from michael brent's group # at WUSTL wget http://mblab.wustl.edu/predictions/Tetraodon/tetNig1.gtf wget http://mblab.wustl.edu/predictions/Tetraodon/tetNig1.prot.fa bzip2 tetNig1.* # load tracks. Note that these have *utr features, rather than # exon features. currently ldHgGene creates separate genePred exons # for these. ldHgGene -bin -gtf -genePredExt tetNig1 nscanGene tetNig1.gtf.bz2 # load protein hgPepPred tetNig1 generic nscanPep tetNig1.prot.fa.bz2 rm *.tab # update trackDb; need a tetNig1-specific page to describe informants tetraodon/tetNig1/nscanGene.html (copy from mm8 and edit) tetraodon/tetNig1/trackDb.ra ############################################################################## # SWAP mm9 alignments to tetNig1 (DONE - 2007-09-07 - Hiram) # the original was done at: cd /cluster/data/mm9/bed/blastzTetNig1.2007-09-06 cat fb.mm9.chainTetNig1Link.txt # 46206292 bases of 2620346127 (1.763%) in intersection mkdir /cluster/data/tetNig1/bed/blastz.mm9.swap cd /cluster/data/tetNig1/bed/blastz.mm9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/mm9/bed/blastzTetNig1.2007-09-06/DEF \ -chainMinScore=5000 \ -qRepeats=windowmaskerSdust -chainLinearGap=loose \ -swap -bigClusterHub=kk > swap.log 2>&1 & # real 19m58.885s cat fb.tetNig1.chainMm9Link.txt # 42256263 bases of 342403326 (12.341%) in intersection ############################################################################## # SWAP DANRER5 BLASTZ RESULT TO GET DANRER5 CHAINS, NETS, # AXTNET, MAFNET AND ALIGNMENT DOWNLOADS (DONE, 2007-09-17, hartera) ssh kkstore04 mkdir /cluster/data/tetNig1/bed/blastz.swap.danRer5 cd /cluster/data/tetNig1/bed/blastz.swap.danRer5 # blastz parameters used to align tetNig1 as query to danRer5 as target: # BLASTZ_H=2500 # BLASTZ_M=50 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # Results for tetNig1 blastz on danRer5 are in: # /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16 time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -qRepeats=windowmaskerSdust -swap \ /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16/DEF \ >& swap.log & # 0.141u 0.097s 12:30.60 0.0% 0+0k 0+0io 7pf+0w ssh hgwdev cat \ /cluster/data/tetNig1/bed/blastz.danRer5.swap/fb.tetNig1.chainDanRer5Link.txt # 70448527 bases of 342403326 (20.575%) in intersection # look at coverage of mrna, there is no native RefSeqs # track for tetraodon, tetNig1. featureBits tetNig1 all_mrna chainDanRer5Link -enrichment # all_mrna 3.104%, chainDanRer5Link 20.575%, both 1.382%, cover 44.53%, enrich # 2.16x featureBits tetNig1 all_mrna chainDanRer4Link -enrichment # all_mrna 3.104%, chainDanRer4Link 24.083%, both 1.522%, cover 49.02%, enrich # 2.04x featureBits tetNig1 all_mrna netDanRer5 -enrichment # all_mrna 3.104%, netDanRer5 70.075%, both 2.400%, cover 77.31%, enrich 1.10x featureBits tetNig1 all_mrna netDanRer4 -enrichment # all_mrna 3.104%, netDanRer4 69.480%, both 2.445%, cover 78.78%, enrich 1.13x # clean up a little ssh kkstore04 cd /cluster/data/tetNig1/bed mv ./blastz.swap.danRer5/swap.log ./blastz.danRer5.swap rm -r blastz.swap.danRer5 ln -s blastz.danRer5.swap blastz.danRer5 ############################################################################ # TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30 see doc/builds.txt for specific details. ############################################################################ # HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-07-18) ssh kkstore03 # bash if not using bash shell already mkdir /cluster/data/tetNig1/blastDb cd /cluster/data/tetNig1 cat noUn/chr*fa > temp.fa faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft rm temp.fa cat randomContigs/*.fa > temp.fa faSplit sequence temp.fa 150 blastDb/y rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /san/sanvol1/scratch/tetNig1/blastDb cd /cluster/data/tetNig1/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/tetNig1/blastDb done mkdir -p /cluster/data/tetNig1/bed/tblastn.hg18KG cd /cluster/data/tetNig1/bed/tblastn.hg18KG echo /san/sanvol1/scratch/tetNig1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 386 query.lst # we want around 150000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(150000/386) = 94.510813 mkdir -p /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/kgfa split -l 95 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done cd /cluster/data/tetNig1/bed/tblastn.hg18KG tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/tetNig1/blastDb.lft carry $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit ssh pk cd /cluster/data/tetNig1/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 136074 of 136074 jobs # CPU time in finished jobs: 2945512s 49091.87m 818.20h 34.09d 0.093 y # IO & Wait Time: 569398s 9489.96m 158.17h 6.59d 0.018 y # Average job time: 26s 0.43m 0.01h 0.00d # Longest finished job: 881s 14.68m 0.24h 0.01d # Submission to last job: 146804s 2446.73m 40.78h 1.70d ssh kkstore03 cd /cluster/data/tetNig1/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' chmod +x chainOne ls -1dS /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh memk cd /cluster/data/tetNig1/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 387 of 387 jobs # CPU time in finished jobs: 1982s 33.04m 0.55h 0.02d 0.000 y # IO & Wait Time: 25746s 429.10m 7.15h 0.30d 0.001 y # Average job time: 72s 1.19m 0.02h 0.00d # Longest finished job: 165s 2.75m 0.05h 0.00d # Submission to last job: 2784s 46.40m 0.77h 0.03d ssh kkstore03 cd /cluster/data/tetNig1/bed/tblastn.hg18KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/tetNig1/bed/tblastn.hg18KG/unliftBlastHg18KG.psl cd .. pslCheck unliftBlastHg18KG.psl liftUp -nohead temp.psl ../../randomContigs/tetNig1.randomContigs.lift carry unliftBlastHg18KG.psl sort -T /tmp -k 14,14 -k 16,16n -k 17,17n temp.psl > blastHg18KG.psl rm temp.psl # load table ssh hgwdev cd /cluster/data/tetNig1/bed/tblastn.hg18KG hgLoadPsl tetNig1 blastHg18KG.psl # check coverage featureBits tetNig1 blastHg18KG # 19028060 bases of 342403326 (5.557%) in intersection featureBits tetNig1 ensGene:cds blastHg18KG -enrichment # ensGene:cds 9.909%, blastHg18KG 5.557%, both 4.906%, cover 49.51%, enrich 8.91x featureBits tetNig1 nscanGene:cds blastHg18KG -enrichment # nscanGene:cds 9.388%, blastHg18KG 5.557%, both 3.568%, cover 38.01%, enrich 6.84x ssh kkstore03 rm -rf /cluster/data/tetNig1/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## # SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram) ssh kkstore03 # not too important since everything moved to hive screen # use screen to control this job cd /cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26 cat fb.oryLat2.chainTetNig1Link.txt # 163171121 bases of 700386597 (23.297%) in intersection # And, for the swap: mkdir /cluster/data/tetNig1/bed/blastz.oryLat2.swap cd /cluster/data/tetNig1/bed/blastz.oryLat2.swap time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \ /cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26/DEF \ -swap -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 & # real 22m48.350s cat fb.tetNig1.chainOryLat2Link.txt # 139059872 bases of 342403326 (40.613%) in intersection ########################################################################## # add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram) hgsql -e 'update dbDb set sourceName="Genoscope and Broad Institute V7 (NCBI project 12350, CAAE01000000)" where name="tetNig1";' hgcentraltest ########################################################################### ############################################################################ # TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01 see doc/builds.txt for specific details. ############################################################################ ############################################################################ # TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd) vertebrate-wide transMap alignments were built Tracks are created and loaded by a single Makefile. This is available from: svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13 see doc/builds.txt for specific details. ############################################################################ # LIFTOVER TO TetNig2 (DONE - 2010-01-15 - Hiram ) mkdir /hive/data/genomes/tetNig1/bed/blat.tetNig2.2010-01-15 cd /hive/data/genomes/tetNig1/bed/blat.tetNig2.2010-01-15 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug tetNig1 tetNig2 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ tetNig1 tetNig2 > do.log 2>&1 # real 36m16.693s #############################################################################