# for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on the # Fugu Rubripes (Japanese pufferfish), whole genome shotgun assembly dated # 26 August 2002, from the DOE Joint Genome Institute (v3.0). # This release contains ~320Mbases, in 20,378 scaffolds. # Size distribution is: 2 scaffolds ~1Mbase, 17 are 500-1000kbase. # Kate Rosenbloom 4/03 # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-----------+---------------------+ # | tableName | type | # +-----------+---------------------+ # | ensGene | genePred ensPep | # | genscan | genePred genscanPep | # +-----------+---------------------+ # DOWNLOAD THE SEQUENCE FROM JGI (04/22/03 KRR) ssh eieio set fugudir = /cluster/store5/Fugu_Rubripes_V3 mkdir $fugudir cd $fugudir # NOTE: there are two downloads available on the site -- one # is named indicating it is masked, and contains additional N's. # The other file (the one we are using) contains lower-case, # indicating it is soft-masked. (As an aside, the mask locations are not # consistent between the two files). # We use the soft-masked file, since it has more sequence; however # we will translate to upper case and repeat mask ourselves, # so that we can generate a Repeats track. # # The file consists of fasta records, each named scaffold_#. # Unfortunately, the ordering of scaffolds is alphabetic, not numeric, # so scaffold_1 is followed by scaffold_10, not scaffold_2. # Also, the scaffold numbering has holes -- the highest numbered # scaffold is 32607. wget ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3.fasta.Z ln -s $fugudir ~/fr1 cd ~/fr1 gunzip fugu_v3.fasta.Z # trim off garbage at end of file (list of scaffolds), # this appears to be an artifact of the release process mv fugu_v3.fasta fugu_v3.orig.fasta grep -v '^scaffold' fugu_v3.orig.fasta > fugu_v3.fasta # force to upper case (see above), but preserve lower case headers tr '[a-z]' '[A-Z]' < fugu_v3.fasta | \ sed 's/>SCAFFOLD_/>scaffold_/' > fugu_v3.upper.fasta # SPLIT THE FASTA FILE INTO SCAFFOLDS FOR FURTHER PROCESSING (DONE KRR) ssh eieio set fugudir = /cluster/store5/Fugu_Rubripes_V3 cd $fugudir mkdir scaffolds # NOTE: must give a record count larger than the number of scaffolds # next time, use "faSplit byName" to do this faSplit sequence fugu_v3.upper.fasta 30000 scaffolds/ # rename files to match scaffold (in fasta header) # and store in directories (00 .. 32) for convenience and readability # This produces directories with 200 - 1000 files per dir. # The directory is the leading two digits of the scaffold number, # (when represented as a zero-filled 5-digit number) cd scaffolds foreach d1 (0 1 2 3) foreach d2 (0 1 2 3 4 5 6 7 8 9) foreach file ($d1$d2*.fa) set num = `sed -n '1s/>scaffold_//p' $file` set dir = `printf "%05d" $num | sed 's/...$//'` mkdir -p $dir echo "moving $file to $dir/scaffold_$num" mv $file $dir/scaffold_${num}.fa end end end # remake a whole-genome fasta file, with the scaffold records # ordered more sensibly (numeric, not alphabetic) set ordered_file = fugu_v3.ordered.fa cp /dev/null $ordered_file cd scaffolds foreach d ([0-9][0-9]) set filelist = `(cd $d; ls *.fa | sort -n -k 1.10)` foreach f ($filelist) echo "concatenating $f" cat $d/$f >> ../$ordered_file end end cd .. # split into ~500KB "super-scaffolds" # each file has one or more scaffolds. Fasta records are preserved # This creates 577 files, size range ~51000 to ~1.2Mb rm -fr superscaffolds mkdir superscaffolds faSplit about $ordered_file 500000 superscaffolds/ss_ # RUN REPEAT MASKER ON THE SUPER-SCAFFOLDS (6/2/03 KRR) # note: fugu library ("puffer.lib") is dated 7/9/2002 ssh eieio cd ~/fr1 # make the run directory, output directory, and job list mkdir -p RMRun/Un cp /dev/null RMRun/RMJobs set fugudir = /cluster/store5/Fugu_Rubripes_V3 cd superscaffolds foreach f (ss_*.fa) echo /cluster/bin/scripts/RMFugu \ $fugudir/superscaffolds $f $fugudir/RMRun/Un \ '{'check out line+ $fugudir/RMRun/Un/$f.out'}' \ >> ../RMRun/RMJobs end # do the run ssh kk cd ~/fr1/RMRun para create RMJobs para try para check para push para check,... # PROCESS SUPER-SCAFFOLDS FOR SIMPLE REPEATS (6/2/03 KRR) # TRF runs pretty quickly now... it takes a few hours total runtime, # so instead of binrsyncing and para-running, just do this on the # local fileserver ssh eieio mkdir ~/fr1/bed/simpleRepeat cd ~/fr1/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh set fugudir = /cluster/store5/Fugu_Rubripes_V3 foreach f ($fugudir/superscaffolds/*.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end tcsh jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l # FILTER SIMPLE REPEATS INTO MASK (6/3/03 KRR) # make a filtered version # of the trf output: # keep trf's with period <= 12: ssh eieio cd ~/fr1/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) echo "filtering $f" awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # MASK SUPER-SCAFFOLDS USING REPEATMASKER AND FILTERED TRF FILES (6/3/03 KRR) ssh eieio cd ~/fr1 mkdir superscaffolds.masked foreach p (superscaffolds/ss_*.fa) set f = $p:t echo "masking $f" maskOutFa $p RMRun/Un/$f.out superscaffolds.masked/$f -soft maskOutFa superscaffolds.masked/$f bed/simpleRepeat/trfMask/${f:r}.bed superscaffolds.masked/$f -softAdd end # CREATE FASTA FOR UNORDERED CHROM FROM MASKED SUPER-SCAFFOLDS (6/3/03 KRR) # Note: scaffolds are separated by 1000 Bp gaps ssh eieio cd ~/fr1 # remake a whole-genome fasta file from the masked supser-scaffolds set masked_file = fugu_v3.masked.fa cd superscaffolds.masked cat `ls ss_*.fa | sort -n -k 1.4` > ../$masked_file cd .. # generate AGP and lift files from the chromosome fasta scaffoldFaToAgp $masked_file # gap size is 1000, total gaps: 20379 # chrom size is 349519338 mkdir Un mv fugu_v3.masked.agp Un/chrUn.agp mv fugu_v3.masked.lft Un/chrUn.lft # reran scaffoldFaToAgp with new version to generate a gap # file that contains gaps within contig mv fugu_v3.masked.gap Un/chrUn.gap # Create chromosome FA file from AGP and file of masked scaffolds cd Un agpToFa -simpleMultiMixed chrUn.agp chrUn chrUn.fa ../fugu_v3.masked.fa # CREATING DATABASE (5/2/03 KRR) # Create the database. ssh hgwdev echo 'create database fr1' | hgsql '' # Make a semi-permanent read-only alias: alias fr1 "mysql -u hguser -phguserstuff -A fr1" # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql # STORE SEQUENCE AND ASSEMBLY INFORMATION (6/3/03 KRR) # Translate to nib ssh eieio cd ~/fr1 mkdir nib faToNib -softMask Un/chrUn.fa nib/chrUn.nib # Make symbolic links from /gbdb/fr1/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/fr1/nib set fugudir = /cluster/store5/Fugu_Rubripes_V3 ln -s $fugudir/nib/chrUn.nib /gbdb/fr1/nib # Load /gbdb/fr1/nib paths into database and save size info. ssh hgwdev hgsql fr1 < ~/src/hg/lib/chromInfo.sql cd ~/fr1 # NOTE: last arg here may be in error hgNibSeq -preMadeNib fr1 /gbdb/fr1/nib chrUn.nib echo "select chrom,size from chromInfo" | hgsql -N fr1 > chrom.sizes # create assembly and gap tracks ssh hgwdev hgGoldGapGl -noGl fr1 ~ fr1 # reload gap track so that it also includes gaps for N's in scaffolds hgLoadGap fr1 /cluster/data/fr1 # make Map Contigs track for comparison to Assembly track # cd /cluster/data/fr1/Un # mkdir -p lift # grep scaffold chrUn.lft > lift/ordered.lft # hgCtgPos fr1 /cluster/data/fr1 # CREATING GRP TABLE FOR TRACK GROUPING (5/2/03 KRR) # Copy all the data from the table "grp" # in the existing database "rn1" to the new database ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \ | hgsql fr1 # CREATE REPEAT TRACKS (6/3/03 KRR) ssh eieio cd ~/fr1 # merge and lift up the repeatmasker output files to chrom coordinates liftUp Un/chrUn.fa.out Un/chrUn.lft warn RMRun/Un/*.out # load into the database (chrUn_rmsk table) ssh hgwdev hgLoadOut fr1 Un/chrUn.fa.out # lift the simple repeat output to chrom coordinates ssh eieio cd ~/fr1 cd bed/simpleRepeat liftUp simpleRepeat.bed ~/fr1/Un/chrUn.lft warn trf/*.bed # load into the database (simpleRepeat table) ssh hgwdev cd ~/fr1/bed/simpleRepeat hgLoadBed fr1 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # MAKE GCPERCENT (6/3/03 KRR) ssh hgwdev set fugudir = /cluster/store5/Fugu_Rubripes_V3 mkdir -p $fugudir/bed/gcPercent cd $fugudir/bed/gcPercent hgsql fr1 < ~/src/hg/lib/gcPercent.sql # load gcPercent table hgGcPercent fr1 ../../nib # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR FUGU (5/6/03 KRR) echo 'INSERT INTO defaultDb VALUES ("Fugu", "fr1");' \ | hgsql -h genome-testdb hgcentraltest # Warning: must genome and organism fields must correspond # with defaultDb values # Note: for next assembly, set scientificName column to "Fugu rubripes" echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome) values \ ("fr1", "Aug. 2002", "/gbdb/fr1/nib", "Fugu", \ "chrUn:827700-845800", 1, 10, "Fugu");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add fr1 in all the right places and do make update # go public on genome-test #make alpha cvs commit makefile # Add trackDb directories mkdir fugu mkdir fugu/fr1 cvs add fugu cvs add fugu/fr1 cvs commit fugu # MAKE RELATIONAL RNA TABLES hgLoadRna new fr1 # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR FUGU (6/4/03 KRR) ssh hgwdev # Get appropriate hostname from cluster admins echo 'insert into blatServers values("fr1", "blat11", "17783", "1"); \ insert into blatServers values("fr1", "blat11", "17782", "0");' \ | hgsql -h genome-testdb hgcentraltest # MAKE AND STORE mRNA ALIGNMENTS (6/8/03 KRR) # Load up the local disks of the small cluster with mrna.fa # and masked super-scaffolds # from /cluster/store5/mrna.134/org/Takifugu_rubripes # note: there are 76 mrna sequences, and 24398 EST's ssh kkr1u00 cd /iscratch/i/fugu set mrnaDir = /cluster/store5/mrna.134/org/Takifugu_rubripes mkdir mrna cp -p $mrnaDir/mrna.fa mrna mkdir trfFa cp -p ~/fr1/superscaffolds.masked/*.fa trfFa # note: this isn't currently working for me # (permission errors) -- Hiram did it ~kent/bin/iSync # generate alignment job # TODO: try aligning whole mrna file against whole chrUn nib # on file server (eieio), instead of using cluster. # this would be simpler, and probably fast enough cd ~/fr1/bed mkdir mrna cd mrna mkdir psl ls -1S /iscratch/i/fugu/trfFa/*.fa > genome.lst ls -1S /iscratch/i/fugu/mrna/*.fa > mrna.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -mask=lower -ooc={check in exists /scratch/hg/h/11.ooc} {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.lst mrna.lst gsub spec para create spec para try para check para push # this just takes a few minutes ssh eieio cd ~/fr1/bed/mrna pslSort dirs raw.psl /tmp psl # psl with 578 files # 578 files in 1 dirs # Got 578 files 24 files per mid file # Writing /tmp/tmp0.psl # ... # Writing /tmp/tmp24.psl # writing raw.psl # Cleaning up temp files pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl scaffolds.psl \ /dev/null # Processing raw.psl to scaffolds.psl and /dev/null # Processed 161 alignments # lift up PSL files to chrom coordinates liftUp -nohead all_mrna.psl ~/fr1/Un/chrUn.lft warn scaffolds.psl # Got 40758 lifts in /cluster/home/kate/fr1/Un/chrUn.lft # Lifting scaffolds.psl pslSortAcc nohead chrom /tmp all_mrna.psl # Processing all_mrna.psl # Processed 89 lines into 1 temp files # load mrna tables ssh hgwdev cd ~/fr1/bed/mrna/chrom # rename psl file to required format (must be chr*_mrna) mv chrUn.psl chrUn_mrna.psl # load alignments hgLoadPsl -noTNameIx fr1 chrUn_mrna.psl cd .. hgLoadPsl fr1 all_mrna.psl -nobin # prepare for data load set mrnaDir = mrna.134 mkdir /gbdb/fr1/mrna.134 #ln -s /cluster/store5/$mrnaDir/org/Takifugu_rubripes /gbdb/fr1/$mrnaDir ln -s /cluster/store5/$mrnaDir/org/Takifugu_rubripes/mrna.fa /gbdb/fr1/$mrnaDir # load mRna into database # WARNING: had to use -ignore flag... check on this hgLoadRna add -type=mRNA fr1 /gbdb/fr1/$mrnaDir/mrna.fa \ /cluster/store5/$mrnaDir/org/Takifugu_rubripes/mrna.ra # Adding data of type: mRNA ... # MAKE AND STORE EST ALIGNMENTS (6/8/03 KRR) # setup BlueArc with EST files ssh kk mkdir -p /cluster/bluearc/fugu/fr1/est cd /cluster/bluearc/fugu/fr1/est # split up est file into 100Kb chunks # (makes ~120 query files * ~600 target superscaffolds = 70000 jobs) faSplit about /cluster/store5/mrna.134/org/Takifugu_rubripes/est.fa \ 100000 e # TODO: try aligning est files against whole chrUn nib # this would be easier on the filesystem, and probably fast enough cd ~/fr1/bed mkdir est cd est mkdir psl ls -1S /iscratch/i/fugu/trfFa/*.fa > genome.lst ls -1S /cluster/bluearc/fugu/fr1/est/*.fa > est.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -mask=lower -ooc={check in exists /scratch/hg/h/11.ooc} {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.lst est.lst gsub spec para create spec para try para check para push para check... # just a few minutes to complete # process alignments ssh eieio cd ~/fr1/bed/est pslSort dirs raw.psl /tmp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl scaffolds.psl \ /dev/null # lift up PSL files to chrom coordinates liftUp -nohead all_est.psl ~/fr1/Un/chrUn.lft warn scaffolds.psl # Got 40758 lifts in /cluster/home/kate/fr1/Un/chrUn.lft # Lifting scaffolds.psl pslSortAcc nohead chrom /tmp all_est.psl # Processing all_est.psl # Processed 20501 lines into 1 temp files # load into database ssh hgwdev cd ~/fr1/bed/est/chrom # rename psl file to required format (must be chr*_est) mv chrUn.psl chrUn_est.psl # load alignments hgLoadPsl -noTNameIx fr1 chrUn_est.psl cd .. hgLoadPsl fr1 all_est.psl -nobin set mrnaDir = mrna.134 ln -s /cluster/store5/$mrnaDir/org/Takifugu_rubripes/est.fa /gbdb/fr1/$mrnaDir cd ~/fr1/bed/est rm *.tab hgLoadRna add -type=EST fr1 /gbdb/fr1/$mrnaDir/est.fa \ /cluster/store5/$mrnaDir/org/Takifugu_rubripes/est.ra unset mrnaDir # note: there is no Genbank RefSeq directory for this organism # PRODUCE CROSS_SPECIES MRNA ALIGNMENT (6/9/03 KRR) # Aligns non-Fugu mRNA's against the masked genome # This uses Genbank mRNA files already downloaded # to /cluster/store5/genbank.134 # Files are unpacked to /cluster/store5/mrna.134 # use fileserver where genbank.134 resides ssh eieio set genbankdir = /cluster/store5/genbank.134 set mrnadir = /cluster/store5/mrna.134 cd $mrnaDir # generate filter file for unpacking entries # note: skip this if fuguXenoRna.fil exists cat << '_EOF_' > fuguXenoRna.fil restrict mol=mRNA & !(org="Takifugu rubripes") hide mol '_EOF_' # << happy emacs # unpack non-Fugu mRNAs gunzip -c $genbankdir/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \ | gbToFaRa fuguXenoRna.fil \ fuguXenoRna.fa fuguXenoRna.ra fuguXenoRna.ta stdin # copy masked chromosome nib to bluearc for cluster run # NOTE: probably want to do this earlier # so it is available for all alignments ssh kk cd /cluster/bluearc/fugu/fr1 mkdir chromNib cp ~/fr1/nib/* chromNib # split masked scaffold file into 50Mb chunks # to produce 7 files mkdir trfFa faSplit about ~/fr1/fugu_v3.masked.fa 50000000 trfFa/s # split big xeno mrna file into 700Kb chunks on bluearc for cluster run # (makes ~1000 query files * 7 target files = ~7000 jobs) # This is not too many to handle in a single directory (psl) mkdir xenoRnaSplit faSplit about $mrnadir/fuguXenoRna.fa 700000 xenoRnaSplit/m cd ~/fr1/bed mkdir xenoMrna cd xenoMrna mkdir psl ls -1S /cluster/bluearc/fugu/fr1/trfFa/s*.fa > genome.lst ls -1S /cluster/bluearc/fugu/fr1/xenoRnaSplit/m*.fa > mrna.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -q=rnax -t=dnax -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 genome.lst mrna.lst gsub spec para create spec para try para check para push para check... # Completed: 6629 of 6629 jobs # Avg. job: 2.7 min # Longest job: 4.7 min # Submission to last job: 49 min # process alignments ssh eieio cd ~/fr1/bed/xenoMrna # sort alignmnents pslSort dirs raw.psl /tmp psl # Got 6629 files 81 files per mid file # analyse repeats and generate genome-wide best alignment # note different parameters from previous alignments pslReps -minAli=0.25 raw.psl scaffolds.psl /dev/null # ...........Processed 1675350 alignments # lift up PSL files to chrom coordinates liftUp -nohead chrom.psl ~/fr1/Un/chrUn.lft warn scaffolds.psl # Got 40758 lifts in /cluster/home/kate/fr1/Un/chrUn.lft # extract psl file for each accession, into a dir pslSortAcc nohead chrom /tmp chrom.psl # merge psl files pslCat -dir chrom > xenoMrna.psl # load alignments into database ssh hgwdev cd ~/fr1/bed/xenoMrna rm *.tab hgLoadPsl fr1 xenoMrna.psl # load xeno rna into database set mrnadir = mrna.134 set datadir = /cluster/store5 ln -s $datadir/$mrnadir/fuguXenoRna.fa /gbdb/fr1/$mrnaDir hgLoadRna add -type=xenoRna fr1 /gbdb/fr1/$mrnadir/fuguXenoRna.fa \ $datadir/$mrnadir/fuguXenoRna.ra # HUMAN HG16 BLAT ALIGNMENT (DONE 2003-09-09 kate) ssh kk mkdir /cluster/data/fr1/bed/blatHg16 cd /cluster/data/fr1/bed/blatHg16 # create output dir and subdirs (prevent directory overload) mkdir psl foreach f (/iscratch/i/fugu/trfFa/*.fa) set c=$f:t:r echo $c mkdir -p psl/$c end mkdir run cd run ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst # 578 files ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > human.lst # 491 files cat << 'EOF' > gsub #LOOP /cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/fr1/bed/blatHg16/psl/$(root1)/$(root1)_$(root2).psl} #ENDLOOP 'EOF' gensub2 fugu.lst human.lst gsub spec # 283798 lines para create spec para try para check para push para check # merge to single Fugu chrUn chrom ssh eieio cd /cluster/data/fr1/bed/blastHg16 pslCat -dir psl/ss_* | \ liftUp -type=.psl stdout \ /cluster/data/fr1/Un/chrUn.lft warn stdin | \ pslSortAcc nohead chrom temp stdin # 15 minutes ? # load into database cd $fr1dir/chrom hgLoadPsl -noTNameIx fr1 chrUn_blatHg15.psl rm -fr psl # TETRAODON BLAT ALIGNMENT (DONE 2003-09-29 kate) # Blat using 2002 Tetraodon nigroviridis (V6) from genoscope # repeat and trf-masked contigs ssh kk mkdir /cluster/data/fr1/bed/blatTetra cd /cluster/data/fr1/bed/blatTetra # create output dir and subdirs (prevent directory overload) mkdir psl foreach f (/iscratch/i/fugu/trfFa/*.fa) set c=$f:t:r echo $c mkdir -p psl/$c end mkdir run cd run ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst # 578 files ls -1S /iscratch/i/tetra/*.fa > tetra.lst # 632 files cat << 'EOF' > gsub #LOOP /cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/fr1/bed/blatTetra/psl/$(root1)/$(root1)_$(root2).psl} #ENDLOOP 'EOF' gensub2 fugu.lst tetra.lst gsub spec wc -l spec # 365296 spec para create spec para try para check para push para check # merge to single Fugu chrUn chrom ssh eieio cd /cluster/data/fr1/bed/blatTetra pslCat -dir psl/ss_* | \ liftUp -type=.psl stdout \ /cluster/data/fr1/Un/chrUn.lft warn stdin | \ pslSortAcc nohead chrom temp stdin # 15 minutes ? rm -fr psl # load into database ssh hgwdev cd /cluster/data/fr1/bed/blatTetra/chrom mv chrUn.psl chrUn_blatTetra.psl hgLoadPsl -noTNameIx fr1 chrUn_blatTetra.psl # Add entry to trackDb.ra # make symlinks in /gbdb and load tetra sequence data cat * > ../tetra.masked.fa cd /gbdb/fr1 ln -s /cluster/data/tetra/tetra.masked.fa # load tetra sequences into database ssh hgwdev cd /cluster/data/fr1/bed/blatTetra hgLoadSeq fr1 /gbdb/fr1/tetra.masked.fa # Warning: load of seq did not go as planned: 108177 record(s), 1147 row(s) skipped, 13534 warning(s) loading . # HUMAN HG16 BLAT SWAPPED FROM HUMAN BROWSER ssh eieio set hg16dir = /cluster/data/hg16/bed/blatFr1 set fr1dir = /cluster/data/fr1/bed/blatHg16.swp mkdir -p $fr1dir/{psl,chrom} # swap alignments (for each human chrom) cd $hg16dir/chrom foreach f (chr?{,?}{,_random}_blatFr1.psl) pslSwap $f $fr1dir/psl/$f end # merge to single Fugu chrUn chrom cd $fr1dir/psl pslCat chr?{,?}{,_random}_blatFr1.psl > \ $fr1dir/chrom/scaffolds.psl # lift target side (fugu) to chrom coordinates cd $fr1dir/chrom liftUp chrUn_blatHg16.psl /cluster/data/fr1/Un/chrUn.lft \ warn scaffolds.psl # Got 40758 lifts # load into database ssh hgwdev cd /cluster/data/fr1/bed/blatHg16.swp/chrom hgLoadPsl -noTNameIx fr1 chrUn_blatHg16.psl # Add entry to trackDb.ra # make hg16 symlink in /gbdb and load human sequence data set gbdbdir=/gbdb/fr1/hg16 mkdir $gbdbdir cd $gbdbdir foreach f (/cluster/data/hg16/?{,?}) set c = $f:t if (-e $f/chr${c}.fa) then ln -s $f/chr${c}.fa endif if (-e $f/chr${c}_random.fa) then ln -s $f/chr${c}_random.fa endif end # load human sequences into database ssh hgwdev cd /cluster/data/fr1/bed/blatHg16.swp hgLoadSeq fr1 /gbdb/fr1/hg16/*.fa # PRODUCE TETRAODON BLASTZ (6/13/03 IN PROGRESS KRR) # NOTE: the tetra sequence wasn't repeatmasked -> must be redone cd eieio mkdir ~/fr1/bed/blastzTetra # translate the Genoscope fasta files to upper-case # so sequence doesn't look like it's masked out (all-lower) cd /iscratch/i/fish foreach f (*.fa) echo $f tr '[a-z]' '[A-Z]' < $f > /cluster/bluearc/tetra/$f end (cat README; echo "NOTE: Upper-cased files from /iscratch/i/fish") > \ /cluster/bluearc/tetra/README # copy fugu files to cluster filesystem set clusterdir = /cluster/bluearc mkdir -p $clusterdir/fugu/fr1/rmsk cd $clusterdir/fugu/fr1/rmsk rm *.out cp ~/fr1/Un/chrUn.fa.out . # generate DEF file cd ~/fr1/bed/blastzTetra cat << '_EOF_' > DEF # Tetraodon vs. Fugu # TODO: angie's home dir out of this export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified BLASTZ_ABRIDGE_REPEATS=0 # TARGET - Fugu # soft-masked chrom nib SEQ1_DIR=/cluster/bluearc/fugu/fr1/chromNib # repeat masker output file for chrom SEQ1_RMSK=/cluster/bluearc/fugu/fr1/rmsk # lineage-specific repeats -- (repeats in fugu, not in tetraodon) # we don't have that information for these species SEQ1_SMSK=/dev/null # currently unused SEQ1_FLAG=-fish SEQ1_IN_CONTIGS=0 # 10 MB chunk for target SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY - Tetraodon SEQ2_DIR=/cluster/bluearc/tetra # no repeatmasker output for these sequences SEQ2_RMSK=/dev/null SEQ2_SMSK=/dev/null # currently unused SEQ2_FLAG=-fish # set to 1 if multiple contigs are in 1 fasta file SEQ2_IN_CONTIGS=1 # 10 Mbase for query SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/store5/Fugu_Rubripes_V3/bed/blastzTetra DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << happy emacs # save the DEF file in the current standard place chmod +x DEF cp DEF ~angie/hummus/DEF.fr1-tetra.2003-06-17 # setup cluster run ssh kk cd ~/fr1/bed/blastzTetra # source the DEF file bash . ./DEF # follow the next set of directions slavishly mkdir -p $BASE/run # give up on avoiding angie's directories # tcl script # creates xdir.sh and joblist run/j ~angie/hummus/make-joblist $DEF > $BASE/run/j # Computing genome sizes # Writing mkdir script: /cluster/store5/Fugu_Rubripes_V3/bed/blastzTetra/xdir.sh # Writing job list # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size sh $BASE/xdir.sh cd $BASE/run # now edit j to prefix path to executable name # NOTE: we should have a controlled version of schwartz bin executables sed -e 's#^#/cluster/home/schwartz/bin/#' j > j2 wc -l j* head j2 # *** make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j # Checking input files # 1400 jobs written to batch para try para check para push # ... etc ... # about 3 hrs. per job # post-process blastz # normalize ssh kk cd ~/fr1/bed/blastzTetra # source the DEF file again in case you are coming back to this bash . ./DEF # a new run directory mkdir -p $BASE/run.1 # another obscure script creates a new job list: ~angie/hummus/do.out2lav $DEF >$BASE/run.1/j cd $BASE/run.1 # add path to executable in the job list: sed -e 's/^/\/cluster\/home\/schwartz\/bin\//' j > j2 wc -l j* head j2 # make sure the edited j2 is OK, then use it: mv j2 j para create j para try para check para push # etc. # 15 minute jobs # Translate the .lav files # into axt files ssh kkstore set base="/cluster/store5/Fugu_Rubripes_V3/bed/blastzTetra" set tbl="blastzTetra" # generate single tetra fa file required for lavToAxt cd /cluster/bluearc cat tetra/* > tetra-all.fa # copy lav files to bluearc to speed up job ssh eieio mkdir -p /cluster/bluearc/fugu/fr1/blastzTetra/lav cp ~/fr1/bed/blastzTetra/lav/chrUn/* /cluster/bluearc/fugu/fr1/blastzTetra/lav # cluster job for converting lav files # these take a long time because query is in fa files # (with 880K sequences), not a chrom nib ssh kk cd ~/fr1/bed/blastzTetra mkdir -p axtLav mkdir run.lavToAxt cd run.lavToAxt # create file lists ls -1S /cluster/bluearc/fugu/fr1/blastzTetra/lav/*.lav > lav.lst echo "" > dummy.lst # Create template file, gsub, for gensub2. cat << 'EOF' > gsub #LOOP /cluster/bin/i386/lavToAxt {check in line+ $(path1)} /cluster/bluearc/fugu/fr1/chromNib -fa /cluster/bluearc/tetra-all.fa {check out line+ /cluster/store5/Fugu_Rubripes_V3/bed/blastzTetra/axtLav/$(root1).axt} #ENDLOOP 'EOF' gensub2 lav.lst dummy.lst gsub jobList para create jobList para try para check para push # 35 hours (too long -- need to split up better). cd $base mkdir -p axtChrom cat axtLav/*.axt | /cluster/bin/i386/axtSort stdin axtChrom/chrUn.axt #foreach c (lav/*) #pushd $c #set chr=$c:t #set out=$base/axtChrom/$chr.axt #echo "out=$out" #echo "Translating $chr lav to $out" #echo cat `ls -1 *.lav | sort -g` \ #| /cluster/bin/i386/lavToAxt stdin $fugudir/nib \ #-fa /cluster/bluearc/tetra-all.fa stdout \ #| /cluster/bin/i386/axtSort stdin $out #popd #end # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # load these blastz results ssh hgwdev cd ~/fr1/bed/blastzTetra/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx fr1 chr*_*.psl # STOPPED HERE -- due to empty output data # PRODUCING GENSCAN PREDICTIONS (6/17/03 KRR) # Note: Genscan input files should be hard-masked ssh eieio # hard mask the super-scaffolds mkdir -p ~/fr1/superscaffolds.hardmasked cd ~/fr1/superscaffolds foreach f (ss_*.fa) echo $ maskOutFa $f hard ../superscaffolds.hardmasked/$f end # setup the genscan directory mkdir -p ~/fr1/bed/genscan cd ~/fr1/bed/genscan # Make 3 subdirectories for genscan to put output files in mkdir -p gtf pep subopt # Generate a list file, genome.list, of all the contigs # *that do not have pure Ns* (due to heterochromatin, unsequenceable # stuff) which would cause genscan to run forever. rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/store5/Fugu_Rubripes_V3/superscaffolds.hardmasked/ss_*.fa` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end # Log into kkr1u00 (not kk!). kkr1u00 is the driver node for the small # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the # big cluster, due to limitation of memory and swap space on each # processing node). ssh kkr1u00 cd ~/fr1/bed/genscan # Create template file, gsub, for gensub2. # NOTE: According to the README for Genscan, # the HumanIso.smat parameter files is used for all vertebrates cat << '_EOF_' > gsub #LOOP /cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << happy emacs echo "" > dummy.list gensub2 genome.list dummy.list gsub jobList para create jobList para try para check para push # Convert output to chrom coordinates ssh eieio cd ~/fr1/bed/genscan liftUp genscan.gtf ~/fr1/Un/chrUn.lft warn gtf/*.gtf liftUp genscanSubopt.bed ~/fr1/Un/chrUn.lft warn subopt/*.bed > /dev/null cat pep/*.pep > genscan.pep # Load into the database ssh hgwdev cd ~/fr1/bed/genscan ldHgGene fr1 genscan genscan.gtf # Reading genscan.gtf # Read 36106 transcripts in 243984 lines in 1 files # 36106 groups 1 seqs 1 sources 1 feature types # 36106 gene predictions hgPepPred fr1 generic genscanPep genscan.pep hgLoadBed fr1 genscanSubopt genscanSubopt.bed > /dev/null # CPGISLANDS (6/17/03 KRR) ssh eieio mkdir -p /cluster/data/fr1/bed/cpgIsland cd /cluster/data/fr1/bed/cpgIsland # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu) # NOTE: we should keep this centrally cp -rp /cluster/data/mm3/bed/cpgIsland/cpg_dist . cd cpg_dist # gcc readseq.c cpg_lh.c -o cpglh.exe # hard-mask the chrom fa # cpglh.exe requires hard-masked (N) .fa's. # There may be warnings about "bad character" for IUPAC ambiguous # characters like R, S, etc. Ignore the warnings. cd ~/fr1/Un maskOutFa chrUn.fa hard chrUn.hardmasked.fa # run cpgislands on the hard-masked chrom fa cd ~/fr1/bed/cpgIsland ./cpglh.exe ~/fr1/Un/chrUn.hardmasked.fa > chrUn.cpg cp ~kent/mm3/bed/cpgIsland/filter.awk . awk -f filter.awk chr*.cpg > cpgIsland.bed # load annotations into database ssh hgwdev cd ~/fr1/bed/cpgIsland # NOTE: this should go into a shared lib dir cp $HOME/kent/src/hg/lib/cpgIsland.sql . hgLoadBed fr1 cpgIsland -tab -noBin \ -sqlTable=cpgIsland.sql cpgIsland.bed # Reading cpgIsland.bed # Loaded 41723 elements # Sorted # Saving bed.tab # Loading fr1 # LOAD ENSEMBL GENE PREDICTIONS (DONE 2003-08-11 kate) # The downloads were provided by Shawn Hoon (shawnh@fugu-sg.org) # of the Singapore Institute of Molecular and Cell Biology (IMCB) # Fugu genome project, part of the international Fugu Genome Consortium # Other contact is: Venkatesh Byrappa (mcbbv@imcb.a-star.edu.sg) # In future releases, these annotations will need to be downloaded from Ensembl, # using Ensmart ssh hgwdev cd /cluster/data/fr1 mkdir -p bed/ensembl cd bed/ensembl # download and load gene predictions wget http://www.fugu-sg.org/~shawnh/fugu_download/fugu_rubripes_15_2_gtf.tar.gz tar xvfz fugu_rubripes_15_2_gtf.tar.gz cd fugu_rubripes_core_15_2 foreach f (*.gtf) echo $f sed 's/^Chr_scaffold/scaffold/' $f >> ../ensGene.scaffold.gtf end cd .. /cluster/bin/i386/liftUp ensGene.gtf /cluster/data/fr1/Un/chrUn.lft warn ensGene.scaffold.gtf /cluster/bin/i386/ldHgGene fr1 ensGene ensGene.gtf # 38510 gene predictions # download and load predicted peptides #wget http://www.fugu-sg.org/~shawnh/fugu_download/Fugu_rubripes.FUGU2.pep.fa.gz #gunzip Fugu_rubripes.FUGU2.pep.fa.gz sed 's/Translation:SINFRUP/SINFRUT/g' < Fugu_rubripes.FUGU2.pep.fa > ensembl.pep hgPepPred fr1 generic ensPep ensembl.pep # # MAKE DOWNLOADABLE SEQUENCE FILES (IN PROGRESS 2003-08-29 kate) ssh eieio cd /cluster/data/fr1 # Build the .zip files mkdir jkStuff cp /cluster/data/hg15/jkStuff/zipAll.sh jkStuff # Edit jkStuff/zipAll.sh to point to the correct paths. ./jkStuff/zipAll.sh >&! zipAll.log tail -f zipAll.log # Look at zipAll.log to make sure all file lists look reasonable. # Check zip file integrity: mkdir zip mv *.zip* zip cd zip foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test # 3 chromAgp.zip.test # 3 chromFa.zip.test # 3 chromFaMasked.zip.test # 3 chromOut.zip.test # 3 chromTrf.zip.test # 580 contigFa.zip.test # 580 contigFaMasked.zip.test # 580 contigTrf.zip.test # 3 liftAll.zip.test # 3 mrna.zip.test # Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/fr1/zip set webDir = /usr/local/apache/htdocs/goldenPath/fr1 mkdir -p $webDir/bigZips cp -p *.zip $webDir/bigZips # Create rmsk database table dump for early users (Victor Solovyev) cd $webDir chmod o+w . mysqldump --user=SECRET --password=SECRET --all --tab=. \ fr1 chrUn_rmsk # change ownership (was mysql) mkdir database cp chrUn_rmsk.* database rm chrUn_rmsk.* # restore dir permission chmod o-w . unset webDir # No RefSeqs for Fugu, so no upstream files. # create checksum file md5sum *.zip > md5sum.txt # Take a look at bigZips/* update the README.txt # move fr1 to incremental genbank/refseq updates: # need to create a lift file to splice chrUn into fragments larger # than the scaffolds or there will be far to many jobs cd /cluster/data/fr1/Un ./makeFragLift chrUn.lft >chrUn-frag.lft cd /cluster/data/genbank/ # edit etc/genbank.conf to add fr1 # align mrnas, then ests nice ./bin/gbAlignStep -initial -type=mrna -clusterRootDir=/cluster/bluearc/genbank -iserver=no fr1 nice ./bin/gbAlignStep -initial -type=est -clusterRootDir=/cluster/bluearc/genbank -iserver=no fr1 # TBA PROJECTION - NON-MAMMAL SPECIES INCLUDED (Fugu & Chicken) 2003-10-20 kate # Note: CFTR region in fugu is spread across 4 non-consecutive scaffolds # These are hg16/rn3/mm3 plus NISC sequences. See hg16 doc for details. ssh hgwdev mkdir -p /cluster/data/fr1/bed/nisc/cftr set table = tbaFishBirdCFTR mkdir -p /gbdb/fr1/$table cd /gbdb/fr1/$table foreach i (1 2 3 4) ln -s /cluster/data/nisc/targets/cftr/CFTR.non-mammal/fugu$i.out \ /cluster/data/fr1/bed/nisc/cftr/tbaFishBird.$i.maf ln -s /cluster/data/fr1/bed/nisc/cftr/tbaFishBird.$i.maf \ /gbdb/fr1/${table}/tba.$i.maf end cd /cluster/data/fr1/bed/nisc/cftr /cluster/bin/i386/hgLoadMaf -WARN fr1 $table # 60 warnings # ECORES FROM GENOSCOPE [in progress, hartera, 2004-03-29] # download data from http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecores # ecotigFH - ecores on Fugu, genome conserved with Human, hg16 # ecotigFM - ecores on Fugu, genome conserved with Mouse, mm3 # ecotigFR - ecores on Fugu, genome conserved with Rat, rn3 ssh hgwdev mkdir /cluster/data/fr1/bed/ecores/ # add parse_ecotig.pl to this directory # MOUSE mkdir /cluster/data/fr1/bed/ecores/mm3 cd /cluster/data/fr1/bed/ecores/mm3/ # download data for ecotigFH to this directory # parse ecotig files to produce a bed format file perl ../parse_ecotig.pl < ecotigFM > ecotigFM.bed # remove "CONTIG:" and change from upper to lower case for "SCAFFOLD" perl -pi.bak -e 's/CONTIG://g; s/SCAFFOLD/scaffold/g;' ecotigFM.bed # use liftUp to convert scaffold co-ordinates to chrUn co-ordinates liftUp -type=.bed ecotigFMlift.bed /cluster/data/fr1/Un/chrUn.lft warn ecotigFM.bed # only converts the chromStart and chromEnd fields, use fixLift.pl to # convert rest of the bed file perl ../fixLift.pl < ecotigFMlift.bed > ecotigFMliftFixed.bed hgLoadBed -tab fr1 ecoresMm3 ecotigFMliftFixed.bed # clean up rm *.bak # RAT mkdir /cluster/data/fr1/bed/ecores/rn3 cd /cluster/data/fr1/bed/ecores/rn3/ # download data for ecotigFH to this directory # parse ecotig files to produce a bed format file perl ../parse_ecotig.pl < ecotigFR > ecotigFR.bed # remove "CONTIG:" and change from upper to lower case for "SCAFFOLD" perl -pi.bak -e 's/CONTIG://g; s/SCAFFOLD/scaffold/g;' ecotigFR.bed # use liftUp to convert scaffold co-ordinates to chrUn co-ordinates liftUp -type=.bed ecotigFRlift.bed /cluster/data/fr1/Un/chrUn.lft warn ecotigFR.bed # only converts the chromStart and chromEnd fields, use fixLift.pl to # convert rest of the bed file perl ../fixLift.pl < ecotigFRlift.bed > ecotigFRliftFixed.bed hgLoadBed -tab fr1 ecoresRn3 ecotigFRliftFixed.bed # clean up rm *.bak # HUMAN mkdir /cluster/data/fr1/bed/ecores/hg16 cd /cluster/data/fr1/bed/ecores/hg16/ # download data for ecotigFH to this directory # parse ecotig files to produce a bed format file perl ../parse_ecotig.pl < ecotigFH > ecotigFH.bed # remove "CONTIG:" and change from upper to lower case for "SCAFFOLD" perl -pi.bak -e 's/CONTIG://g; s/SCAFFOLD/scaffold/g;' ecotigFH.bed # use liftUp to convert scaffold co-ordinates to chrUn co-ordinates liftUp -type=.bed ecotigFHlift.bed /cluster/data/fr1/Un/chrUn.lft warn ecotigFH.bed # only converts the chromStart and chromEnd fields, use fixLift.pl to # convert rest of the bed file perl ../fixLift.pl < ecotigFHlift.bed > ecotigFHliftFixed.bed hgLoadBed -tab fr1 ecoresHg16 ecotigFHliftFixed.bed # clean up rm *.bak # add entries in kent/src/hg/makeDb/trackDb/fugu/trackDb.ra # add html for details pages to this directory: # ecoresMm3.html, ecoresRn3.html and ecoresHg16.html # SWAP ZEBRAFISH-FUGU BLASTZ TO FUGU-ZEBRAFISH (danRer1) # (DONE, 2004-06-16, hartera) ssh kolossus mkdir /cluster/data/fr1/bed/blastz.danRer1.swap.2004-06-16 cd /cluster/data/fr1/bed/blastz.danRer1.swap.2004-06-16 set aliDir = /cluster/data/danRer1/bed/blastz.fr1 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 1.6G /cluster/data/danRer1/bed/blastz.fr1/axtChrom # 1.6G unsorted # 1.6G axtChrom rm -r unsorted # translate sorted axt files into psl ssh kolossus cd /cluster/data/fr1/bed/blastz.danRer1.swap.2004-06-16 mkdir -p pslChrom set tbl = "blastzDanRer1" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database table ssh hgwdev cd /cluster/data/fr1/bed/blastz.danRer1.swap.2004-06-16/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl fr1 $f echo "$f Done" end # MAKE Human Proteins track ssh eieio mkdir -p /cluster/data/fr1/blastDb cd /cluster/data/fr1/blastDb for i in ../superscaffolds/*.fa; do ln -s $i .; done for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/fr1/blastDb cp /cluster/data/fr1/blastDb/* /iscratch/i/fr1/blastDb (~kent/bin/iSync) 2>&1 > sync.out mkdir -p /cluster/data/fr1/bed/tblastn.hg16KG cd /cluster/data/fr1/bed/tblastn.hg16KG ls -1S /iscratch/i/fr1/blastDb/*.nsq | sed "s/\.nsq//" > fugu.lst exit # back to kksilo cd /cluster/data/fr1/bed/tblastn.hg16KG mkdir kgfa # calculate a reasonable number of jobs calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(150000/`wc fugu.lst | awk "{print \\\$1}"`\) # 41117/(150000/578) = 158.437507 split -l 158 /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl kgfa/kg cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT f=/tmp/`basename $3` for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 fugu.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/fr1/bed/tblastn.hg16KG para create blastSpec para shove # jobs will need to be restarted, but they should all finish # Completed: 150858 of 150858 jobs # CPU time in finished jobs: 10404860s 173414.33m 2890.24h 120.43d 0.330 y # IO & Wait Time: 2468643s 41144.06m 685.73h 28.57d 0.078 y # Average job time: 85s 1.42m 0.02h 0.00d # Longest job: 411s 6.85m 0.11h 0.00d # Submission to last job: 26364s 439.40m 7.32h 0.31d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=10000 stdin ../c.`basename $1`.psl) '_EOF_' # << happy emacs chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/fr1/bed/tblastn.hg16KG para create chainSpec para push # Completed: 261 of 261 jobs # CPU time in finished jobs: 828s 13.81m 0.23h 0.01d 0.000 y # IO & Wait Time: 19521s 325.34m 5.42h 0.23d 0.001 y # Average job time: 78s 1.30m 0.02h 0.00d # Longest job: 270s 4.50m 0.07h 0.00d # Submission to last job: 3878s 64.63m 1.08h 0.04d exit # back to kksilo cd /cluster/data/fr1/bed/tblastn.hg16KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | liftUp -nosort -type=".psl" -nohead stdout /cluster/data/fr1/jkStuff/liftAll.lft warn stdin | \ sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../preblastHg16KG.psl cd .. blatDir=/cluster/data/hg16/bed/blat.hg16KG protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl ssh hgwdev cd /cluster/data/fr1/bed/tblastn.hg16KG hgLoadPsl fr1 blastHg16KG.psl exit # back to kksilo rm -rf blastOut # End tblastn # BLASTZ SWAP FOR danRer2 vs. fr1 BLASTZ TO CREATE fr1 vs. danRer2 # (DONE, 2004-12-15, hartera) ssh kolossus mkdir -p /cluster/data/fr1/bed/blastz.danRer2.swap cd /cluster/data/fr1/bed/blastz.danRer2.swap # use axtChrom from blastzFr1 on danRer2 set aliDir = /cluster/data/danRer2/bed/blastz.fr1 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom # 867M /cluster/data/danRer2/bed/blastz.fr1/axtChrom # 868M unsorted # 868M axtChrom rm -r unsorted # translate sorted axt files into psl cd /cluster/data/fr1/bed/blastz.danRer2.swap mkdir -p pslChrom set tbl = "blastzDanRer2" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/fr1/bed/blastz.danRer2.swap/pslChrom foreach f (./*.psl) /cluster/bin/i386/hgLoadPsl fr1 $f echo "$f Done" end # featureBits fr1 all_mrna chrUn_blastzDanRer2 -enrichment # all_mrna 0.123%, chrUn_blastzDanRer2 17.731%, both 0.092%, cover 74.45%, # enrich 4.20x # featureBits fr1 all_mrna chrUn_blastzDanRer1 -enrichment # all_mrna 0.123%, chrUn_blastzDanRer1 17.372%, both 0.091%, cover 74.21%, # enrich 4.27x # CHAIN ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-15, hartera) # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS # AND DEGENERATE DNA (DONE, 2004-12-23, hartera) # Run axtChain on little cluster ssh kki cd /cluster/data/fr1/bed/blastz.danRer2.swap mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/fr1/bed/blastz.danRer2.swap/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /cluster/bluearc/fugu/fr1/chromNib \ /iscratch/i/danRer2/nib $2 >& $3 '_EOF_' # << happy emacs chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check ... # para time # CPU time in finished jobs: 158s 2.63m 0.04h 0.00d 0.000 y # IO & Wait Time: 11s 0.19m 0.00h 0.00d 0.000 y # Average job time: 169s 2.82m 0.05h 0.00d # Longest job: 169s 2.82m 0.05h 0.00d # Submission to last job: 169s 2.82m 0.05h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r >> hist5000.out textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out echo "" end # filter with minScore=5000 mv all.chain all.chain.unfiltered chainFilter -minScore=5000 all.chain.unfiltered > all.chainFilt5k rm -r chain chainSplit chain all.chainFilt5k # remove repeats from chains and reload into database # (2004-12-23, hartera) ssh kksilo /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain gunzip chain/chr*.gz mv chain chainRaw mkdir chain cd chainRaw foreach f (*.chain) set c = $f:r echo $c nice chainAntiRepeat /cluster/bluearc/fugu/fr1/chromNib \ /cluster/bluearc/danRer2/nib $f \ ../chain/$c.chain end cd .. chainMergeSort ./chain/*.chain > all.chain.antirepeat chainSplit chainAR all.chain.antirepeat # Load chains into database ssh hgwdev cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain/chainAR foreach i (*.chain) set c = $i:r hgLoadChain fr1 ${c}_chainDanRer2 $i echo done $c end # featureBits fr1 all_mrna chrUn_chainDanRer2Link -enrichment # all_mrna 0.123%, chrUn_chainDanRer2Link 15.517%, both 0.089%, cover 72.00%, # enrich 4.64x # after chainAntiRepeat: # featureBits fr1 all_mrna chrUn_chainDanRer2Link -enrichment # all_mrna 0.126%, chrUn_chainDanRer2Link 15.505%, both 0.091%, cover 72.03%, # enrich 4.65x # NET ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-15, hartera) # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-23,hartera) ssh kksilo cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain rm -r preNet mkdir preNet cd chainAR foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \ ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \ ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net # memory usage 77627392, utime 389 s/100, stime 103 # Add classification info using db tables: ssh hgwdev cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain # use -noAr option - don't look for ancient repeats netClass -noAr noClass.net fr1 danRer2 zfishdanRer2.net # Load the nets into database ssh hgwdev cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain netFilter -minGap=10 zfishdanRer2.net | hgLoadNet fr1 netDanRer2 stdin # featureBits fr1 all_mrna netDanRer2 -enrichment # all_mrna 0.123%, netDanRer2 31.390%, both 0.096%, cover 77.94%, enrich 2.48x # featureBits fr1 all_mrna netDanRer2 -enrichment # all_mrna 0.126%, netDanRer2 31.379%, both 0.098%, cover 77.94%, enrich 2.48x # BLASTZ FR1 CLEANUP (DONE, 2004-12-15, hartera) # RE-DONE (DONE, 2004-12-23, hartera) ssh kksilo cd /cluster/data/fr1/bed/blastz.danRer2.swap nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/noClass.net axtChain/preNet & nice gzip axtChrom/* pslChrom/* axtChain/all.chainFilt5k axtChain/all.chain.unfiltered axtChain/*.net axtChain/chain/*.chain & nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain & nice rm -fr axtChain/chain axtChain/chainRaw & # MAKE VSDANRER2 DOWNLOADABLES (DONE, 2004-12-23, hartera) ssh hgwdev cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChrom set gp = /usr/local/apache/htdocs/goldenPath/fr1 mkdir -p $gp/vsDanRer2/axtChrom gunzip chrUn.axt.gz cp -p *.axt $gp/vsDanRer2/axtChrom cd $gp/vsDanRer2/axtChrom gzip *.axt md5sum *.gz > md5sum.txt # copy chains and nets to downloads area cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain gzip -c all.chain.antirepeat > \ /cluster/data/fr1/zip/zebrafishDanRer2.chain.gz gzip -c zfishdanRer2.net > /cluster/data/fr1/zip/zebrafishDanRer2.net.gz cd $gp/vsDanRer2 mv /cluster/data/fr1/zip/zebrafish*.gz . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MAKE Human Proteins track (hg17 DONE ssh eieio mkdir -p /cluster/data/fr1/blastDb cd /cluster/data/fr1/blastDb ln -s ../superscaffolds/*.fa . for i in ss* do formatdb -i $i -p F done rm *.log *.fa ssh kkr1u00 mkdir -p /iscratch/i/fr1/blastDb cp /cluster/data/fr1/blastDb/* /iscratch/i/fr1/blastDb (iSync) > sync.out exit # back to eieio ssh kk mkdir -p /cluster/data/fr1/bed/tblastn.hg17KG cd /cluster/data/fr1/bed/tblastn.hg17KG ls -1S /iscratch/i/fr1/blastDb/*.nsq | sed "s/\.nsq//" > query.lst calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\) # 42156/(200000/578) = 121.830840 mkdir -p /cluster/bluearc/fr1/bed/tblastn.hg17KG/kgfa split -l 122 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/fr1/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/fr1/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec cd /cluster/data/fr1/bed/tblastn.hg17KG para create blastSpec para push # Completed: 199988 of 199988 jobs # CPU time in finished jobs: 10762864s 179381.06m 2989.68h 124.57d 0.341 y # IO & Wait Time: 2421413s 40356.89m 672.61h 28.03d 0.077 y # Average job time: 66s 1.10m 0.02h 0.00d # Longest job: 819s 13.65m 0.23h 0.01d # Submission to last job: 36973s 616.22m 10.27h 0.43d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl) '_EOF_' # << happy emacs chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/fr1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 346 of 346 jobs # CPU time in finished jobs: 2576s 42.94m 0.72h 0.03d 0.000 y # IO & Wait Time: 34240s 570.66m 9.51h 0.40d 0.001 y # Average job time: 106s 1.77m 0.03h 0.00d # Longest job: 379s 6.32m 0.11h 0.00d # Submission to last job: 2654s 44.23m 0.74h 0.03d exit # back to eieio cd /cluster/data/fr1/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq | liftUp stdout ../../../Un/chrUn.lft warn stdin > /cluster/data/fr1/bed/tblastn.hg17KG/blastHg17KG.psl cd .. liftUp Un/chrUn.fa.out Un/chrUn.lft warn RMRun/Un/*.out ssh hgwdev cd /cluster/data/fr1/bed/tblastn.hg17KG hgLoadPsl fr1 blastHg17KG.psl # back to eieio rm -rf blastOut # End tblastn ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) # SPLIT MASKED chrUn SEQUENCE INTO SCAFFOLDS AND MAKE SCAFFOLDS 2BIT FILE # (DONE, 2005-08-13, hartera) # ADDED SOFT-MASKED SCAFFOLDS 2BIT FILE AND LIFT FILE AND chrUn 2BIT FILE # TO SAN FOR CLUSTER RUNS AND CREATE FILE OF SCAFFOLD SIZES # (DONE, 2005-08-13 and 2006-04-28, hartera) ssh eieio cd /cluster/data/fr1/Un # for chrUn get masked sequence for soft-masked mkdir scaffoldsSoftMask mkdir -p /panasas/store/fr1/scaffolds set outDir=/panasas/store/fr1/scaffolds awk 'BEGIN {FS="\t"}{if ($5 != "N") \ print "/cluster/data/fr1/Un/scaffoldsSoftMask/getMaskedFaFrags.csh /iscratch/i/fr1/chrUn.fa",$2-1, $3, "'$outDir/'"$6".fa", "'$outDir/'"$6".log";}' \ chrUn.agp >> ./scaffoldsSoftMask/faFragSoftMask # check number of scaffolds grep "scaffold" chrUn.agp | wc -l # 20379 wc -l ./scaffoldsSoftMask/faFragSoftMask # 20379 ./scaffoldsSoftMask/faFragSoftMask # since there are so many, then use kilokluster # need to make a wrapper round faFrag as parasol can not # deal with redirects. cd scaffoldsSoftMask cat << '_EOF_' > getMaskedFaFrags.csh #!/bin/csh -fe set $inFa=$1 set $st = $2 set $end = $3 set $outFa = $4 set $outLog = $5 faFrag -mixed $inFa $st $end $outFa >& $outLog '_EOF_' # << happy emacs chmod +x getMaskedFaFrags.csh ssh kkr1u00 cp /cluster/data/fr1/Un/chrUn.fa /iscratch/i/fr1 /cluster/bin/iSync ssh kk cd /cluster/data/fr1/Un/scaffoldsSoftMask mkdir run cd run cp ../faFragSoftMask jobList para create jobList para try, check, push, check etc. # para time # Completed: 20379 of 20379 jobs # CPU time in finished jobs: 518059s 8634.32m 143.91h 6.00d 0.016 y # IO & Wait Time: 2831188s 47186.46m 786.44h 32.77d 0.090 y # Average job time: 164s 2.74m 0.05h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 469s 7.82m 0.13h 0.01d # Submission to last job: 10106s 168.43m 2.81h 0.12d # check a few sequences to see that they are correct ssh kolossus cd /cluster/data/fr1/Un/scaffoldsSoftMask foreach f (/panasas/store/fr1/scaffolds/scaf*.fa) set g=$f:t:r echo $g >> names perl -pi.bak -e "s/>chr[0-9A-Za-z\-\:]+/>$g/" $f cat $f >> /panasas/store/fr1/scaffolds/scaffoldMaskedUnFr1.fa end grep ">" /panasas/store/fr1/scaffolds/scaffoldMaskedUnFr1.fa | wc -l # 20379 # then make 2bit file for blastz cluster runs faToTwoBit /panasas/store/fr1/scaffolds/scaffoldMaskedUnFr1.fa \ /cluster/data/fr1/fr1UnScaffolds.2bit ssh kkr1u00 mkdir -p /iscratch/i/fr1/UnScaffolds cp /cluster/data/fr1/fr1UnScaffolds.2bit /iscratch/i/fr1/UnScaffolds/ /cluster/bin/iSync # add to the san for cluster runs on pk ssh hgwdev mkdir -p /san/sanvol1/scratch/fr1 # already added to /san/sanvol1/scratch/fr1/UnScaffolds # (2005-08-13, hiram) - file is fr1UnScaffolds.2bit # also put lift file and chrUn 2 bit file on the san (2006-04-28, hartera) cp /cluster/data/fr1/Un/lift/ordered.lft \ /san/sanvol1/scratch/fr1/UnScaffolds cp /cluster/data/fr1/fr1.2bit /san/sanvol1/scratch/fr1/ # create scaffolds sizes file (2006-04-28, hartera) cd /san/sanvol1/scratch/fr1/UnScaffolds twoBitInfo fr1UnScaffolds.2bit scaffolds.sizes # BLASTZ SWAP FOR FUGU (fr1) vs ZEBRAFISH (danRer3) (DONE, 2005-08-18, hartera) # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS # RECREATE DOWNLOADS AS THE ZEBRFISH DOWNLOADS DIRECTORY HAS BEEN DELETED # (DONE, 2005-11-17, hartera) ssh hgwdev # do swap of danRer3 vs fr1 chain and net alignments to # create fr1 vs danRer3. see makeDanRer3.doc for details. # there a no lineage-specific repeats for these species cd /cluster/data/danRer3/bed/blastz.fr1 nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -workhorse kkr1u00 -swap -chainMinScore=5000 >& doSwap.log & # Start: Aug 18 13:11 # Finish: Aug 18 13:29 # Blastz parameters are as for danRer3 vs. fr1 - see makeDanRer3.doc # BLASTZ_H=2000 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # BLASTZ_ABRIDGE_REPEATS=0 # Run directory files are already on /cluster/data. Remake downloads # for zebrafish (danRer3) alignments since these have been removed from # the downloads directory. (hartera, 2005-11-17) cd /cluster/data/danRer3/bed/blastz.fr1 nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \ -swap -continue download -stop download >& doSwapDownload.log & # MAKE Human Proteins track (DONE 2005-06-27 braney) mkdir -p /cluster/data/fr1/bed/tblastn.hg17KG cd /cluster/data/fr1/bed/tblastn.hg17KG ls -1S /iscratch/i/fr1/blastDb/*.nsq | sed "s/\.nsq//" > target.lst mkdir kgfa # calculate a reasonable number of jobs calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(150000/`wc target.lst | awk "{print \\\$1}"`\) # 37365/(150000/578) = 143.979800 split -l 144 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst rm -rf /cluster/bluearc/fr1/bed/tblastn.hg17KG/blastOut mkdir -p /cluster/bluearc/fr1/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/fr1/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT f=/tmp/`basename $3` for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 target.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/fr1/bed/tblastn.hg17KG para create blastSpec para push # Completed: 150280 of 150280 jobs # CPU time in finished jobs: 4621712s 77028.53m 1283.81h 53.49d 0.147 y # IO & Wait Time: 1350989s 22516.48m 375.27h 15.64d 0.043 y # Average job time: 40s 0.66m 0.01h 0.00d # Longest finished job: 204s 3.40m 0.06h 0.00d # Submission to last job: 72366s 1206.10m 20.10h 0.84d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' # << happy emacs chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/fr1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 260 of 260 jobs # CPU time in finished jobs: 614s 10.24m 0.17h 0.01d 0.000 y # IO & Wait Time: 12566s 209.43m 3.49h 0.15d 0.000 y # Average job time: 51s 0.84m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 156s 2.60m 0.04h 0.00d # Submission to last job: 969s 16.15m 0.27h 0.01d ssh eieio cd /cluster/data/fr1/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | liftUp -nosort -type=".psl" -nohead stdout /cluster/data/fr1/jkStuff/liftAll.lft warn stdin | \ sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n > /cluster/data/fr1/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/fr1/bed/tblastn.hg17KG hgLoadPsl fr1 blastHg17KG.psl exit rm -rf blastOut # End tblastn # BLASTZ DANRER1 CLEANUP (DONE, 2006-02-07, hartera) ssh kkstore02 cd /cluster/data/fr1/bed/blastz.danRer1.swap.2004-06-16 rm pslChrom/psl.tab nice gzip axtChrom/* pslChrom/* & ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-19 - Hiram) ssh pk cd /cluster/data/mm8/bed/blastzFr1.2006-02-19 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap -continue=net `pwd`/DEF > swap.net.out 2>&1 & time nice -n +19 featureBits fr1 chainMm8Link # 42671288 bases of 315518167 (13.524%) in intersection ############################################################################ ## BLASTZ swap from hg17 alignments (DONE 2006-04-09 markd) mkdir /cluster/data/fr1/bed/blastz.hg17.swap ln -s blastz.hg17.swap /cluster/data/fr1/bed/blastz.hg17 cd /cluster/data/fr1/bed/blastz.hg17.swap # cp /cluster/data/hg17/bed/blastz.fr1/DEF . # edit to change /iscratch/i/hg17/bothMaskedNibs to /scratch/hg/hg17/bothMaskedNibs time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \ -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ ./DEF >& swap.out& # create the net filee (DONE 2006-04-10 markd) ssh hgwdev cd /cluster/data/fr1/bed/blastz.hg17.swap/axtChain nice netClass -verbose=0 -noAr noClass.net fr1 hg17 fr1.hg17.net nice gzip fr1.hg17.net ########################################################################### # BLASTZ SWAP TO CREATE CHAIN AND NET ALIGNMENTS FOR ZEBRAFISH (danRer4) # AND CREATE AXNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS # (DONE, 2006-04-29, hartera) ssh pk # Used these BLASTZ parameters - see also makeDanRer4.doc # BLASTZ_H=2000 # BLASTZ_M=50 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q # and no lineage-specific repeats. # Alignments are in: /cluster/data/fr1/bed/blastz.danRer4.swap cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28 nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF >& doSwap.log & # Took about 30 minutes. # check coverage compared to danRer3: featureBits fr1 chainDanRer4Link # 86236583 bases of 315518167 (27.332%) in intersection # There is no refGene table for fr1 so use mrna: featureBits fr1 mrna chainDanRer4Link -enrichment # mrna 0.187%, chainDanRer4Link 27.332%, both 0.153%, cover 82.08%, # enrich 3.00x featureBits fr1 mrna netDanRer4 -enrichment # mrna 0.187%, netDanRer4 69.291%, both 0.172%, cover 92.16%, enrich 1.33x featureBits fr1 mrna netDanRer3 -enrichment # mrna 0.187%, netDanRer3 68.354%, both 0.171%, cover 91.52%, enrich 1.34x # Runs out of memory for danRer3 chains. For nets, similar coverage for # danRer4 and danRer3 nets on fr1. ##################################################################### # SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie) # File emailed from Xinwei She mkdir /cluster/data/fr1/bed/genomicSuperDups cd /cluster/data/fr1/bed/genomicSuperDups liftUp -type=.bed stdout /cluster/data/fr1/jkStuff/liftAll.lft warn \ fr1_genomicSuperDup.tab \ | sed -e 's/\t_\t/\t-\t/' \ | awk '($3 - $2) >= 500 && ($9 - $8) >= 500 {print;}' \ | hgLoadBed fr1 genomicSuperDups stdin \ -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql ######################################################################### ## Reorder Fish organisms (DONE - 2006-12-22 - Hiram) hgsql -h genome-testdb hgcentraltest \ -e "update dbDb set orderKey = 465 where name = 'fr1';" ######################################################################### ## BLASTZ SWAP MEDAKA oryLat1 to Fr1 (DONE - 2007-01-11 - Hiram) ssh kkstore02 mkdir /cluster/data/fr1/bed/blastz.oryLat1.swap cd /cluster/data/fr1/bed/blastz.oryLat1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/oryLat1/bed/blastz.fr1.2006-12-22/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > swap.log 2>&1 & # real 22m4.224s ######################################################################### ## MEDAKA ORYLAT1 BLASTZ (WORKING - 2007-01-11 - Hiram) ## The above swap doesn't contain the the Medaka chrUn scaffolds # and it would be good to have the Fr1 alignments back on Medaka ## So, to get them, run this blastz of the Medaka chroms and chrUn ## Scaffolds, then swap it back to Medaka, no Db loads as we are ## going to extract the chrUn chains and nets out of this business cat << '_EOF_' > DEF # Fugu vs. Medaka chroms and chrUn in scaffolds # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Fugu fr1 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates # 20,397 scaffolds, largest 1.1M bases, smallest 107 bases SEQ1_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes SEQ1_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft SEQ1_CHUNK=1000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # QUERY: Medaka oryLat1, chrUn in scaffolds # 36,494 scaffolds, 25 chroms SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/cluster/data/fr1/bed/blastz.oryLat1.2007-01-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -stop=net -bigClusterHub=pk > do.log 2>&1 & ########################################################################## ## Swap Stickleback gasAcu1 alignments back to fr1 ## (DONE - 2006-12-11 - Hiram) mkdir /cluster/data/fr1/bed/blastz.gasAcu1.swap cd /cluster/data/fr1/bed/blastz.gasAcu1.swap time doBlastzChainNet.pl \ /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > do.log 2>&1 & time doBlastzChainNet.pl \ /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/DEF \ -continue=net -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -bigClusterHub=pk > net.log 2>&1 & # real 24m33.761s # user 0m0.125s # sys 0m0.080s ssh hgwdev cd /cluster/data/fr1/bed/blastz.gasAcu1.swap time nice -n 19 featureBits fr1 chainGasAcu1Link \ > fb.fr1.chainGasAcu1Link.txt 2>&1 & # 148005715 bases of 315518167 (46.909%) in intersection ####################################################################### ## Stickleback gasAcu1 BLASTZ (WORKING - 2007-01-11 - Hiram) ## The above swap doesn't contain the the Stickleback random contigs ## and it would be good to have the Fr1 alignments back on Stickleback ## So, to get them, run this blastz of the Stickleback chroms and random ## contigs, then swap it back to Stickleback, no Db loads as we are ## going to extract the random chains and nets out of this business ## (WORKIN - 2007-01-11 - Hiram) ssh kkstore02 mkdir /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11 cd /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11 cat << '_EOF_' > DEF # Fugu vs. Stickleback chroms and randoms in scaffolds # Try "human-fugu" (more distant, less repeat-killed than mammal) params # +M=50: BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET: Fugu fr1 # Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates # 20,397 scaffolds, largest 1.1M bases, smallest 107 bases SEQ1_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit SEQ1_LEN=/san/sanvol1/scratch/fr1/chrom.sizes SEQ1_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit SEQ1_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes SEQ1_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft SEQ1_CHUNK=1000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # QUERY: Stickleback gasAcu1 chroms and chrUn in contigs # The largest chrom is 32M bases, the largest contig 418,670 bases # The smallest chrom chrM is 15,742 bases, smallest contig 60 bases # there are 5,115 chroms and contig pieces # total size 1,089,690,673 bases # 47107538 N's 1042583135 real 834274605 upper 208308530 lower SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift SEQ2_CHUNK=1000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 DEF -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/fr1GasAcu1Un \ -stop=net -bigClusterHub=pk > do.log 2>&1 & XXXX - running 2007-01-11 20:55 time doBlastzChainNet.pl -verbose=2 DEF -smallClusterHub=pk \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -blastzOutRoot /cluster/bluearc/fr1GasAcu1Un \ -continue=cat -stop=net -bigClusterHub=pk > cat-net.log 2>&1 & # real 170m43.012s ## An interesting failure in this one that did not stop it: # HgStepManager: executing step 'chainMerge'. # ssh -x kolossus nice 'chainMergeSort # /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/axtChain/run/chain/*.chain | # nice gzip -c > # /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/axtChain/fr1.gasAcu1.all.chain.gz' # bash: /bin/nice: Argument list too long ## To work-around this: ssh kolossus cd /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/axtChain/run/chain chainMergeSort *.chain | gzip -c > ../../fr1.gasAcu1.all.chain.gz cd ../.. chainSplit chain fr1.gasAcu1.all.chain.gz time ./netChains.csh > netChains.out 2>&1 & # real 12m14.200s ## OK, that is sufficient to get going with the swap mkdir /cluster/data/gasAcu1/bed/blastz.fr1.swap cd /cluster/data/gasAcu1/bed/blastz.fr1.swap time doBlastzChainNet.pl -verbose=2 \ /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/DEF \ -chainMinScore=2000 -chainLinearGap=loose \ -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \ -swap -stop=net -bigClusterHub=pk > swap.log 2>&1 & ########################################################################### ## Chicken/Fugu chain/net swap - (DONE - 2007-03-12 - Hiram) mkdir /cluster/data/fr1/bed/blastz.galGal3.swap cd /cluster/data/fr1/bed/blastz.galGal3.swap time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \ /cluster/data/galGal3/bed/blastz.fr1.2007-03-09/DEF \ -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \ -swap > swap.log 2>&1 & # real 3m14.494s cat fb.fr1.chainGalGal3Link.txt # 33536400 bases of 315518167 (10.629%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2007-04-09 - Hiram) # This will find repeats within the genome that should not be matched # against. Uses 11-mers. # Use -repMatch=128 (based on size -- for human we use 1024, and # fugu size is ~12% of human judging by gapless fr2 vs. hg18 # genome sizes from featureBits. ssh kolossus blat /cluster/data/fr1/fr1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/data/fr1/11.ooc -repMatch=128 # Wrote 5850 overused 11-mers to /cluster/data/fr1/11.ooc cp -p /cluster/data/fr1/11.ooc /cluster/bluearc/fr1 ######################################################################### # CLEANUP OF DANRER2 BLASTZ (DONE, 2007-06-25, hartera) ssh kkstore02 cd /cluster/store5/Fugu_Rubripes_V3/bed/blastz.danRer2.swap/ # split chain files can be recreated from all.chain files so remove chains rm -r chainAR cd .. gzip axtChrom/*.axt # removed about 0.9 G of data