# for emacs: -*- mode: sh; -*- # DOWNLOAD SEQUENCE (DONE 10/15/04 Fan) ssh kksilo mkdir /cluster/store8/xenTro1 cd /cluster/data ln -s /cluster/store8/xenTro1 xenTro1 cd /cluster/data/xenTro1 mkdir downloads cd downloads wget --timestamp ftp://ftp.jgi-psf.org/pub/JGI_data/Frog/v3.0/xenopus.041015.fasta.gz wget --timestamp ftp://ftp.jgi-psf.org/pub/JGI_data/Frog/v3.0/xenopus.041015.allmasked.gz wget --timestamp ftp://ftp.jgi-psf.org/pub/JGI_data/Frog/v3.0/repeats_lib/xt3.lib1.fasta.gz gzip -d *.gz cd .. # SPLIT SCAFFOLDS (DONE 10/17/04, Fan) ssh kksilo cd /cluster/data/xenTro1 mkdir scaffolds html # /cluster/bin/i386/faSplit byname downloads/xenopus.041015.fasta scaffolds -outDirDepth=3 # Tried above and as noted by Heather before, not sure why these didn't get put under scaffolds # directory, removed 1, 2, 3, .... Tried it differently. cd scaffolds /cluster/bin/i386/faSplit byname ../downloads/xenopus.041015.fasta scaffolds -outDirDepth=3 cd .. # generate list find scaffolds -print > scaffolds.list.0 grep fa scaffolds.list.0 > scaffolds.list # MAKE 2BIT NIB FILE (DONE 10/18/04, Fan) ssh kksilo cd /cluster/data/xenTro1/downloads /cluster/bin/i386/faToTwoBit xenopus.041015.fasta xenTro1.2bit # The xenopus.041015.fasta file has 70 bases each line instead of 50. # Created a specialized twoBitToFa70 program to generate 70 bases fa file. /cluster/bin/i386/twoBitToFa70 xenTro1.2bit check70.fa # diff got memory exhausted error on kksilo # went to kolossus and did diff between check70.fa and xenopus.041015.fasta. # they are identical except xenopus.041015.fasta has some empty lines at end of # some records mv xenTro1.2bit .. ssh hgwdev mkdir /gbdb/xenTro1 ln -s /cluster/data/xenTro1/html /gbdb/xenTro1 ln -s /cluster/data/xenTro1/xenTro1.2bit /gbdb/xenTro1 # CREATE DATABASE ssh hgwdev hgsql hg17 create database xenTro1; use xenTro1; create table grp (PRIMARY KEY(NAME)) select * from hg17.grp; # add rows to hgcentraltest on hgwbeta in dbDb and defaultDb # set orderKey past chicken, before Zebrafish ## re-work orderKey 2007-02-20 to get Lizard above frog - Hiram hgsql -e 'update dbDb set orderKey="446" where name="xenTro1";' \ hgcentraltest # CREATE AGP FILES AND GAP/GOLD TABLES (DONE JK 10/23/04) ssh kksilo cd /cluster/data/xenTro1 hgFakeAgp downloads/xenopus.041015.fasta xenTro1.agp ssh hgwdev cd /cluster/data/xenTro1 hgGoldGapGl xenTro1 xenTro1.agp # CREATE CHROMINFO TABLE (DONE 10/18/04, Fan) # an improvement here would be to have getMaxCoord output the 2bit file name ssh hgwdev cd /cluster/data/xenTro1 faSize -detailed=on downloads/xenopus.041015.fasta \ | awk '{printf("%s\t%s\t%s\n", $1, $2, "/gbdb/xenTro1/xenTro1.2bit");}' \ > chromInfo.tab hgsql xenTro1 < ~/src/hg/lib/chromInfo.sql hgsql xenTro1 -e 'load data local infile "chromInfo.tab" into table chromInfo' # REPEATMASKER (DONE,10/20/04, Fan) # using the split scaffold fa files generated earlier # do a trial run ssh hgwdev cd /cluster/data/xenTro1/scaffolds/0/0/0 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec bos SCAFFOLD1000.fa # configure cd /cluster/data/xenTro1 cp -p downloads/xt3.lib1.fasta /cluster/bluearc/RepeatMasker/Libraries/xt3_JGI.lib cp /cluster/data/anoGam1/jkStuff/RMAnopheles jkStuff/RMXenTro # edit jkStuff/RMXenTro to change all references anoGam1 --> xenTro1 # and use the following line to run RepeatMasker # /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -xsmall -lib /cluster/bluearc/RepeatMasker/Librari es/xt3_JGI.lib $2 mkdir RMRun # /bin/csh makeJoblist-RM.csh # what the above line is for? cd scaffolds foreach i (0 1 2 3 4 5 6 7 8 9) cd $i foreach j (0 1 2 3 4 5 6 7 8 9) cd $j foreach k (0 1 2 3 4 5 6 7 8 9) cd $k foreach f (*.fa) echo /cluster/data/xenTro1/jkStuff/RMXenTro \ /cluster/data/xenTro1/scaffolds/$i/$j/$k $f \ '{'check out line+ /cluster/data/xenTro1/scaffolds/$i/$j/$k/$f.out'}' \ >> /cluster/data/xenTro1/RMRun/RMJobs end cd .. end cd .. end cd .. end # do the run ssh kk cd /cluster/data/xenTro1/RMRun para create RMJobs para try para check para push para check para time # Completed: 27064 of 27064 jobs # CPU time in finished jobs: 8152873s 135881.22m 2264.69h 94.36d 0.259 y # IO & Wait Time: 287040s 4784.00m 79.73h 3.32d 0.009 y # Average job time: 312s 5.20m 0.09h 0.00d # Longest job: 46100s 768.33m 12.81h 0.53d # Submission to last job: 46220s 770.33m 12.84h 0.53d # Our RepeatMasker results are not exactly the same as JGI's, but close. # JGI has not provided .out files requested earlier as of 10/24/04. # With Jim's approval, proceeded with our RepeatMasker results. # Note: Jim earlier went ahead with JGI's repeat masked result for # his blastz run. +-----------+-----------+-------------+------------+------------+------------+ | sum(a) | sum(b) | sum(abBoth) | sum(aOnly) | sum(bOnly) | sum(total) | +-----------+-----------+-------------+------------+------------+------------+ | 177141183 | 177367102 | 174224057 | 2917126 | 3143045 | 1630713514 | +-----------+-----------+-------------+------------+------------+------------+ # concatenate into one output file; took about 90 minutes ssh kksilo cd /cluster/data/xenTro1 mkdir repeats /bin/tcsh concatRM.tcsh cd repeats head -2 repeats.all >repeats.out fgrep scaffold_ repeats.all |grep -v >>repeats.out # get the header (first 2 lines) to make hgLoadOut happy head -2 repeats.all >repeats.out # filter out no repetitive sequence found ... message lines fgrep scaffold_ repeats.all |grep -v 'no repetitive' >>repeats.out ssh hgwdev cd /cluster/data/xenTro1/repeats hgLoadOut -nosplit xenTro1 repeats.out hgsql xenTro1 rename table repeats_rmsk to rmsk # select count(*) from rmsk; # 1140362 # select count(*) from rmsk where repName like "TE_ORF%"; # 164354 # SIMPLE REPEATS (DONE 10/22/04, Fan) # put the results throughout the scaffold 0/0/0 directories, # same as RepeatMasker, to avoid too many files in the same directory ssh kksilo cd /cluster/data/xenTro1 mkdir bed cd bed mkdir simpleRepeat cd simpleRepeat # /bin/csh makeJoblist-trf.csh # do the run; took about 12 hours tcsh trf-run.csh > trf.log # concatenate into one output file; took about an hour /bin/tcsh concatTRF.csh # load ssh hgwdev cd /cluster/data/xenTro1/bed/simpleRepeat /cluster/bin/i386/hgLoadBed xenTro1 simpleRepeat trf.bed \ -sqlTable=/cluster/home/fanhsu/src/hg/lib/simpleRepeat.sql # Reading trf.all # Loaded 505308 elements of size 16 # Sorted # Saving bed.tab # Loading xenTro1 # CREATE MASKED FA USING REPEATMASKER AND FILTERED TRF FILES (DONE Oct. 22, 2004, Fan) ssh kksilo cd /cluster/data/xenTro1 /cluster/bin/i386/maskOutFa -soft downloads/xenopus.041015.fasta repeats/repeats.out xenTro1.softmask.fa # 1 warnings about negative rEnd # WARNING: negative rEnd: -18 scaffold_69:2353795-2354532 XL1723L # matches select count(*) from rmsk where repEnd < 0; /cluster/bin/i386/maskOutFa -softAdd xenTro1.softmask.fa bed/simpleRepeat/trf.bed xenTro1.softmask2.fa # hard masking (Ns instead of lower case) for download files # REGENERATE 2BIT NIB (DONE, 10/22/04, Fan) ssh kksilo cd /cluster/data/xenTro1 /cluster/bin/i386/faToTwoBit xenTro1.softmask2.fa xenTro1.softmask.2bit /cluster/bin/i386/twoBitToFa xenTro1.softmask.2bit check.softmask.fa diff -q xenTro1.softmask2.fa check.softmask.fa # They are the same. mv xenTro1.2bit xenTro1.2bit.unmasked mv xenTro1.softmask.2bit xenTro1.2bit # MAKE 11.OOC FILE FOR BLAT (DONE, 10/22/04, Fan) ssh kkr1u00 mkdir /cluster/data/xenTro1/bed/ooc cd /cluster/data/xenTro1/bed/ooc ls -1 /cluster/data/xenTro1/xenTro1.2bit > nib.lst mkdir /cluster/bluearc/xenTro1 /cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/xenTro1/11.ooc -repMatch=540 # Wrote 26600 overused 11-mers to /cluster/bluearc/xenTro1/11.ooc cp -p /cluster/bluearc/xenTro1/11.ooc /iscratch/i/xenTro1/ iSync # GENBANK (DONE, 10/26/04, Fan) ssh hgwdev cd /cluster/home/fanhsu/kent/src/hg/makeDb/genbank # check for missing commits diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # edit etc/genbank.conf and add these lines, starting with the comment: # xenTro1 (X. tropicalis) # could get rid of the nib in this path, 2bit isn't a nib (4bit) ??? # xenTro1 (X. tropicalis) 27064 scaffolds! # This uses the mondo 2bit file functionality, where multiple # sequences from a two bit are alignmed from a single jobs. # Genbank X. tropicalis # 5365 mRNAs # 423107 ESTs # partation 2bit into 5000 parts for initial build xenTro1.genome = /iscratch/i/xenTro1/nib/xenTro1.2bit xenTro1.mondoTwoBitParts = 5000 xenTro1.lift = no xenTro1.refseq.mrna.native.load = no xenTro1.genbank.mrna.xeno.load = no xenTro1.genbank.est.xeno.load = no xenTro1.downloadDir = xenTro1 xenTro1.perChromTables = no cvs update etc/genbank.conf cvs commit etc/genbank.conf # revision: 1.64 # edit src/lib/gbGenome.c make cvs commit -m "added frog" src/lib/gbGenome.c # revision 1.21 # edit src/align/gbBlat make cvs commit -m "added frog" src/align/gbBlat # revision 1.33 make install-server ssh eieio cd /cluster/data/genbank bin/gbAlignStep -srcDb=genbank -type=mrna -initial -verbose=1 xenTro1 & # logged to # /cluster/data/genbank/var/build/logs/YYMMDDHHMMSS.xenTro1.initalign.log # load ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad xenTro1 # logged to /cluster/data/genbank/var/dbload/hgwdev/logs/YYMMDDHHMMSS.dbload.log # ESTs required modification to the build process to deal with the huge # number of scaffolds. Once completed, the following line was # added to genbank.conf to enable it's uses: xenTro1.mondoTwoBitParts = 5000 ssh eieio cd /cluster/data/genbank bin/gbAlignStep -initial -srcDb=genbank -type=est xenTro1 # load the ESTs, reloading mRNAs as well ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -drop -initialLoad xenTro1 # add xenTro1 to list of databases to align in # ~/src/hg/makeDb/genbank/etc/align-genbank # CVS update and check in the file. # GC5BASE - (DONE 2004-10-27 - Hiram) ssh kksilo mkdir /cluster/data/xenTro1/bed/gc5Base cd /cluster/data/xenTro1/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 xenTro1 \ /cluster/data/xenTro1 | wigBedToBinary stdin gc5Base.wig gc5Base.wib # Calculating gcPercent with window size 5 # Using twoBit: /cluster/data/xenTro1/xenTro1.2bit # File stdout created # Converted stdin, upper limit 100.00, lower limit 0.00 # real 332m59.680s # user 150m45.460s # sys 157m28.750s # Now load those results into the database ssh hgwdev cd /cluster/data/xenTro1/bed/gc5Base mkdir /gbdb/xenTro1/wib ln -s `pwd`/gc5Base.wib /gbdb/xenTro1/wib hgLoadWiggle -pathPrefix=/gbdb/xenTro1/wib xenTro1 gc5Base gc5Base.wig # PRODUCE X. LAEVIS MRNA ALIGNMENTS (IN PROGRESS 11/6/05 JK) # Make working directory and get Laevis RNA ssh hgwdev mkdir /san/sanvol1/xenTro1/xenLaeRna cd /san/sanvol1/xenTro1/xenLaeRna hgSpeciesRna hg17 Xenopus laevis xenLaeRna.fa cp /cluster/data/xenTro1/xenTro1.2bit /san/sanvol1/scratch/xenTro1 # Split RNA into 4 mkdir split faSplit sequence xenoLaeRna.fa 4 split/xl # Do blatz run on pk. mkdir /san/sanvol1/xenTro1/xenLaeRna mkdir run1 cd run1 mkdir psl awk '{print $1;}' /cluster/data/xenTro1/chrom.sizes > genome.lst ls -1 ../split/*.fa > mrna.lst cat << '_EOF_' > gsub #LOOP blatz -rna -minScore=6000 /san/sanvol1/scratch/xenTro1/xenTro1.2bit:$(file1) $(path2) -out=psl {check out line psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 genome.lst mrna.lst gsub spec para create spec # Then do usual para try/para push/para time #Completed: 108256 of 108256 jobs #CPU time in finished jobs: 1933198s 32219.96m 537.00h 22.37d 0.061 y #IO & Wait Time: 275698s 4594.97m 76.58h 3.19d 0.009 y #Average job time: 20s 0.34m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 986s 16.43m 0.27h 0.01d #Submission to last job: 24061s 401.02m 6.68h 0.28d # Do sort and near-besting catDir psl | pslFilter -minScore=100 -minAli=255 -noHead stdin stdout | sort -k 10 > ../filtered.psl cd .. pslReps filtered.psl nearBest.psl /dev/null -nohead -minAli=0.5 -nearTop=0.01 -minCover=0.255 sort -k 14 nearBest.psl > xenLaeRna.psl # Clean up rm -r run1/psl # Load into database ssh hgwdev cd /cluster/data/xenTro1/bed/xenLaeRna ln -s /cluster/data/xenTro1/bed/xenLaeRna/xenLaeRna.fa /gbdb/xenTro1/xenLaeRna.fa hgLoadSeq xenTro1 /gbdb/xenTro1/xenLaeRna.fa hgLoadPsl xenTro1 xenLaeRna.psl # PRODUCE X. LAEVIS MRNA ALIGNMENTS (DONE 1/14/05 JK) # Make working directory and get Laevis RNA and DNA ssh hgwdev mkdir -p /cluster/data/xenTro1/bed/xenLaeRna cd /cluster/data/xenTro1/bed/xenLaeRna hgSpeciesRna hg17 Xenopus laevis xenLaeRna.faa # Split RNA into 3 and load on temp device ssh kkr1u00 cd /iscratch/i/xenTro1 mkdir xenLaeRna cd xenLaeRna faSplit sequence /cluster/data/xenTro1/bed/xenLaeRna/xenLaeRna.fa 3 xl iSync # Do blatz cluster run ssh kk cd /cluster/data/xenTro1/bed/xenLaeRna mkdir run1 cd run1 mkdir psl awk '{print $1;}' /cluster/data/xenTro1/chrom.sizes > genome.lst ls -1 /iscratch/i/xenTro1/xenLaeRna/*.fa > mrna.lst cat << '_EOF_' > gsub #LOOP blatz -rna -minScore=6000 /iscratch/i/xenTro1/nib/xenTro1.2bit:$(file1) $(path2) -out=psl {check out line psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 genome.lst mrna.lst gsub spec para create spec # Then do usual para try/para push/para time CPU time in finished jobs: 3896883s 64948.04m 1082.47h 45.10d 0.124 y IO & Wait Time: 1031103s 17185.06m 286.42h 11.93d 0.033 y Average job time: 61s 1.01m 0.02h 0.00d Longest job: 3276s 54.60m 0.91h 0.04d Submission to last job: 5989s 99.82m 1.66h 0.07d # Do sort and near-besting on file server ssh kksilo cd /cluster/data/xenTro1/bed/xenLaeRna/run1 catDir psl | pslFilter -minScore=100 -minAli=255 -noHead stdin stdout | sort -k 10 > ../filtered.psl cd .. pslReps filtered.psl nearBest.psl /dev/null -nohead -minAli=0.5 -nearTop=0.01 -minCover=0.255 sort -k 14 nearBest.psl > xenLaeRna.psl # Clean up rm -r run1/psl # Load into database ssh hgwdev cd /cluster/data/xenTro1/bed/xenLaeRna ln -s /cluster/data/xenTro1/bed/xenLaeRna/xenLaeRna.fa /gbdb/xenTro1/xenLaeRna.fa hgLoadSeq xenTro1 /gbdb/xenTro1/xenLaeRna.fa hgLoadPsl xenTro1 xenLaeRna.psl # PRODUCING GENSCAN PREDICTIONS (DONE 10/27/04 Fan) # GENSCAN RELOADED WITHOUT -genePredExt (DONE 01-06-05 angie) # MOVE TIMESTAMP ON GENSCANPEP TO SATSIFY ALL.JOINER (DONE 03-03-05 alisq, kuhn) ssh hgwdev mkdir /cluster/data/xenTro1/bed/genscan cd /cluster/data/xenTro1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # create hard masked .fa files ssh kksilo cd /cluster/store8/xenTro1 maskOutFa xenTro1.softmask2.fa hard xenTro1.hardmask2.fa /cluster/bin/i386/faSplit about xenTro1.hardmask2.fa 2000000 hardMasked2/ # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/xenTro1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) rm -f genome.list touch genome.list foreach f ( `ls -1S /cluster/data/xenTro1/hardMasked2/*` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... # Completed: 645 of 648 jobs # Crashed: 3 jobs # CPU time in finished jobs: 43701s 728.35m 12.14h 0.51d 0.001 y # IO & Wait Time: 4362s 72.70m 1.21h 0.05d 0.000 y # Average job time: 75s 1.24m 0.02h 0.00d # Longest job: 435s 7.25m 0.12h 0.01d # Submission to last job: 3641s 60.68m 1.01h 0.04d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". ssh kkr7u00 cd /cluster/data/xenTro1/bed/Genscan /cluster/bin/x86_64/gsBig /cluster/data/xenTro1/hardMasked2/05.fa gtf/05.gtf -trans=pep/05.pep - subopt=subopt/05.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat - tmp=/tmp -window=1200000 /cluster/bin/x86_64/gsBig /cluster/data/xenTro1/hardMasked2/76.fa gtf/76.gtf -trans=pep/76.pep - subopt=subopt/76.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat - tmp=/tmp -window=1200000 /cluster/bin/x86_64/gsBig /cluster/data/xenTro1/hardMasked2/116.fa gtf/116.gtf -trans=pep/116.pep - subopt=subopt/116.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat - tmp=/tmp -window=1200000 ls -1 gtf | wc -l # 648 endsInLf gtf/* # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/xenTro1/bed/genscan cat gtf/*.gtf >genscan.gtf cat pep/*.pep > genscan.pep cat subopt/*.bed >genscanSubopt.bed # Load into the database as so: ssh hgwdev cd /cluster/data/xenTro1/bed/genscan # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf xenTro1 genscan genscan.gtf hgPepPred xenTro1 generic genscanPep genscan.pep hgLoadBed xenTro1 genscanSubopt genscanSubopt.bed nice featureBits xenTro1 genscan # 63766483 bases of 1381238994 (4.617%) in intersection # MAKE Human Proteins track (hg17) (DONE 2004-11-04 braney) ssh kksilo mkdir -p /cluster/data/xenTro1/blastDb cd /cluster/data/xenTro1/blastDb faSplit sequence ../downloads/xen*fasta 500 x for i in x* do formatdb -i $i -p F done rm *.log *.fa ssh kkr1u00 mkdir -p /iscratch/i/xenTro1/blastDb cp /cluster/data/xenTro1/blastDb/* /iscratch/i/xenTro1/blastDb (iSync) > sync.out exit # back to kksilo mkdir -p /cluster/data/xenTro1/bed/tblastn.hg17KG cd /cluster/data/xenTro1/bed/tblastn.hg17KG ls -1S /iscratch/i/xenTro1/blastDb/*.nsq | sed "s/\.nsq//" > frog.lst mkdir kgfa calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(150000/`wc frog.lst | awk "{print \\\$1}"`\) # 41117/(150000/498) = 136.508440 mkdir -p /cluster/bluearc/xenTro1/bed/tblastn.hg17KG/kgfa split -l 137 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/xenTro1/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/xenTro1/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 frog.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/xenTro1/bed/tblastn.hg17KG para create blastSpec para push # Completed: 146412 of 146412 jobs # CPU time in finished jobs: 43595624s 726593.73m 12109.90h 504.58d 1.382 y # IO & Wait Time: 1997243s 33287.38m 554.79h 23.12d 0.063 y # Average job time: 311s 5.19m 0.09h 0.00d # Longest job: 1860s 31.00m 0.52h 0.02d # Submission to last job: 54876s 914.60m 15.24h 0.64d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/xenTro1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 294 of 294 jobs # CPU time in finished jobs: 893s 14.88m 0.25h 0.01d 0.000 y # IO & Wait Time: 20958s 349.30m 5.82h 0.24d 0.001 y # Average job time: 74s 1.24m 0.02h 0.00d # Longest job: 156s 2.60m 0.04h 0.00d # Submission to last job: 1727s 28.78m 0.48h 0.02d exit # back to kksilo cd /cluster/data/xenTro1/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > ../blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/xenTro1/bed/tblastn.hg17KG hgLoadPsl xenTro1 blastHg17KG.psl # back to kksilo rm -rf blastOut # End tblastn # PRODUCING FUGU BLAT ALIGNMENTS (DONE 10/27/04 Fan) ssh kksilo cd /cluster/data/xenTro1 mkdir softMasked2 cd softMasked2 /cluster/bin/i386/faSplit about ../xenTro1.softmask2.fa 2000000 ./ ssh kkr1u00 rm -rf /iscratch/i/xenTro1/softMasked2 mkdir /iscratch/i/xenTro1/softMasked2 foreach d (/cluster/data/xenTro1/softMasked2/*) cp $d /iscratch/i/xenTro1/softMasked2 end iSync ssh kk mkdir /cluster/data/xenTro1/bed/blatFr1 cd /cluster/data/xenTro1/bed/blatFr1 ls -1S /cluster/bluearc/fugu/fr1/trfFa/*.fa > fugu.lst ls -1S /iscratch/i/xenTro1/softMasked2/*.fa > frog.lst cat << '_EOF_' > gsub #LOOP blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy mkdir psl gensub2 frog.lst fugu.lst gsub spec para create spec para try, check, push, check, ... # Completed: 4536 of 4536 jobs # CPU time in finished jobs: 3466126s 57768.77m 962.81h 40.12d 0.110 y # IO & Wait Time: 458981s 7649.68m 127.49h 5.31d 0.015 y # Average job time: 865s 14.42m 0.24h 0.01d # Longest job: 3763s 62.72m 1.05h 0.04d # Submission to last job: 7688s 128.13m 2.14h 0.09d # Sort alignments: ssh kksilo cd /cluster/data/xenTro1/bed/blatFr1 # added the liftUp step, and reloaded blatFr1 table. 11/16/04 Fan. pslCat -dir psl \ | liftUp -type=.psl -pslQ stdout /cluster/data/fr1/jkStuff/liftAll.lft \ warn stdin \ | pslSortAcc nohead chrom /tmp stdin # load psl into database: ssh hgwdev cd /cluster/data/xenTro1/bed/blatFr1/chrom foreach d (/cluster/data/xenTro1/bed/blatFr1/chrom/*) cat $d >>blatFr1_all.psl end # Special case: fr1 has only chrUn, so no need to index tName! hgLoadPsl -noTNameIx -fastLoad -table=blatFr1 xenTro1 blatFr1_all.psl #load of blatFr1 did not go as planned: 1360912 record(s), 0 row(s) skipped, 1 warning(s) loading psl.tab featureBits xenTro1 blatFr1 # 27096524 bases of 1381238994 (1.962%) in intersection featureBits hg17 blatFr1 #21977081 bases of 2866216770 (0.767%) in intersection # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR xenTro1 (DONE 10/28/04 Fan) ssh hgwdev echo 'insert into blatServers values("xenTro1", "blat13", "17780", 1, 0); \ insert into blatServers values("xenTro1", "blat13", "17781", 0, 1);' \ | hgsql -h genome-testdb hgcentraltest # ALIGN TO CHICKEN (BLASTZ/CHAIN/NET DONE 10/28/04 JK) ssh kk mkdir -p /cluster/data/xenTro1/bed/zb.galGal2 cd /cluster/data/xenTro1/bed/zb.galGal2 # Create DEF file for run of scaffolds vs. chicken chromosomes cat << '_EOF_' > DEF # chicken vs. xenopus export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Set up blastz parameters using parameters between chicken and fish, # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Chicken SEQ1_DIR=/iscratch/i/galGal2/nib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Xenopus SEQ2_DIR=/scratch/hg/xenTro1/maskedPieces SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/xenTro1/bed/zb.galGal2 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy bash source DEF mkdir $RAW run.0 cp /cluster/data/galGal2/chrom.sizes $BASE/S1.len cp /cluster/data/xenTro1/chrom.sizes $BASE/S2.len /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j # (Next time use /cluster/bin/scripts/blastz-make-joblist instead) sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, push check, para push, para check.... # Completed: 52397 of 52397 jobs # CPU time in finished jobs: 19113995s 318566.59m 5309.44h 221.23d 0.606 y # IO & Wait Time: 212544s 3542.40m 59.04h 2.46d 0.007 y # Average job time: 369s 6.15m 0.10h 0.00d # Longest job: 3764s 62.73m 1.05h 0.04d # Submission to last job: 39454s 657.57m 10.96h 0.46d # second cluster run: lift raw alignments -> lav dir ssh kki cd /cluster/data/xenTro1/bed/zb.galGal2 bash # if a csh/tcsh user source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 para create jobList para try, push check, para push, para check.... # Completed: 151 of 151 jobs # CPU time in finished jobs: 1583s 26.38m 0.44h 0.02d 0.000 y # IO & Wait Time: 2086s 34.77m 0.58h 0.02d 0.000 y # Average job time: 24s 0.40m 0.01h 0.00d # Longest job: 115s 1.92m 0.03h 0.00d # Submission to last job: 703s 11.72m 0.20h 0.01d # Convert lav to axt. This takes 5 or 10 minutes ssh kksilo cd /cluster/data/xenTro1/bed/zb.galGal2 mkdir axtTemp cd lav foreach i (*) catDir $i | \ | lavToAxt stdin /cluster/data/galGal2/nib \ /cluster/data/xenTro1/xenTro1.2bit ../axtTemp/$i.axt echo done $i end # CHAIN CHICKEN BLASTZ # Run axtChain on little cluster # Note, for this one might as well do it serially on file server too. # The longest job (chr19) takes up 90% of the time. ssh kki cd /cluster/data/xenTro1/bed/zb.galGal2 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainRaw ls -1S /cluster/data/xenTro1/bed/zb.galGal2/axtTemp/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chainRaw/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from chicken run. cat << '_EOF_' > temp.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line makes emacs coloring happy sed 's/ */\t/g' temp.gap > ../../xenopusHumanTuned.gap rm -f temp.gap cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../xenopusHumanTuned.gap \ -minScore=5000 $1 \ /iscratch/i/galGal2/nib \ /iscratch/i/xenTro1/xenTro1.2bit $2 $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try para check para push # Completed: 52 of 54 jobs # Crashed: 2 jobs # CPU time in finished jobs: 495s 8.25m 0.14h 0.01d 0.000 y # IO & Wait Time: 281s 4.68m 0.08h 0.00d 0.000 y # Average job time: 15s 0.25m 0.00h 0.00d # Longest job: 215s 3.58m 0.06h 0.00d # Submission to last job: 215s 3.58m 0.06h 0.00d # # Note, the two crashed were from chr6_random and chr8_random, that simply # had no alignments and thus produced empty files, so no problem. # On the cluster server weed the chains for repeats ssh kksilo cd /cluster/data/xenTro1/bed/zb.galGal2/axtChain mkdir chain cd chainRaw foreach i (*.chain) chainAntiRepeat /cluster/data/galGal2/nib /cluster/data/xenTro1/xenTro1.2bit $i ../chain/$i end # now on the cluster server, sort chains. The chainMergeSort and chainSplit # steps both take about 10 minutes ssh kksilo cd /cluster/data/xenTro1/bed/zb.galGal2/axtChain chainMergeSort run1/chain/*.chain > all.chain rm run1/chain/*.chain rm -r ../axtTemp chainSplit chain all.chain # Load chains into database. This takes an hour ssh hgwdev cd /cluster/data/xenTro1/bed/zb.galGal2/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain galGal2 ${c}_chainXenTro1 $i end featureBits galGal2 chainXenTro1Link # NET XENOPUS BLASTZ ssh kksilo cd /cluster/data/xenTro1/bed/zb.galGal2/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Create axt files for the net as so: netSplit noClass.net noClass cd noClass mkdir ../../axtNet foreach i (*.net) netToAxt $i ../chain/$i:r.chain /cluster/data/galGal2/nib /cluster/data/xenTro1/xenTro1.2bit ../../axtNet/$i:r.axt end cd .. rm -r noClass # Add classification info using db tables: (Stopped here, need gap tables) ssh hgwdev cd /cluster/data/xenTro1/bed/zb.galGal2/axtChain netClass -noAr noClass.net galGal2 xenTro1 galGal2.net rm noClass.net # Load the nets into database ssh hgwdev cd /cluster/data/xenTro1/bed/zb.galGal2/axtChain netFilter -minGap=10 galGal2.net | hgLoadNet galGal2 netXenTro1 stdin # index had NULL cardinality; analzye table to fix (DONE 1/18/2005 Heather) hgsql hg17 analyze table netXenTro1 # SWAP EVERYTHING AND REMAKE NET FROM XENOPUS REFERENCE ssh kksilo cd /cluster/data/xenTro1/bed mkdir -p bz.galGal2/axtChain chainSwap zb.galGal2/axtChain/all.chain bz.galGal2/axtChain/all.chain cd bz.galGal2/axtChain chainPreNet all.chain ../../zb.galGal2/S2.len ../../zb.galGal2/S1.len stdout \ | chainNet stdin -minSpace=1 ../../zb.galGal2/S2.len ../../zb.galGal2/S1.len stdout /dev/null \ | netSyntenic stdin noClass.net mkdir ../axtNet netToAxt noClass.net all.chain /cluster/data/xenTro1/xenTro1.2bit /cluster/data/galGal2/nib \ ../axtNet/all.axt # Classify net and load into database ssh hgwdev cd /cluster/data/xenTro1/bed/bz.galGal2/axtChain netClass -noAr noClass.net xenTro1 galGal2 xenTro1.net rm noClass.net netFilter -minGap=10 xenTro1.net | hgLoadNet xenTro1 netGalGal2 stdin # Load chains into database (takes about 2 hours). hgLoadChain -tIndex xenTro1 chainGalGal2 all.chain rm chain.tab link.tab # ALIGN TO HUMAN (BLASTZ/CHAIN/NET DONE 10/28/04 JK) ssh kk mkdir -p /cluster/data/xenTro1/bed/zb.hg17 cd /cluster/data/xenTro1/bed/zb.hg17 # Create DEF file for run of scaffolds vs. human chromosomes cat << '_EOF_' > DEF # human vs. xenopus export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Set up blastz parameters using parameters between chicken and fish, # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Human SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Xenopus SEQ2_DIR=/scratch/hg/xenTro1/maskedPieces SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/xenTro1/bed/zb.hg17 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy bash source DEF mkdir $RAW run.0 cp /cluster/data/hg17/chrom.sizes $BASE/S1.len cp /cluster/data/xenTro1/chrom.sizes $BASE/S2.len /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, push check, para push, para check.... # Completed: 118327 of 118327 jobs # CPU time in finished jobs: 28921705s 482028.42m 8033.81h 334.74d 0.917 y # IO & Wait Time: 400616s 6676.93m 111.28h 4.64d 0.013 y # Average job time: 248s 4.13m 0.07h 0.00d # Longest job: 5415s 90.25m 1.50h 0.06d # Submission to last job: 35773s 596.22m 9.94h 0.41d # second cluster run: lift raw alignments -> lav dir ssh kki cd /cluster/data/xenTro1/bed/zb.hg17 bash # if a csh/tcsh user source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList para try, push check, para push, para check.... # Completed: 341 of 341 jobs # CPU time in finished jobs: 4477s 74.62m 1.24h 0.05d 0.000 y # IO & Wait Time: 12372s 206.19m 3.44h 0.14d 0.000 y # Average job time: 49s 0.82m 0.01h 0.00d # Longest job: 305s 5.08m 0.08h 0.00d # Submission to last job: 1113s 18.55m 0.31h 0.01d # third run: lav -> axt # (Note - this seems to be i/o bound even on a single CPU. Next time # I'd suggest just running it serially on the file server -jk ) ssh kki cd /cluster/data/xenTro1/bed/zb.hg17 mkdir axtChrom run.2 cd run.2 cat << 'EOF' > do.csh #!/bin/csh -ef cd $1 set chr = $1:t cat `ls -1 *.lav | sort -g` \ | lavToAxt stdin \ /iscratch/i/hg17/bothMaskedNibs /iscratch/i/xenTro1/xenTro1.2bit ../../axtChrom/$chr.axt 'EOF' # << this line keeps emacs coloring happy chmod a+x do.csh cp /dev/null jobList foreach d (../lav/chr*) echo "do.csh $d" >> jobList end para create jobList # 46 jobs para try para check para push # CHAIN HUMAN BLASTZ # Run axtChain on little cluster # Note, for this one might as well do it serially on file server too. # The longest job (chr19) takes up 90% of the time. ssh kki cd /cluster/data/xenTro1/bed/zb.hg17 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainRaw ls -1S /cluster/data/xenTro1/bed/zb.hg17/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chainRaw/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from chicken run. cat << '_EOF_' > temp.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line makes emacs coloring happy sed 's/ */\t/g' temp.gap > ../../xenopusHumanTuned.gap rm -f temp.gap cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../xenopusHumanTuned.gap \ -minScore=5000 $1 \ /iscratch/i/hg17/bothMaskedNibs \ /iscratch/i/xenTro1/xenTro1.2bit $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try para check para push Completed: 46 of 46 jobs CPU time in finished jobs: 2396s 39.94m 0.67h 0.03d 0.000 y IO & Wait Time: 7221s 120.34m 2.01h 0.08d 0.000 y Average job time: 209s 3.48m 0.06h 0.00d Longest job: 2042s 34.03m 0.57h 0.02d Submission to last job: 2042s 34.03m 0.57h 0.02d # On the cluster server weed the chains for repeats ssh kksilo cd /cluster/data/xenTro1/bed/zb.hg17/axtChain mkdir chain cd chainRaw foreach i (*.chain) chainAntiRepeat /cluster/data/galGal2/nib /cluster/data/xenTro1/xenTro1.2bit $i ../chain/$i end # now on the cluster server, sort chains. The chainMergeSort and chainSplit # steps both take about 10 minutes ssh kksilo cd /cluster/data/xenTro1/bed/zb.hg17/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # Load chains into database. This takes an hour ssh hgwdev cd /cluster/data/xenTro1/bed/zb.hg17/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain hg17 ${c}_chainXenTro1 $i end featureBits hg17 chainXenTro1Link # NET XENOPUS BLASTZ ssh kksilo cd /cluster/data/xenTro1/bed/zb.hg17/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Create axt files for the net as so: netSplit noClass.net noClass cd noClass mkdir ../../axtNet foreach i (*.net) netToAxt $i ../chain/$i:r.chain /cluster/data/hg17/nib /cluster/data/xenTro1/xenTro1.2bit ../../axtNet/$i:r.axt end cd .. rm -r noClass # Add classification info using db tables: ssh hgwdev cd /cluster/data/xenTro1/bed/zb.hg17/axtChain netClass -noAr noClass.net hg17 xenTro1 human.net rm noClass.net # Load the nets into database ssh hgwdev cd /cluster/data/hg17/bed/zb.hg17/axtChain netFilter -minGap=10 human.net | hgLoadNet hg17 netXenTro1 stdin # MAKE NET MAFS FOR MULTIZ (2004-12-15 kate) ssh kksilo cd /cluster/data/xenTro1/bed/zb.hg17/axtNet mkdir -p ../mafNet cat *.axt | axtSort stdin stdout | axtToMaf -tSplit stdin \ -tPrefix=hg17. /cluster/data/hg17/chrom.sizes \ qPrefix=xenTro1. /cluster/data/xenTro1/chrom.sizes ../mafNet # redo axt's and maf's to remove overlaps (2006-04-07 kate) ssh kkstore04 cd /cluster/data/xenTro1/bed/zb.hg17 mv axtNet axtNet.old mv mafNet mafNet.old mkdir -p axtNet mafNet cd axtChain netSplit human.net net/ chainSplit chain/ all.chain cat > fix.csh << 'EOF' date foreach f (net/chr*.net) set c = $f:t:r echo $c netToAxt net/$c.net chain/$c.chain \ /cluster/data/hg17/nib /cluster/data/xenTro1/xenTro1.2bit stdout | \ axtSort stdin ../axtNet/$c.axt echo "Complete: $c.net -> $c.axt" axtToMaf ../axtNet/$c.axt \ /cluster/data/hg17/chrom.sizes /cluster/data/xenTro1/chrom.sizes \ ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=xenTro1. end date 'EOF' csh fix.csh >&! fix.log & cd /san/sanvol1/scratch/hg17/mafNet rm -fr xenTro1 cp -rp /cluster/data/xenTro1/bed/zb.hg17/mafNet xenTro1 # SWAP EVERYTHING AND REMAKE NET FROM XENOPUS REFERENCE ssh kksilo cd /cluster/data/xenTro1/bed mkdir -p bz.hg17/axtChain chainSwap zb.hg17/axtChain/all.chain bz.hg17/axtChain/all.chain cd bz.hg17/axtChain chainPreNet all.chain ../../zb.hg17/S2.len ../../zb.hg17/S1.len stdout \ | chainNet stdin -minSpace=1 ../../zb.hg17/S2.len ../../zb.hg17/S1.len stdout /dev/null \ | netSyntenic stdin noClass.net mkdir ../axtNet netToAxt noClass.net all.chain /cluster/data/xenTro1/xenTro1.2bit /cluster/data/hg17/nib \ ../axtNet/all.axt # Classify net and load into database ssh hgwdev cd /cluster/data/xenTro1/bed/bz.hg17/axtChain netClass -noAr noClass.net xenTro1 hg17 xenTro1.net rm noClass.net netFilter -minGap=10 xenTro1.net | hgLoadNet xenTro1 netHg17 stdin # Load chains into database (takes about 2 hours). hgLoadChain -tIndex xenTro1 chainHg17 all.chain rm chain.tab link.tab # compressed and create symlinks to make consistent with newer builds # 2006-04-09 markd ln -s zb.hg17 /cluster/data/xenTro1/bed/blastz.hg17 cd /cluster/data/xenTro1/bed/zb.hg17/axtChain gzip all.chain human.net ln -s all.chain.gz xenTro1.hg17.all.chain.gz ln -s human.net.gz xenTro1.hg17.net.gz # ALIGN TO MOUSE (BLASTZ/CHAIN/NET DONE 10/24/04 JK)) ssh kk mkdir -p /cluster/data/xenTro1/bed/zb.mm5 cd /cluster/data/xenTro1/bed/zb.mm5 # Create DEF file for run of scaffolds vs. human chromosomes cat << '_EOF_' > DEF # mouse vs. xenopus export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Set up blastz parameters using parameters between chicken and fish, # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=8000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: mouse SEQ1_DIR=/scratch/mus/mm5/softNib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Xenopus SEQ2_DIR=/scratch/hg/xenTro1/maskedPieces SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/xenTro1/bed/zb.mm5 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy bash source DEF mkdir $RAW run.0 cp /cluster/data/mm5/chrom.sizes $BASE/S1.len cp /cluster/data/xenTro1/chrom.sizes $BASE/S2.len /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, push check, para push, para check.... # Completed: 118327 of 118327 jobs # CPU time in finished jobs: 26625995s 443766.59m 7396.11h 308.17d 0.844 y # IO & Wait Time: 407224s 6787.06m 113.12h 4.71d 0.013 y # Average job time: 228s 3.81m 0.06h 0.00d # Longest job: 2642s 44.03m 0.73h 0.03d # Submission to last job: 47229s 787.15m 13.12h 0.55d # second cluster run: lift raw alignments -> lav dir ssh kki cd /cluster/data/xenTro1/bed/zb.mm5 bash # if a csh/tcsh user source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 wc -l jobList para create jobList para try, push check, para push, para check.... # Completed: 341 of 341 jobs # CPU time in finished jobs: 3787s 63.12m 1.05h 0.04d 0.000 y # IO & Wait Time: 6585s 109.75m 1.83h 0.08d 0.000 y # Average job time: 30s 0.51m 0.01h 0.00d # Longest job: 163s 2.72m 0.05h 0.00d # Submission to last job: 752s 12.53m 0.21h 0.01d # Convert lav to axt. This takes 30 minutes ssh kksilo cd /cluster/data/xenTro1/bed/zb.mm5 mkdir axtTemp cd lav foreach i (*) catDir $i | \ | lavToAxt stdin /cluster/data/mm5/nib \ /cluster/data/xenTro1/xenTro1.2bit ../axtTemp/$i.axt echo done $i end # CHAIN HUMAN BLASTZ # Run axtChain on little cluster # The longest job (chr19) takes up 90% of the time. ssh kki cd /cluster/data/xenTro1/bed/zb.mm5 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainRaw ls -1S /cluster/data/xenTro1/bed/zb.mm5/axtTemp/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chainRaw/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from chicken run. cat << '_EOF_' > temp.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line makes emacs coloring happy sed 's/ */\t/g' temp.gap > ../../xenopusHumanTuned.gap rm -f temp.gap cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../xenopusHumanTuned.gap \ -minScore=5000 $1 \ /scratch/mus/mm5/softNib \ /iscratch/i/xenTro1/xenTro1.2bit $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try para check para push Completed: 46 of 46 jobs CPU time in finished jobs: 2396s 39.94m 0.67h 0.03d 0.000 y IO & Wait Time: 7221s 120.34m 2.01h 0.08d 0.000 y Average job time: 209s 3.48m 0.06h 0.00d Longest job: 2042s 34.03m 0.57h 0.02d Submission to last job: 2042s 34.03m 0.57h 0.02d # On the cluster server weed the chains for repeats ssh kksilo cd /cluster/data/xenTro1/bed/zb.mm5/axtChain mkdir chain cd chainRaw foreach i (*.chain) chainAntiRepeat /cluster/data/galGal2/nib /cluster/data/xenTro1/xenTro1.2bit $i ../chain/$i end # now on the cluster server, sort chains. The chainMergeSort and chainSplit # steps both take about 10 minutes ssh kksilo cd /cluster/data/xenTro1/bed/zb.mm5/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # Load chains into database. This takes an hour ssh hgwdev cd /cluster/data/xenTro1/bed/zb.mm5/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain mm5 ${c}_chainXenTro1 $i end featureBits mm5 chainXenTro1Link # NET XENOPUS BLASTZ ssh kksilo cd /cluster/data/xenTro1/bed/zb.mm5/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Create axt files for the net as so: netSplit noClass.net noClass cd noClass mkdir ../../axtNet foreach i (*.net) netToAxt $i ../chain/$i:r.chain /cluster/data/mm5/nib /cluster/data/xenTro1/xenTro1.2bit ../../axtNet/$i:r.axt end cd .. rm -r noClass # Add classification info using db tables: (Stopped here, need gap tables) ssh hgwdev cd /cluster/data/xenTro1/bed/zb.mm5/axtChain netClass -noAr noClass.net mm5 xenTro1 mm5.net rm noClass.net # Load the nets into database ssh hgwdev cd /cluster/data/mm5/bed/zb.mm5/axtChain netFilter -minGap=10 mm5.net | hgLoadNet mm5 netXenTro1 stdin # SWAP EVERYTHING AND REMAKE NET FROM XENOPUS REFERENCE ssh kksilo cd /cluster/data/xenTro1/bed mkdir -p bz.mm5/axtChain chainSwap zb.mm5/axtChain/all.chain bz.mm5/axtChain/all.chain cd bz.mm5/axtChain chainPreNet all.chain ../../zb.mm5/S2.len ../../zb.mm5/S1.len stdout \ | chainNet stdin -minSpace=1 ../../zb.mm5/S2.len ../../zb.mm5/S1.len stdout /dev/null \ | netSyntenic stdin noClass.net mkdir ../axtNet netToAxt noClass.net all.chain /cluster/data/xenTro1/xenTro1.2bit /cluster/data/mm5/nib \ ../axtNet/all.axt # Classify net and load into database ssh hgwdev cd /cluster/data/xenTro1/bed/bz.mm5/axtChain netClass -noAr noClass.net xenTro1 mm5 xenTro1.net rm noClass.net netFilter -minGap=10 xenTro1.net | hgLoadNet xenTro1 netMm5 stdin # Load chains into database (takes about 2 hours). hgLoadChain -tIndex xenTro1 chainMm5 all.chain rm chain.tab link.tab # ALIGN TO ZEBRAFISH (BLASTZ/CHAIN/NET DONE 10/24/04 JK)) ssh kk mkdir -p /cluster/data/xenTro1/bed/zb.danRer1 cd /cluster/data/xenTro1/bed/zb.danRer1 # Create DEF file for run of scaffolds vs. zebrafish chromosomes cat << '_EOF_' > DEF # zebrafish vs. xenopus export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz # Set up blastz parameters using parameters between chicken and fish, # but not abridging repeats since can't do that with scaffolds, and # it's not very relevant at this evolutionary distance. BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Chicken SEQ1_DIR=/iscratch/i/danRer1/nib SEQ1_RMSK= SEQ1_FLAG= SEQ1_SMSK= SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Xenopus SEQ2_DIR=/scratch/hg/xenTro1/maskedPieces SEQ2_RMSK= SEQ2_FLAG= SEQ2_SMSK= SEQ2_IN_CONTIGS=1 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/xenTro1/bed/zb.danRer1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line keeps emacs coloring happy bash source DEF mkdir $RAW run.0 cp /cluster/data/danRer1/chrom.sizes $BASE/S1.len cp /cluster/data/xenTro1/chrom.sizes $BASE/S2.len /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j sh ./xdir.sh cd run.0 sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList para create jobList para try, push check, para push, para check.... # Completed: 58990 of 58990 jobs # CPU time in finished jobs: 10946601s 182443.35m 3040.72h 126.70d 0.347 y # IO & Wait Time: 389169s 6486.15m 108.10h 4.50d 0.012 y # Average job time: 192s 3.20m 0.05h 0.00d # Longest job: 5028s 83.80m 1.40h 0.06d # Submission to last job: 138773s 2312.88m 38.55h 1.61d # second cluster run: lift raw alignments -> lav dir ssh kki cd /cluster/data/xenTro1/bed/zb.danRer1 bash # if a csh/tcsh user source DEF mkdir run.1 lav /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList cd run.1 para create jobList # Completed: 170 of 170 jobs # CPU time in finished jobs: 5013s 83.55m 1.39h 0.06d 0.000 y # IO & Wait Time: 14173s 236.22m 3.94h 0.16d 0.000 y # Average job time: 113s 1.88m 0.03h 0.00d # Longest job: 333s 5.55m 0.09h 0.00d # Submission to last job: 2837s 47.28m 0.79h 0.03d # Convert lav to axt. This takes about 2 hours ssh kksilo cd /cluster/data/xenTro1/bed/zb.danRer1 mkdir axtTemp cd lav foreach i (*) catDir $i | \ lavToAxt stdin /cluster/data/danRer1/nib \ /cluster/data/xenTro1/xenTro1.2bit ../axtTemp/$i.axt echo done $i end # CHAIN ZEBRAFISH BLASTZ # Run axtChain on little cluster # Note, for this one might as well do it serially on file server too. # The longest job (chr19) takes up 90% of the time. ssh kki cd /cluster/data/xenTro1/bed/zb.danRer1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chainRaw ls -1S /cluster/data/xenTro1/bed/zb.danRer1/axtTemp/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chainRaw/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Reuse gap penalties from chicken run. cat << '_EOF_' > temp.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line makes emacs coloring happy sed 's/ */\t/g' temp.gap > ../../xenopusHumanTuned.gap rm -f temp.gap cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../xenopusHumanTuned.gap \ -minScore=5000 $1 \ /iscratch/i/danRer1/nib \ /iscratch/i/xenTro1/xenTro1.2bit $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try para check para push # Completed: 52 of 54 jobs # Crashed: 2 jobs # CPU time in finished jobs: 495s 8.25m 0.14h 0.01d 0.000 y # IO & Wait Time: 281s 4.68m 0.08h 0.00d 0.000 y # Average job time: 15s 0.25m 0.00h 0.00d # Longest job: 215s 3.58m 0.06h 0.00d # Submission to last job: 215s 3.58m 0.06h 0.00d # # Note, the two crashed were from chr6_random and chr8_random, that simply # had no alignments and thus produced empty files, so no problem. # On the cluster server weed the chains for repeats ssh kksilo cd /cluster/data/xenTro1/bed/zb.danRer1/axtChain mkdir chain cd chainRaw foreach i (*.chain) chainAntiRepeat /cluster/data/galGal2/nib /cluster/data/xenTro1/xenTro1.2bit $i ../chain/$i end # now on the cluster server, sort chains. The chainMergeSort and chainSplit # steps both take about 10 minutes ssh kksilo cd /cluster/data/xenTro1/bed/zb.danRer1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain rm -r ../axtTemp # Load chains into database. This takes an hour or three ssh hgwdev cd /cluster/data/xenTro1/bed/zb.danRer1/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain danRer1 ${c}_chainXenTro1 $i end featureBits danRer1 chainXenTro1Link # NET XENOPUS/ZEBRAFISH BLASTZ ssh kksilo cd /cluster/data/xenTro1/bed/zb.danRer1/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Create axt files for the net as so: netSplit noClass.net noClass cd noClass mkdir ../../axtNet foreach i (*.net) netToAxt $i ../chain/$i:r.chain /cluster/data/danRer1/nib /cluster/data/xenTro1/xenTro1.2bit ../../axtNet/$i:r.axt end cd .. rm -r noClass # Add classification info using db tables: ssh hgwdev cd /cluster/data/xenTro1/bed/zb.danRer1/axtChain netClass -noAr noClass.net danRer1 xenTro1 danRer1.net rm noClass.net # Load the nets into database ssh hgwdev cd /cluster/data/xenTro1/bed/zb.danRer1/axtChain netFilter -minGap=10 danRer1.net | hgLoadNet danRer1 netXenTro1 stdin # SWAP EVERYTHING AND REMAKE NET FROM XENOPUS REFERENCE ssh kksilo cd /cluster/data/xenTro1/bed mkdir -p bz.danRer1/axtChain chainSwap zb.danRer1/axtChain/all.chain bz.danRer1/axtChain/all.chain cd bz.danRer1/axtChain chainPreNet all.chain ../../zb.danRer1/S2.len ../../zb.danRer1/S1.len stdout \ | chainNet stdin -minSpace=1 ../../zb.danRer1/S2.len ../../zb.danRer1/S1.len stdout /dev/null \ | netSyntenic stdin noClass.net mkdir ../axtNet netToAxt noClass.net all.chain /cluster/data/xenTro1/xenTro1.2bit /cluster/data/danRer1/nib \ ../axtNet/all.axt # Classify net and load into database ssh hgwdev cd /cluster/data/xenTro1/bed/bz.danRer1/axtChain netClass -noAr noClass.net xenTro1 danRer1 xenTro1.net rm noClass.net netFilter -minGap=10 xenTro1.net | hgLoadNet xenTro1 netDanRer1 stdin # Load chains into database (takes about 8 hours). These are so large # we have to split them first. ssh kksilo cd /cluster/store8/xenTro1/bed/bz.danRer1/axtChain chainSplit -lump=10 lumps all.chain ssh hgwdev cd /cluster/store8/xenTro1/bed/bz.danRer1/axtChain/lumps hgLoadChain -tIndex xenTro1 chainDanRer1 000.chain rm 000.chain foreach i (*.chain) echo loading $i hgLoadChain -tIndex -oldTable xenTro1 chainDanRer1 $i end rm chain.tab link.tab # CREATE MULTIPLE ALIGNMENTS WITH MULTIZ (DONE 10/24/04 JK) ssh kksilo cd /cluster/data/xenTro1/bed mkdir -p multiz axtSort bz.galGal2/axtNet/all.axt stdout | axtToMaf -tSplit stdin \ -tPrefix=xenTro1. /cluster/data/xenTro1/chrom.sizes \ -qPrefix=galGal2. /cluster/data/galGal2/chrom.sizes \ multiz/galGal2 axtSort bz.hg17/axtNet/all.axt stdout | axtToMaf -tSplit stdin \ -tPrefix=xenTro1. /cluster/data/xenTro1/chrom.sizes \ -qPrefix=hg17. /cluster/data/hg17/chrom.sizes \ multiz/hg17 axtSort bz.mm5/axtNet/all.axt stdout | axtToMaf -tSplit stdin \ -tPrefix=xenTro1. /cluster/data/xenTro1/chrom.sizes \ -qPrefix=mm5. /cluster/data/mm5/chrom.sizes \ multiz/mm5 axtSort bz.danRer1/axtNet/all.axt stdout | axtToMaf -tSplit stdin \ -tPrefix=xenTro1. /cluster/data/xenTro1/chrom.sizes \ -qPrefix=danRer1. /cluster/data/danRer1/chrom.sizes \ multiz/danRer1 # Do multiz in a tizzy (takes about an hour) ssh kksilo cd /cluster/data/xenTro1/bed/multiz echo galGal2 hg17 mm5 danRer1 > dirList awk '{print $1;}' /cluster/data/xenTro1/chrom.sizes > scaffoldList multizzy dirList scaffoldList mtz catDir mtz | mafFilter stdin -minScore=500 > multiz5way.maf # Load into database ssh hgwdev cd /cluster/data/xenTro1/bed/multiz mkdir -p /gbdb/xenTro1/multiz5way ln -s /cluster/data/xenTro1/bed/multiz/multiz5way.maf /gbdb/xenTro1/multiz5way hgLoadMaf xenTro1 multiz5way # Create pairwise maf files (no longer split across scaffolds) ssh kksilo cd /cluster/data/xenTro1/bed/multiz mkdir -p pairwise catDir galGal2 > pairwise/galGal2.maf catDir hg17 > pairwise/hg17.maf catDir mm5 > pairwise/mm5.maf catDir danRer1 > pairwise/danRer1.maf # Load pairwise mafs into database ssh hgwdev set h = /cluster/data/xenTro1/bed/multiz/pairwise cd $h foreach i (*.maf) set o = $i:r set t = ${o}_netBlastz mkdir -p /gbdb/xenTro1/multiz5way/$t ln -s $h/$i /gbdb/xenTro1/multiz5way/$t hgLoadMaf xenTro1 $t -pathPrefix=/gbdb/xenTro1/multiz5way/$t end mkdir -p /gbdb/xenTro1/multiz5way/galGal2_netBlastz ln -s galGal2.maf /gbdb/xenTro1/multiz5way/galGal2_netBlastz hgLoadMaf xenTro1 galGal2_netBlastz -pathPrefix=/gbdb/xenTro1/multiz5way/galGal2_netBlastz mkdir -p /gbdb/xenTro1/multiz5way/hg17_netBlastz ln -s hg17.maf /gbdb/xenTro1/multiz5way/hg17_netBlastz hgLoadMaf xenTro1 hg17_netBlastz -pathPrefix=/gbdb/xenTro1/multiz5way/hg17_netBlastz mkdir -p /gbdb/xenTro1/multiz5way/mm5_netBlastz ln -s mm5.maf /gbdb/xenTro1/multiz5way/mm5_netBlastz hgLoadMaf xenTro1 mm5_netBlastz -pathPrefix=/gbdb/xenTro1/multiz5way/mm5_netBlastz mkdir -p /gbdb/xenTro1/multiz5way/danRer1_netBlastz ln -s danRer1.maf /gbdb/xenTro1/multiz5way/danRer1_netBlastz hgLoadMaf xenTro1 danRer1_netBlastz -pathPrefix=/gbdb/xenTro1/multiz5way/danRer1_netBlastz # CREATE CONSERVATION WIGGLE WITH PHASTCONS (DONE 10/25/04 JK) # Estimate phastCons parameters ssh kksilo cd /cluster/data/xenTro1/bed/multiz mkdir cons cd cons # Create a starting-tree.mod based on scaffold_1. twoBitToFa ../../xenTro1.2bit -seq=scaffold_1 scaffold_1.fa /cluster/bin/phast/msa_split ../mtz/scaffold_1.maf --refseq scaffold_1.fa --in-format MAF --windows 100000000,1000 --out-format SS --between-blocks 5000 --out-root s1 /cluster/bin/phast/phyloFit -i SS s1.*.ss --tree "((((hg17,mm5),galGal2),xenTro1),danRer1)" --out-root starting-tree.mod rm scaffold_1.fa scaffold1.*.ss # Create directory full of sequence in individual scaffold files on bluearc. ssh kksilo mkdir -p /cluster/bluearc/xenTro1/scaffoldFa cd /cluster/bluearc/xenTro1/scaffoldFa twoBitToFa /cluster/data/xenTro1/xenTro1.2bit stdout | faSplit byname stdin foo # Create big bad bloated SS files in bluearc (takes ~45 minutes) mkdir -p /cluster/bluearc/xenTro1/cons/ss cd /cluster/bluearc/xenTro1/cons/ss foreach i (`awk '{print $1}' /cluster/data/xenTro1/chrom.sizes`) if (-e /cluster/data/xenTro1/bed/multiz/mtz/$i.maf) then echo msa_split $i /cluster/bin/phast/msa_split /cluster/data/xenTro1/bed/multiz/mtz/$i.maf \ --refseq /cluster/bluearc/xenTro1/scaffoldFa/$i.fa \ --in-format MAF --windows 1000000,0 --between-blocks 5000 \ --out-format SS --out-root $i endif end # Create a random list of 50 1 mb regions ls -l | awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs # Set up parasol directory to calculate trees on these 50 regions ssh kk9 cd /cluster/bluearc/xenTro1/cons mkdir treeRun1 cd treeRun1 mkdir tree log # Create little script that calls phastCons with right arguments cat > makeTree << '_EOF_' /cluster/bin/phast/phastCons ../ss/$1.ss \ /cluster/data/xenTro1/bed/multiz/cons/starting-tree.mod \ --gc 0.410 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 12 --target-coverage 0.35 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' chmod a+x makeTree # Create gensub file cat > gsub << '_EOF_' #LOOP makeTree $(root1) #ENDLOOP '_EOF_' # Make cluster job and run it gensub2 ../randomSs single gsub jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # CPU time in finished jobs: 2004s 33.40m 0.56h 0.02d 0.000 y # IO & Wait Time: 257s 4.28m 0.07h 0.00d 0.000 y # Average job time: 45s 0.75m 0.01h 0.00d # Longest job: 91s 1.52m 0.03h 0.00d # Submission to last job: 92s 1.53m 0.03h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ls tree/*.cons.mod > cons.txt /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' --output-average ../ave.cons.mod > cons_summary.txt ls tree/*.noncons.mod > noncons.txt /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp ave.*.mod /cluster/data/xenTro1/bed/multiz/cons # Create cluster dir to do main phastCons run mkdir consRun1 cd consRun1 mkdir ppRaw bed # Create script to run phastCons with right parameters cat > doPhast << '_EOF_' mkdir -p ppRaw/$2 /cluster/bin/phast/phastCons ../ss/$1.ss ../ave.cons.mod,../ave.noncons.mod \ --expected-lengths 12 --target-coverage 0.35 --quiet --seqname $2 \ --idpref $2 --viterbi bed/$1.bed --score --require-informative 0 \ ppRaw/$2/$1.pp '_EOF_' chmod a+x doPhast # Create gsub file cat > gsub << '_EOF_' #LOOP doPhast $(file1) $(root1) #ENDLOOP '_EOF_' # Create parasol batch and run it ls -1 ../ss | sed 's/.ss//' > in.lst> in.lst gensub2 in.lst single gsub jobList para create jobList # para try/check/push/etc. # Completed: 12869 of 12869 jobs # CPU time in finished jobs: 11470s 191.17m 3.19h 0.13d 0.000 y # IO & Wait Time: 40795s 679.91m 11.33h 0.47d 0.001 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest job: 20s 0.33m 0.01h 0.00d # Submission to last job: 675s 11.25m 0.19h 0.01d # combine predictions and transform scores to be in 0-1000 interval catDir bed | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \ /cluster/bin/scripts/lodToBedScore /dev/stdin > \ /cluster/data/xenTro1/bed/multiz/mostConserved.bed # Figure out how much is actually covered by the bed files as so: awk '{print $3 - $2;}' /cluster/data/xenTro1/bed/multiz/mostConserved.bed | addCols stdin # If the results of the this divided by the non-n genome size (1.5G) aren't around 4%, then # do it again, adjusting the target-coverage phastCons parameter. Actually when I # took the target-coverage up to 0.61, it started putting negative numbers into the # score column, so left it at 0.35, which results in 0.3% coverage. # Load most conserved track into database ssh hgwdev cd /cluster/data/xenTro1/bed/multiz hgLoadBed xenTro1 mostConserved mostConserved.bed # Create merged posterier probability file and wiggle track data files ssh kksilo cd /cluster/bluearc/xenTro1/cons/consRun1 wigAsciiCrunch -dirDir ppRaw /cluster/data/xenTro1/bed/multiz/cons/phastCons5.pp cd /cluster/data/xenTro1/bed/multiz/cons wigAsciiToBinary -wibFile=phastCons5 phastCons5.pp # Load gbdb and database with wiggle. ssh hgwdev ln -s /cluster/data/xenTro1/bed/multiz/cons/phastCons5.wib /gbdb/xenTro1/wib/phastCons5.wib cd /cluster/data/xenTro1/bed/multiz/cons hgLoadWiggle xenTro1 phastCons5 phastCons5.wig # RE-INDEX 12 TABLES TO ACCOMODATE LONG SCAFFOLD NAMES hgsql xenTro1 -e 'drop index qName on blatFr1' hgsql xenTro1 -e 'create index qName on blatFr1(qName(16))' hgsql xenTro1 -e 'drop index tName on all_est' hgsql xenTro1 -e 'create index tName on all_est(tName(16),bin)' hgsql xenTro1 -e 'drop index tName_2 on all_est' hgsql xenTro1 -e 'create index tName_2 on all_est(tName(16),tStart)' hgsql xenTro1 -e 'drop index tName_3 on all_est' hgsql xenTro1 -e 'create index tName_3 on all_est(tName(16),tEnd)' hgsql xenTro1 -e 'drop index chrom on estOrientInfo' hgsql xenTro1 -e 'create index chrom on estOrientInfo(chrom(16),bin)' hgsql xenTro1 -e 'drop index chrom_2 on estOrientInfo' hgsql xenTro1 -e 'create index chrom_2 on estOrientInfo(chrom(16),chromStart)' hgsql xenTro1 -e 'drop index chrom_3 on estOrientInfo' hgsql xenTro1 -e 'create index chrom_3 on estOrientInfo(chrom(16),chromEnd)' hgsql xenTro1 -e 'drop index chrom on gc5Base' hgsql xenTro1 -e 'create index chrom on gc5Base(chrom(16))' hgsql xenTro1 -e 'drop index name on genscan' hgsql xenTro1 -e 'create index name on genscan(name(16))' hgsql xenTro1 -e 'drop index name2 on genscan' hgsql xenTro1 -e 'create index name2 on genscan(name2(16))' hgsql xenTro1 -e 'drop index chrom on genscan' hgsql xenTro1 -e 'create index chrom on genscan(chrom(16),txStart)' hgsql xenTro1 -e 'drop index chrom_2 on genscan' hgsql xenTro1 -e 'create index chrom_2 on genscan(chrom(16),txEnd)' hgsql xenTro1 -e 'drop index chrom on genscanSubopt' hgsql xenTro1 -e 'create index chrom on genscanSubopt(chrom(16),bin)' hgsql xenTro1 -e 'drop index chrom_2 on genscanSubopt' hgsql xenTro1 -e 'create index chrom_2 on genscanSubopt(chrom(16),chromStart)' hgsql xenTro1 -e 'drop index chrom_3 on genscanSubopt' hgsql xenTro1 -e 'create index chrom_3 on genscanSubopt(chrom(16),chromEnd)' hgsql xenTro1 -e 'drop index tName on intronEst' hgsql xenTro1 -e 'create index tName on intronEst(tName(16),bin)' hgsql xenTro1 -e 'drop index tName_2 on intronEst' hgsql xenTro1 -e 'create index tName_2 on intronEst(tName(16),tStart)' hgsql xenTro1 -e 'drop index tName_3 on intronEst' hgsql xenTro1 -e 'create index tName_3 on intronEst(tName(16),tEnd)' hgsql xenTro1 -e 'drop index chrom on mostConserved' hgsql xenTro1 -e 'create index chrom on mostConserved(chrom(16),bin)' hgsql xenTro1 -e 'drop index chrom_2 on mostConserved' hgsql xenTro1 -e 'create index chrom_2 on mostConserved(chrom(16),chromStart)' hgsql xenTro1 -e 'drop index chrom_3 on mostConserved' hgsql xenTro1 -e 'create index chrom_3 on mostConserved(chrom(16),chromEnd)' hgsql xenTro1 -e 'drop index tName on all_mrna' hgsql xenTro1 -e 'create index tName on all_mrna(tName(16),bin)' hgsql xenTro1 -e 'drop index tName_2 on all_mrna' hgsql xenTro1 -e 'create index tName_2 on all_mrna(tName(16),tStart)' hgsql xenTro1 -e 'drop index tName_3 on all_mrna' hgsql xenTro1 -e 'create index tName_3 on all_mrna(tName(16),tEnd)' hgsql xenTro1 -e 'drop index chrom on mrnaOrientInfo' hgsql xenTro1 -e 'create index chrom on mrnaOrientInfo(chrom(16),bin)' hgsql xenTro1 -e 'drop index chrom_2 on mrnaOrientInfo' hgsql xenTro1 -e 'create index chrom_2 on mrnaOrientInfo(chrom(16),chromStart)' hgsql xenTro1 -e 'drop index chrom_3 on mrnaOrientInfo' hgsql xenTro1 -e 'create index chrom_3 on mrnaOrientInfo(chrom(16),chromEnd)' hgsql xenTro1 -e 'drop index chrom on multiz5way' hgsql xenTro1 -e 'create index chrom on multiz5way(chrom(16),bin)' hgsql xenTro1 -e 'drop index chrom_2 on multiz5way' hgsql xenTro1 -e 'create index chrom_2 on multiz5way(chrom(16),chromStart)' hgsql xenTro1 -e 'drop index chrom_3 on multiz5way' hgsql xenTro1 -e 'create index chrom_3 on multiz5way(chrom(16),chromEnd)' hgsql xenTro1 -e 'drop index chrom on phastCons5' hgsql xenTro1 -e 'create index chrom on phastCons5(chrom(16))' # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-11-04, Fan) ssh kksilo cd /cluster/data/xenTro1 #- Build the .zip files rm -rf zip mkdir zip # scaffolds zip -j zip/scaffolds.zip downloads/xenopus.041015.fasta & # RepeatMasker library zip -j zip/RmLib.zip downloads/xt3.lib1.fasta & # soft masked scaffold fasta zip -j zip/softmask2.zip xenTro1.softmask2.fa & # RepeatMasker out files zip -j zip/repeats.zip repeats/repeats.out & # hard masked scaffold fasta zip -j zip/hardmask2.zip xenTro1.hardmask2.fa & # TRF output files cd bed/simpleRepeat zip ../../zip/trf.zip trf.bed & cd ../.. # get GenBank native mRNAs cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=xenTro1 -native GenBank mrna \ /cluster/data/xenTro1/zip/mrna.fa # get GenBank xeno mRNAs ./bin/i386/gbGetSeqs -db=xenTro1 -xeno GenBank mrna \ /cluster/data/xenTro1/zip/xenoMrna.fa & # get native GenBank ESTs ./bin/i386/gbGetSeqs -db=xenTro1 -native GenBank est \ /cluster/data/xenTro1/zip/est.fa & cd /cluster/data/xenTro1/zip # zip GenBank native and xeno mRNAs and native ESTs, # after the above gbGetSeqs jobs are done. zip -j mrna.zip mrna.fa zip -j xenoMrna.zip xenoMrna.fa zip -j est.zip est.fa ssh hgwdev cd /cluster/data/xenTro1/zip set gp = /usr/local/apache/htdocs/goldenPath/xenTro1 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips cd $gp/bigZips md5sum *.zip > md5sum.txt # Take a look at bigZips, update README.txt. # MAKE VSHG17 DOWNLOADABLES (DONE 11/06/04 Fan) ssh kksilo cd /cluster/data/xenTro1/bed/bz.hg17/axtChain gzip -c all.chain > /cluster/data/xenTro1/zip/human.chain.gz gzip -c xenTro1.net > /cluster/data/xenTro1/zip/human.net.gz mkdir /cluster/data/xenTro1/zip/axtNet gzip -c ../axtNet/all.axt > /cluster/data/xenTro1/zip/axtNet/all.axt.gz ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/xenTro1/vsHg17 cd /usr/local/apache/htdocs/goldenPath/xenTro1/vsHg17 mv /cluster/data/xenTro1/zip/human*.gz . mv /cluster/data/xenTro1/zip/axtNet . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MAKE VSMM5 DOWNLOADABLES (DONE 11/06/04 Fan) ssh kksilo cd /cluster/data/xenTro1/bed/bz.mm5/axtChain gzip -c all.chain > /cluster/data/xenTro1/zip/mouse.chain.gz gzip -c xenTro1.net > /cluster/data/xenTro1/zip/mouse.net.gz mkdir /cluster/data/xenTro1/zip/axtNet gzip -c ../axtNet/all.axt > /cluster/data/xenTro1/zip/axtNet/all.axt.gz ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/xenTro1/vsMm5 cd /usr/local/apache/htdocs/goldenPath/xenTro1/vsMm5 mv /cluster/data/xenTro1/zip/mouse*.gz . mv /cluster/data/xenTro1/zip/axtNet . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MAKE VSGALGAL2 DOWNLOADABLES (DONE 11/06/04 Fan) ssh kksilo cd /cluster/data/xenTro1/bed/bz.galGal2/axtChain gzip -c all.chain > /cluster/data/xenTro1/zip/chicken.chain.gz gzip -c xenTro1.net > /cluster/data/xenTro1/zip/chicken.net.gz mkdir /cluster/data/xenTro1/zip/axtNet gzip -c ../axtNet/all.axt > /cluster/data/xenTro1/zip/axtNet/all.axt.gz ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/xenTro1/vsGalGal2 cd /usr/local/apache/htdocs/goldenPath/xenTro1/vsGalGal2 mv /cluster/data/xenTro1/zip/chicken*.gz . mv /cluster/data/xenTro1/zip/axtNet . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MAKE VSDANRER1 DOWNLOADABLES (DONE 11/06/04 Fan) ssh kksilo cd /cluster/data/xenTro1/bed/bz.danRer1/axtChain gzip -c all.chain > /cluster/data/xenTro1/zip/zebrafish.chain.gz gzip -c xenTro1.net > /cluster/data/xenTro1/zip/zebrafish.net.gz mkdir /cluster/data/xenTro1/zip/axtNet gzip -c ../axtNet/all.axt > /cluster/data/xenTro1/zip/axtNet/all.axt.gz ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/xenTro1/vsDanRer1 cd /usr/local/apache/htdocs/goldenPath/xenTro1/vsDanRer1 mv /cluster/data/xenTro1/zip/zebrafish*.gz . mv /cluster/data/xenTro1/zip/axtNet . md5sum *.gz > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # LOAD CPGISSLANDS (DONE 10/28/04 Fan) ssh hgwdev mkdir -p /cluster/data/xenTro1/bed/cpgIsland cd /cluster/data/xenTro1/bed/cpgIsland # Build software from Asif Chinwalla (achinwal@watson.wustl.edu) cvs co hg3rdParty/cpgIslands cd hg3rdParty/cpgIslands make mv cpglh.exe /cluster/data/xenTro1/bed/cpgIsland/ ssh kksilo cd /cluster/data/xenTro1/bed/cpgIsland foreach f (../../softMasked2/*.fa) set fout=$f:t:r:r.cpg echo running cpglh on $f to $fout ./cpglh.exe $f > $fout end # Transform cpglh output to bed + cat << '_EOF_' > filter.awk /* Input columns: */ /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */ /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */ /* Output columns: */ /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */ /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */ { $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); } '_EOF_' # << this line makes emacs coloring happy awk -f filter.awk chr*.cpg > cpgIsland.bed # load into database: ssh hgwdev cd /cluster/data/xenTro1/bed/cpgIsland hgLoadBed xenTro1 cpgIslandExt -tab -noBin \ -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed wc -l cpgIsland.bed # 44091 cpgIsland.bed featureBits xenTro1 cpgIslandExt #19279778 bases of 1381238994 (1.396%) in intersection # MAKE Human Proteins track (hg17 DONE 2005-1-15 braney) ssh kksilo mkdir -p /cluster/data/xenTro1/blastDb cd /cluster/data/xenTro1/blastDb for i in `cat ../chrom.lst`; do ls -1 ../$i/*/*.fa ; done > list for i in `cat list` do ln -s $i . formatdb -i `basename $i` -p F done rm *.log *.fa list ssh kkr1u00 mkdir -p /iscratch/i/xenTro1/blastDb cd /cluster/data/xenTro1/blastDb for i in nhr nin nsq; do cp *.$i /iscratch/i/xenTro1/blastDb ; echo $i; done cd iSync > sync.out mkdir -p /cluster/data/xenTro1/bed/tblastn.hg17KG cd /cluster/data/xenTro1/bed/tblastn.hg17KG echo /iscratch/i/xenTro1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst # back to kksilo exit cd /cluster/data/xenTro1/bed/tblastn.hg17KG calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(300000/`wc query.lst | awk "{print \\\$1}"`\) # 42156/(300000/498) = 69.978960 mkdir -p /cluster/bluearc/xenTro1/bed/tblastn.hg17KG/kgfa split -l 70 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/xenTro1/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/xenTro1/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/xenTro1/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/xenTro1/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/xenTro1/bed/tblastn.hg17KG para create blastSpec para push # Completed: 253435 of 253435 jobs # CPU time in finished jobs: 58003988s 966733.13m 16112.22h 671.34d 1.839 y # IO & Wait Time: 13196517s 219941.96m 3665.70h 152.74d 0.418 y # Average job time: 281s 4.68m 0.08h 0.00d # Longest job: 6238s 103.97m 1.73h 0.07d # Submission to last job: 174503s 2908.38m 48.47h 2.02d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/xenTro1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 603 of 603 jobs # CPU time in finished jobs: 883s 14.72m 0.25h 0.01d 0.000 y # IO & Wait Time: 75640s 1260.66m 21.01h 0.88d 0.002 y # Average job time: 127s 2.12m 0.04h 0.00d # Longest job: 342s 5.70m 0.10h 0.00d # Submission to last job: 5747s 95.78m 1.60h 0.07d exit # back to kksilo cd /cluster/data/xenTro1/bed/tblastn.hg17KG/blastOut for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/xenTro1/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/xenTro1/bed/tblastn.hg17KG hgLoadPsl xenTro1 blastHg17KG.psl # back to kksilo rm -rf blastOut # End tblastn # MAKE JGI FILTERED GENES (DONE 3/16/05 kent) # # Copy email attatchment from Igor Grigoriev (IVGrigoriev@lbl.gov) # to /cluster/data/xenTro1/bed/jgiFilteredModels/xenopus.FilteredModels # and then ssh hgwdev cd /cluster/data/xenTro1/bed/jgiFilteredModels hgJgiFilteredModels xenopus.FilteredModels jgiFilteredModels.gff ldHgGene xenTro1 jgiFilteredModels jgiFilteredModels.gff # Then edit trackDb to put in the track. # MAKE NIBPICS (DONE 3/17/05 kent) # # Load image data from NIBB (contact Naoto Ueno nueno@nibb.ac.jp) # from CD's into /cluster/store11/visiGene/offLine/nibbFrog # Get sequence of assembled ESTs from NIBB ssh hgwdev mkdir -p /cluster/data/xenTro1/bed/nibbPics cd /cluster/data/xenTro1/bed/nibbPics wget -O contigs.fa.gz http://xenopus.nibb.ac.jp/data/contigs.gz wget -O f.fa.gz http://xenopus.nibb.ac.jp/data/f.seq.gz wget -O r.fa.gz http://xenopus.nibb.ac.jp/data/r.seq.gz wget -O i.fa.gz http://xenopus.nibb.ac.jp/data/i.seq.gz gunzip contigs.fa.gz gunzip f.fa.gz gunzip r.fa.gz gunzip i.fa.gz # Figure out sequences used to probe images. nibbImageProbes /cluster/store11/visiGene/offline/nibbFrog i.fa f.fa r.fa nibbImageProbes.fa nibbImageProbes.tab # Go to pk, create directory for cluster job on SAN, and set up sequence. ssh pk cd /san/sanvol1/scratch/xenTro1/ mkdir nibbPics cd nibbPics cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa . # Set up blatz list files mkdir run cd run mkdir psl awk '{print $1;}' /cluster/data/xenTro1/chrom.sizes > genome.lst ls -1 ../nibbImageProbes.fa > mrna.lst # Make a little wrapper script that runs blatz and pslFilter cat << '_EOF_' > doBlatz #!/bin/tcsh -ef mkdir -p /tmp/nibPic blatz -rna -minScore=6000 /san/sanvol1/scratch/xenTro1/xenTro1.2bit:$1 ../$2.fa -out=psl stdout | pslFilter -minScore=100 -noHead stdin /tmp/nibPic/$1_$2.psl mv /tmp/nibPic/$1_$2.psl $3 '_EOF_' # << this line keeps emacs coloring happy chmod a+x doBlatz # Make gensub2 script cat << '_EOF_' > gsub #LOOP doBlatz $(root1) $(root2) {check out exists psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy #Create parasol batch gensub2 genome.lst mrna.lst gsub spec para create spec # Then do usual para try/para push/para time #Completed: 27064 of 27064 jobs #CPU time in finished jobs: 119714s 1995.23m 33.25h 1.39d 0.004 y #IO & Wait Time: 83315s 1388.59m 23.14h 0.96d 0.003 y #Average job time: 8s 0.13m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 254s 4.23m 0.07h 0.00d #Submission to last job: 2338s 38.97m 0.65h 0.03d # Do sorting and filtering catDir psl | sort -k 10 | pslReps stdin nearBest.psl /dev/null -nohead -minAli=0.75 -nearTop=0.001 -minCover=0.25 -minNearTopSize=80 sort -k 14 nearBest.psl > /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.psl # MAKE NIBB EST MAPPINGS # Go to pk, create directory for cluster job on SAN, and set up sequence. ssh pk cd /san/sanvol1/scratch/xenTro1/ mkdir nibPics cd nibPics mkdir split faSplit sequence /cluster/data/xenTro1/bed/nibPics/contigs.fa 16 split/xl # Note things would go smoother using 4 instead of 16 in the faSplit above # I was worried about hippos, but there were none. # Set up blatz list files mkdir run2 cd run2 mkdir psl awk '{print $1;}' /cluster/data/xenTro1/chrom.sizes > genome.lst ls -1 ../split/*.fa > mrna.lst # Make a little wrapper script that runs blatz and pslFilter cat << '_EOF_' > doBlatz #!/bin/tcsh -ef mkdir -p /tmp/nibPic blatz -rna -minScore=6000 /san/sanvol1/scratch/xenTro1/xenTro1.2bit:$1 ../split/$2.fa -out=psl stdout | pslFilter -minScore=100 -noHead stdin /tmp/nibPic/$1_$2.psl mv /tmp/nibPic/$1_$2.psl $3 '_EOF_' # << this line keeps emacs coloring happy chmod a+x doBlatz # Make gensub2 script cat << '_EOF_' > gsub #LOOP doBlatz $(root1) $(root2) {check out exists psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy #Create parasol batch gensub2 genome.lst mrna.lst gsub spec para create spec # Then do usual para try/para push/para time #Completed: 433024 of 433024 jobs #CPU time in finished jobs: 5489117s 91485.29m 1524.75h 63.53d 0.174 y #IO & Wait Time: 1109930s 18498.83m 308.31h 12.85d 0.035 y #Average job time: 15s 0.25m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 959s 15.98m 0.27h 0.01d #Submission to last job: 23019s 383.65m 6.39h 0.27d # Merge results into one file. This took at least an hour (left it going over night) catDir psl > ../raw.psl # Sort and do additional filtering. cd .. sort -k 10 raw.psl | pslReps stdin nearBest.psl /dev/null -nohead -minAli=0.80 -nearTop=0.001 -minCover=0.25 -minNearTopSize=100 sort -k 14 nearBest.psl > contigs.psl # Convert names nibbNameFix contigs.fa contigs.psl nibbEstContigs.fa nibbEstContigs.psl # Still TODO - massage names a little to get rid of NCBIisms in a few places # and to decide how to link to NIBB. # Clean up. This also takes a good while rm -r run1/psl split rm raw.psl nearBest.psl ######################################################################### # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram) ############################################################################ ## BLASTZ swap from mm8 alignments (DONE - 2006-02-21 - Hiram) ssh kk cd /cluster/data/mm8/bed/blastzXenTro1.2006-02-21 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \ -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \ -swap `pwd`/DEF > swap.out 2>&1 & time nice -n +19 featureBits xenTro1 chainMm8Link # 59307185 bases of 1381238994 (4.294%) in intersection ########################################################################## # GenBank gbMiscDiff table (markd 2007-01-10) # Supports `NCBI Clone Validation' section of mgcGenes details page # genbank release 157.0 now contains misc_diff fields for MGC clones # reloading mRNAs results in gbMiscDiff table being created. ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna xenTro1