# for emacs: -*- mode: sh; -*- # Ciona Intestinalis V2.0 from JGI # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # +-------------+---------------------------------+ # DOWNLOAD SEQUENCE ssh kkstore02 mkdir /cluster/store11/ci2 ln -s /cluster/store11/ci2 /cluster/data cd /cluster/data/ci2 wget "ftp://ftp.jgi-psf.org/pub/JGI_data/Ciona/v2.0/ciona050324.unmasked.fasta.gz" gunzip ciona050324.unmasked.fasta.gz mkdir chunks500k jkStuff scaffolds nib cd scaffolds faSplit byname ../fromJGI/*fasta . for i in *.fa do faToNib $i ../nib/`basename $i .fa`.nib done cd .. faSplit gap fromJGI/*fasta 500000 chunks500k/x -lift=jkStuff/chunks500k.lft -minGapSize=100 # REPEAT MASKING #- Make the run directory and job list: cd /cluster/data/ci2 tcsh cat << '_EOF_' > jkStuff/RMCiona #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/ci2/$2 /bin/cp $2 /tmp/ci2/$2/ cd /tmp/ci2/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2 popd /bin/cp /tmp/ci2/$2/$2.out ./ if (-e /tmp/ci2/$2/$2.align) /bin/cp /tmp/ci2/$2/$2.align ./ if (-e /tmp/ci2/$2/$2.tbl) /bin/cp /tmp/ci2/$2/$2.tbl ./ if (-e /tmp/ci2/$2/$2.cat) /bin/cp /tmp/ci2/$2/$2.cat ./ /bin/rm -fr /tmp/ci2/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/ci2/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/ci2 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMCiona exit mkdir RMRun for i in chunks500k/*.fa do d=`dirname $i` f=`basename $i` echo "/cluster/data/ci2/jkStuff/RMCiona /cluster/data/ci2/$d $f {check out line+ /cluster/data/ci2/$d/$f.out} " done > RMRun/RMJobs #- Do the run ssh kk cd /cluster/data/ci2/RMRun para create RMJobs para try, para check, para check, para push, para check,... # Completed: 4617 of 4617 jobs # CPU time in finished jobs: 133300s 2221.66m 37.03h 1.54d 0.004 y # IO & Wait Time: 703794s 11729.90m 195.50h 8.15d 0.022 y # Average job time: 181s 3.02m 0.05h 0.00d # Longest finished job: 651s 10.85m 0.18h 0.01d # Submission to last job: 2948s 49.13m 0.82h 0.03d featureBits ci2 rmsk # 22881665 bases of 141233565 (16.201%) in intersection # SIMPLE REPEATS (TRF) ssh kkstore02 mkdir -p /cluster/data/ci2/bed/simpleRepeat cd /cluster/data/ci2/bed/simpleRepeat mkdir trf tcsh cp /dev/null jobs.csh foreach d (/cluster/data/ci2/chunks500k) foreach f ($d/*.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" >> jobs.csh end end screen csh -ef jobs.csh # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l endsInLf trf/* # When job is done do: liftUp simpleRepeat.bed /cluster/data/ci2/jkStuff/liftAll.lft error trf/*.bed # Load into the database: ssh hgwdev hgLoadBed ci2 simpleRepeat /cluster/data/ci2/bed/simpleRepeat/simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits ci2 simpleRepeat # 6047914 bases of 141233565 (4.282%) in intersection # Create the database. ssh hgwdev echo 'create database ci2' | hgsql '' # CREATING GRP TABLE FOR TRACK GROUPING echo "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" | hgsql ci2 echo 'insert into blatServers values("ci2", "blat10", "17780", "1"); \ insert into blatServers values("ci2", "blat10", "17781", "0");' \ | hgsql -h genome-testdb hgcentraltest # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) values("ci2", "March 2005", "/gbdb/ci2/nib", "Ciona", "", 1, \ 44, "C. intestinalis", "Ciona intestinalias", "/gbdb/ci2/html/description.html", 0);' \ | hgsql -h genome-testdb hgcentraltest # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION # Make symbolic links in /gbdb/ci2/nib to the real nibs. ssh hgwdev mkdir /gbdb/ci2 ln -s /cluster/data/ci2/nib /gbdb/ci2/nib # Load /gbdb/ci2/nib paths into database cd /cluster/data/ci2 hgsql ci2 < ~kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib ci2 /gbdb/ci2/nib scaffolds/*.fa # make gcPercent table ssh hgwdev mkdir -p /cluster/data/ci2/bed/gcPercent cd /cluster/data/ci2/bed/gcPercent hgsql ci2 < ~/kent/src/hg/lib/gcPercent.sql hgGcPercent ci2 ../../nib #dropped gcPercent table (DONE braney 2005-08-19) ### AUTO UPDATE GENBANK MRNA RUN (DONE markd 2005-08-17) # genbank done with revised alignment procedure # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvs update etc # Edit etc/genbank.conf and add these lines: # NOTE: braney created a ooc and this was added after the initial alignment # ci2 (ciona intestinalis) ci2.serverGenome = /cluster/data/ci2/ci2.2bit ci2.clusterGenome = /iscratch/i/ci2/ci2.2bit ci2.ooc = no ci2.maxIntron = 20000 ci2.lift = no ci2.refseq.mrna.native.load = yes ci2.refseq.mrna.xeno.load = yes ci2.refseq.mrna.xeno.pslCDnaFilter = -minCover=0.25 -coverNearTop=0.005 -minId=0.15 -idNearTop=0.005 -maxRepMatch=0.4 -bestOverlap ci2.genbank.mrna.xeno.load = yes ci2.genbank.est.xeno.load = no ci2.downloadDir = ci2 ci2.perChromTables = no cvs ci etc/genbank.conf # Install to /cluster/data/genbank: make etc-update ssh kkstore02 cd /cluster/data/genbank # do mrnas and ests in one run nice bin/gbAlignStep -initial ci2 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -drop -initialLoad ci2 ### gap and repeats tables ssh hgwdev mkdir -p /cluster/data/ci2/bed/gapRmsk cd /cluster/data/ci2/bed/gapRmsk simpleGap /cluster/data/ci2/nib gap.bed repeats.bed echo "drop table gap;" | hgsql ci2 hgsql ci2 < ~/kent/src/hg/lib/gap.sql hgLoadBed -oldTable ci2 gap gap.bed echo "create index chrom on gap (chrom(13), bin) ;" | hgsql ci2 echo "create index chrom_2 on gap (chrom(13), chromStart);" | hgsql ci2 echo "create index chrom_3 on gap (chrom(13), chromEnd);" | hgsql ci2 # do RepeatMasking cd /cluster/data/ci echo "drop index bin on rmsk;" | hgsql ci2 echo "drop index genoStart on rmsk;" | hgsql ci2 echo "drop index genoEnd on rmsk;" | hgsql ci2 echo "create index chrom_2 on rmsk (genoName(13), genoStart);" | hgsql ci2 echo "create index chrom_3 on rmsk (genoName(13), genoEnd);" | hgsql ci2 ssh kkstore02 mkdir -p /cluster/data/ci2/bed/simpleRepeat cd /cluster/data/ci2/bed/simpleRepeat mkdir trf for i in ../../scaffolds/*.fa do trfBig $i /dev/null -bedAt=trf/`basename $i .fa`.bed > /dev/null 2>&1 ; echo $i; done cat trf/* > simpleRepeat.bed ssh hgwdev hgLoadBed ci2 simpleRepeat /cluster/data/ci2/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql ssh kkstore02 cd /cluster/data/ci2/bed/simpleRepeat mkdir -p trfMask cd trf for i in *.bed do awk '{if ($5 <= 12) print;}' $i > ../trfMask/$i done cd ../trfMask liftUp ../../../all.trfMask.bed ../../../jkStuff/chunks500k.lft error *.bed # use RepeatMasker and simpleRepeat to build masked fa's cd /cluster/data/ci2 mkdir maskedScaffolds cat scaffolds/*.fa | maskOutFa -soft stdin all.out all.tmp.fa maskOutFa -softAdd all.tmp.fa all.trfMask.bed all.masked.fa # Rebuild the nib files, using the soft masking in the fa: mkdir -p /cluster/data/ci2/nib cd /cluster/data/ci2/maskedScaffolds for i in *.fa do faToNib -softMask $i ../nib/`basename $i .fa`.nib done # Make one big 2bit file as well, and make a link to it in faToTwoBit *.fa ../ci2.2bit ln -s /cluster/data/ci2/ci2.2bit /gbdb/ci2/ ### SNAP GENE PREDICTIONS FROM COLIN DEWEY ssh hgwdev mkdir /cluster/data/ci2/bed/snap cd /cluster/data/ci2/bed/snap # contact: Colin Dewey wget "http://hanuman.math.berkeley.edu/~cdewey/tracks/SNAP.CioInt_2.gff.gz" gunzip SNAP.CioInt_2.gff.gz ldHgGene -gtf -frame -id -geneName ci2 snapGene SNAP.CioInt_2.gff # 31491546 bases of 141233565 (22.297%) in intersection # in ci1 28699953 bases of 113192845 (25.355%) in intersection # MAKE DOWNLOADABLE SEQUENCE FILES (re-DONE braney 2008-09-17) ssh kkstore02 cd /cluster/data/ci2 #- Build the .zip files csh cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip mkdir zip zip -j zip/Scaffold.out.zip all.out cd maskedScaffolds zip -j ../zip/ScaffoldFa.zip *.fa cd ../hardMaskedScaffolds zip -j ../zip/ScaffoldFaMasked.zip *.fa cd ../bed/simpleRepeat zip -j ../../zip/ScaffoldTrf.zip simpleRepeat.bed '_EOF_' # << this line makes emacs coloring happy chmod +x ./jkStuff/zipAll.csh ./jkStuff/zipAll.csh |& tee zipAll.log cd zip #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test rm *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/ci2/zip set gp = /usr/local/apache/htdocs/goldenPath/ci2 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips # mkdir -p $gp/scaffolds # foreach f ( ../*/chr*.fa ) # zip -j $gp/chromosomes/$f:t.zip $f # end cd $gp/bigZips md5sum *.zip > md5sum.txt # cd $gp/chromosomes # md5sum *.zip > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # MAKE 11.OOC FILE FOR BLAT mkdir /cluster/bluearc/ci2 blat /cluster/data/ci2/ci2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/ci2/11.ooc -repMatch=100 # Wrote 10830 overused 11-mers to /cluster/bluearc/ci2/11.ooc cp -p /cluster/bluearc/ci2/*.ooc /iscratch/i/ci2/ iSync ####################################################################### # GC5BASE (DONE - 2005-08-19 - Hiram) ssh kkstore02 mkdir /cluster/data/ci2/bed/gc5Base cd /cluster/data/ci2/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 ci2 \ /cluster/data/ci2 | wigEncode stdin gc5Base.wig gc5Base.wib ssh hgwdev cd /cluster/data/ci2/bed/gc5Base mkdir /gbdb/ci2/wib ln -s `pwd`/gc5Base.wib /gbdb/ci2/wib hgLoadWiggle ci2 gc5Base gc5Base.wig ####################################################################### ### AUTO UPDATE GENBANK MRNA RUN (DONE markd 2005-08-31) # redo genbank revised alignment procedure once again to # pickup local near best pslCDnaFilter # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvs update etc # Edit etc/genbank.conf and add these lines: # ci2 (ciona intestinalis) ci2.serverGenome = /cluster/data/ci2/ci2.2bit ci2.clusterGenome = /iscratch/i/ci2/ci2.2bit ci2.ooc = /iscratch/i/ci2/11.ooc ci2.maxIntron = 20000 ci2.lift = no ci2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} ci2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} ci2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} ci2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} ci2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} ci2.genbank.est.xeno.pslCDnaFilter = ${lowCover.genbank.est.xeno.pslCDnaFilter} ci2.refseq.mrna.native.load = yes ci2.refseq.mrna.xeno.load = yes ci2.genbank.mrna.xeno.load = yes ci2.genbank.est.xeno.load = no ci2.downloadDir = ci2 ci2.perChromTables = no cvs ci etc/genbank.conf # Install to /cluster/data/genbank: make etc-update ssh kkstore02 cd /cluster/data/genbank # do mrnas and ests in one run nice bin/gbAlignStep -initial ci2 & # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -drop -initialLoad ci2 # Note: download sequences are made as part of the first # genbank update on hgdownload. ############################################################################ # Adding Ensembl Genes (DONE - 2008-02-22 - Hiram) ssh kkstore02 cd /cluster/data/ci2 cat << '_EOF_' > ci2.ensGene.ra # required db and ensVersion variables db ci2 ensVersion 48 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to make sure. nameTranslation "s/^\([0-9][pq]\)/chr0\1/; s/^\([0-9][0-9][pq]\)/chr\1/; " '_EOF_' # << happy emacs doEnsGeneUpdate.pl oryCun1.ensGene.ra ##########################################################################pubStart # Publications track (DONE - 04-27-12 - Max) # article download and conversion is run every night on hgwdev: # 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh # the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/ # then converts them to text into /hive/data/outside/literature/{pmc,elsevier} # all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py # data processing was run manually like this export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/ # pmc cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc ssh swarm cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat steps:annot-tables exit pubBlat load # elsevier cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier ssh swarm cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat steps:annot-tables exit pubBlat load #--pubEnd