#!/bin/sh # Strongylocentrotus purpuratus (California purple sea urchin) # Baylor College of Medicine HGSC # Spur_v3.1 (June 15th, 2011) # Project ID: 13728 # http://www.ncbi.nlm.nih.gov/bioproject/13728 # Assembly Type Number N50(kb) Bases+Gaps(Mb) Bases(Mb) # Spur_v3.0 Scaffolds 31,238 404,330 936,069,451 816,170,552 # Spur_v3.0 Contigs 174,512 13,472 816,170,552 816,170,552 # Spur_3.1 (June, 2011) # Contamination removed version of Spur_3.0. # # Spur_v3.0 (March, 2011) # This release is an improved assembly using a variety of Illumina # libraries with different mate-pair distances for scaffolding and gap # filling. # (Referencing hetGla1 for up-to-date procedures.) ################################################################ # Downloads Sequence (DONE - 2011-12-20 - kord) mkdir -p /cluster/data/strPur4/downloads cd /cluster/data/strPur4/downloads wget http://www.spbase.org/SpBase/download/downloadfile.php?file=Spur_v3.1_assembly.tar tar -xvf Spur_v3.1_assembly.tar # Spur_v3.1_assembly/ # Spur_v3.1_assembly/Contigs/ # Spur_v3.1_assembly/Contigs/acc_ctg_num.tbl.gz # Spur_v3.1_assembly/Contigs/Spur_3.1.contig.fa.gz # Spur_v3.1_assembly/Contigs/Spur_3.1.AGP.gz # Spur_v3.1_assembly/Contigs/Spur_3.1.contig.qual.gz # Spur_v3.1_assembly/README.Spur_3.1.txt # Spur_v3.1_assembly/LinearScaffolds/ # Spur_v3.1_assembly/LinearScaffolds/Spur_3.1.LinearScaffold.fa.gz # Spur_v3.1_assembly/LinearScaffolds/Spur_3.1.LinearScaffold.qual.gz faSize Spur_v3.1_assembly/Contigs/Spur_3.1.contig.fa.gz # 815995273 bases (59015 N's 815936258 real 815936258 upper 0 lower) in 174772 sequences in 1 files # Total size: mean 4668.9 sd 8818.7 min 51 (AAGJ04169930.1) max 237383 (AAGJ04011348.1) median 1448 # N count: mean 0.3 sd 2.0 # U count: mean 4668.6 sd 8818.3 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real cd .. # remove the version suffix ('.1') from the contig names zcat downloads/Spur_v3.1_assembly/Contigs/Spur_3.1.AGP.gz \ | grep "^#" > ucsc.agp zcat downloads/Spur_v3.1_assembly/Contigs/Spur_3.1.AGP.gz \ | grep -v "^#" | sed -e "s/\.1//" >> ucsc.agp gzip ucsc.agp # extract the scaffold/contig name and remove the .1 accession version zcat downloads/Spur_v3.1_assembly/Contigs/Spur_3.1.contig.fa.gz \ | sed -e "s/\.1.*//" \ > ucsc.fa # extract the scaffold/contig name and remove the .1 accession version # for quality file zcat downloads/Spur_v3.1_assembly/Contigs/Spur_3.1.contig.qual.gz \ | sed -e "s/\.1.*//" \ > ucsc.qual.fa gzip ucsc.qual.fa gzip ucsc.fa faSize ucsc.fa.gz # 815995273 bases (59015 N's 815936258 real 815936258 upper 0 lower) in 174772 sequences in 1 files # Total size: mean 4668.9 sd 8818.7 min 51 (AAGJ04169930) max 237383 (AAGJ04011348) median 1448 # N count: mean 0.3 sd 2.0 # U count: mean 4668.6 sd 8818.3 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real mv ucsc.agp.gz ucsc.spur3_1.agp.gz mv ucsc.fa.gz ucsc.spur3_1.fa.gz mv ucsc.qual.fa.gz ucsc.spur3_1.qual.fa.gz ######################################################################### # Initial database build (DONE - 2011-12-20 - kord) hgsql -e "select orderKey,name from dbDb order by orderKey;" \ hgcentraltest | less # 804 ci2 # 805 ci1 # 814 cioSav2 # 816 cioSav1 # 818 strPur3 # 819 strPur2 # 820 strPur1 # 822 ce10 # 823 ce9 cd /hive/data/genomes/strPur4 cat << _EOF_ > strPur4.config.ra # Config parameters for makeGenomeDb.pl: db strPur4 clade deuterostome genomeCladePriority 20 scientificName Strongylocentrotus purpuratus commonName S. purpuratus assemblyDate Jun. 2011 assemblyLabel Baylor College of Medicine Human Genome Sequencing Center (BCM-HGSC) Spur_v3.1 assemblyShortLabel BCM-HGSC Spur_v3.1 orderKey 817 mitoAcc NC_001453 fastaFiles /hive/data/genomes/strPur4/ucsc.spur3_1.fa.gz agpFiles /hive/data/genomes/strPur4/ucsc.spur3_1.agp.gz #qualFiles /hive/data/genomes/strPur4/ucsc.spur3_1.qual.fa.gz dbDbSpeciesDir urchin taxId 7668 subsetLittleIds Y _EOF_ time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev strPur4.config.ra \ > makeGenomeDb.log 2>&1 & # real 6m34.244s # user 0m0.078s # sys 0m0.084s # NOTES -- STUFF THAT YOU WILL HAVE TO DO -- # # Template trackDb.ra and .html's have been created, but they all need editing! # # cd /cluster/data/strPur4/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/urchin/strPur4 # # Search for '***' notes in each file in and make corrections (sometimes the # files used for a previous assembly might make a better template): # description.html /cluster/data/strPur4/html/{trackDb.ra,gap.html,gold.html} # # Then copy these files to your ~/kent/src/hg/makeDb/trackDb/urchin/strPur4 # - cd ~/kent/src/hg/makeDb/trackDb # - edit makefile to add strPur4 to DBS. # - git add urchin/strPur4/*.{ra,html} # - git commit -m "Added strPur4 to DBS." makefile # - git commit -m "Initial descriptions for strPur4." urchin/strPur4/*.{ra,html} # - git pull; git push # - Run make update DBS=strPur4 and make alpha when done. # - (optional) Clean up /cluster/data/strPur4/TemporaryTrackDbCheckout ln -s `pwd`/strPur4.unmasked.2bit /gbdb/strPur4/strPur4.2bit mkdir /cluster/home/kord/kent/src/hg/makeDb/trackDb/urchin/strPur4 cp /cluster/data/strPur4/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/urchin/strPur4/* \ /cluster/home/kord/kent/src/hg/makeDb/trackDb/urchin/strPur4 ######################################################################### # RepeatMasker (DONE 2011-12-20 - kord) mkdir /hive/data/genomes/strPur4/bed/repeatMasker cd /hive/data/genomes/strPur4/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -bigClusterHub=swarm -noSplit strPur4 > do.log 2>&1 & # real 142m59.141s # user 0m0.243s # sys 0m0.363s cat faSize.rmsk.txt # 936580645 bases (120628737 N's 815951908 real 671935715 upper 144016193 lower) # in 32009 sequences in 1 files # Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607 # (Scaffold5) median 1006 # N count: mean 3768.6 sd 15822.1 # U count: mean 20992.1 sd 87536.4 # L count: mean 4499.2 sd 17290.8 # %15.38 masked total, %17.65 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps strPur2 rmsk # 115258247 bases of 907085737 (12.706%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ######################################################################### # simpleRepeats (DONE - 2011-12-21 - Kord) mkdir /hive/data/genomes/strPur4/bed/simpleRepeat cd /hive/data/genomes/strPur4/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev -bigClusterHub=swarm -smallClusterHub=memk \ strPur4 > do.log 2>&1 & # real 93m24.992s # user 0m0.085s # sys 0m0.205s # *** All done! # *** Steps were performed in /hive/data/genomes/strPur4/bed/simpleRepeat # *** Please check the log file to see if trf had warnings that we # should pass on to Gary Benson (ideally with a small test case). # *** IMPORTANT: Developer, it is up to you to make use of trfMask.bed! # Here is a typical command sequence, to be run after RepeatMasker # (or WindowMasker etc.) has completed: # # cd /hive/data/genomes/strPur4 # twoBitMask strPur4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed strPur4.2bit # # *** Again, it is up to you to incorporate trfMask.bed into the final 2bit! cat fb.simpleRepeat # 63920123 bases of 816010923 (7.833%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/strPur4 twoBitMask strPur4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed strPur4.2bit # safe to ignore warnings about >=13 fields twoBitToFa strPur4.2bit stdout | faSize stdin > strPur4.2bit.faSize.txt cat strPur4.2bit.faSize.txt # double check with featureBits featureBits -countGaps strPur4 gap # 936580645 bases (120628737 N's 815951908 real 670818208 upper 145133700 lower) # in 32009 sequences in 1 files # Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607 # (Scaffold5) median 1006 # N count: mean 3768.6 sd 15822.1 # U count: mean 20957.2 sd 87408.0 # L count: mean 4534.2 sd 17420.4 # %15.50 masked total, %17.79 masked real rm /gbdb/strPur4/strPur4.2bit ln -s `pwd`/strPur4.2bit /gbdb/strPur4/strPur4.2bit ######################################################################### # SPBASE.ORG gene predictions (DONE - 2011-12-21 - Kord) cd /cluster/data/strPur4/downloads wget http://www.spbase.org/SpBase/download/downloadfile.php?file=SpBase3.1.gff3.tar tar -xvf SpBase3.1.gff3.tar # SpBase3.1.gff3/ # SpBase3.1.gff3/GLEAN-3.1.gff3-chado-UTR # SpBase3.1.gff3/contigs-3.1.gff3 # SpBase3.1.gff3/EST-3.1.gff3 # SpBase3.1.gff3/region-3.1.gff3 # SpBase3.1.gff3/BAC-END.gff3~ # SpBase3.1.gff3/exact-match-3.0-3.1 # SpBase3.1.gff3/3.0-3.1-map-b # SpBase3.1.gff3/qiang-3.1.gff3 # SpBase3.1.gff3/3.0-3.1-map # SpBase3.1.gff3/BAC-END.gff3 # SpBase3.1.gff3/CVS/ # SpBase3.1.gff3/CVS/Repository # SpBase3.1.gff3/CVS/Entries # SpBase3.1.gff3/CVS/Root # SpBase3.1.gff3/eBAC-3.1.gff3 # SpBase3.1.gff3/GLEAN-3.1.gff3-chado mkdir ../gff cd ../gff cat ../downloads/SpBase3.1.gff3/GLEAN-3.1.gff3-chado-UTR | sed 's/\"//g' | sed 's/gene_id //g' \ | sed 's/exon/CDS/g' > GLEAN-3.1.gff3-chado-UTR.ucsc.gff3 gffPeek GLEAN-3.1.gff3-chado-UTR.ucsc.gff3 # source: # GLEAN3 245885 # feature: # 3UTR 706 # CDS 186609 # gene 29285 # transcript 29285 # strand: # + 123018 # - 122867 # frame: # . 245885 cat GLEAN-3.1.gff3-chado-UTR.ucsc.gff3 | cut -d';' -f1 | sed 's/ID=//g' \ | cut -d':' -f1 | sed 's/gn$//g' | sed 's/tr$//g' \ > GLEAN-3.1.gff3-chado-UTR.ucsc.clean.gff3 ./numberDuplicatesInGff.py -i GLEAN-3.1.gff3-chado-UTR.ucsc.clean.gff3 \ -o GLEAN-3.1.gff3-chado-UTR.ucsc.clean.fixDups.gff3 # Corrected 163 duplicate names in 29285 genes. grep -v "transcript|gene" GLEAN-3.1.gff3-chado-UTR.ucsc.clean.fixDups.gff3 \ > GLEAN-3.1.gff3-chado-UTR_gene.gff3 ldHgGene strPur4 GLEAN_3_1_chado_UTR_gene GLEAN-3.1.gff3-chado-UTR_gene.gff3 # Reading GLEAN-3.1.gff3-chado-UTR.ucsc.clean.fixDups.gff3 # Read 29285 transcripts in 245885 lines in 1 files # 29285 groups 4213 seqs 1 sources 4 feature types # 29122 gene predictions # create peptide predictions twoBitToFa ../strPur4.rmsk.2bit ../strPur4.rmsk.fa hgPepPred strPur4 generic GLEAN_3_1_chado_UTR_genePep \ /hive/data/genomes/strPur4/strPur4.rmsk.fa # Processing /hive/data/genomes/strPur4/strPur4.rmsk.fa ## Adding a second gene prediction set for comparison from spbase.org (spbase6) cat qiang-3.1.gff3 | sed 's/exon/CDS/g' | cut -d';' -f1 | sed 's/ID=//g' \ | cut -d':' -f1 | sed 's/\"//g' | egrep -v transcript >qiang-3.1.cds.gff3 ldHgGene strPur4 qiang_3_1_gene qiang-3.1.cds.gff3 # Reading qiang-3.1.cds.gff3 # Read 50161 transcripts in 255110 lines in 1 files # 50161 groups 3838 seqs 1 sources 2 feature types # 29070 gene predictions hgPepPred strPur4 generic qiang_3_1_genePred \ /hive/data/genomes/strPur4/strPur4.rmsk.fa ######################################################################### # Create cpgIslandExt (DONE - 2011-12-21 - Kord) mkdir /cluster/data/strPur4/bed/cpgIslands cd /cluster/data/strPur4/bed/cpgIslands doCpgIslands.pl strPur4 -dbHost=hgwdev -workhorse=hgwdev -buildDir=`pwd` > strPur4eMods.log 2>&1 & featureBits strPur4 cpgIslandExt # 8293997 bases of 816010923 (1.016%) in intersection ########################################################################## ## WINDOWMASKER (DONE - 2011-12-21 - Kord) mkdir /hive/data/genomes/strPur4/bed/windowMasker cd /hive/data/genomes/strPur4/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev strPur4 > do.log 2>&1 & # real 56m3.925s # user 0m0.068s # sys 0m0.148s twoBitToFa strPur4.wmsk.2bit stdout | faSize stdin # 936580645 bases (120628737 N's 815951908 real 513579450 upper 302372458 lower) in 32009 sequences in 1 files # Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607 (Scaffold5) median 1006 # %32.28 masked total, %37.06 masked real twoBitToFa strPur4.wmsk.sdust.2bit stdout | faSize stdin # 936580645 bases (120628737 N's 815951908 real 506302518 upper 309649390 lower) in 32009 sequences in 1 files # Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607 (Scaffold5) median 1006 # %33.06 masked total, %37.95 masked real hgLoadBed strPur4 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 5445810 elements of size 3 featureBits -countGaps strPur4 windowmaskerSdust # 430238784 bases of 936580645 (45.937%) in intersection # eliminate the gaps from the masking featureBits strPur4 -not gap -bed=notGap.bed # 816010923 bases of 816010923 (100.000%) in intersection time nice -n +19 featureBits strPur4 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 309669062 bases of 816010923 (37.949%) in intersection # # real 16m13.749s # user 14m47.332s # sys 0m56.948s # reload track to get it clean hgLoadBed strPur4 windowmaskerSdust cleanWMask.bed.gz # Loaded 5453517 elements of size 4 featureBits -countGaps strPur4 windowmaskerSdust # 309669062 bases of 936580645 (33.064%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../strPur4.unmasked.2bit stdin \ -type=.bed strPur4.cleanWMSdust.2bit twoBitToFa strPur4.cleanWMSdust.2bit stdout | faSize stdin \ > strPur4.cleanWMSdust.faSize.txt cat strPur4.cleanWMSdust.faSize.txt # 936580645 bases (120628737 N's 815951908 real 506302518 upper 309649390 lower) in 32009 sequences in 1 files # Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607 (Scaffold5) median 1006 # %33.06 masked total, %37.95 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps strPur4 rmsk windowmaskerSdust # 110294593 bases of 936580645 (11.776%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2011-12-21 - Kord) cd /hive/data/genomes/strPur4 twoBitMask -add bed/windowMasker/strPur4.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed strPur4.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa strPur4.2bit stdout | faSize stdin > faSize.strPur4.txt cat faSize.strPur4.txt # 936580645 bases (120628737 N's 815951908 real 505914972 upper 310036936 lower) in 32009 sequences in 1 files # Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607 (Scaffold5) median 1006 # %33.10 masked total, %38.00 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/strPur4/strPur4.2bit ln -s `pwd`/strPur4.2bit /gbdb/strPur4/strPur4.2bit ############################################################################### # Add Illumina alignments for S. droebachiensis (DONE - 2012-01-12 - Kord) mkdir -p /gbdb/strPur4/Sdroe/ssaha2.sdro_d206a_KMK001.2011-12-22 cd /gbdb/strPur4/Sdroe/ssaha2.sdro_d206a_KMK001.2011-12-22 ln -s /hive/data/genomes/strPur4/sam/ssaha2.sdro_d206a_KMK001.2011-12-22/mapped.merge.sorted.bam ln -s /hive/data/genomes/strPur4/sam/ssaha2.sdro_d206a_KMK001.2011-12-22/mapped.merge.sorted.bam.bai hgBbiDbLink strPur4 bamSdroeGdna /gbdb/strPur4/Sdroe/ssaha2.sdro_d206a_KMK001.2011-12-22/mapped.merge.sorted.bam # add new strongylocentrotus group: hgsql strPur4 -e "INSERT INTO grp VALUES ('strongylocentrotus', 'Strongylocentrotus spp. Assembly and Analysis', 6.6);" make DBS=strPur4 update ############################################################################### # Add Illumina alignments for S. pallidus (DONE - 2012-01-12 - Kord) mkdir -p /gbdb/strPur4/Spall/ssaha2.spall_p20a_KMK002.2011-12-20 cd /gbdb/strPur4/Spall/ssaha2.spall_p20a_KMK002.2011-12-20 ln -s /hive/data/genomes/strPur4/sam/ssaha2.spall_p20a_KMK002.2011-12-20/mapped.merge.sorted.bam ln -s /hive/data/genomes/strPur4/sam/ssaha2.spall_p20a_KMK002.2011-12-20/mapped.merge.sorted.bam.bai hgBbiDbLink strPur4 bamSpallGdna /gbdb/strPur4/Spall/ssaha2.spall_p20a_KMK002.2011-12-20/mapped.merge.sorted.bam make DBS=strPur4 update ############################################################################### # Add Illumina alignments for S. intermedius ( DONE - 2012-01-18 - Kord) mkdir -p /gbdb/strPur4/Sint/ssaha2.sint_int1_KMK013.2012-01-06 cd /gbdb/strPur4/Sint/ssaha2.sint_int1_KMK013.2012-01-06 ln -s /hive/data/genomes/strPur4/sam/ssaha2.sint_int1_KMK013.2012-01-06/mapped.merge.sorted.bam ln -s /hive/data/genomes/strPur4/sam/ssaha2.sint_int1_KMK013.2012-01-06/mapped.merge.sorted.bam.bai hgBbiDbLink strPur4 bamSintGdna /gbdb/strPur4/Sint/ssaha2.sint_int1_KMK013.2012-01-06/mapped.merge.sorted.bam make DBS=strPur4 update ############################################################################### # Add Illumina alignments for S. franciscanus ( DONE - 2012-03-16 - Kord) mkdir -p /gbdb/strPur4/Sfran/ssaha2.sfra_sf4_KMK010.2012-02-18 cd /gbdb/strPur4/Sfran/ssaha2.sfra_sf4_KMK010.2012-02-18 ln -s /hive/data/genomes/strPur4/sam/ssaha2.sfra_sf4_KMK010.2012-02-18/mapped.merge.sorted.bam ln -s /hive/data/genomes/strPur4/sam/ssaha2.sfra_sf4_KMK010.2012-02-18/mapped.merge.sorted.bam.bai hgBbiDbLink strPur4 bamSfranGdna/gbdb/strPur4/Sfran/ssaha2.sfra_sf4_KMK010.2012-02-18/mapped.merge.sorted.bam make DBS=strPur4 update ############################################################################### # Add Illumina alignments for S. nudus ( DONE - 2012-03-16 - Kord) mkdir -p /gbdb/strPur4/Snud/ssaha2.snud_nu1_KMK011.2012-02-02 cd /gbdb/strPur4/Snud/ssaha2.snud_nu1_KMK011.2012-02-02 ln -s /hive/data/genomes/strPur4/sam/ssaha2.snud_nu1_KMK011.2012-02-02/mapped.merge.sorted.bam ln -s /hive/data/genomes/strPur4/sam/ssaha2.snud_nu1_KMK011.2012-02-02/mapped.merge.sorted.bam.bai hgBbiDbLink strPur4 bamSnudGdna /gbdb/strPur4/Snud/ssaha2.snud_nu1_KMK011.2012-02-02/mapped.merge.sorted.bam ############################################################################### # Add Illumina alignments for A. fragilis ( DONE - 2012-03-16 - Kord) mkdir -p /gbdb/strPur4/Afrag/ssaha2.afra_af10_KMK012.2012-02-10 cd /gbdb/strPur4/Afrag/ssaha2.afra_af10_KMK012.2012-02-10 ln -s /hive/data/genomes/strPur4/sam/ssaha2.afra_af10_KMK012.2012-02-10/mapped.merge.sorted.bam ln -s /hive/data/genomes/strPur4/sam/ssaha2.afra_af10_KMK012.2012-02-10/mapped.merge.sorted.bam.bai hgBbiDbLink strPur4 bamAfraGdna /gbdb/strPur4/Afrag/ssaha2.afra_af10_KMK012.2012-02-10/mapped.merge.sorted.bam ############################################################################### # Add Illumina alignments for P. depressus ( DONE - 2012-03-27 - Kord) mkdir -p /gbdb/strPur4/Pdep/ssaha2.pdep_pd5_KMK010.2012-03-21 cd /gbdb/strPur4/Pdep/ssaha2.pdep_pd5_KMK010.2012-03-21 ln -s /hive/data/genomes/strPur4/sam/ssaha2.pdep_pd5_KMK010.2012-03-21/mapped.merge.sorted.bam ln -s /hive/data/genomes/strPur4/sam/ssaha2.pdep_pd5_KMK010.2012-03-21/mapped.merge.sorted.bam.bai hgBbiDbLink strPur4 bamPdepGdna /gbdb/strPur4/Pdep/ssaha2.pdep_pd5_KMK010.2012-03-21/mapped.merge.sorted.bam ############################################################################### # Add Illumina alignments for P. depressus ( DONE - 2012-03-27 - Kord) mkdir -p /gbdb/strPur4/Hpul/ssaha2.hpul_hp5_KMK16.2012-03-14 cd /gbdb/strPur4/Hpul/ssaha2.hpul_hp5_KMK16.2012-03-14 ln -s /hive/data/genomes/strPur4/sam/ssaha2.hpul_hp5_KMK16.2012-03-14/mapped.merge.sorted.bam ln -s /hive/data/genomes/strPur4/sam/ssaha2.hpul_hp5_KMK16.2012-03-14/mapped.merge.sorted.bam.bai hgBbiDbLink strPur4 bamHpulGdna /gbdb/strPur4/Hpul/ssaha2.hpul_hp5_KMK16.2012-03-14/mapped.merge.sorted.bam ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-01-18 - Kord) # Use -repMatch=650, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 815995273 / 2897316137 \) \* 1024 # ( 815995273 / 2897316137 ) * 1024 = 288.397648 # round up to 300 cd /hive/data/genomes/strPur4 blat strPur4.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/strPur4.11.ooc -repMatch=300 # Wrote 36670 overused 11-mers to jkStuff/strPur4.11.ooc # copy all of this stuff to the klusters: # there are no non-bridged gaps hgsql -N -e "select bridge from gap;" strPur4 | sort | uniq -c # 142764 yes # If there were: # cd /hive/data/genomes/oreNil1/jkStuff # gapToLift oreNil1 nonBridged.lift -bedFile=nonBridged.bed cd /hive/data/genomes/strPur4 mkdir /hive/data/staging/data/strPur4 cp -p jkStuff/strPur4.11.ooc chrom.sizes \ strPur4.2bit /hive/data/staging/data/strPur4 # request rsync copy from cluster admin ########################################################################## # BLATSERVERS ENTRY (DONE - 2012-01-18 - Kord) # After getting a blat server assigned by the Blat Server Gods, # # blatx # trans port 17844 # untrans port 17845 hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("strPur4", "blatx", "17844", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("strPur4", "blatx", "17845", "0", "1");' \ hgcentraltest # mysql> SELECT * FROM blatServers WHERE db="strPur4"; # +---------+-------+-------+---------+--------+ # | db | host | port | isTrans | canPcr | # +---------+-------+-------+---------+--------+ # | strPur4 | blatx | 17844 | 1 | 0 | # | strPur4 | blatx | 17845 | 0 | 1 | # +---------+-------+-------+---------+--------+ # 2 rows in set (0.00 sec) # test it with some sequence ############################################################################### # Add Illumina alignments for S. intermedius chrM ( DONE - 2012-02-02 - Kord) mkdir -p /gbdb/strPur4/Sint/ssaha2.sint_int1_KMK013.chrM.2012-01-31 cdp /gbdb/strPur4/Sint/ssaha2.sint_int1_KMK013.chrM.2012-01-31 ln -s /hive/data/genomes/strPur4/sam/ssaha2.sint_int1_KMK013.chrM.2012-01-31/mapped.mq25.mated.sorted.bam ln -s /hive/data/genomes/strPur4/sam/ssaha2.sint_int1_KMK013.chrM.2012-01-31/mapped.mq25.mated.sorted.bam.bai hgBbiDbLink strPur4 bamSintGdnaChrM /hive/data/genomes/strPur4/sam/ssaha2.sint_int1_KMK013.chrM.2012-01-31/mapped.mq25.mated.sorted.bam ############################################################################### # Add the mitochondria genes ( Done - 2012-02-02 - Kord) # bring them over from strPur2 ssh hgwdev cd /hive/data/genomes/strPur4/gff cp /hive/data/genomes/strPur2/gff/X12631.fix.gff . # laod the GFF ldHgGene strPur4 mitoGene X12631.fix.gff # Reading X12631.fix.gff # Read 10 transcripts in 14 lines in 1 files # 10 groups 1 seqs 1 sources 2 feature types # 10 gene predictions # create peptide predictions from the mitoGene table cd .. hgPepPred strPur4 generic mitoGenePep strPur4.rmsk.fa # Processing strPur4.rmsk.fa ######################################################################## # codeml sites Pvalues mkdir -p /hive/data/genomes/strPur4/codeml/2012-04-01/download mkdir -p /hive/data/genomes/strPur4/codeml/2012-04-01/load cd /hive/data/genomes/strPur4/codeml/2012-04-01/load for chrom in `/bin/ls -1 ../download/*wig | sed 's/\./ /g'| awk '{print $2}'`; do wigEncode ../download/codeml.${chrom}.wig codeml.${chrom}.wig codeml.${chrom}.wib; done hgLoadWiggle -pathPrefix=/gbdb/strPur4/wib/codemlSites strPur4 codemlSites *.wig rm /gbdb/strPur4/wib/codemlSites/* ln -s `pwd`/*.wib /gbdb/strPur4/wib/codemlSites ######################################################################## # SpBase.org EST alignments (TODO - TBD - Kord) ######################################################################## # Create kluster run files (TODO - TBD - Kord) ######################################################################### # RefSeq alignments (TODO - TBD - Kord) ######################################################################### # Genscan gene predictions (TODO - TBD - Kord) ######################################################################### # LIFTOVER TO strPur2 (TODO - TBD - Kord ) ######################################################################### # LASTZ Drosophila droMel3 (TODO - TBD - Kord) ######################################################################### # LASTZ Tunicate ci2 (TODO - TBD - Kord)