# for emacs: -*- mode: sh; -*- # This file describes browser build for the felCat5 # Felis catus - Cat # DATE: 13-Sep-2011 # ORGANISM: Felis catus # TAXID: 9685 # ASSEMBLY LONG NAME: Felis_catus-6.2 # ASSEMBLY SHORT NAME: Felis_catus-6.2 # ASSEMBLY SUBMITTER: International Cat Genome Sequencing Consortium # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000181335.2 # FTP-RELEASE DATE: 20-Dec-2011 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Felis_catus/Felis_catus-6.2/ # Genome ID: # http://www.ncbi.nlm.nih.gov/genome/78 # http://www.ncbi.nlm.nih.gov/bioproject/16726 # http://www.ncbi.nlm.nih.gov/nuccore/359264388 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AANG02 # Genome Coverage : 2x Sanger; 12x 454 # Taxonomy: # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9685 # Mitochondrial sequence: # http://www.ncbi.nlm.nih.gov/bioproject/10762 - chrMt NC_001700.1 # Assembly ID: 320798 # http://www.ncbi.nlm.nih.gov/genome/assembly/320798/ ############################################################################# # fetch sequence from genbank (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/felCat5/genbank cd /hive/data/genomes/felCat5/genbank time rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Felis_catus/Felis_catus-6.2/ ./ # real 25m17.609s # measure sequence to be used here faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \ Primary_Assembly/unlocalized_scaffolds/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz # 2455524127 bases (91244929 N's 2364279198 real 2364279198 upper # 0 lower) in 5499 sequences in 39 files # Total size: mean 446540.1 sd 8151278.3 # min 205 (gi|361624552|gb|JH413284.1|) # max 239302903 (gi|362110686|gb|CM001378.1|) median 1243 ############################################################################# # fixup names for UCSC standards (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/felCat5/ucsc cd /hive/data/genomes/felCat5/ucsc ######################## Assembled Chromosomes cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, "|gzip -c >chr${chrN}.agp.gz") or die "can not write to chr${chrN}.agp.gz"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, "|gzip -c >chr${chrN}.fa.gz") or die "can not write to chr${chrN}.fa.gz"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl time ./toUcsc.pl # real 12m6.675s ######################## Unplaced scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, "|gzip -c >unplaced.agp.gz") or die "can not write to unplaced.agp.gz"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, "|gzip -c >unplaced.fa.gz") or die "can not write to unplaced.fa.gz"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 0m3.062s # make sure none of the names got to be over 31 characers long: zcat unplaced.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \ | sort -rn | uniq -c | head # 5621 14 ######################## Unlocalized scaffolds cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, "|gzip -c >chr${chrN}_random.agp.gz") or die "can not write to chr${chrN}_random.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, "|gzip -c >chr${chrN}_random.fa.gz") or die "can not write to chr${chrN}_random.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/.*gb\|//; $acc =~ s/\|.*//; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x unlocalized.pl time ./unlocalized.pl # real 0m6.459s # verify nothing lost from original: faSize *.fa.gz # 2455524127 bases (91244929 N's 2364279198 real 2364279198 upper # 0 lower) in 5499 sequences in 39 files # Total size: mean 446540.1 sd 8151278.3 min 205 (chrUn_JH413284) # max 239302903 (chrA1) median 1243 # make sure none of the names got to be over 31 characers long: zcat *.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \ | sort -rn | uniq -c | head # 2860 21 # 255 20 # 5621 14 # 1632 5 # 187 4 ############################################################################# # Initial database build (DONE - 2012-04-11 - Hiram) cd /hive/data/genomes/felCat5 cat << '_EOF_' > felCat5.config.ra # Config parameters for makeGenomeDb.pl: db felCat5 clade mammal genomeCladePriority 17 scientificName Felis catus commonName Cat assemblyDate Sep. 2011 assemblyLabel International Cat Genome Sequencing Consortium Felis_catus-6.2 (NCBI project 16726, accession AANG00000000.2, GCA_000181335.2) assemblyShortLabel ICGSC Felis_catus 6.2 ncbiAssemblyName Felis_catus-6.2 ncbiAssemblyId 320798 orderKey 2418 mitoAcc NC_001700 fastaFiles /hive/data/genomes/felCat5/ucsc/*.fa.gz agpFiles /hive/data/genomes/felCat5/ucsc/*.agp.gz # qualFiles none dbDbSpeciesDir cat taxId 9685 '_EOF_' # << happy emacs # verify sequence and agp are OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp felCat5.config.ra > agp.log 2>&1 # real 2m49.310s # verify OK: tail -1 agp.log # *** All done! (through the 'agp' step) time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \ -dbHost=hgwdev felCat5.config.ra > db.log 2>&1 # real 18m37.337s ########################################################################## # running repeat masker (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/felCat5/bed/repeatMasker cd /hive/data/genomes/felCat5/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek felCat5 > do.log 2>&1 & # about 17h 35m cat faSize.rmsk.txt # 2455541136 bases (91244929 N's 2364296207 real 1342935452 upper # 1021360755 lower) in 5500 sequences in 1 files # Total size: mean 446462.0 sd 8150539.1 min 205 (chrUn_JH413284) # max 239302903 (chrA1) median 1243 # %41.59 masked total, %43.20 masked real egrep -i "versi|relea" do.log # April 26 2011 (open-3-3-0) version of RepeatMasker # CC RELEASE 20110920; # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ featureBits -countGaps felCat5 rmsk # 1023331634 bases of 2455541136 (41.674%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/felCat5/bed/simpleRepeat cd /hive/data/genomes/felCat5/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ felCat5 > do.log 2>&1 & # about 14 minutes cat fb.simpleRepeat # 39293231 bases of 2403678276 (1.635%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/felCat5 twoBitMask felCat5.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed felCat5.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa felCat5.2bit stdout | faSize stdin > faSize.felCat5.2bit.txt cat faSize.felCat5.2bit.txt # 2455541136 bases (91244929 N's 2364296207 real 1341962038 upper # 1022334169 lower) in 5500 sequences in 1 files # Total size: mean 446462.0 sd 8150539.1 min 205 (chrUn_JH413284) # max 239302903 (chrA1) median 1243 # %41.63 masked total, %43.24 masked real rm /gbdb/felCat5/felCat5.2bit ln -s `pwd`/felCat5.2bit /gbdb/felCat5/felCat5.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/felCat5/bed/gap cd /hive/data/genomes/felCat5/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../felCat5.unmasked.2bit > findMotif.txt 2>&1 # real 0m24.662s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits felCat5 -not gap -bed=notGap.bed # 2403678276 bases of 2403678276 (100.000%) in intersection # real 0m12.025s time featureBits felCat5 allGaps.bed notGap.bed -bed=new.gaps.bed # 39382069 bases of 2403678276 (1.638%) in intersection # real 2m51.724s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" felCat5 | sort -n | tail -1 # 264 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" felCat5 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps felCat5 other.bed # 39382069 bases of 2455541136 (1.604%) in intersection wc -l other.bed # 195055 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad felCat5 otherGap other.bed # Read 195055 elements of size 8 from other.bed # starting with this many hgsql -e "select count(*) from gap;" felCat5 # 2528 hgsql felCat5 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" felCat5 # 197583 # == 2528 + 195055 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 felCat5 nonBridged.lift -bedFile=nonBridged.bed # see example in danRer7.txt # there are no non-bridged gaps here: hgsql -N -e "select bridge from gap;" felCat5 | sort | uniq -c # 900 no # 196683 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/felCat5/bed/windowMasker cd /hive/data/genomes/felCat5/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev felCat5 > do.log 2>&1 & # about 2h 47m # Masking statistics twoBitToFa felCat5.wmsk.2bit stdout | faSize stdin # 2455541136 bases (91244929 N's 2364296207 real 1589489056 upper # 774807151 lower) in 5500 sequences in 1 files # Total size: mean 446462.0 sd 8150539.1 min 205 (chrUn_JH413284) # max 239302903 (chrA1) median 1243 # %31.55 masked total, %32.77 masked real twoBitToFa felCat5.wmsk.sdust.2bit stdout | faSize stdin # 2455541136 bases (91244929 N's 2364296207 real 1577768583 upper # 786527624 lower) in 5500 sequences in 1 files # Total size: mean 446462.0 sd 8150539.1 min 205 (chrUn_JH413284) # max 239302903 (chrA1) median 1243 # %32.03 masked total, %33.27 masked real hgLoadBed felCat5 windowmaskerSdust windowmasker.sdust.bed.gz # Read 13534078 elements of size 3 from windowmasker.sdust.bed.gz featureBits -countGaps felCat5 windowmaskerSdust # 877772553 bases of 2455541136 (35.747%) in intersection # eliminate the gaps from the masking featureBits felCat5 -not gap -bed=notGap.bed # 2364296207 bases of 2364296207 (100.000%) in intersection time nice -n +19 featureBits felCat5 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 786527624 bases of 2364296207 (33.267%) in intersection # real 4m29.505s # reload track to get it clean hgLoadBed felCat5 windowmaskerSdust cleanWMask.bed.gz # Read 13586894 elements of size 4 from cleanWMask.bed.gz featureBits -countGaps felCat5 windowmaskerSdust # 786527624 bases of 2455541136 (32.031%) in intersection zcat cleanWMask.bed.gz \ | twoBitMask ../../felCat5.unmasked.2bit stdin \ -type=.bed felCat5.cleanWMSdust.2bit twoBitToFa felCat5.cleanWMSdust.2bit stdout | faSize stdin \ > felCat5.cleanWMSdust.faSize.txt cat felCat5.cleanWMSdust.faSize.txt # 2455541136 bases (91244929 N's 2364296207 real 1577768583 upper # 786527624 lower) in 5500 sequences in 1 files # Total size: mean 446462.0 sd 8150539.1 min 205 (chrUn_JH413284) # max 239302903 (chrA1) median 1243 # %32.03 masked total, %33.27 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps felCat5 rmsk windowmaskerSdust # 553445549 bases of 2455541136 (22.539%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-04-11 - Hiram) # Not masking with WM here since RM masked well # since rmsk masks so very little of the genome, use the clean WM mask # on the genome sequence # cd /hive/data/genomes/felCat5 # twoBitMask -add bed/windowMasker/felCat5.cleanWMSdust.2bit \ # bed/simpleRepeat/trfMask.bed felCat5.2bit # safe to ignore the warnings about BED file with >=13 fields # twoBitToFa felCat5.2bit stdout | faSize stdin > faSize.felCat5.txt # cat faSize.felCat5.txt # 927696114 bases (111611440 N's 816084674 real 607935500 upper # 208149174 lower) in 5678 sequences in 1 files # Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # %22.44 masked total, %25.51 masked real # create symlink to gbdb # rm /gbdb/felCat5/felCat5.2bit # ln -s `pwd`/felCat5.2bit /gbdb/felCat5/felCat5.2bit ######################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/felCat5/bed/cpgIslands cd /hive/data/genomes/felCat5/bed/cpgIslands time doCpgIslands.pl felCat5 > do.log 2>&1 # real 28m6.499s cat fb.felCat5.cpgIslandExt.txt # 39523291 bases of 2364296207 (1.672%) in intersection ######################################################################### # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/felCat5/bed/genscan cd /hive/data/genomes/felCat5/bed/genscan time doGenscan.pl felCat5 > do.log 2>&1 # real 275m20.499s # several failing jobs, as found from 'para problems' on swarm: ./runGsBig.csh chrA2 000 gtf/000/chrA2.gtf pep/000/chrA2.pep subopt/000/chrA2.bed ./runGsBig.csh chrB1 000 gtf/000/chrB1.gtf pep/000/chrB1.pep subopt/000/chrB1.bed ./runGsBig.csh chrC1 000 gtf/000/chrC1.gtf pep/000/chrC1.pep subopt/000/chrC1.bed ./runGsBig.csh chrD1 000 gtf/000/chrD1.gtf pep/000/chrD1.pep subopt/000/chrD1.bed ./runGsBig.csh chrD2 000 gtf/000/chrD2.gtf pep/000/chrD2.pep subopt/000/chrD2.bed # reruning with window size of 2000000 A2 completed # reruning with window size of 1500000 B1 C1 D1 D2 still failing # real 202m31.369s # reruning with window size of 1000000 # real 86m4.437s # reruning with window size of 500000 # reruning with window size of 250000 with -tmp=/dev/shm # real 62m34.760s # reruning with window size of 100000 with -tmp=/dev/shm # real 65m27.077s # reruning with window size of 50000 with -tmp=/dev/shm # real 64m44.787s # This is not working. Need to run these split mkdir /hive/data/genomes/felCat5/bed/genscan/splitRun cd /hive/data/genomes/felCat5/bed/genscan/splitRun gapToLift felCat5 felCat5.nonBridged.lift -bedFile=felCat5.nonBridged.bed for C in A2 B1 C1 D1 D2 do echo chr${C} cd /hive/data/genomes/felCat5/bed/genscan/splitRun grep -w "chr${C}" felCat5.nonBridged.lift | grep -v random \ | sed -e "s/chr${C}./chr${C}_/" > chr${C}.nonBridged.lift mkdir chr${C} faToTwoBit ../hardMaskedFa/000/chr${C}.fa chr${C}/chr${C}.2bit ~/kent/src/hg/utils/lft2BitToFa.pl chr${C}/chr${C}.2bit \ chr${C}.nonBridged.lift > chr${C}/chr${C}.nonBridged.fa cd /hive/data/genomes/felCat5/bed/genscan/splitRun/chr${C} mkdir split${C} faSplit sequence chr${C}.nonBridged.fa 100 split${C}/chr${C}_ done for C in A2 B1 C1 D1 D2 do echo chr${C} cd /hive/data/genomes/felCat5/bed/genscan/splitRun/chr${C} echo '#!/bin/sh' > cmdList.sh export NL=-1 ls split${C} | while read F do NL=`echo $NL | awk '{print $1+1}'` if [ "${NL}" -gt 7 ]; then NL=0 echo "echo waiting before $F" >> cmdList.sh echo wait >> cmdList.sh fi echo "/cluster/bin/x86_64/gsBig split${C}/${F} gtf/${F}.gtf -trans=pep/${F}.pep -subopt=subopt/${F}.bed -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 &" done >> cmdList.sh echo "echo waiting at end" >> cmdList.sh echo "wait" >> cmdList.sh chmod +x cmdList.sh rm -fr gtf pep subopt mkdir gtf pep subopt done # running each of these individually # chrA2 real 18m18.896s # chrB1 real 26m41.138s # chrC1 real 27m18.315s # chrD1 real 16m40.449s # chrD2 real 9m49.652s # collecting the results: for C in chrA2 chrB1 chrC1 chrD1 chrD2 do cd /hive/data/genomes/felCat5/bed/genscan/splitRun/${C} cat gtf/${C}_*.gtf | liftUp -type=.gtf stdout ../${C}.nonBridged.lift error stdin \ | sed -e "s/${C}_0$[0-4]$\./${C}.\1/g" > ${C}.gtf cat subopt/${C}_*.bed | liftUp -type=.bed stdout ../${C}.nonBridged.lift error stdin \ | sed -e "s/${C}_0$[0-4]$\./${C}.\1/g" > ${C}.subopt.bed cat pep/${C}_*.pep | sed -e "s/${C}_0$[0-4]$\./${C}.\1/g" > ${C}.pep ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed done # after verifying the sizes of the files seem same compared to what # happened in the main run: for C in chrA2 chrB1 chrC1 chrD1 chrD2 do cd /hive/data/genomes/felCat5/bed/genscan/splitRun/${C} ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed cd /hive/data/genomes/felCat5/bed/genscan/splitRun done # this is tricky, it is counting on the file existing, empty or otherwise for C in chrA2 chrB1 chrC1 chrD1 chrD2 do cd /hive/data/genomes/felCat5/bed/genscan/splitRun/${C} D=`ls ../../gtf/00?/${C}.gtf` rm -f "${D}" cp -p ${C}.gtf "${D}" D=`ls ../../pep/00?/${C}.pep` rm -f "${D}" cp -p ${C}.pep "${D}" D=`ls ../../subopt/00?/${C}.bed` rm -f "${D}" cp -p ${C}.subopt.bed "${D}" cd /hive/data/genomes/felCat5/bed/genscan/splitRun done # Now, we can continue time doGenscan.pl -continue=makeBed -workhorse=hgwdev -dbHost=hgwdev \ felCat5 > makeBed.log 2>&1 # real 3m40.678s cat fb.felCat5.genscan.txt # 56450443 bases of 2364296207 (2.388%) in intersection cat fb.felCat5.genscanSubopt.txt # 50327551 bases of 2364296207 (2.129%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-04 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc $ 2364296207 / 2897316137 $ \* 1024 # ( 2364296207 / 2897316137 ) * 1024 = 835.614480 # round up to 850 (felCat4 was 700) cd /hive/data/genomes/felCat5 time blat felCat5.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/felCat5.11.ooc -repMatch=850 # Wrote 24227 overused 11-mers to jkStuff/felCat5.11.ooc # felCat4 had: Wrote 23033 overused 11-mers to jkStuff/felCat4.11.ooc # real 1m10.025s # there are non-bridged gaps, create lift file needed for genbank hgsql -N -e "select bridge from gap;" felCat5 | sort | uniq -c # 900 no # 196683 yes cd /hive/data/genomes/felCat5/jkStuff gapToLift felCat5 felCat5.nonBridged.lift -bedFile=felCat5.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' felCat5.nonBridged.bed | sort -nr | head # 22000847 chrA1 191124230 213125077 chrA1.58 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Felis catus 1081 919 354 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add: # felCat5 felCat5.serverGenome = /hive/data/genomes/felCat5/felCat5.2bit felCat5.clusterGenome = /hive/data/genomes/felCat5/felCat5.2bit felCat5.ooc = /hive/data/genomes/felCat5/jkStuff/felCat5.11.ooc felCat5.lift = /hive/data/genomes/felCat5/jkStuff/felCat5.nonBridged.lift felCat5.perChromTables = no felCat5.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} felCat5.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} felCat5.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} felCat5.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} felCat5.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} felCat5.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} felCat5.downloadDir = felCat5 felCat5.refseq.mrna.native.load = yes felCat5.refseq.mrna.xeno.load = yes felCat5.refseq.mrna.xeno.loadDesc = yes # felCat5.upstreamGeneTbl = refGene # felCat5.upstreamMaf = multiz6way # /hive/data/genomes/felCat5/bed/multiz6way/species.list # end of section added to etc/genbank.conf git commit -m "adding felCat5 rat" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S felCat5 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial felCat5 & # var/build/logs/2012.05.04-15:01:45.felCat5.initalign.log # real 1299m14.421s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad felCat5 & # logFile: var/dbload/hgwdev/logs/2012.05.08-09:58:52.dbload.log # real 41m1.805s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add felCat5 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added felCat5." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-23 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chrA2:53225502-53231571" where name="felCat5";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-23 - Hiram) mkdir /hive/data/genomes/felCat5/pushQ cd /hive/data/genomes/felCat5/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl felCat5 2> stderr.txt | grep -v transMap > felCat5.sql # real 3m42.309s scp -p felCat5.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < felCat5.sql ########################################################################### # construct lift file Ensembl names to UCSC names (DONE - 2013-02-15 - Chin) cd /hive/data/genomes/felCat5/jkStuff cat ../chrom.sizes | while read L do ucName=`echo "${L}" | awk '{print $1}'` ucSize=`echo "${L}" | awk '{print $2}'` ensName=`echo $L | sed -e 's/^chrM/MT/; s/^chr[0-9A-Za-z]*_//; s/_random//; s/^chr//; s/^$[JA][HA][CZ0-9]*$/\1.1/;' | awk '{print $1}'` ensSize=`echo $L | awk '{print $2}'` echo -e "0\t$ensName\t$ensSize\t$ucName\t$ucSize" done > ensToUcsc.lift ############################################################################