# for emacs: -*- mode: sh; -*- # Anopheles gambiae -- complete chromosomes from Ensembl, apparently from # May 2004: # ftp://ftp.ensembl.org/pub/current_mosquito/data/fasta/dna # (NCBI still has just scaffolds from 2002 here...?! # ftp://ftp.ncbi.nih/gov/genbank/genomes/Anopheles_gambiae/Assembly_scaffolds) # # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-----------+---------------------+ # | tableName | type | # +-----------+---------------------+ # | ensGene | genePred ensPep | # | geneid | genePred geneidPep | # | genscan | genePred genscanPep | # +-----------+---------------------+ # DOWNLOAD SEQUENCE (DONE 6/7/04 angie) ssh kksilo mkdir /cluster/store7/anoGam1 cd /cluster/data ln -s /cluster/store7/anoGam1 anoGam1 cd /cluster/data/anoGam1 mkdir downloads cd downloads foreach c (2L 2R 3L 3R X UNKN) wget ftp://ftp.ensembl.org/pub/current_mosquito/data/fasta/dna/Anopheles_gambiae.MOZ2a.may.dna.chromosome.$c.fa.gz wget ftp://ftp.ensembl.org/pub/current_mosquito/data/fasta/dna/Anopheles_gambiae.MOZ2a.may.dna_rm.chromosome.$c.fa.gz end gunzip *.fa.gz # sanity check -- make sure the 2 versions are the same modulo masking: foreach f (*.dna.*) set g = `echo $f | sed -e 's/\.dna\./\.dna_rm\./'` faCmp $f $g end # Is their masking actually different? foreach f (*.dna.*) set g = `echo $f | sed -e 's/\.dna\./\.dna_rm\./'` faCmp -softMask $f $g end # Nope! ? Well, OK, clean up then. rm Ano*.dna_rm.* # Put in our usual chrom dir structure. foreach c (2L 2R 3L 3R X) mkdir ../$c sed -e 's/^>.*$/>'chr$c/ A*.dna.chromosome.$c.fa > ../$c/chr$c.fa end mkdir ../U sed -e 's/^>.*$/>'chrU/ A*.dna.chromosome.UNKN.fa > ../U/chrU.fa cd .. foreach c (?{,?}) diff $c/chr$c.fa downloads/A*.dna.*.$c*.fa end # download mitochondrion sequence mkdir M cd M # go to http://www.ncbi.nih.gov/ and search Genome for # "anopheles gambiae mitochondrion complete". That shows the gi number: # 5834911 # Use that number in the entrez linking interface to get fasta: wget -O chrM.fa \ 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834911&dopt=FASTA' # Edit chrM.fa: make sure the long fancy header line says it's the # Anopheles gambiae mitochondrion complete genome, and then replace the # header line with just ">chrM". cd .. # tidy up downloads nice gzip downloads/* & # GET ASSEMBLY INFO FROM ENSEMBL (DONE 6/7/04 angie) # Since there's no AGP, extract what we can from Ensembl... looks like # they give offsets of both their "chunks" and of the real scaffolds. ssh kksilo cd /cluster/data/anoGam1/bed/ensChunks wget ftp://ftp.ensembl.org/pub/current_mosquito/data/mysql/anopheles_gambiae_core_22_2b/assembly.txt.table.gz wget ftp://ftp.ensembl.org/pub/current_mosquito/data/mysql/anopheles_gambiae_core_22_2b/anopheles_gambiae_core_22_2b.sql.gz wget ftp://ftp.ensembl.org/pub/current_mosquito/data/mysql/anopheles_gambiae_core_22_2b/seq_region.txt.table.gz gunzip *.gz ssh hgwdev cd /cluster/data/anoGam1/bed/ensChunks hgsql -e "create database ensAnoGam" hgsql ensAnoGam < anopheles_gambiae_core_22_2b.sql hgsql ensAnoGam \ -e 'load data local infile "assembly.txt.table" into table assembly' hgsql ensAnoGam \ -e 'load data local infile "seq_region.txt.table" into table seq_region' hgsql ensAnoGam -e 'create table assembly1 \ select assembly.asm_seq_region_id,seq_region.name, \ assembly.asm_start,assembly.asm_end,assembly.cmp_start,\ assembly.cmp_end,assembly.ori from assembly,seq_region \ where assembly.cmp_seq_region_id = seq_region.seq_region_id' hgsql ensAnoGam -e 'create table assembly2 \ select seq_region.name as chrom, \ assembly1.asm_start as chromStart, assembly1.asm_end as chromEnd, \ assembly1.name, \ assembly1.cmp_start as fragStart, assembly1.cmp_end as fragEnd, \ assembly1.ori from assembly1,seq_region \ where assembly1.asm_seq_region_id = seq_region.seq_region_id' hgsql ensAnoGam -e 'update assembly2 set chrom = "U" where chrom = "UNKN"' hgsql ensAnoGam -e 'update assembly2 set chrom = concat("chr", chrom)' hgsql ensAnoGam -N -e 'select * from assembly2 order by chrom,chromStart' \ | perl -we '$i = 0; \ while (<>) { \ chomp; @w = split; $w[6] =~ s/1$//; $w[6] =~ s/^$/+/; ++$i; \ print "$w[0]\t$w[1]\t$w[2]\t$i\tW\t$w[3]\t$w[4]\t$w[5]\t$w[6]\n"; \ }' \ > ensAll.agp # Get just the scaffold entries, and put gap records in there... ssh kksilo cd /cluster/data/anoGam1 awk '$6 !~ /_[0-9]+$/ {print;}' ensAll.agp \ | perl -we '$i = 0; $prevChr = ""; \ while (<>) { \ @w = split; ++$i; \ if ($w[0] eq $prevChr && defined $prevEnd) { \ $gapEnd = $w[1] - 1; $size = $w[1] - $prevEnd; \ print "$w[0]\t$prevEnd\t$gapEnd\t$i\tN\t$size\tclone\tno\n"; $i++; \ } \ $prevEnd = $w[2] + 1; $prevChr = $w[0]; \ print; \ }' \ > ensScaffold.agp cd ../.. foreach c (?{,?}) grep "^chr$c" bed/ensChunks/ensScaffold.agp > $c/chr$c.agp end # clean up 0-length inappropriate chrM.agp: rm M/chrM.agp # checkAgpAndFa prints out way too much info -- keep the end/stderr only: foreach agp (?{,?}/chr*.agp) if (-e $agp) then set fa = $agp:r.fa echo checking consistency of $agp and $fa checkAgpAndFa $agp $fa | tail -1 endif end # clean up ensAnoGam database ssh hgwdev hgsql -e "drop database ensAnoGam" # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CHUNKS OF N'S (DONE 6/7/04 angie) # Since we didn't get real AGP with gap info, we don't have enough info # to help us break into ~5MB chunks at proper gap boundaries. So just # use runs of N's as gaps. ssh kksilo cd /cluster/data/anoGam1 foreach c (?{,?}) faSplit gap -minGapSize=100 -lift=$c/chr$c.lft \ $c/chr$c.fa 5000000 $c/chr${c}_ end # put chunks into usual "contig" dir structure foreach ctg (?{,?}/chr?{,?}_?{,?}.fa) mkdir $ctg:r mv $ctg $ctg:r/ end # MAKE JKSTUFF AND BED DIRECTORIES (DONE 6/7/04 angie) # This used to hold scripts -- better to keep them inline here so # they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/anoGam1/jkStuff # This is where most tracks will be built: mkdir /cluster/data/anoGam1/bed # CREATING DATABASE (DONE 6/7/04 angie) # Create the database. ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql hgsql -e 'create database anoGam1' # CREATING GRP TABLE FOR TRACK GROUPING (DONE 6/7/04 angie) ssh hgwdev echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \ | hgsql anoGam1 # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 6/7/04 angie) # Make nib/, unmasked until RepeatMasker and TRF steps are done. # Do this now so we can load up RepeatMasker and run featureBits; # can also load up other tables that don't depend on masking. ssh kksilo cd /cluster/data/anoGam1 mkdir nib foreach c (?{,?}) foreach f ($c/chr${c}{,_random}.fa) if (-e $f) then echo "nibbing $f" /cluster/bin/i386/faToNib $f nib/$f:t:r.nib endif end end # Make symbolic links from /gbdb/anoGam1/nib to the real nibs. ssh hgwdev mkdir -p /gbdb/anoGam1/nib foreach f (/cluster/data/anoGam1/nib/chr*.nib) ln -s $f /gbdb/anoGam1/nib end # Load /gbdb/anoGam1/nib paths into database and save size info. cd /cluster/data/anoGam1 hgsql anoGam1 < $HOME/kent/src/hg/lib/chromInfo.sql hgNibSeq -preMadeNib anoGam1 /gbdb/anoGam1/nib */chr*.fa echo "select chrom,size from chromInfo" | hgsql -N anoGam1 > chrom.sizes # take a look at chrom.sizes, should be 18 lines wc chrom.sizes # REPEATMASKER (DONE 6/8/04 angie) #- Split sequence into 500kb chunks, at gaps if possible: ssh kksilo cd /cluster/data/anoGam1 foreach c (?{,?}) foreach d ($c/chr${c}*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end #- Make the run directory and job list: cd /cluster/data/anoGam1 cat << '_EOF_' > jkStuff/RMAnopheles #!/bin/csh -fe cd $1 pushd . /bin/mkdir -p /tmp/anoGam1/$2 /bin/cp $2 /tmp/anoGam1/$2/ cd /tmp/anoGam1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec anopheles $2 popd /bin/cp /tmp/anoGam1/$2/$2.out ./ if (-e /tmp/anoGam1/$2/$2.align) /bin/cp /tmp/anoGam1/$2/$2.align ./ if (-e /tmp/anoGam1/$2/$2.tbl) /bin/cp /tmp/anoGam1/$2/$2.tbl ./ if (-e /tmp/anoGam1/$2/$2.cat) /bin/cp /tmp/anoGam1/$2/$2.cat ./ /bin/rm -fr /tmp/anoGam1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/anoGam1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/anoGam1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMAnopheles mkdir RMRun cp /dev/null RMRun/RMJobs foreach c (?{,?}) foreach d ($c/chr${c}*_?{,?}) set ctg = $d:t foreach f ( $d/${ctg}_?{,?}.fa ) set f = $f:t echo /cluster/data/anoGam1/jkStuff/RMAnopheles \ /cluster/data/anoGam1/$d $f \ '{'check out line+ /cluster/data/anoGam1/$d/$f.out'}' \ >> RMRun/RMJobs end end end #- Do the run ssh kk9 cd /cluster/data/anoGam1/RMRun para create RMJobs para try, para check, para check, para push, para check,... #Completed: 747 of 747 jobs #Average job time: 1597s 26.61m 0.44h 0.02d #Longest job: 2788s 46.47m 0.77h 0.03d #Submission to last job: 20691s 344.85m 5.75h 0.24d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/anoGam1 foreach d (*/chr*_?{,?}) set contig = $d:t echo $contig liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach c (?{,?}) echo lifting $c cd $c liftUp chr$c.fa.out chr$c.lft warn \ `awk '{print $2 "/" $2 ".fa.out";}' chr$c.lft` \ > /dev/null cd .. end #- Load the .out files into the database with: ssh hgwdev cd /cluster/data/anoGam1 hgLoadOut anoGam1 */chr*.fa.out # VERIFY REPEATMASKER RESULTS (DONE 6/9/04 angie) # Eyeball some repeat annotations in the browser, compare to lib seqs. # Run featureBits on anoGam1 and on a comparable genome build, and compare: ssh hgwdev featureBits anoGam1 rmsk #21785530 bases of 278268413 (7.829%) in intersection # compare to dm1: featureBits dm1 rmsk #15452875 bases of 126527731 (12.213%) in intersection # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 6/7/04 angie) # Warning: genome and organism fields must correspond # with defaultDb values hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("anoGam1", "Feb. 2003", "/gbdb/anoGam1/nib", "A. gambiae", \ "chr3R:51042001-51053000", 1, 59, "A. gambiae", \ "Anopheles gambiae", "/gbdb/anoGam1/html/description.html", \ 0, 0, "MOZ2");' hgsql -h genome-testdb hgcentraltest \ -e 'INSERT INTO defaultDb (genome, name) values ("A. gambiae", "anoGam1");' # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add anoGam1 in all the right places and do make update mkdir /gbdb/anoGam1/html # go public on genome-test cvs commit makefile make alpha # Add trackDb directories mkdir anopheles mkdir anopheles/anoGam1 cvs add anopheles cvs add anopheles/anoGam1 cvs commit anopheles/anoGam1 # GOLD AND GAP TRACKS (DONE 6/7/04 angie) ssh hgwdev cd /cluster/data/anoGam1 cp /dev/null chrom.lst foreach f (?{,?}/chr*.agp chrM) echo $f:t:r >> chrom.lst end hgGoldGapGl -noGl -chromLst=chrom.lst anoGam1 /cluster/data/anoGam1 . # featureBits fails if there's no chrM_gap, so make one: # echo "create table chrM_gap like chr1_gap" | hgsql anoGam1 # oops, that won't work until v4.1, so do this for the time being: hgsql anoGam1 -e 'create table chrM_gap select * from chr2L_gap where 0=1' # MAKE LIFTALL.LFT (DONE 6/8/04 angie) ssh kksilo cd /cluster/data/anoGam1 cat ?{,?}/chr*.lft > jkStuff/liftAll.lft # SIMPLE REPEATS (TRF) (DONE 6/8/04 angie) ssh kksilo mkdir /cluster/data/anoGam1/bed/simpleRepeat cd /cluster/data/anoGam1/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/anoGam1/?{,?}/chr*_?{,?}) set ctg = $d:t foreach f ($d/${ctg}.fa) set fout = $f:t:r.bed echo $fout echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end end tcsh jobs.csh >&! jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn \ trf/*.bed > /dev/null # Load this into the database as so ssh hgwdev hgLoadBed anoGam1 simpleRepeat \ /cluster/data/anoGam1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql featureBits anoGam1 simpleRepeat #5668988 bases of 278268413 (2.037%) in intersection # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 6/8/04 angie) # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kksilo cd /cluster/data/anoGam1/bed/simpleRepeat mkdir trfMask foreach f (trf/*.bed) echo -n "filtering $f... " awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Lift up filtered trf output to chrom coords: mkdir trfMaskChrom foreach f (../../?{,?}/chr*.fa) set c = $f:t:r echo $c liftUp trfMaskChrom/$c.bed ../../jkStuff/liftAll.lft warn \ trfMask/${c}_[0-9]*.bed > /dev/null end # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 6/8/04 angie) ssh kksilo cd /cluster/data/anoGam1 # Soft-mask (lower-case) the contig and chr .fa's, # then make hard-masked versions from the soft-masked. set trfCtg=bed/simpleRepeat/trfMask set trfChr=bed/simpleRepeat/trfMaskChrom foreach f (*/chr*.fa) echo "repeat- and trf-masking $f" maskOutFa -soft $f $f.out $f set chr = $f:t:r maskOutFa -softAdd $f $trfChr/$chr.bed $f echo "hard-masking $f" maskOutFa $f hard $f.masked end foreach c (?{,?}) echo "repeat- and trf-masking contigs of chr$c, chr${c}_random" foreach d ($c/chr*_?{,?}) set ctg=$d:t set f=$d/$ctg.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f $trfCtg/$ctg.bed $f maskOutFa $f hard $f.masked end end #- Rebuild the nib files, using the soft masking in the fa: foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end # Make one big 2bit file as well, and make a link to it in # /gbdb/anoGam1/nib because hgBlat looks there: faToTwoBit */chr*.fa anoGam1.2bit ssh hgwdev ln -s /cluster/data/anoGam1/anoGam1.2bit /gbdb/anoGam1/nib/ # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 6/17/04 angie) ssh kksilo cd /cluster/data/anoGam1 #- Build the .zip files -- no genbank for now. cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip mkdir zip zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp zip -j zip/chromOut.zip */chr*.fa.out zip -j zip/chromFa.zip */chr*.fa zip -j zip/chromFaMasked.zip */chr*.fa.masked cd bed/simpleRepeat zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed cd ../.. '_EOF_' # << this line makes emacs coloring happy csh ./jkStuff/zipAll.csh |& tee zipAll.log cd zip #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/anoGam1/zip set gp = /usr/local/apache/htdocs/goldenPath/anoGam1 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips mkdir -p $gp/chromosomes foreach f ( ../*/chr*.fa ) nice zip -j $gp/chromosomes/$f:t.zip $f end cd $gp/bigZips nice md5sum *.zip > md5sum.txt cd $gp/chromosomes nice md5sum *.zip > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 6/8/04 angie) ssh kkr1u00 # Chrom-level mixed nibs that have been repeat- and trf-masked: rm -rf /iscratch/i/anoGam1/nib mkdir -p /iscratch/i/anoGam1/nib cp -p /cluster/data/anoGam1/nib/chr*.nib /iscratch/i/anoGam1/nib iSync # bluearc too (rack 9 can't see iscratch): ssh kksilo rm -rf /cluster/bluearc/anoGam1/nib mkdir -p /cluster/bluearc/anoGam1/nib cp -p /cluster/data/anoGam1/nib/chr*.nib /cluster/bluearc/anoGam1/nib # LOAD ENSEMBL GENES (DONE 6/9/04 angie) mkdir /cluster/data/anoGam1/bed/ensembl cd /cluster/data/anoGam1/bed/ensembl # Get the ensembl gene data from # http://www.ensembl.org/Anopheles_gambiae/martview # Follow this sequence through the pages: # Page 1) Make sure that the Anopheles_gambiae choice is selected. Hit next. # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next. # Page 3) Choose the "Structures" box. # Page 4) Choose GTF as the ouput. choose gzip compression. hit export. # Save as ensembl.gff.gz # Add "chr" to front of each line in the gene data gtf file to make # it compatible with our software. gunzip -c ensembl.gff.gz \ | perl -wpe 's/^UNKN/U/; s/^([2-3][LR]?|X|U)/chr$1/ \ || die "Line $. doesnt start with anopheles chrom:\n$_"' \ > ensGene.gtf ssh hgwdev ldHgGene -gtf -genePredExt anoGam1 ensGene \ /cluster/data/anoGam1/bed/ensembl/ensGene.gtf # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and # hgKnownToSuper. Use ensMart to create it as above, except: # Page 3) Choose the "Features" box. In "Ensembl Attributes", check # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. # Choose Text, tab-separated as the output format. Result name ensGtp. # Save file as ensGtp.txt.gz gunzip ensGtp.txt.gz hgsql anoGam1 < ~/kent/src/hg/lib/ensGtp.sql hgsql anoGam1 -e 'load data local infile "ensGtp.txt" into table ensGtp' # delete header line to avoid joinerCheck error hgsql anoGam1 -e 'delete from ensGtp where gene = "Ensembl Gene ID"' # Load Ensembl peptides: # Get them from ensembl as above in the gene section except for # Page 3) Choose the "Sequences" box. # Page 4) Transcripts/Proteins. Peptide. Format = FASTA. # Save file as ensemblPep.fa.gz gunzip -c ensemblPep.fa.gz \ | perl -wpe 's/^(>ENSANGT\d+\.\d+).*/$1/' \ > ensPep.fa hgPepPred anoGam1 generic ensPep ensPep.fa # PRODUCING GENSCAN PREDICTIONS (DONE 6/9/04 angie) # Run on small cluster -- genscan needs big mem. ssh hgwdev mkdir /cluster/data/anoGam1/bed/genscan cd /cluster/data/anoGam1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Run on small cluster (more mem than big cluster). ssh kki cd /cluster/data/anoGam1/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Generate a list file, genome.list, of all the hard-masked contigs that # *do not* consist of all-N's (which would cause genscan to blow up) cp /dev/null genome.list foreach f ( `ls -1S /cluster/data/anoGam1/*/chr*_*/chr*_?{,?}.fa.masked` ) egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> genome.list end wc -l genome.list # Create template file, gsub, for gensub2. For example (3-line file): cat << '_EOF_' > gsub #LOOP gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line makes emacs coloring happy gensub2 genome.list single gsub jobList para create jobList para try, check, push, check, ... #Completed: 68 of 68 jobs #Average job time: 133s 2.21m 0.04h 0.00d #Longest job: 167s 2.78m 0.05h 0.00d #Submission to last job: 720s 12.00m 0.20h 0.01d # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/anoGam1/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/anoGam1/bed/genscan # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf anoGam1 genscan genscan.gtf hgPepPred anoGam1 generic genscanPep genscan.pep hgLoadBed anoGam1 genscanSubopt genscanSubopt.bed # MYTOUCH FIX - jen - 2006-01-24 sudo mytouch anoGam1 genscanPep 0501071300.00 # SWAP BLASTZ MELANOGASTER-ANOPHELES TO ANOPHELES-MEL (DONE 6/10/04 angie) ssh kolossus mkdir /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10 cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10 set aliDir = /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom #157M /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChrom #157M unsorted #157M axtChrom rm -r unsorted # CHAIN MELANOGASTER BLASTZ (DONE 6/10/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10 mkdir -p axtChain/run1 cd axtChain/run1 mkdir out chain ls -1S /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Make our own linear gap file with reduced gap penalties: cat << '_EOF_' > ../../chickenHumanTuned.gap tablesize 11 smallSize 111 position 1 2 3 11 111 2111 12111 32111 72111 152111 252111 qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600 bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000 '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=../../chickenHumanTuned.gap \ -verbose=0 $1 \ /iscratch/i/anoGam1/nib \ /cluster/bluearc/drosophila/dm1/nib $2 > $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 7 of 7 jobs #Average job time: 10s 0.17m 0.00h 0.00d #Longest job: 13s 0.22m 0.00h 0.00d #Submission to last job: 13s 0.22m 0.00h 0.00d # now on the cluster server, sort chains ssh kksilo cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain du -sh chain run1/chain rm run1/chain/*.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # Load chains into database ssh hgwdev cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain/chain foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain anoGam1 ${c}_chainDm1 $i end # NET MELANOGASTER BLASTZ (DONE 6/10/04 angie) ssh kksilo cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain netClass -noAr noClass.net anoGam1 dm1 melanogaster.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain rm noClass.net netFilter -syn melanogaster.net > melanogasterSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain netFilter -minGap=10 melanogaster.net | hgLoadNet anoGam1 netDm1 stdin netFilter -minGap=10 melanogasterSyn.net | hgLoadNet anoGam1 netSyntenyDm1 stdin # MAKE AXTNET FOR DOWNLOAD (DONE 8/20/04 angie) ssh kksilo cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain netSplit melanogaster.net net cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10 mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/anoGam1/nib \ /cluster/data/dm1/nib stdout \ | axtSort stdin axtNet/$chr.axt end # MAKE VSDM1 DOWNLOADABLES (DONE 6/9/04 angie) ssh kksilo cd /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChain ln all.chain melanogaster.chain zip /cluster/data/anoGam1/zip/melanogaster.chain.zip melanogaster.chain rm melanogaster.chain zip /cluster/data/anoGam1/zip/melanogaster.net.zip melanogaster.net ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/anoGam1/vsDm1 cd /usr/local/apache/htdocs/goldenPath/anoGam1/vsDm1 mv /cluster/data/anoGam1/zip/melanogaster*.zip . md5sum *.zip > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # Add axtChrom.zip (as advertised in original README) and axtNet/*.gz # (new std offering) 8/20/04: mkdir axtNet foreach f (/cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtNet/*) gzip -c $f > axtNet/$f:t.gz end zip -j axtChrom.zip \ /cluster/data/anoGam1/bed/blastz.dm1.swap.2004-06-10/axtChrom/*.axt md5sum *.zip *.gz */*.gz > md5sum.txt # MAKE 11.OOC FILE FOR BLAT (DONE 8/2/04 angie) # Use -repMatch=100 (based on size -- for human we use 1024, and # mosquito size is ~9.2% of human) ssh kkr1u00 mkdir /cluster/data/anoGam1/bed/ooc cd /cluster/data/anoGam1/bed/ooc ls -1 /cluster/data/anoGam1/nib/chr*.nib > nib.lst blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/anoGam1/11.ooc -repMatch=100 #Wrote 11692 overused 11-mers to /cluster/bluearc/anoGam1/11.ooc cp -p /cluster/bluearc/anoGam1/*.ooc /iscratch/i/anoGam1/ iSync # AUTO UPDATE GENBANK MRNA RUN (DONE 8/2/04 angie) ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in # edits, check them in if necessary: diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # Edit etc/genbank.conf and add these lines: # anoGam1 (A. gambiae) anoGam1.genome = /iscratch/i/anoGam1/nib/chr*.nib anoGam1.lift = /cluster/data/anoGam1/jkStuff/liftAll.lft anoGam1.refseq.mrna.native.load = no anoGam1.genbank.mrna.xeno.load = yes anoGam1.genbank.est.xeno.load = no anoGam1.downloadDir = anoGam1 cvs ci etc/genbank.conf # Since A. gambiae is a new species for us, edit src/lib/gbGenome.c. # Pick some other browser species, & monkey-see monkey-do. cvs diff src/lib/gbGenome.c make cvs ci src/lib/gbGenome.c # Edit src/align/gbBlat to add /iscratch/i/anoGam1/11.ooc cvs diff src/align/gbBlat make cvs ci src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial anoGam1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad anoGam1 featureBits anoGam1 mrna #4879150 bases of 278268413 (1.753%) in intersection featureBits anoGam1 xenoMrna #9229323 bases of 278268413 (3.317%) in intersection # Clean up: rm -rf work/initial.anoGam1 ssh eieio # -initial for ESTs (now with /cluster/store7 and iservers): nice bin/gbAlignStep -srcDb=genbank -type=est -initial anoGam1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 anoGam1 & # Not bad... certainly better than, say, D. yakuba: featureBits anoGam1 intronEst #8712776 bases of 278268413 (3.131%) in intersection featureBits anoGam1 est #19664902 bases of 278268413 (7.067%) in intersection # Clean up: rm -rf work/initial.anoGam1 # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 7/30/04 angie) ssh hgwdev echo 'insert into blatServers values("anoGam1", "blat8", "17780", 1, 0); \ insert into blatServers values("anoGam1", "blat8", "17781", 0, 0);' \ | hgsql -h genome-testdb hgcentraltest # MAKE Drosophila Proteins track (Done 08-07-2004 braney) ssh kksilo mkdir -p /cluster/data/anoGam1/blastDb cd /cluster/data/anoGam1/blastDb for i in ../*/*/*_[0-9]*_[0-9]*.fa; do ln -s $i .; done for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/anoGam1/blastDb cp /cluster/data/anoGam1/blastDb/* /iscratch/i/anoGam1/blastDb (~kent/bin/iSync) 2>&1 > sync.out mkdir -p /cluster/data/anoGam1/bed/tblastn.dm1FB cd /cluster/data/anoGam1/bed/tblastn.dm1FB ls -1S /iscratch/i/anoGam1/blastDb/*.nsq | sed "s/\.nsq//" > skeeter.lst exit # back to kksilo cd /cluster/data/anoGam1/bed/tblastn.dm1FB mkdir fbfa # calculate a reasonable number of jobs (didn't actually do this for anoGam1) calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(164630/`wc skeeter.lst | awk "{print \\\$1}"`\) # 18735/(150000/463) = 57.828700 split -l 58 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT f=/tmp/`basename $3` for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/anoGam1/jkStuff/liftAll.lft warn $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.3 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 skeeter.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/anoGam1/bed/tblastn.dm1FB para create blastSpec para shove # jobs will need to be restarted, but they should all finish exit # back to kksilo cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=20000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/anoGam1/bed/tblastn.dm1FB para create chainSpec para push exit # back to kksilo cd /cluster/data/anoGam1/bed/tblastn.dm1FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n |uniq > /cluster/data/anoGam1/bed/tblastn.dm1FB/preblastDm1FB.psl cd .. blatDir=/cluster/data/dm1/bed/blat.dm1FB protDat -fb preblastDm1FB.psl $blatDir/dm1FB.psl $blatDir/dm1FB.mapNames blastDm1FB.psl ssh hgwdev cd /cluster/data/anoGam1/bed/tblastn.dm1FB hgLoadPsl anoGam1 blastDm1FB.psl exit # back to kksilo rm -rf blastOut # End tblastn # MAKE GCPERCENT (DONE 10/5/04 angie) ssh hgwdev mkdir /cluster/data/anoGam1/bed/gcPercent cd /cluster/data/anoGam1/bed/gcPercent # create and load gcPercent table hgsql anoGam1 < ~/src/hg/lib/gcPercent.sql hgGcPercent anoGam1 ../../nib # ANOEST FROM ZDOBNOV (DONE 6/2/05 angie) ssh hgwdev mkdir /cluster/data/anoGam1/bed/anoEst cd /cluster/data/anoGam1/bed/anoEst wget http://komar.embl.de/Data/ag_clusters_GFF.txt perl -wpe 'chomp; @words = split; \ if (scalar(@words) != 9 || $words[0] eq "") {$_ = ""; next;} \ $words[0] = "chr$words[0]"; \ $words[6] = "+" if ($words[6] eq "1"); \ $words[6] = "-" if ($words[6] eq "-1"); \ $words[8] = $words[8] . $words[6] if ($words[8] eq "NCLAG018843" \ || $words[8] eq "NCLAG026485" || $words[8] eq "NCLAG028206" \ || $words[8] eq "NCLAG030852" || $words[8] eq "TCLAG029693" \ || $words[8] eq "TCLAG068099"); \ $_ = join("\t", @words) . "\n";' \ ag_clusters_GFF.txt > anoEst.gff # Two tracks, separated by ID. From Evgeny's email 4/17/05: # TCLAG.. - clusters have at least one EST that 'uniquely' aligns # to this genomic region, and # NCLAG.. - all the rest that did not get into TCLAGs, eg mostly # transposon derived and some resent paralogs (or haplotype artifacts). # NCLAG.. hidden by default and TCLAG..s expanded. awk '$9 ~ /^TCLAG/ {print;}' anoEst.gff > anoEstTcl.gff awk '$9 ~ /^NCLAG/ {print;}' anoEst.gff > anoEstNcl.gff wc -l *.gff # 229268 anoEst.gff # 154371 anoEstNcl.gff # 74897 anoEstTcl.gff expr 154371 + 74897 #229268 # These are not really meant to be genePreds (no CDS/UTR), so # translate into bed 12 via genePred.tab. Also fix chrom names... ldHgGene anoGam1 anoEstTcl anoEstTcl.gff -out=stdout \ | $HOME/kent/src/utils/genePredToBed/genePredToBed \ | sed -e 's/^chrMIAGCG/chrM/; s/chrUNKN/chrU/;' \ > anoEstTcl.bed ldHgGene anoGam1 anoEstNcl anoEstNcl.gff -out=stdout \ | $HOME/kent/src/utils/genePredToBed/genePredToBed \ | sed -e 's/^chrMIAGCG/chrM/; s/chrUNKN/chrU/;' \ > anoEstNcl.bed # List of TCLAG* ids considered to be expressed, saved from email # 6/2/05 to /cluster/data/anoGam1/bed/anoEst/expressed_cl_ids.txt. # OK, load 'em up: hgLoadBed anoGam1 anoEstTcl anoEstTcl.bed hgLoadBed anoGam1 anoEstNcl anoEstNcl.bed # Scores are not real so set to 0: hgsql anoGam1 -e 'update anoEstTcl set score = 0' hgsql anoGam1 -e 'update anoEstNcl set score = 0' tail +2 expressed_cl_ids.txt > anoEstExpressed.tab hgsql anoGam1 -e \ 'CREATE TABLE anoEstExpressed ( \ name char(11) not null, \ PRIMARY KEY(name(11)) \ ); \ load data local infile "anoEstExpressed.tab" into table anoEstExpressed' # MAKE Drosophila Proteins track (DONE 06-27-05 braney) ssh kk mkdir -p /cluster/data/anoGam1/bed/tblastn.dm2FB cd /cluster/data/anoGam1/bed/tblastn.dm2FB ls -1S /iscratch/i/anoGam1/blastDb/*.nsq | sed "s/\.nsq//" > skeeter.lst mkdir fbfa # calculate a reasonable number of jobs (didn't actually do this for anoGam1) calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(164630/`wc skeeter.lst | awk "{print \\\$1}"`\) # 18929/(164630/815) = 93.707921 split -l 94 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/anoGam1/jkStuff/subChr.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/anoGam1/jkStuff/liftAll.lft warn $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 skeeter.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/anoGam1/bed/tblastn.dm2FB para create blastSpec para shove # jobs will need to be restarted, but they should all finish exit # back to kksilo cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=20000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/anoGam1/bed/tblastn.dm2FB para create chainSpec para push exit # back to kksilo cd /cluster/data/anoGam1/bed/tblastn.dm2FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/anoGam1/bed/tblastn.dm2FB/blastDm2FB.psl cd .. ssh hgwdev cd /cluster/data/anoGam1/bed/tblastn.dm2FB hgLoadPsl anoGam1 blastDm2FB.psl exit # back to kksilo rm -rf blastOut # End tblastn # GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie) ssh hgwdev mkdir /cluster/data/anoGam1/bed/geneid cd /cluster/data/anoGam1/bed/geneid foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/A.gambiae/golden_path_200302/geneidv1.2/$chr.gtf end ldHgGene -gtf -genePredExt anoGam1 geneid *.gtf # SWAP/CHAIN/NET DM2 (DONE 1/13/06 angie) ssh hgwdev mkdir /cluster/data/anoGam1/bed/blastz.dm2.swap cd /cluster/data/anoGam1/bed/blastz.dm2.swap doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.anoGam1/DEF >& do.log & tail -f do.log ln -s blastz.dm2.swap /cluster/data/anoGam1/bed/blastz.dm2 ########################################################################### # SWAP/CHAIN/NET DM3 (DONE 6/7/07 angie) ssh kkstore03 mkdir /cluster/data/anoGam1/bed/blastz.dm3.swap cd /cluster/data/anoGam1/bed/blastz.dm3.swap doBlastzChainNet.pl -swap /cluster/data/dm3/bed/blastz.anoGam1/DEF >& do.log & tail -f do.log ln -s blastz.dm3.swap /cluster/data/anoGam1/bed/blastz.dm3